From 5f5ff69353589dfcf1cd722a6a425fb7db2c0617 Mon Sep 17 00:00:00 2001 From: Kuo Wei Date: Thu, 19 Mar 2026 13:45:49 +0000 Subject: [PATCH 1/3] Update DeepSeek V3 32-node BF16 NEMO26.02 recipe with optimized launcher flags #recipebot --- .../recipe/custom_setup_experiment.py | 272 +----------------- .../recipe/launcher.sh | 62 ++-- .../recipe/recipe_launch_command.sh | 1 - .../recipe/templates/workload-job.yaml | 4 +- .../recipe/values.yaml | 42 ++- 5 files changed, 64 insertions(+), 317 deletions(-) delete mode 100644 training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/recipe_launch_command.sh diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/custom_setup_experiment.py b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/custom_setup_experiment.py index 369cfa0a..2337fdec 100644 --- a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/custom_setup_experiment.py +++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/custom_setup_experiment.py @@ -1,19 +1,3 @@ -#!/usr/bin/env python3 - -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - import glob import logging import os @@ -60,109 +44,6 @@ logger = logging.getLogger(__name__) -def check_training_finished(log_file_path: str) -> bool: - """Check if training is finished.""" - with open(log_file_path, "r") as f: - log_lines = f.readlines() - log = "\n".join(log_lines) - return "StopIteration" in log or "after training is done" in log or "exiting program at iteration" in log - - -def check_slurm_timeout(log_file_path: str) -> bool: - """Check if Slurm job timed out.""" - with open(log_file_path, "r") as f: - log_lines = f.readlines() - log = "\n".join(log_lines) - return "DUE TO TIME LIMIT" in log - - -def is_flaky_failure(log_file_path: str) -> bool: - """Check if Slurm job failed due to flaky failure.""" - with open(log_file_path, "r") as f: - log_lines = f.readlines() - log = "\n".join(log_lines) - - return ( - "The server socket has failed to listen on any local network address." in log - or "Some NCCL operations have failed or timed out." in log - or "uncorrectable ECC error encountered" in log - or "illegal memory access" in log - or "illegal instruction" in log - or "torch.distributed.DistNetworkError" in log - or "Segmentation fault" in log - or "found NaN in" in log - or "For debugging consider passing CUDA_LAUNCH_BLOCKING=1" in log - or "double free or corruption" in log - or "Call to CUDA function failed." in log - or "Connection reset by peer" in log - or "invalid pointer" in log - or "malloc(): unaligned tcache chunk detected" in log - or "zmq.error.ZMQError: Address already in use" in log - or "We couldn't connect to 'https://huggingface.co'" in log - or "Unpack failed: incomplete input" in log - or "unspecified launch failure" in log - or "free(): corrupted unsorted chunks" in log - or "Segfault encountered" in log - or "Fatal glibc error" in log - or "EOFError: No data left in file" in log - ) - - -def build_performance_config(args) -> Optional[Dict[str, Any]]: - """Build performance configuration from command-line arguments. - - Args: - args: Parsed command-line arguments - - Returns: - Dictionary with performance configuration or None if performance is disabled - """ - config = {} - - performance_params = { - "timing_threshold": args.timing_threshold, - "skip_first_percent_time": args.skip_first_percent_time, - } - - for key, value in performance_params.items(): - if value is not None: - config[key] = value - - return config if config else None - - -def ensure_logs_where_written(log_file_paths: List[str]): - """Ensure logs were written to disk.""" - if len(log_file_paths) != 1: - raise FileNotFoundError( - f"Unexpected number of log files found: {log_file_paths}. Expected 1, got {len(log_file_paths)}" - ) - - -def get_job_dir_and_status_from_run(exp_name: str): - """Get job directory and status from run.""" - result_dict = run.Experiment.from_title(exp_name).status(return_dict=True) - _, job_dict = list(result_dict.items())[0] - job_dir = job_dict["local_dir"] - job_status = str(job_dict["status"]) - return job_dir, job_status - - -def maybe_increase_n_attempts_on_flaky_failure( - n_attempts: int, - max_retries: int, - is_finished_experiment: bool, - is_long_convergence_run: bool, - log_file_paths: List[str], -): - """Maybe increase number of attempts.""" - if not is_finished_experiment and not is_long_convergence_run: - if is_flaky_failure(log_file_paths[-1]): - n_attempts += 1 - else: - n_attempts = max_retries # On non-flaky failures, we don't need to restart the experiment. - - return n_attempts def main( @@ -336,151 +217,14 @@ def main( logger.info("Will launch the following command with Nemo-Run: %s", " ".join(nemorun_script.to_command())) - is_finished_experiment = False # An experiment might consist of multiple training runs, due to restarts. - is_testing_passed = False # Whether the testing passed convergence and performance validation. - error_msg = None - n_attempts = 0 - exp_name = ( - exp_name[:37] if dgxc_cluster is not None else exp_name - ) # Some k8s clusters have a limit on the length of the experiment name. - wandb_run_id = None - while n_attempts <= max_retries: - while is_finished_experiment is False: - if HAVE_WANDB: - wandb_run_id = ( - (wandb_run_id or wandb.util.generate_id()) if is_long_convergence_run else wandb.util.generate_id() - ) - executor.env_vars.update( - { - "WANDB_RUN_ID": wandb_run_id, - "WANDB_RESUME": "allow", - } - ) - if wandb_key is not None: - executor.env_vars["WANDB_API_KEY"] = wandb_key - - run.run( - nemorun_script, - executor=executor, - plugins=plugins, - dryrun=dryrun, - detach=detach, - name=exp_name, - ) - if dryrun: - logger.info("dryrun requested: exiting") - return - - def _copy_logs_to_gcp(job_dir_path): - import shutil - import glob - - artifact_dir = os.environ.get("ARTIFACT_DIR", "/tmp/artifacts") - dest_logs_dir = os.path.join(artifact_dir, "logs") - os.makedirs(dest_logs_dir, exist_ok=True) - - try: - log_files = glob.glob(f"{job_dir_path}/log-*.out") + glob.glob(f"{job_dir_path}/log-*.err") - for log_f in log_files: - shutil.copy(log_f, dest_logs_dir) - msg = f"Copied {log_f} to {dest_logs_dir}" - print(msg) - logger.info(msg) - except Exception as e: - print(f"Failed to copy logs to GCP: {e}") - logger.error(f"Failed to copy logs to GCP: {e}") - - - job_dir, job_status = get_job_dir_and_status_from_run(exp_name) - - if job_status not in ["SUCCEEDED", "SUBMITTED", "PENDING", "RUNNING"]: - _copy_logs_to_gcp(job_dir) - raise Exception(f"Experiment failed for {exp_name} with status: {job_status}.") - - if detach: - is_finished_experiment = True - is_testing_passed = True - break - - log_file_paths = list(Path(f"{job_dir}").glob("log-*_0.out")) - ensure_logs_where_written(log_file_paths) - - is_finished_experiment = ( - check_training_finished(log_file_paths[-1]) if is_long_convergence_run else (job_status == "SUCCEEDED") - ) - - n_attempts = maybe_increase_n_attempts_on_flaky_failure( - n_attempts=n_attempts, - max_retries=max_retries, - is_finished_experiment=is_finished_experiment, - is_long_convergence_run=is_long_convergence_run, - log_file_paths=log_file_paths, - ) - - if not is_finished_experiment and n_attempts <= max_retries: - logger.error(f"Starting attempt {n_attempts + 1} of {max_retries + 1} for {exp_name}") - - if not is_finished_experiment: - break - - if is_finished_experiment is True and detach is False: - log_paths = sorted( - list(glob.glob(f"{get_nemorun_home()}/experiments/{exp_name}/{exp_name}_*/{exp_name}/log-*_0.out")) - ) - - if not is_long_convergence_run: - log_paths = [log_paths[-1]] - - logger.info(f"Starting convergence check for {model_family_name}_{model_recipe_name}") - wandb_run = None - if HAVE_WANDB and wandb_key: - wandb_run = wandb.init( - project=wandb_project_name, entity=wandb_entity_name, id=wandb_run_id, resume="allow" - ) - - logger.info("Waiting 10 seconds for I/O to settle") - time.sleep(10) - - is_testing_passed, error_msg = calc_convergence_and_performance( - model_family_name=model_family_name, - model_recipe_name=model_recipe_name, - assets_dir=os.path.join(job_dir, exp_name), - log_paths=log_paths, - loss_metric="lm loss", - timing_metric="elapsed time per iteration (ms)", - alloc_metric="alloc", - max_alloc_metric="max_alloc", - golden_values_path=golden_values_path, - convergence_config=convergence_params, - performance_config=performance_params, - memory_config=memory_params, - wandb_run=wandb_run, - ) - - if wandb_run: - wandb_run.finish() - wandb.teardown(exit_code=int(not is_testing_passed)) - - if not is_long_convergence_run: - n_attempts = max_retries - is_finished_experiment = True - if not is_testing_passed: - _copy_logs_to_gcp(job_dir) - break - - if is_finished_experiment and is_testing_passed: - break - - if not is_testing_passed and error_msg is not None: - raise AssertionError(error_msg) - if is_testing_passed and error_msg is not None: - logger.warning(error_msg) - - if not is_finished_experiment: - _copy_logs_to_gcp(job_dir) - raise Exception("Megatron-Bridge CI test job failed") - elif is_finished_experiment and not detach: - logger.info("Megatron-Bridge CI test job completed successfully!") + run.run( + nemorun_script, + executor=executor, + plugins=plugins, + dryrun=dryrun, + detach=detach, + name=exp_name, + ) if __name__ == "__main__": diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/launcher.sh b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/launcher.sh index 3cb08b61..dd15b2d0 100644 --- a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/launcher.sh +++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/launcher.sh @@ -7,7 +7,7 @@ EOF } parse_args() { - while [ "$1" != "" ]; do + while [[ "$1" != "" ]]; do case $(grep -o "=" <<< "$1" | wc -l) in 1 ) config_overrides+=("$1") @@ -25,15 +25,15 @@ parse_args() { config_overrides=() parse_args "$@" -if [ -z "${config_overrides}" ]; then +if [[ -z "${config_overrides[*]}" ]]; then echo "No NeMo config overrides specified" else echo "NeMo config overrides:" echo " ${config_overrides}" fi -export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH:/usr/local/nvidia/lib64" -ldconfig $LD_LIBRARY_PATH +export LD_LIBRARY_PATH="/usr/local/cuda/compat/lib:$NCCL_PLUGIN_PATH:$LD_LIBRARY_PATH" +ldconfig "$LD_LIBRARY_PATH" echo "Added $LD_LIBRARY_PATH to ldconfig:" ldconfig -p | grep libcuda | sed 's/^/ /' echo "" @@ -47,7 +47,7 @@ echo "Logging to ${explicit_log_dir}" if [[ -n "${TOKENIZER_PATH}" ]]; then echo "Getting tokenizer files" - cp ${TOKENIZER_PATH}/* . + cp "${TOKENIZER_PATH}"/* . echo "" fi @@ -56,14 +56,22 @@ echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger # Create the nsys directory. -mkdir -p ${explicit_log_dir}/nsys - -if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then - echo "--- DEBUG libnccl-env.so ---" - ls -la /usr/local/gib/lib/libnccl-env.so || echo "libnccl-env.so not found" - ls -lh /usr/local/gib/lib - echo "----------------------------" +mkdir -p "${explicit_log_dir}/nsys" + +# Collect diagnostics to a single line +kv="\"kernel_version\": \"$(uname --kernel-release)\"" +if command -v nvidia-smi &> /dev/null; then + cuda_v=$(nvidia-smi -q -x | grep -Po '(?<=).*(?=)' || true) + driver_v=$(nvidia-smi -q -x | grep -Po '(?<=).*(?=)' || true) + vbios_v=$(nvidia-smi -q -x | grep -Po '(?<=).*(?=)' | head -n1 || true) + kv="${kv}, \"cuda_version\": \"${cuda_v}\"" + kv="${kv}, \"driver_version\": \"${driver_v}\"" + kv="${kv}, \"vbios_version\": \"${vbios_v}\"" fi +echo "VERSION_DIAGNOSTICS: {${kv}}" + + +export HF_TOKEN=YOUR_HF_TOKEN cd /opt rm -rf Megatron-Bridge @@ -71,17 +79,13 @@ git clone https://github.com/NVIDIA-NeMo/Megatron-Bridge.git cd Megatron-Bridge git checkout f7a9428f301fa17ac374d5e7166a63b0aa4771af git submodule update --init --recursive -sed -i -e '/return config/i \ config.dist.distributed_timeout_minutes = 30' scripts/performance/run_recipe.py +sed -i -e '/pretrain(config=recipe/i \ recipe.dist.distributed_timeout_minutes = 60' scripts/performance/run_script.py ls cp $CUSTOM_SETUP_EXPERIMENT_SCRIPT_PATH scripts/performance/ worker_command=$(cat <<- EOM if [ "\$RANK" -eq "0" ]; then - echo "--- LOCATING MEGATRON LIBRARIES ---" ; - python -c "import megatron.core; print('megatron.core:', megatron.core.__file__)" || echo "megatron.core not found" ; - python -c "import megatron.bridge; print('megatron.bridge:', megatron.bridge.__file__)" || echo "megatron.bridge not found" ; - echo "-----------------------------------" ; echo "Worker 0 is stalling for a few seconds.." ; sleep 3 ; echo "The detected environment within worker rank 0 is:" ; @@ -89,9 +93,8 @@ worker_command=$(cat <<- EOM fi ; cd /opt/Megatron-Bridge ; - export PYTHONPATH="/opt/Megatron-Bridge:/opt/Megatron-Bridge/3rdparty/Megatron-LM:\$PYTHONPATH" ; - exec numactl \ + numactl \ --cpunodebind=\$((LOCAL_RANK/4)) \ --membind=\$((LOCAL_RANK/4)) nsys profile \ -t nvtx,cuda \ @@ -100,7 +103,7 @@ worker_command=$(cat <<- EOM --capture-range=cudaProfilerApi \ --capture-range-end=stop \ --kill none \ - -o /${explicit_log_dir}/$JOB_IDENTIFIER/rank-\$RANK \ + -o "/${explicit_log_dir}/$JOB_IDENTIFIER/rank-\$RANK" \ --force-overwrite true \ --session-new "nsys-\$RANDOM-\$RANK" \ nice -10 \ @@ -110,16 +113,19 @@ worker_command=$(cat <<- EOM --model_recipe_name deepseek_v3 \ --gpus_per_node 8 \ --num_gpus 256 \ + --compute_dtype bf16 \ + --seq_length 4096 \ --global_batch_size 2048 \ --micro_batch_size 1 \ - --seq_length 4096 \ --tensor_model_parallel_size 1 \ --pipeline_model_parallel_size 16 \ + --expert_model_parallel_size 8 \ + --expert_tensor_parallel_size 1 \ --context_parallel_size 1 \ --virtual_pipeline_model_parallel_size None \ - --expert_model_parallel_size 8 \ - --compute_dtype bf16 \ - --max_steps 30 dist.distributed_timeout_minutes=30 + --recompute_modules mla_up_proj \ + --moe_a2a_overlap False \ + --max_steps 30 EOM ) @@ -138,10 +144,10 @@ torchrun \ if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then - mkdir -p ${ARTIFACT_DIR} - cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/ - env > ${ARTIFACT_DIR}/environ.txt - ls ${ARTIFACT_DIR} + mkdir -p "${ARTIFACT_DIR}" + cp -r "${explicit_log_dir}"/* "${ARTIFACT_DIR}/" + env > "${ARTIFACT_DIR}/environ.txt" + ls "${ARTIFACT_DIR}" fi echo "Training completed" echo "Pod on $(hostname --fqdn) is exiting" \ No newline at end of file diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/recipe_launch_command.sh b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/recipe_launch_command.sh deleted file mode 100644 index 892961cb..00000000 --- a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/recipe_launch_command.sh +++ /dev/null @@ -1 +0,0 @@ -helm install joeywan-ubench-6wsw . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=/tmp/ubench_recipe/joeywan-ubench-6wsw/custom_setup_experiment.py --set workload.image=nvcr.io/nvidia/nemo:26.02 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/joeywan-ubench-6wsw --set queue=a4 \ No newline at end of file diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-job.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-job.yaml index 54efbb6b..b4ffa210 100644 --- a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-job.yaml +++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-job.yaml @@ -62,7 +62,7 @@ spec: gke-parallelstore/memory-limit: "0" {{- end }} {{- if and $root.Values.queue $root.Values.tasSettings.topologyRequest }} - kueue.x-k8s.io/podset-preferred-topology: {{ .Values.tasSettings.topologyRequest | default "kubernetes.io/hostname" }} + {{- toYaml .Values.tasSettings.topologyRequest | nindent 14 }} {{- end }} {{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }} provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}" @@ -245,7 +245,7 @@ spec: value: "{{ $gpusPerNode }}" - name: NCCL_PLUGIN_PATH - value: /usr/local/gib/lib64 + value: /usr/local/gib/lib64:/usr/local/nvidia/lib64 {{ if $root.Values.network.gibVersion }} - name: NCCL_INIT_SCRIPT diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/values.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/values.yaml index 05e98e12..cb73da9b 100644 --- a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/values.yaml +++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/values.yaml @@ -1,35 +1,33 @@ +queue: null dwsSettings: maxRunDurationSeconds: null -network: - gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.1 - hostNetwork: true - ncclSettings: - - name: NCCL_DEBUG - value: INFO - - name: NCCL_TIMEOUT - value: '7200000' - subnetworks[]: null -queue: null tasSettings: topologyRequest: kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname volumes: - gcsMounts: - - bucketName: null - mountPath: null gcsVolumes: true psVolumes: false + gcsMounts: + - bucketName: null + mountPath: null workload: + gpus: 256 + image: nvcr.io/nvidia/nemo:26.02 + defaultArguments[]: null arguments[]: null configFile: custom_setup_experiment.py configPath: /workload/configs/ - defaultArguments[]: null envs: - - name: ARTIFACT_DIR - value: null - - name: GLOO_SOCKET_IFNAME - value: eth0 - - name: CUSTOM_SETUP_EXPERIMENT_SCRIPT_PATH - value: /workload/configs/custom_setup_experiment.py - gpus: 256 - image: nvcr.io/nvidia/nemo:26.02 + - name: ARTIFACT_DIR + value: null + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: CUSTOM_SETUP_EXPERIMENT_SCRIPT_PATH + value: /workload/configs/custom_setup_experiment.py +network: + hostNetwork: true + subnetworks[]: null + gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.1 + ncclSettings: + - name: NCCL_DEBUG + value: WARN From a88fec2bfb975316d9262ffff7d868ecda288fe1 Mon Sep 17 00:00:00 2001 From: Kuo Wei Date: Mon, 23 Mar 2026 09:54:04 +0000 Subject: [PATCH 2/3] Move 32node recipe into nemo2602 subdirectory and update README --- .../32node-BF16-SEQ4096-GBS2048}/recipe/Chart.yaml | 0 .../32node-BF16-SEQ4096-GBS2048}/recipe/README.md | 2 +- .../recipe/custom_setup_experiment.py | 0 .../32node-BF16-SEQ4096-GBS2048}/recipe/launcher.sh | 0 .../recipe/templates/workload-config-configmap.yaml | 0 .../recipe/templates/workload-job.yaml | 0 .../recipe/templates/workload-launcher-configmap.yaml | 0 .../recipe/templates/workload-svc.yaml | 0 .../32node-BF16-SEQ4096-GBS2048}/recipe/values.yaml | 0 9 files changed, 1 insertion(+), 1 deletion(-) rename training/a4/deepseek_v3/megatron-bridge-pretraining-gke/{32node-BF16-SEQ4096-GBS2048-NEMO26.02 => nemo2602/32node-BF16-SEQ4096-GBS2048}/recipe/Chart.yaml (100%) rename training/a4/deepseek_v3/megatron-bridge-pretraining-gke/{32node-BF16-SEQ4096-GBS2048-NEMO26.02 => nemo2602/32node-BF16-SEQ4096-GBS2048}/recipe/README.md (98%) rename training/a4/deepseek_v3/megatron-bridge-pretraining-gke/{32node-BF16-SEQ4096-GBS2048-NEMO26.02 => nemo2602/32node-BF16-SEQ4096-GBS2048}/recipe/custom_setup_experiment.py (100%) rename training/a4/deepseek_v3/megatron-bridge-pretraining-gke/{32node-BF16-SEQ4096-GBS2048-NEMO26.02 => nemo2602/32node-BF16-SEQ4096-GBS2048}/recipe/launcher.sh (100%) rename training/a4/deepseek_v3/megatron-bridge-pretraining-gke/{32node-BF16-SEQ4096-GBS2048-NEMO26.02 => nemo2602/32node-BF16-SEQ4096-GBS2048}/recipe/templates/workload-config-configmap.yaml (100%) rename training/a4/deepseek_v3/megatron-bridge-pretraining-gke/{32node-BF16-SEQ4096-GBS2048-NEMO26.02 => nemo2602/32node-BF16-SEQ4096-GBS2048}/recipe/templates/workload-job.yaml (100%) rename training/a4/deepseek_v3/megatron-bridge-pretraining-gke/{32node-BF16-SEQ4096-GBS2048-NEMO26.02 => nemo2602/32node-BF16-SEQ4096-GBS2048}/recipe/templates/workload-launcher-configmap.yaml (100%) rename training/a4/deepseek_v3/megatron-bridge-pretraining-gke/{32node-BF16-SEQ4096-GBS2048-NEMO26.02 => nemo2602/32node-BF16-SEQ4096-GBS2048}/recipe/templates/workload-svc.yaml (100%) rename training/a4/deepseek_v3/megatron-bridge-pretraining-gke/{32node-BF16-SEQ4096-GBS2048-NEMO26.02 => nemo2602/32node-BF16-SEQ4096-GBS2048}/recipe/values.yaml (100%) diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/Chart.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/Chart.yaml similarity index 100% rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/Chart.yaml rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/Chart.yaml diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/README.md b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/README.md similarity index 98% rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/README.md rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/README.md index fc9352fb..f3954712 100644 --- a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/README.md +++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/README.md @@ -75,7 +75,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/custom_setup_experiment.py b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/custom_setup_experiment.py similarity index 100% rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/custom_setup_experiment.py rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/custom_setup_experiment.py diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/launcher.sh b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/launcher.sh similarity index 100% rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/launcher.sh rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/launcher.sh diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-config-configmap.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-config-configmap.yaml similarity index 100% rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-config-configmap.yaml rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-config-configmap.yaml diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-job.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-job.yaml similarity index 100% rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-job.yaml rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-job.yaml diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-launcher-configmap.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-launcher-configmap.yaml similarity index 100% rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-launcher-configmap.yaml rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-launcher-configmap.yaml diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-svc.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-svc.yaml similarity index 100% rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-svc.yaml rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-svc.yaml diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/values.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/values.yaml similarity index 100% rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/values.yaml rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/values.yaml From ac7eb192fb7c6933150be6dcc728485cf4e78ba0 Mon Sep 17 00:00:00 2001 From: Kuo Wei Date: Mon, 23 Mar 2026 10:10:53 +0000 Subject: [PATCH 3/3] Move 32node NEMO25.11 recipe into nemo2511 subdirectory --- .../32node-BF16-SEQ4096-GBS2048}/recipe/Chart.yaml | 0 .../32node-BF16-SEQ4096-GBS2048}/recipe/README.md | 2 +- .../recipe/custom_setup_experiment.py | 0 .../32node-BF16-SEQ4096-GBS2048}/recipe/launcher.sh | 0 .../recipe/recipe_launch_command.sh | 0 .../recipe/templates/workload-config-configmap.yaml | 0 .../recipe/templates/workload-job.yaml | 0 .../recipe/templates/workload-launcher-configmap.yaml | 0 .../recipe/templates/workload-svc.yaml | 0 .../32node-BF16-SEQ4096-GBS2048}/recipe/values.yaml | 0 10 files changed, 1 insertion(+), 1 deletion(-) rename training/a4/deepseek_v3/megatron-bridge-pretraining-gke/{32node-BF16-SEQ4096-GBS2048-NEMO25.11 => nemo2511/32node-BF16-SEQ4096-GBS2048}/recipe/Chart.yaml (100%) rename training/a4/deepseek_v3/megatron-bridge-pretraining-gke/{32node-BF16-SEQ4096-GBS2048-NEMO25.11 => nemo2511/32node-BF16-SEQ4096-GBS2048}/recipe/README.md (98%) rename training/a4/deepseek_v3/megatron-bridge-pretraining-gke/{32node-BF16-SEQ4096-GBS2048-NEMO25.11 => nemo2511/32node-BF16-SEQ4096-GBS2048}/recipe/custom_setup_experiment.py (100%) rename training/a4/deepseek_v3/megatron-bridge-pretraining-gke/{32node-BF16-SEQ4096-GBS2048-NEMO25.11 => nemo2511/32node-BF16-SEQ4096-GBS2048}/recipe/launcher.sh (100%) rename training/a4/deepseek_v3/megatron-bridge-pretraining-gke/{32node-BF16-SEQ4096-GBS2048-NEMO25.11 => nemo2511/32node-BF16-SEQ4096-GBS2048}/recipe/recipe_launch_command.sh (100%) rename training/a4/deepseek_v3/megatron-bridge-pretraining-gke/{32node-BF16-SEQ4096-GBS2048-NEMO25.11 => nemo2511/32node-BF16-SEQ4096-GBS2048}/recipe/templates/workload-config-configmap.yaml (100%) rename training/a4/deepseek_v3/megatron-bridge-pretraining-gke/{32node-BF16-SEQ4096-GBS2048-NEMO25.11 => nemo2511/32node-BF16-SEQ4096-GBS2048}/recipe/templates/workload-job.yaml (100%) rename training/a4/deepseek_v3/megatron-bridge-pretraining-gke/{32node-BF16-SEQ4096-GBS2048-NEMO25.11 => nemo2511/32node-BF16-SEQ4096-GBS2048}/recipe/templates/workload-launcher-configmap.yaml (100%) rename training/a4/deepseek_v3/megatron-bridge-pretraining-gke/{32node-BF16-SEQ4096-GBS2048-NEMO25.11 => nemo2511/32node-BF16-SEQ4096-GBS2048}/recipe/templates/workload-svc.yaml (100%) rename training/a4/deepseek_v3/megatron-bridge-pretraining-gke/{32node-BF16-SEQ4096-GBS2048-NEMO25.11 => nemo2511/32node-BF16-SEQ4096-GBS2048}/recipe/values.yaml (100%) diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/Chart.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/Chart.yaml similarity index 100% rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/Chart.yaml rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/Chart.yaml diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/README.md b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/README.md similarity index 98% rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/README.md rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/README.md index aa487339..c5095c27 100644 --- a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/README.md +++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/README.md @@ -75,7 +75,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder. git clone https://github.com/ai-hypercomputer/gpu-recipes.git cd gpu-recipes export REPO_ROOT=`git rev-parse --show-toplevel` -export RECIPE_ROOT=$REPO_ROOT/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe +export RECIPE_ROOT=$REPO_ROOT/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe cd $RECIPE_ROOT ``` diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/custom_setup_experiment.py b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/custom_setup_experiment.py similarity index 100% rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/custom_setup_experiment.py rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/custom_setup_experiment.py diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/launcher.sh b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/launcher.sh similarity index 100% rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/launcher.sh rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/launcher.sh diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/recipe_launch_command.sh b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/recipe_launch_command.sh similarity index 100% rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/recipe_launch_command.sh rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/recipe_launch_command.sh diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-config-configmap.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-config-configmap.yaml similarity index 100% rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-config-configmap.yaml rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-config-configmap.yaml diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-job.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-job.yaml similarity index 100% rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-job.yaml rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-job.yaml diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-launcher-configmap.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-launcher-configmap.yaml similarity index 100% rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-launcher-configmap.yaml rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-launcher-configmap.yaml diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-svc.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-svc.yaml similarity index 100% rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-svc.yaml rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-svc.yaml diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/values.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/values.yaml similarity index 100% rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/values.yaml rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/values.yaml