From 52b7db69b447b963d26bc05c0ceaaf63856d3849 Mon Sep 17 00:00:00 2001 From: Vishwas Reddy Date: Thu, 19 Mar 2026 18:32:10 +0000 Subject: [PATCH 1/9] Recipe for llama3.1-8b 16nodes with gbs 256/seq 8192 --- .../16node-bf16-seq8192-gbs256/Chart.yaml | 20 ++ .../16node-bf16-seq8192-gbs256/README.md | 153 ++++++++ .../16node-bf16-seq8192-gbs256/launcher.sh | 106 ++++++ ...llama3-1-8b-bf16-seq8192-gbs256-gpus128.py | 142 ++++++++ .../recipe_launch_command.sh | 1 + .../templates/workload-config-configmap.yaml | 28 ++ .../templates/workload-job.yaml | 333 ++++++++++++++++++ .../workload-launcher-configmap.yaml | 28 ++ .../templates/workload-svc.yaml | 22 ++ .../16node-bf16-seq8192-gbs256/values.yaml | 33 ++ 10 files changed, 866 insertions(+) create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/Chart.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/README.md create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/launcher.sh create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/llama3-1-8b-bf16-seq8192-gbs256-gpus128.py create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/recipe_launch_command.sh create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-config-configmap.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-job.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-launcher-configmap.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-svc.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/values.yaml diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/Chart.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/Chart.yaml new file mode 100644 index 00000000..af46c11a --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/Chart.yaml @@ -0,0 +1,20 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v2 +name: a4_jobset_workload +description: a4_jobset_workload +type: application +version: 0.1.0 +appVersion: "1.16.0" diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/README.md b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/README.md new file mode 100644 index 00000000..483ed116 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/README.md @@ -0,0 +1,153 @@ + +# Pretrain llama3-1-8b workloads on a4 GKE Node pools with Nvidia NeMo Framework + +This recipe outlines the steps for running a llama3-1-8b pretraining +workload on [a4 GKE Node pools](https://cloud.google.com/kubernetes-engine) by using the +[NVIDIA NeMo framework](https://github.com/NVIDIA/nemo). + +## Orchestration and deployment tools + +For this recipe, the following setup is used: + +- Orchestration - [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine) +- Pretraining job configuration and deployment - A Helm chart is used to + configure and deploy the [Kubernetes Jobset](https://kubernetes.io/blog/2025/03/23/introducing-jobset) resource which manages the execution of the + [NeMo pretraining workload](https://github.com/NVIDIA/nemo). + +## Test environment + +This recipe has been optimized for and tested with the following configuration: + +- GKE cluster +Please follow Cluster Toolkit [instructions](https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/examples/gke-a4) +to create your a4 GKE cluster. + +## Training dataset + +This recipe uses a mock pretraining dataset provided by the NeMo framework. + +## Docker container image + +This recipe uses the following docker images: + +- `nvcr.io/nvidia/nemo:25.07` +- `us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0` + +## Run the recipe + +From your client workstation, complete the following steps: + +### Configure environment settings + +Set the environment variables to match your environment: + + ```bash + export PROJECT_ID= + export CLUSTER_REGION= + export CLUSTER_NAME= + export GCS_BUCKET= # Note: path should not be prefixed with gs:// + export KUEUE_NAME= + ``` + +Replace the following values: + + - ``: your Google Cloud project ID. + - ``: the region where your cluster is located. + - ``: the name of your GKE cluster. + - ``: the name of your Cloud Storage bucket. Don't include the `gs://` prefix. + - ``: the name of the Kueue local queue. The default queue created by the cluster toolkit is `a4`. Make sure to verify the name of the local queue in your cluster. + +Set the default project: + + ```bash + gcloud config set project $PROJECT_ID + ``` + +### Get the recipe + +Clone the `gpu-recipes` repository and set a reference to the recipe folder. + +``` +git clone https://github.com/ai-hypercomputer/gpu-recipes.git +cd gpu-recipes +export REPO_ROOT=`git rev-parse --show-toplevel` +export RECIPE_ROOT=$REPO_ROOT/training/a4/llama3-1-8b/nemo-pretraining-gke/16_nodes +cd $RECIPE_ROOT +``` + +### Get cluster credentials + +``` +gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION +``` + +### Configure and submit a pretraining job + +#### Using 16 node (128 gpus) bf16 precision +To execute the job with the default settings, run the following command from +your client: + +```bash +cd $RECIPE_ROOT +export WORKLOAD_NAME=$USER-a4-llama3-1-8b-16node +helm install $WORKLOAD_NAME . -f values.yaml \ +--set-file workload_launcher=launcher.sh \ +--set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus128.py \ +--set workload.image=nvcr.io/nvidia/nemo:25.07 \ +--set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \ +--set volumes.gcsMounts[0].mountPath=/job-logs \ +--set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \ +--set queue=${KUEUE_NAME} +``` + +**Examples** + +- To set the number of training steps to 100, run the following command from + your client: + + ```bash + cd $RECIPE_ROOT + export WORKLOAD_NAME=$USER-a4-llama3-1-8b-16node + helm install $WORKLOAD_NAME . -f values.yaml \ + --set-file workload_launcher=launcher.sh \ + --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus128.py \ + --set workload.image=nvcr.io/nvidia/nemo:25.07 \ + --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \ + --set volumes.gcsMounts[0].mountPath=/job-logs \ + --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \ + --set queue=${KUEUE_NAME} \ + --set workload.arguments[0]="trainer.max_steps=100" + ``` + +### Monitor the job + +To check the status of pods in your job, run the following command: + +``` +kubectl get pods | grep $USER-a4-llama3-1-8b-16node +``` + +Replace the following: + +- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4-llama3-1-8b-16node. + +To get the logs for one of the pods, run the following command: + +``` +kubectl logs POD_NAME +``` + +Information about the training job's progress, including crucial details such as +loss, step count, and step time, is generated by the rank 0 process. +This process runs on the pod whose name begins with +`JOB_NAME_PREFIX-workload-0-0`. +For example: `$USER-a4-llama3-1-8b-16node-workload-0-0-s9zrv`. + +### Uninstall the Helm release + +You can delete the job and other resources created by the Helm chart. To +uninstall Helm, run the following command from your client: + +```bash +helm uninstall $USER-a4-llama3-1-8b-16node +``` \ No newline at end of file diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/launcher.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/launcher.sh new file mode 100644 index 00000000..daf2bd09 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/launcher.sh @@ -0,0 +1,106 @@ +usage() +{ +cat << EOF +usage: bash ./launcher.sh [config-override [config-override ...]] +config-override (Optional) A NeMo configuration override. E.g. trainer.max_steps=10000. +EOF +} + +parse_args() { + while [ "$1" != "" ]; do + case $(grep -o "=" <<< "$1" | wc -l) in + 1 ) + config_overrides+=("$1") + ;; + * ) + echo "Invalid config override: $1" + usage + exit 1 + esac + shift + done + config_overrides="${config_overrides[*]}" +} + +config_overrides=() +parse_args "$@" + +if [ -z "${config_overrides}" ]; then + echo "No NeMo config overrides specified" +else + echo "NeMo config overrides:" + echo " ${config_overrides}" +fi + +if [[ -n "${NCCL_PLUGIN_PATH}" ]]; then + export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH" + ldconfig $LD_LIBRARY_PATH + echo "Added $LD_LIBRARY_PATH to ldconfig:" + ldconfig -p | grep libcuda | sed 's/^/ /' + echo "" +fi + +if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then + explicit_log_dir=${EXPLICIT_LOG_DIR} +else + explicit_log_dir=workload_logs +fi +echo "Logging to ${explicit_log_dir}" + +if [[ -n "${TOKENIZER_PATH}" ]]; then + echo "Getting tokenizer files" + cp ${TOKENIZER_PATH}/* . + echo "" +fi + +echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of $NNODES nodes" + + +pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger + +export HF_TOKEN="" + +# Export the nemo2 config to yaml. +python ${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \ +trainer.num_nodes="$NNODES" \ +log.explicit_log_dir="${explicit_log_dir}" \ +trainer.max_steps=30 \ +trainer.num_nodes=16 \ +trainer.devices=8 \ +${config_overrides} \ +--to-yaml exported_nemo_config.yaml + +# Create the nsys directory. +mkdir -p ${explicit_log_dir}/nsys + +OMP_NUM_THREADS=12 NSYS_CONFIG_DIRECTIVES="AgentLaunchTimeoutSec=240;AppLaunchTimeoutSec=240" TORCH_NCCL_ENABLE_MONITORING=0 \ +/usr/local/bin/nsys profile -s none -t nvtx,cuda --capture-range=cudaProfilerApi --capture-range-end=stop \ +-o ${explicit_log_dir}/nsys/noderank-${JOB_COMPLETION_INDEX} \ +--session-new "nemo-rank${JOB_COMPLETION_INDEX}"-$RANDOM \ +--wait all \ +torchrun \ +--nproc-per-node="8" \ +--nnodes="${NNODES}" \ +--node_rank="${JOB_COMPLETION_INDEX}" \ +--rdzv_id="${JOB_IDENTIFIER}" \ +--master_addr="${MASTER_ADDR}" \ +--master_port="${MASTER_PORT}" \ +${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \ +trainer.num_nodes="$NNODES" \ +log.explicit_log_dir="${explicit_log_dir}" \ +trainer.max_steps=30 \ +trainer.num_nodes=16 \ +trainer.devices=8 \ +${config_overrides} + +if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then + mkdir -p ${ARTIFACT_DIR} + cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/ + cp ${NEMO_LAUNCH_SCRIPT} ${ARTIFACT_DIR}/run-cli.py + cp dllogger.json ${ARTIFACT_DIR}/dllogger.json + cp exported_nemo_config.yaml ${ARTIFACT_DIR}/nemo-configuration.yaml + env > ${ARTIFACT_DIR}/environ.txt + ls ${ARTIFACT_DIR} +fi +echo "Training completed" +echo "Pod on $(hostname --fqdn) is exiting" \ No newline at end of file diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/llama3-1-8b-bf16-seq8192-gbs256-gpus128.py b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/llama3-1-8b-bf16-seq8192-gbs256-gpus128.py new file mode 100644 index 00000000..4ae82ea4 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/llama3-1-8b-bf16-seq8192-gbs256-gpus128.py @@ -0,0 +1,142 @@ +"""Nemo2 pretraining recipe for Llama 3.1 8B model.""" + +from nemo.collections import llm +from nemo.collections.llm.recipes import llama31_8b +from nemo.lightning.pytorch.callbacks import NsysCallback +from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback +from nemo.utils.loggers.dllogger import DLLogger +import nemo_run as run +from scripts.performance.helpers import ( + set_primary_perf_configs, +) +from scripts.performance.utils import get_comm_overlap_callback_idx + + +def recipe( + profile_enabled: bool = False, + profile_start_step: int = 0, + profile_end_step: int = 0, + profile_ranks: str = "0", +) -> run.Partial: + """Returns a Nemo2 training recipe for Llama 3.1 8B model. + + Args: + profile_enabled: Whether to enable Nsys profiling. + profile_start_step: The step to start profiling. + profile_end_step: The step to end profiling. + profile_ranks: The ranks to profile, comma separated. + + Returns: + A Nemo2 training recipe. + """ + # Start from the Nemo standard recipe. + pretrain = llama31_8b.pretrain_recipe(performance_mode=True) + + num_nodes = 16 + num_gpus_per_node = 8 + mbs = 2 + gbs = 256 + max_steps = 30 + tp_size = 1 + pp_size = 1 + cp_size = 1 + vp_size = 1 # Virtual Pipeline Parallelism + ep_size = 1 # Expert Parallelism + enable_cuda_graphs = False + compute_dtype = "bf16" + fp8_recipe = None # Not needed for bf16 + nccl_communicator_config_path = None + use_mcore_fsdp = False + use_fsdp_double_buffer = False + use_user_buffer_registration = False + use_sharp = False + keep_fsdp_fp8_transpose_cache = False + + pretrain = set_primary_perf_configs( + pretrain, + "pre_train", + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + mbs=mbs, + gbs=gbs, + max_steps=max_steps, + tp_size=tp_size, + pp_size=pp_size, + cp_size=cp_size, + vp_size=vp_size, + ep_size=ep_size, + enable_cuda_graphs=enable_cuda_graphs, + compute_dtype=compute_dtype, + fp8_recipe=fp8_recipe, + nccl_communicator_config_path=nccl_communicator_config_path, + use_mcore_fsdp=use_mcore_fsdp, + use_fsdp_double_buffer=use_fsdp_double_buffer, + use_user_buffer_registration=use_user_buffer_registration, + use_sharp=use_sharp, + keep_fsdp_fp8_transpose_cache=keep_fsdp_fp8_transpose_cache, + ) + + # Sequence Length (model and data) + pretrain.model.config.seq_length = 8192 + pretrain.data.seq_length = 8192 + + # Set the number of steps to 50 for a quicker benchmark. + pretrain.trainer.max_steps = 50 + + # Disable validation batches. + pretrain.trainer.limit_val_batches = 0.0 + pretrain.trainer.val_check_interval = 0 + + # Add the Nsys profiling callback if enabled. + if profile_enabled: + pretrain.trainer.callbacks.append( + run.Config( + NsysCallback, + start_step=profile_start_step, + end_step=profile_end_step, + ranks=[int(x) for x in profile_ranks.split(",")], + gen_shape=False, + ) + ) + + # Add the FLOPs measurement callback. + pretrain.trainer.callbacks.append( + run.Config( + FLOPsMeasurementCallback, + model_name="llama31-8b", + model_config=pretrain.model.config, + data_config=pretrain.data, + ) + ) + + # When `performance_mode` is enabled, the Megatron communication overlap + # callback is already added to the recipe. + # https://github.com/NVIDIA-NeMo/NeMo/blob/90a396a567ebb4e8c1c41e454dc00cb71f911317/nemo/collections/llm/recipes/llama31_8b.py#L231 + comm_overlap_callback_idx = get_comm_overlap_callback_idx( + pretrain.trainer.callbacks + ) + pretrain.trainer.callbacks[ + comm_overlap_callback_idx + ].tp_comm_bootstrap_backend = "nccl" + + # Disable checkpointing. + pretrain.log.ckpt = None + pretrain.trainer.enable_checkpointing = False + + # Log every step. + pretrain.trainer.log_every_n_steps = 1 + + # Enable DLLogger + dllogger_config = run.Config( + DLLogger, + verbose=True, + stdout=True, + json_file="dllogger.json", + ) + pretrain.log.extra_loggers = [dllogger_config] + + return pretrain + + +if __name__ == "__main__": + run.cli.main(llm.pretrain, default_factory=recipe) diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/recipe_launch_command.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/recipe_launch_command.sh new file mode 100644 index 00000000..29ae3f6a --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/recipe_launch_command.sh @@ -0,0 +1 @@ +helm install vishwasreddy-ubench-6jdz . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus128.py --set workload.image=nvcr.io/nvidia/nemo:25.07 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/vishwasreddy-ubench-6jdz \ No newline at end of file diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-config-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-config-configmap.yaml new file mode 100644 index 00000000..a1d54cee --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-config-configmap.yaml @@ -0,0 +1,28 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- if .Values.workload.configFile }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-config" +data: + workload-configuration: |- +{{- if .Values.workload_config }} +{{ .Values.workload_config | nindent 4 }} +{{- else }} +{{ "config: null" | nindent 4 }} +{{- end }} +{{- end }} diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-job.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-job.yaml new file mode 100644 index 00000000..ae59e456 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-job.yaml @@ -0,0 +1,333 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{$timestamp := now | date "2006-01-02-15-04-05"}} +{{$jobSuffix := randAlphaNum 4 | lower}} +{{$jobuuid := uuidv4}} +{{$nodes := div .Values.workload.gpus 8 | max 1}} +{{$gpusPerNode := min .Values.workload.gpus 8}} +{{- $root := . -}} + +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: "{{ .Release.Name }}" + namespace: default + labels: + {{- if $root.Values.queue }} + kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}" + {{- end }} +spec: + {{- if $root.Values.queue }} + suspend: true + {{- end }} + failurePolicy: + maxRestarts: {{ default 0 $root.Values.workload.max_workload_restarts }} + replicatedJobs: + - name: workload + replicas: 1 + template: + spec: + parallelism: {{ $nodes }} + completions: {{ $nodes }} + backoffLimit: 0 + completionMode: Indexed + activeDeadlineSeconds: 14400 # 4 hours (4 * 60 * 60) + ttlSecondsAfterFinished: 43200 # 12 hours (12 * 60 * 60) + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: workload + {{- if $root.Values.volumes.gcsVolumes }} + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "500m" + gke-gcsfuse/memory-limit: "1Ti" + gke-gcsfuse/ephemeral-storage-limit: "2Ti" + {{- end }} + {{- if $root.Values.volumes.psVolumes }} + gke-parallelstore/volumes: "true" + gke-parallelstore/cpu-limit: "0" + gke-parallelstore/memory-limit: "0" + {{- end }} + {{- if and $root.Values.queue $root.Values.tasSettings.topologyRequest }} + {{- toYaml .Values.tasSettings.topologyRequest | nindent 14 }} + {{- end }} + {{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }} + provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}" + {{- end }} + {{- if not $root.Values.network.hostNetwork }} + networking.gke.io/default-interface: "eth0" + networking.gke.io/interfaces: | + {{- if $root.Values.network.subnetworks }} + [ + {{- range $i, $subnetwork := $root.Values.network.subnetworks }} + {"interfaceName":"eth{{ $i }}","network":"{{ $subnetwork }}"}{{ eq $i 9 | ternary "" ","}} + {{- end }} + ] + {{- else }} + [ + {"interfaceName":"eth0","network":"default"}, + {"interfaceName":"eth1","network":"gvnic-1"}, + {{- range $i := until 8 }} + {"interfaceName":"eth{{ add 2 $i }}","network":"rdma-{{ $i }}"}{{ eq $i 7 | ternary "" ","}} + {{- end }} + ] + {{- end }} + {{- end }} + spec: + {{- if $root.Values.network.hostNetwork }} + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + {{- end }} + subdomain: "{{.Release.Name}}" + restartPolicy: Never + {{- if $root.Values.targetNodes }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: "In" + values: + {{- range $hostname := $root.Values.targetNodes }} + - {{ $hostname }} + {{- end }} + {{- end }} + {{- if $root.Values.avoidNodes }} + {{- if not $root.Values.targetNodes }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + {{- end }} + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: "NotIn" + values: + {{- range $hostname := $root.Values.avoidNodes }} + - {{ $hostname }} + {{- end }} + {{- end }} + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + - operator: "Exists" + key: cloud.google.com/impending-node-termination + + volumes: + {{ if $root.Values.network.gibVersion }} + - name: gib + emptyDir: {} + {{ end }} + + {{- if $root.Values.workload.configFile }} + - name: workload-configuration + configMap: + name: "{{.Release.Name}}-config" + items: + - key: workload-configuration + path: {{ $root.Values.workload.configFile | default "workload-configuration" }} + {{- end }} + + - name: workload-launcher + configMap: + name: "{{.Release.Name}}-launcher" + + - name: shared-memory + emptyDir: + medium: "Memory" + sizeLimit: 250Gi + + {{- range $pvc := $root.Values.volumes.pvcMounts }} + - name: "{{ $pvc.claimName }}" + persistentVolumeClaim: + claimName: "{{ $pvc.claimName }}" + {{- end }} + + {{- range $gcs := $root.Values.volumes.gcsMounts }} + - name: "{{ $gcs.bucketName }}" + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: "{{ $gcs.bucketName }}" + {{- if $gcs.mountOptions }} + mountOptions: "{{ $gcs.mountOptions }}" + {{- end }} + {{- end}} + + {{- if $root.Values.volumes.ssdMountPath }} + - name: local-ssd + hostPath: + path: /mnt/stateful_partition/kube-ephemeral-ssd + {{- end }} + + initContainers: + {{ if $root.Values.network.gibVersion }} + - name: nccl-plugin-installer + image: {{ $root.Values.network.gibVersion }} + imagePullPolicy: Always + args: + - | + set -ex + /scripts/container_entry.sh install --install-nccl + cp -R /var/lib/gib/lib64/. /target/usr/local/gib/lib64 + cp -R /var/lib/gib/. /target/usr/local/gib + command: + - /bin/sh + - -c + volumeMounts: + - mountPath: /target/usr/local/gib + name: gib + {{ end}} + + containers: + {{- if $root.Values.workload.gcsSidecarImage }} + - name: gke-gcsfuse-sidecar + image: {{ $root.Values.workload.gcsSidecarImage }} + - name: gke-gcsfuse-metadata-prefetch + image: {{ $root.Values.workload.gcsSidecarImage }} + {{- end }} + {{- if $root.Values.workload.psSidecarImage }} + - name: gke-parallelstore-sidecar + image: {{ $root.Values.workload.psSidecarImage }} + {{- end }} + + - name: workload + image: "{{ $root.Values.workload.image }}" + imagePullPolicy: Always + {{- if $root.Values.network.hostNetwork }} + securityContext: + privileged: true + {{- end }} + env: + - name: JOB_IDENTIFIER + value: "{{ .Release.Name }}-{{ $timestamp }}" + - name: JOB_TIMESTAMP + value: "{{ $timestamp }}" + - name: JOB_UUID + value: "{{ $jobuuid }}" + - name: JOB_ORCHESTRATOR + value: "gke" + # Add RANK based on the pod's index provided by the Indexed Job + # This is crucial for torch.distributed initialization. + - name: JOB_COMPLETION_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index'] + - name: RANK_0_FQDN + value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local" + - name: HOSTNAME_PREFIX + value: "{{.Release.Name}}-workload-" + - name: DOMAIN_NAME + value: "{{.Release.Name}}.default.svc.cluster.local" + - name: MASTER_ADDR + value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local" + - name: MASTER_PORT + value: "6002" + - name: WORLD_SIZE + value: "{{ $root.Values.workload.gpus }}" + - name: NNODES + value: "{{ $nodes }}" + - name: GPUS_PER_NODE + value: "{{ $gpusPerNode }}" + + - name: NCCL_PLUGIN_PATH + value: /usr/local/gib/lib64 + + {{ if $root.Values.network.gibVersion }} + - name: NCCL_INIT_SCRIPT + value: "/usr/local/gib/scripts/set_nccl_env.sh" + {{ end }} + + {{ if $root.Values.network.ncclSettings }} + {{- toYaml .Values.network.ncclSettings | nindent 14 }} + {{ end }} + + {{ if $root.Values.workload.envs }} + {{- toYaml .Values.workload.envs | nindent 14 }} + {{ end }} + + command: + - bash + - -c + - | + echo "Pod on $(hostname --fqdn) is running" + echo "Pod is assigned job index of $JOB_COMPLETION_INDEX" + + if [[ -n "${NCCL_INIT_SCRIPT}" ]]; then + echo "Running NCCL init script: ${NCCL_INIT_SCRIPT}" + source ${NCCL_INIT_SCRIPT} + fi + + # Overriding NCCL_SOCKET_IFNAME definition + export NCCL_SOCKET_IFNAME="eth0,eth1" + export NCCL_TUNER_CONFIG_PATH=/usr/local/gib/configs/tuner_config_a4.txtpb + + echo "Launching workload with the following arguments:" + {{- range $root.Values.workload.defaultArguments }} + echo " {{ . }}" + {{- end }} + {{- range $root.Values.workload.arguments }} + echo " {{ . }}" + {{- end }} + echo "" + + sleep 10 + + bash /workload/launcher/launch-workload.sh \ + {{- range $root.Values.workload.defaultArguments }} + {{ . }} \ + {{- end }} + {{- range $root.Values.workload.arguments }} + {{ . }} \ + {{- end }} + + + volumeMounts: + {{ if $root.Values.network.gibVersion }} + - name: gib + mountPath: /usr/local/gib + {{ end }} + + {{- if $root.Values.workload.configFile }} + - name: workload-configuration + mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }} + {{- end }} + + - name: workload-launcher + mountPath: /workload/launcher + + - name: shared-memory + mountPath: /dev/shm + + {{- range $pvc := $root.Values.volumes.pvcMounts }} + - name: "{{ $pvc.claimName }}" + mountPath: "{{ $pvc.mountPath }}" + {{- end }} + + {{- range $gcs := $root.Values.volumes.gcsMounts }} + - name: "{{ $gcs.bucketName }}" + mountPath: "{{ $gcs.mountPath }}" + {{- end }} + + {{- if $root.Values.volumes.ssdMountPath }} + - name: local-ssd + mountPath: "{{ $root.Values.volumes.ssdMountPath }}" + {{- end }} + + resources: + limits: + nvidia.com/gpu: {{ $gpusPerNode }} diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-launcher-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-launcher-configmap.yaml new file mode 100644 index 00000000..7026e0f1 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-launcher-configmap.yaml @@ -0,0 +1,28 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-launcher" +data: + launch-workload.sh: |- +{{- if .Values.workload_launcher }} +{{ .Values.workload_launcher | nindent 4 }} +{{- else }} + #!/bin/bash + echo "No workload launcher specified" + exit 1 +{{- end }} diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-svc.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-svc.yaml new file mode 100644 index 00000000..7cfe220b --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-svc.yaml @@ -0,0 +1,22 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + name: "{{ .Release.Name }}" +spec: + clusterIP: None + selector: + jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/values.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/values.yaml new file mode 100644 index 00000000..d82e91f8 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/values.yaml @@ -0,0 +1,33 @@ +dwsSettings: + maxRunDurationSeconds: null +network: + gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0 + hostNetwork: true + ncclSettings: + - name: NCCL_DEBUG + value: WARN + subnetworks[]: null +queue: null +tasSettings: + topologyRequest: + kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname +volumes: + gcsMounts: + - bucketName: null + mountPath: null + gcsVolumes: true + psVolumes: false +workload: + arguments[]: null + configFile: llama3-1-8b-bf16-seq8192-gbs256-gpus128.py + configPath: /workload/configs/ + defaultArguments[]: null + envs: + - name: ARTIFACT_DIR + value: null + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NEMO_LAUNCH_SCRIPT + value: /workload/configs/llama3-1-8b-bf16-seq8192-gbs256-gpus128.py + gpus: 128 + image: nvcr.io/nvidia/nemo:25.07 From 5f2a0253cb2e7a787f479c56f0a0df166ad8ee2b Mon Sep 17 00:00:00 2001 From: Vishwas Reddy Date: Thu, 19 Mar 2026 11:59:08 -0700 Subject: [PATCH 2/9] Update WORKLOAD_NAME in README for consistency --- .../16node-bf16-seq8192-gbs256/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/README.md b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/README.md index 483ed116..ba00e866 100644 --- a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/README.md +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/README.md @@ -89,7 +89,7 @@ your client: ```bash cd $RECIPE_ROOT -export WORKLOAD_NAME=$USER-a4-llama3-1-8b-16node +export WORKLOAD_NAME=$USER-a4-llama3-1-8b helm install $WORKLOAD_NAME . -f values.yaml \ --set-file workload_launcher=launcher.sh \ --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus128.py \ @@ -107,7 +107,7 @@ helm install $WORKLOAD_NAME . -f values.yaml \ ```bash cd $RECIPE_ROOT - export WORKLOAD_NAME=$USER-a4-llama3-1-8b-16node + export WORKLOAD_NAME=$USER-a4-llama3-1-8b helm install $WORKLOAD_NAME . -f values.yaml \ --set-file workload_launcher=launcher.sh \ --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus128.py \ @@ -124,7 +124,7 @@ helm install $WORKLOAD_NAME . -f values.yaml \ To check the status of pods in your job, run the following command: ``` -kubectl get pods | grep $USER-a4-llama3-1-8b-16node +kubectl get pods | grep $USER-a4-llama3-1-8b ``` Replace the following: @@ -141,7 +141,7 @@ Information about the training job's progress, including crucial details such as loss, step count, and step time, is generated by the rank 0 process. This process runs on the pod whose name begins with `JOB_NAME_PREFIX-workload-0-0`. -For example: `$USER-a4-llama3-1-8b-16node-workload-0-0-s9zrv`. +For example: `$USER-a4-llama3-1-8b-workload-0-0-s9zrv`. ### Uninstall the Helm release @@ -149,5 +149,5 @@ You can delete the job and other resources created by the Helm chart. To uninstall Helm, run the following command from your client: ```bash -helm uninstall $USER-a4-llama3-1-8b-16node -``` \ No newline at end of file +helm uninstall $USER-a4-llama3-1-8b +``` From 8f5c02ebdc3e9849596c6f110df3bd8363bfbf3d Mon Sep 17 00:00:00 2001 From: Vishwas Reddy Date: Thu, 19 Mar 2026 19:31:17 +0000 Subject: [PATCH 3/9] feat: add 8-node bf16 recipe for llama3-1-8b --- .../8node-bf16-seq8192-gbs256/Chart.yaml | 20 ++ .../8node-bf16-seq8192-gbs256/README.md | 153 ++++++++ .../8node-bf16-seq8192-gbs256/launcher.sh | 106 ++++++ .../llama3-1-8b-bf16-seq8192-gbs256-gpus64.py | 142 ++++++++ .../recipe_launch_command.sh | 1 + .../templates/workload-config-configmap.yaml | 28 ++ .../templates/workload-job.yaml | 333 ++++++++++++++++++ .../workload-launcher-configmap.yaml | 28 ++ .../templates/workload-svc.yaml | 22 ++ .../8node-bf16-seq8192-gbs256/values.yaml | 33 ++ 10 files changed, 866 insertions(+) create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/Chart.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/README.md create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/launcher.sh create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/llama3-1-8b-bf16-seq8192-gbs256-gpus64.py create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/recipe_launch_command.sh create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-config-configmap.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-job.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-launcher-configmap.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-svc.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/values.yaml diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/Chart.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/Chart.yaml new file mode 100644 index 00000000..af46c11a --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/Chart.yaml @@ -0,0 +1,20 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v2 +name: a4_jobset_workload +description: a4_jobset_workload +type: application +version: 0.1.0 +appVersion: "1.16.0" diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/README.md b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/README.md new file mode 100644 index 00000000..cc666f48 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/README.md @@ -0,0 +1,153 @@ + +# Pretrain llama3-1-8b workloads on a4 GKE Node pools with Nvidia NeMo Framework + +This recipe outlines the steps for running a llama3-1-8b pretraining +workload on [a4 GKE Node pools](https://cloud.google.com/kubernetes-engine) by using the +[NVIDIA NeMo framework](https://github.com/NVIDIA/nemo). + +## Orchestration and deployment tools + +For this recipe, the following setup is used: + +- Orchestration - [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine) +- Pretraining job configuration and deployment - A Helm chart is used to + configure and deploy the [Kubernetes Jobset](https://kubernetes.io/blog/2025/03/23/introducing-jobset) resource which manages the execution of the + [NeMo pretraining workload](https://github.com/NVIDIA/nemo). + +## Test environment + +This recipe has been optimized for and tested with the following configuration: + +- GKE cluster +Please follow Cluster Toolkit [instructions](https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/examples/gke-a4) +to create your a4 GKE cluster. + +## Training dataset + +This recipe uses a mock pretraining dataset provided by the NeMo framework. + +## Docker container image + +This recipe uses the following docker images: + +- `nvcr.io/nvidia/nemo:25.07` +- `us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0` + +## Run the recipe + +From your client workstation, complete the following steps: + +### Configure environment settings + +Set the environment variables to match your environment: + + ```bash + export PROJECT_ID= + export CLUSTER_REGION= + export CLUSTER_NAME= + export GCS_BUCKET= # Note: path should not be prefixed with gs:// + export KUEUE_NAME= + ``` + +Replace the following values: + + - ``: your Google Cloud project ID. + - ``: the region where your cluster is located. + - ``: the name of your GKE cluster. + - ``: the name of your Cloud Storage bucket. Don't include the `gs://` prefix. + - ``: the name of the Kueue local queue. The default queue created by the cluster toolkit is `a4`. Make sure to verify the name of the local queue in your cluster. + +Set the default project: + + ```bash + gcloud config set project $PROJECT_ID + ``` + +### Get the recipe + +Clone the `gpu-recipes` repository and set a reference to the recipe folder. + +``` +git clone https://github.com/ai-hypercomputer/gpu-recipes.git +cd gpu-recipes +export REPO_ROOT=`git rev-parse --show-toplevel` +export RECIPE_ROOT=$REPO_ROOT/training/a4/llama3-1-8b/nemo-pretraining-gke/8_nodes +cd $RECIPE_ROOT +``` + +### Get cluster credentials + +``` +gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION +``` + +### Configure and submit a pretraining job + +#### Using 8 node (64 gpus) bf16 precision +To execute the job with the default settings, run the following command from +your client: + +```bash +cd $RECIPE_ROOT +export WORKLOAD_NAME=$USER-a4-llama3-1-8b-8node +helm install $WORKLOAD_NAME . -f values.yaml \ +--set-file workload_launcher=launcher.sh \ +--set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus64.py \ +--set workload.image=nvcr.io/nvidia/nemo:25.07 \ +--set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \ +--set volumes.gcsMounts[0].mountPath=/job-logs \ +--set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \ +--set queue=${KUEUE_NAME} +``` + +**Examples** + +- To set the number of training steps to 100, run the following command from + your client: + + ```bash + cd $RECIPE_ROOT + export WORKLOAD_NAME=$USER-a4-llama3-1-8b-8node + helm install $WORKLOAD_NAME . -f values.yaml \ + --set-file workload_launcher=launcher.sh \ + --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus64.py \ + --set workload.image=nvcr.io/nvidia/nemo:25.07 \ + --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \ + --set volumes.gcsMounts[0].mountPath=/job-logs \ + --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \ + --set queue=${KUEUE_NAME} \ + --set workload.arguments[0]="trainer.max_steps=100" + ``` + +### Monitor the job + +To check the status of pods in your job, run the following command: + +``` +kubectl get pods | grep $USER-a4-llama3-1-8b-8node +``` + +Replace the following: + +- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4-llama3-1-8b-8node. + +To get the logs for one of the pods, run the following command: + +``` +kubectl logs POD_NAME +``` + +Information about the training job's progress, including crucial details such as +loss, step count, and step time, is generated by the rank 0 process. +This process runs on the pod whose name begins with +`JOB_NAME_PREFIX-workload-0-0`. +For example: `$USER-a4-llama3-1-8b-8node-workload-0-0-s9zrv`. + +### Uninstall the Helm release + +You can delete the job and other resources created by the Helm chart. To +uninstall Helm, run the following command from your client: + +```bash +helm uninstall $USER-a4-llama3-1-8b-8node +``` \ No newline at end of file diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/launcher.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/launcher.sh new file mode 100644 index 00000000..357d27a4 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/launcher.sh @@ -0,0 +1,106 @@ +usage() +{ +cat << EOF +usage: bash ./launcher.sh [config-override [config-override ...]] +config-override (Optional) A NeMo configuration override. E.g. trainer.max_steps=10000. +EOF +} + +parse_args() { + while [ "$1" != "" ]; do + case $(grep -o "=" <<< "$1" | wc -l) in + 1 ) + config_overrides+=("$1") + ;; + * ) + echo "Invalid config override: $1" + usage + exit 1 + esac + shift + done + config_overrides="${config_overrides[*]}" +} + +config_overrides=() +parse_args "$@" + +if [ -z "${config_overrides}" ]; then + echo "No NeMo config overrides specified" +else + echo "NeMo config overrides:" + echo " ${config_overrides}" +fi + +if [[ -n "${NCCL_PLUGIN_PATH}" ]]; then + export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH" + ldconfig $LD_LIBRARY_PATH + echo "Added $LD_LIBRARY_PATH to ldconfig:" + ldconfig -p | grep libcuda | sed 's/^/ /' + echo "" +fi + +if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then + explicit_log_dir=${EXPLICIT_LOG_DIR} +else + explicit_log_dir=workload_logs +fi +echo "Logging to ${explicit_log_dir}" + +if [[ -n "${TOKENIZER_PATH}" ]]; then + echo "Getting tokenizer files" + cp ${TOKENIZER_PATH}/* . + echo "" +fi + +echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of $NNODES nodes" + + +pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger + +export HF_TOKEN="" + +# Export the nemo2 config to yaml. +python ${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \ +trainer.num_nodes="$NNODES" \ +log.explicit_log_dir="${explicit_log_dir}" \ +trainer.max_steps=30 \ +trainer.num_nodes=8 \ +trainer.devices=8 \ +${config_overrides} \ +--to-yaml exported_nemo_config.yaml + +# Create the nsys directory. +mkdir -p ${explicit_log_dir}/nsys + +OMP_NUM_THREADS=12 NSYS_CONFIG_DIRECTIVES="AgentLaunchTimeoutSec=240;AppLaunchTimeoutSec=240" TORCH_NCCL_ENABLE_MONITORING=0 \ +/usr/local/bin/nsys profile -s none -t nvtx,cuda --capture-range=cudaProfilerApi --capture-range-end=stop \ +-o ${explicit_log_dir}/nsys/noderank-${JOB_COMPLETION_INDEX} \ +--session-new "nemo-rank${JOB_COMPLETION_INDEX}"-$RANDOM \ +--wait all \ +torchrun \ +--nproc-per-node="8" \ +--nnodes="${NNODES}" \ +--node_rank="${JOB_COMPLETION_INDEX}" \ +--rdzv_id="${JOB_IDENTIFIER}" \ +--master_addr="${MASTER_ADDR}" \ +--master_port="${MASTER_PORT}" \ +${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \ +trainer.num_nodes="$NNODES" \ +log.explicit_log_dir="${explicit_log_dir}" \ +trainer.max_steps=30 \ +trainer.num_nodes=8 \ +trainer.devices=8 \ +${config_overrides} + +if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then + mkdir -p ${ARTIFACT_DIR} + cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/ + cp ${NEMO_LAUNCH_SCRIPT} ${ARTIFACT_DIR}/run-cli.py + cp dllogger.json ${ARTIFACT_DIR}/dllogger.json + cp exported_nemo_config.yaml ${ARTIFACT_DIR}/nemo-configuration.yaml + env > ${ARTIFACT_DIR}/environ.txt + ls ${ARTIFACT_DIR} +fi +echo "Training completed" +echo "Pod on $(hostname --fqdn) is exiting" \ No newline at end of file diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/llama3-1-8b-bf16-seq8192-gbs256-gpus64.py b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/llama3-1-8b-bf16-seq8192-gbs256-gpus64.py new file mode 100644 index 00000000..fbf01761 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/llama3-1-8b-bf16-seq8192-gbs256-gpus64.py @@ -0,0 +1,142 @@ +"""Nemo2 pretraining recipe for Llama 3.1 8B model.""" + +from nemo.collections import llm +from nemo.collections.llm.recipes import llama31_8b +from nemo.lightning.pytorch.callbacks import NsysCallback +from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback +from nemo.utils.loggers.dllogger import DLLogger +import nemo_run as run +from scripts.performance.helpers import ( + set_primary_perf_configs, +) +from scripts.performance.utils import get_comm_overlap_callback_idx + + +def recipe( + profile_enabled: bool = False, + profile_start_step: int = 0, + profile_end_step: int = 0, + profile_ranks: str = "0", +) -> run.Partial: + """Returns a Nemo2 training recipe for Llama 3.1 8B model. + + Args: + profile_enabled: Whether to enable Nsys profiling. + profile_start_step: The step to start profiling. + profile_end_step: The step to end profiling. + profile_ranks: The ranks to profile, comma separated. + + Returns: + A Nemo2 training recipe. + """ + # Start from the Nemo standard recipe. + pretrain = llama31_8b.pretrain_recipe(performance_mode=True) + + num_nodes = 8 + num_gpus_per_node = 8 + mbs = 2 + gbs = 256 + max_steps = 30 + tp_size = 1 + pp_size = 1 + cp_size = 1 + vp_size = 1 # Virtual Pipeline Parallelism + ep_size = 1 # Expert Parallelism + enable_cuda_graphs = False + compute_dtype = "bf16" + fp8_recipe = None # Not needed for bf16 + nccl_communicator_config_path = None + use_mcore_fsdp = False + use_fsdp_double_buffer = False + use_user_buffer_registration = False + use_sharp = False + keep_fsdp_fp8_transpose_cache = False + + pretrain = set_primary_perf_configs( + pretrain, + "pre_train", + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + mbs=mbs, + gbs=gbs, + max_steps=max_steps, + tp_size=tp_size, + pp_size=pp_size, + cp_size=cp_size, + vp_size=vp_size, + ep_size=ep_size, + enable_cuda_graphs=enable_cuda_graphs, + compute_dtype=compute_dtype, + fp8_recipe=fp8_recipe, + nccl_communicator_config_path=nccl_communicator_config_path, + use_mcore_fsdp=use_mcore_fsdp, + use_fsdp_double_buffer=use_fsdp_double_buffer, + use_user_buffer_registration=use_user_buffer_registration, + use_sharp=use_sharp, + keep_fsdp_fp8_transpose_cache=keep_fsdp_fp8_transpose_cache, + ) + + # Sequence Length (model and data) + pretrain.model.config.seq_length = 8192 + pretrain.data.seq_length = 8192 + + # Set the number of steps to 50 for a quicker benchmark. + pretrain.trainer.max_steps = 50 + + # Disable validation batches. + pretrain.trainer.limit_val_batches = 0.0 + pretrain.trainer.val_check_interval = 0 + + # Add the Nsys profiling callback if enabled. + if profile_enabled: + pretrain.trainer.callbacks.append( + run.Config( + NsysCallback, + start_step=profile_start_step, + end_step=profile_end_step, + ranks=[int(x) for x in profile_ranks.split(",")], + gen_shape=False, + ) + ) + + # Add the FLOPs measurement callback. + pretrain.trainer.callbacks.append( + run.Config( + FLOPsMeasurementCallback, + model_name="llama31-8b", + model_config=pretrain.model.config, + data_config=pretrain.data, + ) + ) + + # When `performance_mode` is enabled, the Megatron communication overlap + # callback is already added to the recipe. + # https://github.com/NVIDIA-NeMo/NeMo/blob/90a396a567ebb4e8c1c41e454dc00cb71f911317/nemo/collections/llm/recipes/llama31_8b.py#L231 + comm_overlap_callback_idx = get_comm_overlap_callback_idx( + pretrain.trainer.callbacks + ) + pretrain.trainer.callbacks[ + comm_overlap_callback_idx + ].tp_comm_bootstrap_backend = "nccl" + + # Disable checkpointing. + pretrain.log.ckpt = None + pretrain.trainer.enable_checkpointing = False + + # Log every step. + pretrain.trainer.log_every_n_steps = 1 + + # Enable DLLogger + dllogger_config = run.Config( + DLLogger, + verbose=True, + stdout=True, + json_file="dllogger.json", + ) + pretrain.log.extra_loggers = [dllogger_config] + + return pretrain + + +if __name__ == "__main__": + run.cli.main(llm.pretrain, default_factory=recipe) diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/recipe_launch_command.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/recipe_launch_command.sh new file mode 100644 index 00000000..378e1475 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/recipe_launch_command.sh @@ -0,0 +1 @@ +helm install vishwasreddy-ubench-c3i3 . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus64.py --set workload.image=nvcr.io/nvidia/nemo:25.07 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/vishwasreddy-ubench-c3i3 \ No newline at end of file diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-config-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-config-configmap.yaml new file mode 100644 index 00000000..a1d54cee --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-config-configmap.yaml @@ -0,0 +1,28 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- if .Values.workload.configFile }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-config" +data: + workload-configuration: |- +{{- if .Values.workload_config }} +{{ .Values.workload_config | nindent 4 }} +{{- else }} +{{ "config: null" | nindent 4 }} +{{- end }} +{{- end }} diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-job.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-job.yaml new file mode 100644 index 00000000..ae59e456 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-job.yaml @@ -0,0 +1,333 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{$timestamp := now | date "2006-01-02-15-04-05"}} +{{$jobSuffix := randAlphaNum 4 | lower}} +{{$jobuuid := uuidv4}} +{{$nodes := div .Values.workload.gpus 8 | max 1}} +{{$gpusPerNode := min .Values.workload.gpus 8}} +{{- $root := . -}} + +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: "{{ .Release.Name }}" + namespace: default + labels: + {{- if $root.Values.queue }} + kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}" + {{- end }} +spec: + {{- if $root.Values.queue }} + suspend: true + {{- end }} + failurePolicy: + maxRestarts: {{ default 0 $root.Values.workload.max_workload_restarts }} + replicatedJobs: + - name: workload + replicas: 1 + template: + spec: + parallelism: {{ $nodes }} + completions: {{ $nodes }} + backoffLimit: 0 + completionMode: Indexed + activeDeadlineSeconds: 14400 # 4 hours (4 * 60 * 60) + ttlSecondsAfterFinished: 43200 # 12 hours (12 * 60 * 60) + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: workload + {{- if $root.Values.volumes.gcsVolumes }} + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "500m" + gke-gcsfuse/memory-limit: "1Ti" + gke-gcsfuse/ephemeral-storage-limit: "2Ti" + {{- end }} + {{- if $root.Values.volumes.psVolumes }} + gke-parallelstore/volumes: "true" + gke-parallelstore/cpu-limit: "0" + gke-parallelstore/memory-limit: "0" + {{- end }} + {{- if and $root.Values.queue $root.Values.tasSettings.topologyRequest }} + {{- toYaml .Values.tasSettings.topologyRequest | nindent 14 }} + {{- end }} + {{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }} + provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}" + {{- end }} + {{- if not $root.Values.network.hostNetwork }} + networking.gke.io/default-interface: "eth0" + networking.gke.io/interfaces: | + {{- if $root.Values.network.subnetworks }} + [ + {{- range $i, $subnetwork := $root.Values.network.subnetworks }} + {"interfaceName":"eth{{ $i }}","network":"{{ $subnetwork }}"}{{ eq $i 9 | ternary "" ","}} + {{- end }} + ] + {{- else }} + [ + {"interfaceName":"eth0","network":"default"}, + {"interfaceName":"eth1","network":"gvnic-1"}, + {{- range $i := until 8 }} + {"interfaceName":"eth{{ add 2 $i }}","network":"rdma-{{ $i }}"}{{ eq $i 7 | ternary "" ","}} + {{- end }} + ] + {{- end }} + {{- end }} + spec: + {{- if $root.Values.network.hostNetwork }} + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + {{- end }} + subdomain: "{{.Release.Name}}" + restartPolicy: Never + {{- if $root.Values.targetNodes }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: "In" + values: + {{- range $hostname := $root.Values.targetNodes }} + - {{ $hostname }} + {{- end }} + {{- end }} + {{- if $root.Values.avoidNodes }} + {{- if not $root.Values.targetNodes }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + {{- end }} + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: "NotIn" + values: + {{- range $hostname := $root.Values.avoidNodes }} + - {{ $hostname }} + {{- end }} + {{- end }} + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + - operator: "Exists" + key: cloud.google.com/impending-node-termination + + volumes: + {{ if $root.Values.network.gibVersion }} + - name: gib + emptyDir: {} + {{ end }} + + {{- if $root.Values.workload.configFile }} + - name: workload-configuration + configMap: + name: "{{.Release.Name}}-config" + items: + - key: workload-configuration + path: {{ $root.Values.workload.configFile | default "workload-configuration" }} + {{- end }} + + - name: workload-launcher + configMap: + name: "{{.Release.Name}}-launcher" + + - name: shared-memory + emptyDir: + medium: "Memory" + sizeLimit: 250Gi + + {{- range $pvc := $root.Values.volumes.pvcMounts }} + - name: "{{ $pvc.claimName }}" + persistentVolumeClaim: + claimName: "{{ $pvc.claimName }}" + {{- end }} + + {{- range $gcs := $root.Values.volumes.gcsMounts }} + - name: "{{ $gcs.bucketName }}" + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: "{{ $gcs.bucketName }}" + {{- if $gcs.mountOptions }} + mountOptions: "{{ $gcs.mountOptions }}" + {{- end }} + {{- end}} + + {{- if $root.Values.volumes.ssdMountPath }} + - name: local-ssd + hostPath: + path: /mnt/stateful_partition/kube-ephemeral-ssd + {{- end }} + + initContainers: + {{ if $root.Values.network.gibVersion }} + - name: nccl-plugin-installer + image: {{ $root.Values.network.gibVersion }} + imagePullPolicy: Always + args: + - | + set -ex + /scripts/container_entry.sh install --install-nccl + cp -R /var/lib/gib/lib64/. /target/usr/local/gib/lib64 + cp -R /var/lib/gib/. /target/usr/local/gib + command: + - /bin/sh + - -c + volumeMounts: + - mountPath: /target/usr/local/gib + name: gib + {{ end}} + + containers: + {{- if $root.Values.workload.gcsSidecarImage }} + - name: gke-gcsfuse-sidecar + image: {{ $root.Values.workload.gcsSidecarImage }} + - name: gke-gcsfuse-metadata-prefetch + image: {{ $root.Values.workload.gcsSidecarImage }} + {{- end }} + {{- if $root.Values.workload.psSidecarImage }} + - name: gke-parallelstore-sidecar + image: {{ $root.Values.workload.psSidecarImage }} + {{- end }} + + - name: workload + image: "{{ $root.Values.workload.image }}" + imagePullPolicy: Always + {{- if $root.Values.network.hostNetwork }} + securityContext: + privileged: true + {{- end }} + env: + - name: JOB_IDENTIFIER + value: "{{ .Release.Name }}-{{ $timestamp }}" + - name: JOB_TIMESTAMP + value: "{{ $timestamp }}" + - name: JOB_UUID + value: "{{ $jobuuid }}" + - name: JOB_ORCHESTRATOR + value: "gke" + # Add RANK based on the pod's index provided by the Indexed Job + # This is crucial for torch.distributed initialization. + - name: JOB_COMPLETION_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index'] + - name: RANK_0_FQDN + value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local" + - name: HOSTNAME_PREFIX + value: "{{.Release.Name}}-workload-" + - name: DOMAIN_NAME + value: "{{.Release.Name}}.default.svc.cluster.local" + - name: MASTER_ADDR + value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local" + - name: MASTER_PORT + value: "6002" + - name: WORLD_SIZE + value: "{{ $root.Values.workload.gpus }}" + - name: NNODES + value: "{{ $nodes }}" + - name: GPUS_PER_NODE + value: "{{ $gpusPerNode }}" + + - name: NCCL_PLUGIN_PATH + value: /usr/local/gib/lib64 + + {{ if $root.Values.network.gibVersion }} + - name: NCCL_INIT_SCRIPT + value: "/usr/local/gib/scripts/set_nccl_env.sh" + {{ end }} + + {{ if $root.Values.network.ncclSettings }} + {{- toYaml .Values.network.ncclSettings | nindent 14 }} + {{ end }} + + {{ if $root.Values.workload.envs }} + {{- toYaml .Values.workload.envs | nindent 14 }} + {{ end }} + + command: + - bash + - -c + - | + echo "Pod on $(hostname --fqdn) is running" + echo "Pod is assigned job index of $JOB_COMPLETION_INDEX" + + if [[ -n "${NCCL_INIT_SCRIPT}" ]]; then + echo "Running NCCL init script: ${NCCL_INIT_SCRIPT}" + source ${NCCL_INIT_SCRIPT} + fi + + # Overriding NCCL_SOCKET_IFNAME definition + export NCCL_SOCKET_IFNAME="eth0,eth1" + export NCCL_TUNER_CONFIG_PATH=/usr/local/gib/configs/tuner_config_a4.txtpb + + echo "Launching workload with the following arguments:" + {{- range $root.Values.workload.defaultArguments }} + echo " {{ . }}" + {{- end }} + {{- range $root.Values.workload.arguments }} + echo " {{ . }}" + {{- end }} + echo "" + + sleep 10 + + bash /workload/launcher/launch-workload.sh \ + {{- range $root.Values.workload.defaultArguments }} + {{ . }} \ + {{- end }} + {{- range $root.Values.workload.arguments }} + {{ . }} \ + {{- end }} + + + volumeMounts: + {{ if $root.Values.network.gibVersion }} + - name: gib + mountPath: /usr/local/gib + {{ end }} + + {{- if $root.Values.workload.configFile }} + - name: workload-configuration + mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }} + {{- end }} + + - name: workload-launcher + mountPath: /workload/launcher + + - name: shared-memory + mountPath: /dev/shm + + {{- range $pvc := $root.Values.volumes.pvcMounts }} + - name: "{{ $pvc.claimName }}" + mountPath: "{{ $pvc.mountPath }}" + {{- end }} + + {{- range $gcs := $root.Values.volumes.gcsMounts }} + - name: "{{ $gcs.bucketName }}" + mountPath: "{{ $gcs.mountPath }}" + {{- end }} + + {{- if $root.Values.volumes.ssdMountPath }} + - name: local-ssd + mountPath: "{{ $root.Values.volumes.ssdMountPath }}" + {{- end }} + + resources: + limits: + nvidia.com/gpu: {{ $gpusPerNode }} diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-launcher-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-launcher-configmap.yaml new file mode 100644 index 00000000..7026e0f1 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-launcher-configmap.yaml @@ -0,0 +1,28 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-launcher" +data: + launch-workload.sh: |- +{{- if .Values.workload_launcher }} +{{ .Values.workload_launcher | nindent 4 }} +{{- else }} + #!/bin/bash + echo "No workload launcher specified" + exit 1 +{{- end }} diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-svc.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-svc.yaml new file mode 100644 index 00000000..7cfe220b --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-svc.yaml @@ -0,0 +1,22 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + name: "{{ .Release.Name }}" +spec: + clusterIP: None + selector: + jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/values.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/values.yaml new file mode 100644 index 00000000..42c40d7b --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/values.yaml @@ -0,0 +1,33 @@ +dwsSettings: + maxRunDurationSeconds: null +network: + gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0 + hostNetwork: true + ncclSettings: + - name: NCCL_DEBUG + value: WARN + subnetworks[]: null +queue: null +tasSettings: + topologyRequest: + kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname +volumes: + gcsMounts: + - bucketName: null + mountPath: null + gcsVolumes: true + psVolumes: false +workload: + arguments[]: null + configFile: llama3-1-8b-bf16-seq8192-gbs256-gpus64.py + configPath: /workload/configs/ + defaultArguments[]: null + envs: + - name: ARTIFACT_DIR + value: null + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NEMO_LAUNCH_SCRIPT + value: /workload/configs/llama3-1-8b-bf16-seq8192-gbs256-gpus64.py + gpus: 64 + image: nvcr.io/nvidia/nemo:25.07 From 2ac2b5b75a1dc95a33786d80aa7ff0f8bc1f97c6 Mon Sep 17 00:00:00 2001 From: Vishwas Reddy Date: Thu, 19 Mar 2026 12:38:57 -0700 Subject: [PATCH 4/9] Remove '-8node' suffix from workload name --- .../8node-bf16-seq8192-gbs256/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/README.md b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/README.md index cc666f48..aa44841a 100644 --- a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/README.md +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/README.md @@ -89,7 +89,7 @@ your client: ```bash cd $RECIPE_ROOT -export WORKLOAD_NAME=$USER-a4-llama3-1-8b-8node +export WORKLOAD_NAME=$USER-a4-llama3-1-8b helm install $WORKLOAD_NAME . -f values.yaml \ --set-file workload_launcher=launcher.sh \ --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus64.py \ @@ -107,7 +107,7 @@ helm install $WORKLOAD_NAME . -f values.yaml \ ```bash cd $RECIPE_ROOT - export WORKLOAD_NAME=$USER-a4-llama3-1-8b-8node + export WORKLOAD_NAME=$USER-a4-llama3-1-8b helm install $WORKLOAD_NAME . -f values.yaml \ --set-file workload_launcher=launcher.sh \ --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus64.py \ @@ -124,7 +124,7 @@ helm install $WORKLOAD_NAME . -f values.yaml \ To check the status of pods in your job, run the following command: ``` -kubectl get pods | grep $USER-a4-llama3-1-8b-8node +kubectl get pods | grep $USER-a4-llama3-1-8b ``` Replace the following: @@ -141,7 +141,7 @@ Information about the training job's progress, including crucial details such as loss, step count, and step time, is generated by the rank 0 process. This process runs on the pod whose name begins with `JOB_NAME_PREFIX-workload-0-0`. -For example: `$USER-a4-llama3-1-8b-8node-workload-0-0-s9zrv`. +For example: `$USER-a4-llama3-1-8b-workload-0-0-s9zrv`. ### Uninstall the Helm release @@ -149,5 +149,5 @@ You can delete the job and other resources created by the Helm chart. To uninstall Helm, run the following command from your client: ```bash -helm uninstall $USER-a4-llama3-1-8b-8node -``` \ No newline at end of file +helm uninstall $USER-a4-llama3-1-8b +``` From b182d558a6b2829875b38e4fe9e716cebd6fbd54 Mon Sep 17 00:00:00 2001 From: Vishwas Reddy Date: Thu, 19 Mar 2026 19:54:19 +0000 Subject: [PATCH 5/9] feat: add 1-node bf16 recipe for llama3-1-8b --- .../1node-bf16-seq4096-gbs256/Chart.yaml | 20 ++ .../1node-bf16-seq4096-gbs256/README.md | 153 ++++++++ .../1node-bf16-seq4096-gbs256/launcher.sh | 106 ++++++ .../llama3-1-8b-bf16-seq4096-gbs256-gpus8.py | 142 ++++++++ .../recipe_launch_command.sh | 1 + .../templates/workload-config-configmap.yaml | 28 ++ .../templates/workload-job.yaml | 333 ++++++++++++++++++ .../workload-launcher-configmap.yaml | 28 ++ .../templates/workload-svc.yaml | 22 ++ .../1node-bf16-seq4096-gbs256/values.yaml | 33 ++ 10 files changed, 866 insertions(+) create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/Chart.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/README.md create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/launcher.sh create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/llama3-1-8b-bf16-seq4096-gbs256-gpus8.py create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/recipe_launch_command.sh create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-config-configmap.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-job.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-launcher-configmap.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-svc.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/values.yaml diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/Chart.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/Chart.yaml new file mode 100644 index 00000000..af46c11a --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/Chart.yaml @@ -0,0 +1,20 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v2 +name: a4_jobset_workload +description: a4_jobset_workload +type: application +version: 0.1.0 +appVersion: "1.16.0" diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/README.md b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/README.md new file mode 100644 index 00000000..1f1e50c0 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/README.md @@ -0,0 +1,153 @@ + +# Pretrain llama3-1-8b workloads on a4 GKE Node pools with Nvidia NeMo Framework + +This recipe outlines the steps for running a llama3-1-8b pretraining +workload on [a4 GKE Node pools](https://cloud.google.com/kubernetes-engine) by using the +[NVIDIA NeMo framework](https://github.com/NVIDIA/nemo). + +## Orchestration and deployment tools + +For this recipe, the following setup is used: + +- Orchestration - [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine) +- Pretraining job configuration and deployment - A Helm chart is used to + configure and deploy the [Kubernetes Jobset](https://kubernetes.io/blog/2025/03/23/introducing-jobset) resource which manages the execution of the + [NeMo pretraining workload](https://github.com/NVIDIA/nemo). + +## Test environment + +This recipe has been optimized for and tested with the following configuration: + +- GKE cluster +Please follow Cluster Toolkit [instructions](https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/examples/gke-a4) +to create your a4 GKE cluster. + +## Training dataset + +This recipe uses a mock pretraining dataset provided by the NeMo framework. + +## Docker container image + +This recipe uses the following docker images: + +- `nvcr.io/nvidia/nemo:25.11` +- `us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0` + +## Run the recipe + +From your client workstation, complete the following steps: + +### Configure environment settings + +Set the environment variables to match your environment: + + ```bash + export PROJECT_ID= + export CLUSTER_REGION= + export CLUSTER_NAME= + export GCS_BUCKET= # Note: path should not be prefixed with gs:// + export KUEUE_NAME= + ``` + +Replace the following values: + + - ``: your Google Cloud project ID. + - ``: the region where your cluster is located. + - ``: the name of your GKE cluster. + - ``: the name of your Cloud Storage bucket. Don't include the `gs://` prefix. + - ``: the name of the Kueue local queue. The default queue created by the cluster toolkit is `a4`. Make sure to verify the name of the local queue in your cluster. + +Set the default project: + + ```bash + gcloud config set project $PROJECT_ID + ``` + +### Get the recipe + +Clone the `gpu-recipes` repository and set a reference to the recipe folder. + +``` +git clone https://github.com/ai-hypercomputer/gpu-recipes.git +cd gpu-recipes +export REPO_ROOT=`git rev-parse --show-toplevel` +export RECIPE_ROOT=$REPO_ROOT/training/a4/llama3-1-8b/nemo-pretraining-gke/1_nodes +cd $RECIPE_ROOT +``` + +### Get cluster credentials + +``` +gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION +``` + +### Configure and submit a pretraining job + +#### Using 1 node (8 gpus) bf16 precision +To execute the job with the default settings, run the following command from +your client: + +```bash +cd $RECIPE_ROOT +export WORKLOAD_NAME=$USER-a4-llama3-1-8b-1node +helm install $WORKLOAD_NAME . -f values.yaml \ +--set-file workload_launcher=launcher.sh \ +--set-file workload_config=llama3-1-8b-bf16-seq4096-gbs256-gpus8.py \ +--set workload.image=nvcr.io/nvidia/nemo:25.11 \ +--set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \ +--set volumes.gcsMounts[0].mountPath=/job-logs \ +--set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \ +--set queue=${KUEUE_NAME} +``` + +**Examples** + +- To set the number of training steps to 100, run the following command from + your client: + + ```bash + cd $RECIPE_ROOT + export WORKLOAD_NAME=$USER-a4-llama3-1-8b-1node + helm install $WORKLOAD_NAME . -f values.yaml \ + --set-file workload_launcher=launcher.sh \ + --set-file workload_config=llama3-1-8b-bf16-seq4096-gbs256-gpus8.py \ + --set workload.image=nvcr.io/nvidia/nemo:25.11 \ + --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \ + --set volumes.gcsMounts[0].mountPath=/job-logs \ + --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \ + --set queue=${KUEUE_NAME} \ + --set workload.arguments[0]="trainer.max_steps=100" + ``` + +### Monitor the job + +To check the status of pods in your job, run the following command: + +``` +kubectl get pods | grep $USER-a4-llama3-1-8b-1node +``` + +Replace the following: + +- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4-llama3-1-8b-1node. + +To get the logs for one of the pods, run the following command: + +``` +kubectl logs POD_NAME +``` + +Information about the training job's progress, including crucial details such as +loss, step count, and step time, is generated by the rank 0 process. +This process runs on the pod whose name begins with +`JOB_NAME_PREFIX-workload-0-0`. +For example: `$USER-a4-llama3-1-8b-1node-workload-0-0-s9zrv`. + +### Uninstall the Helm release + +You can delete the job and other resources created by the Helm chart. To +uninstall Helm, run the following command from your client: + +```bash +helm uninstall $USER-a4-llama3-1-8b-1node +``` \ No newline at end of file diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/launcher.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/launcher.sh new file mode 100644 index 00000000..8184f6f9 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/launcher.sh @@ -0,0 +1,106 @@ +usage() +{ +cat << EOF +usage: bash ./launcher.sh [config-override [config-override ...]] +config-override (Optional) A NeMo configuration override. E.g. trainer.max_steps=10000. +EOF +} + +parse_args() { + while [ "$1" != "" ]; do + case $(grep -o "=" <<< "$1" | wc -l) in + 1 ) + config_overrides+=("$1") + ;; + * ) + echo "Invalid config override: $1" + usage + exit 1 + esac + shift + done + config_overrides="${config_overrides[*]}" +} + +config_overrides=() +parse_args "$@" + +if [ -z "${config_overrides}" ]; then + echo "No NeMo config overrides specified" +else + echo "NeMo config overrides:" + echo " ${config_overrides}" +fi + +if [[ -n "${NCCL_PLUGIN_PATH}" ]]; then + export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH" + ldconfig $LD_LIBRARY_PATH + echo "Added $LD_LIBRARY_PATH to ldconfig:" + ldconfig -p | grep libcuda | sed 's/^/ /' + echo "" +fi + +if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then + explicit_log_dir=${EXPLICIT_LOG_DIR} +else + explicit_log_dir=workload_logs +fi +echo "Logging to ${explicit_log_dir}" + +if [[ -n "${TOKENIZER_PATH}" ]]; then + echo "Getting tokenizer files" + cp ${TOKENIZER_PATH}/* . + echo "" +fi + +echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of $NNODES nodes" + + +pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger + +export HF_TOKEN="" + +# Export the nemo2 config to yaml. +python ${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \ +trainer.num_nodes="$NNODES" \ +log.explicit_log_dir="${explicit_log_dir}" \ +trainer.max_steps=30 \ +trainer.num_nodes=1 \ +trainer.devices=8 \ +${config_overrides} \ +--to-yaml exported_nemo_config.yaml + +# Create the nsys directory. +mkdir -p ${explicit_log_dir}/nsys + +OMP_NUM_THREADS=12 NSYS_CONFIG_DIRECTIVES="AgentLaunchTimeoutSec=240;AppLaunchTimeoutSec=240" TORCH_NCCL_ENABLE_MONITORING=0 \ +/usr/local/bin/nsys profile -s none -t nvtx,cuda --capture-range=cudaProfilerApi --capture-range-end=stop \ +-o ${explicit_log_dir}/nsys/noderank-${JOB_COMPLETION_INDEX} \ +--session-new "nemo-rank${JOB_COMPLETION_INDEX}"-$RANDOM \ +--wait all \ +torchrun \ +--nproc-per-node="8" \ +--nnodes="${NNODES}" \ +--node_rank="${JOB_COMPLETION_INDEX}" \ +--rdzv_id="${JOB_IDENTIFIER}" \ +--master_addr="${MASTER_ADDR}" \ +--master_port="${MASTER_PORT}" \ +${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \ +trainer.num_nodes="$NNODES" \ +log.explicit_log_dir="${explicit_log_dir}" \ +trainer.max_steps=30 \ +trainer.num_nodes=1 \ +trainer.devices=8 \ +${config_overrides} + +if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then + mkdir -p ${ARTIFACT_DIR} + cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/ + cp ${NEMO_LAUNCH_SCRIPT} ${ARTIFACT_DIR}/run-cli.py + cp dllogger.json ${ARTIFACT_DIR}/dllogger.json + cp exported_nemo_config.yaml ${ARTIFACT_DIR}/nemo-configuration.yaml + env > ${ARTIFACT_DIR}/environ.txt + ls ${ARTIFACT_DIR} +fi +echo "Training completed" +echo "Pod on $(hostname --fqdn) is exiting" \ No newline at end of file diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/llama3-1-8b-bf16-seq4096-gbs256-gpus8.py b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/llama3-1-8b-bf16-seq4096-gbs256-gpus8.py new file mode 100644 index 00000000..3a6a1198 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/llama3-1-8b-bf16-seq4096-gbs256-gpus8.py @@ -0,0 +1,142 @@ +"""Nemo2 pretraining recipe for Llama 3.1 8B model.""" + +from nemo.collections import llm +from nemo.collections.llm.recipes import llama31_8b +from nemo.lightning.pytorch.callbacks import NsysCallback +from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback +from nemo.utils.loggers.dllogger import DLLogger +import nemo_run as run +from scripts.performance.helpers import ( + set_primary_perf_configs, +) +from scripts.performance.utils import get_comm_overlap_callback_idx + + +def recipe( + profile_enabled: bool = False, + profile_start_step: int = 0, + profile_end_step: int = 0, + profile_ranks: str = "0", +) -> run.Partial: + """Returns a Nemo2 training recipe for Llama 3.1 8B model. + + Args: + profile_enabled: Whether to enable Nsys profiling. + profile_start_step: The step to start profiling. + profile_end_step: The step to end profiling. + profile_ranks: The ranks to profile, comma separated. + + Returns: + A Nemo2 training recipe. + """ + # Start from the Nemo standard recipe. + pretrain = llama31_8b.pretrain_recipe(performance_mode=True) + + num_nodes = 1 + num_gpus_per_node = 8 + mbs = 2 + gbs = 256 + max_steps = 30 + tp_size = 1 + pp_size = 1 + cp_size = 1 + vp_size = 1 # Virtual Pipeline Parallelism + ep_size = 1 # Expert Parallelism + enable_cuda_graphs = False + compute_dtype = "bf16" + fp8_recipe = None # Not needed for bf16 + nccl_communicator_config_path = None + use_mcore_fsdp = False + use_fsdp_double_buffer = False + use_user_buffer_registration = False + use_sharp = False + keep_fsdp_fp8_transpose_cache = False + + pretrain = set_primary_perf_configs( + pretrain, + "pre_train", + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + mbs=mbs, + gbs=gbs, + max_steps=max_steps, + tp_size=tp_size, + pp_size=pp_size, + cp_size=cp_size, + vp_size=vp_size, + ep_size=ep_size, + enable_cuda_graphs=enable_cuda_graphs, + compute_dtype=compute_dtype, + fp8_recipe=fp8_recipe, + nccl_communicator_config_path=nccl_communicator_config_path, + use_mcore_fsdp=use_mcore_fsdp, + use_fsdp_double_buffer=use_fsdp_double_buffer, + use_user_buffer_registration=use_user_buffer_registration, + use_sharp=use_sharp, + keep_fsdp_fp8_transpose_cache=keep_fsdp_fp8_transpose_cache, + ) + + # Sequence Length (model and data) + pretrain.model.config.seq_length = 4096 + pretrain.data.seq_length = 4096 + + # Set the number of steps to 50 for a quicker benchmark. + pretrain.trainer.max_steps = 50 + + # Disable validation batches. + pretrain.trainer.limit_val_batches = 0.0 + pretrain.trainer.val_check_interval = 0 + + # Add the Nsys profiling callback if enabled. + if profile_enabled: + pretrain.trainer.callbacks.append( + run.Config( + NsysCallback, + start_step=profile_start_step, + end_step=profile_end_step, + ranks=[int(x) for x in profile_ranks.split(",")], + gen_shape=False, + ) + ) + + # Add the FLOPs measurement callback. + pretrain.trainer.callbacks.append( + run.Config( + FLOPsMeasurementCallback, + model_name="llama31-8b", + model_config=pretrain.model.config, + data_config=pretrain.data, + ) + ) + + # When `performance_mode` is enabled, the Megatron communication overlap + # callback is already added to the recipe. + # https://github.com/NVIDIA-NeMo/NeMo/blob/90a396a567ebb4e8c1c41e454dc00cb71f911317/nemo/collections/llm/recipes/llama31_8b.py#L231 + comm_overlap_callback_idx = get_comm_overlap_callback_idx( + pretrain.trainer.callbacks + ) + pretrain.trainer.callbacks[ + comm_overlap_callback_idx + ].tp_comm_bootstrap_backend = "nccl" + + # Disable checkpointing. + pretrain.log.ckpt = None + pretrain.trainer.enable_checkpointing = False + + # Log every step. + pretrain.trainer.log_every_n_steps = 1 + + # Enable DLLogger + dllogger_config = run.Config( + DLLogger, + verbose=True, + stdout=True, + json_file="dllogger.json", + ) + pretrain.log.extra_loggers = [dllogger_config] + + return pretrain + + +if __name__ == "__main__": + run.cli.main(llm.pretrain, default_factory=recipe) diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/recipe_launch_command.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/recipe_launch_command.sh new file mode 100644 index 00000000..c4d8cb3b --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/recipe_launch_command.sh @@ -0,0 +1 @@ +helm install vishwasreddy-ubench-d5hr . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-8b-bf16-seq4096-gbs256-gpus8.py --set workload.image=nvcr.io/nvidia/nemo:25.11 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/vishwasreddy-ubench-d5hr \ No newline at end of file diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-config-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-config-configmap.yaml new file mode 100644 index 00000000..a1d54cee --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-config-configmap.yaml @@ -0,0 +1,28 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- if .Values.workload.configFile }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-config" +data: + workload-configuration: |- +{{- if .Values.workload_config }} +{{ .Values.workload_config | nindent 4 }} +{{- else }} +{{ "config: null" | nindent 4 }} +{{- end }} +{{- end }} diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-job.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-job.yaml new file mode 100644 index 00000000..ae59e456 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-job.yaml @@ -0,0 +1,333 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{$timestamp := now | date "2006-01-02-15-04-05"}} +{{$jobSuffix := randAlphaNum 4 | lower}} +{{$jobuuid := uuidv4}} +{{$nodes := div .Values.workload.gpus 8 | max 1}} +{{$gpusPerNode := min .Values.workload.gpus 8}} +{{- $root := . -}} + +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: "{{ .Release.Name }}" + namespace: default + labels: + {{- if $root.Values.queue }} + kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}" + {{- end }} +spec: + {{- if $root.Values.queue }} + suspend: true + {{- end }} + failurePolicy: + maxRestarts: {{ default 0 $root.Values.workload.max_workload_restarts }} + replicatedJobs: + - name: workload + replicas: 1 + template: + spec: + parallelism: {{ $nodes }} + completions: {{ $nodes }} + backoffLimit: 0 + completionMode: Indexed + activeDeadlineSeconds: 14400 # 4 hours (4 * 60 * 60) + ttlSecondsAfterFinished: 43200 # 12 hours (12 * 60 * 60) + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: workload + {{- if $root.Values.volumes.gcsVolumes }} + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "500m" + gke-gcsfuse/memory-limit: "1Ti" + gke-gcsfuse/ephemeral-storage-limit: "2Ti" + {{- end }} + {{- if $root.Values.volumes.psVolumes }} + gke-parallelstore/volumes: "true" + gke-parallelstore/cpu-limit: "0" + gke-parallelstore/memory-limit: "0" + {{- end }} + {{- if and $root.Values.queue $root.Values.tasSettings.topologyRequest }} + {{- toYaml .Values.tasSettings.topologyRequest | nindent 14 }} + {{- end }} + {{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }} + provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}" + {{- end }} + {{- if not $root.Values.network.hostNetwork }} + networking.gke.io/default-interface: "eth0" + networking.gke.io/interfaces: | + {{- if $root.Values.network.subnetworks }} + [ + {{- range $i, $subnetwork := $root.Values.network.subnetworks }} + {"interfaceName":"eth{{ $i }}","network":"{{ $subnetwork }}"}{{ eq $i 9 | ternary "" ","}} + {{- end }} + ] + {{- else }} + [ + {"interfaceName":"eth0","network":"default"}, + {"interfaceName":"eth1","network":"gvnic-1"}, + {{- range $i := until 8 }} + {"interfaceName":"eth{{ add 2 $i }}","network":"rdma-{{ $i }}"}{{ eq $i 7 | ternary "" ","}} + {{- end }} + ] + {{- end }} + {{- end }} + spec: + {{- if $root.Values.network.hostNetwork }} + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + {{- end }} + subdomain: "{{.Release.Name}}" + restartPolicy: Never + {{- if $root.Values.targetNodes }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: "In" + values: + {{- range $hostname := $root.Values.targetNodes }} + - {{ $hostname }} + {{- end }} + {{- end }} + {{- if $root.Values.avoidNodes }} + {{- if not $root.Values.targetNodes }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + {{- end }} + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: "NotIn" + values: + {{- range $hostname := $root.Values.avoidNodes }} + - {{ $hostname }} + {{- end }} + {{- end }} + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + - operator: "Exists" + key: cloud.google.com/impending-node-termination + + volumes: + {{ if $root.Values.network.gibVersion }} + - name: gib + emptyDir: {} + {{ end }} + + {{- if $root.Values.workload.configFile }} + - name: workload-configuration + configMap: + name: "{{.Release.Name}}-config" + items: + - key: workload-configuration + path: {{ $root.Values.workload.configFile | default "workload-configuration" }} + {{- end }} + + - name: workload-launcher + configMap: + name: "{{.Release.Name}}-launcher" + + - name: shared-memory + emptyDir: + medium: "Memory" + sizeLimit: 250Gi + + {{- range $pvc := $root.Values.volumes.pvcMounts }} + - name: "{{ $pvc.claimName }}" + persistentVolumeClaim: + claimName: "{{ $pvc.claimName }}" + {{- end }} + + {{- range $gcs := $root.Values.volumes.gcsMounts }} + - name: "{{ $gcs.bucketName }}" + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: "{{ $gcs.bucketName }}" + {{- if $gcs.mountOptions }} + mountOptions: "{{ $gcs.mountOptions }}" + {{- end }} + {{- end}} + + {{- if $root.Values.volumes.ssdMountPath }} + - name: local-ssd + hostPath: + path: /mnt/stateful_partition/kube-ephemeral-ssd + {{- end }} + + initContainers: + {{ if $root.Values.network.gibVersion }} + - name: nccl-plugin-installer + image: {{ $root.Values.network.gibVersion }} + imagePullPolicy: Always + args: + - | + set -ex + /scripts/container_entry.sh install --install-nccl + cp -R /var/lib/gib/lib64/. /target/usr/local/gib/lib64 + cp -R /var/lib/gib/. /target/usr/local/gib + command: + - /bin/sh + - -c + volumeMounts: + - mountPath: /target/usr/local/gib + name: gib + {{ end}} + + containers: + {{- if $root.Values.workload.gcsSidecarImage }} + - name: gke-gcsfuse-sidecar + image: {{ $root.Values.workload.gcsSidecarImage }} + - name: gke-gcsfuse-metadata-prefetch + image: {{ $root.Values.workload.gcsSidecarImage }} + {{- end }} + {{- if $root.Values.workload.psSidecarImage }} + - name: gke-parallelstore-sidecar + image: {{ $root.Values.workload.psSidecarImage }} + {{- end }} + + - name: workload + image: "{{ $root.Values.workload.image }}" + imagePullPolicy: Always + {{- if $root.Values.network.hostNetwork }} + securityContext: + privileged: true + {{- end }} + env: + - name: JOB_IDENTIFIER + value: "{{ .Release.Name }}-{{ $timestamp }}" + - name: JOB_TIMESTAMP + value: "{{ $timestamp }}" + - name: JOB_UUID + value: "{{ $jobuuid }}" + - name: JOB_ORCHESTRATOR + value: "gke" + # Add RANK based on the pod's index provided by the Indexed Job + # This is crucial for torch.distributed initialization. + - name: JOB_COMPLETION_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index'] + - name: RANK_0_FQDN + value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local" + - name: HOSTNAME_PREFIX + value: "{{.Release.Name}}-workload-" + - name: DOMAIN_NAME + value: "{{.Release.Name}}.default.svc.cluster.local" + - name: MASTER_ADDR + value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local" + - name: MASTER_PORT + value: "6002" + - name: WORLD_SIZE + value: "{{ $root.Values.workload.gpus }}" + - name: NNODES + value: "{{ $nodes }}" + - name: GPUS_PER_NODE + value: "{{ $gpusPerNode }}" + + - name: NCCL_PLUGIN_PATH + value: /usr/local/gib/lib64 + + {{ if $root.Values.network.gibVersion }} + - name: NCCL_INIT_SCRIPT + value: "/usr/local/gib/scripts/set_nccl_env.sh" + {{ end }} + + {{ if $root.Values.network.ncclSettings }} + {{- toYaml .Values.network.ncclSettings | nindent 14 }} + {{ end }} + + {{ if $root.Values.workload.envs }} + {{- toYaml .Values.workload.envs | nindent 14 }} + {{ end }} + + command: + - bash + - -c + - | + echo "Pod on $(hostname --fqdn) is running" + echo "Pod is assigned job index of $JOB_COMPLETION_INDEX" + + if [[ -n "${NCCL_INIT_SCRIPT}" ]]; then + echo "Running NCCL init script: ${NCCL_INIT_SCRIPT}" + source ${NCCL_INIT_SCRIPT} + fi + + # Overriding NCCL_SOCKET_IFNAME definition + export NCCL_SOCKET_IFNAME="eth0,eth1" + export NCCL_TUNER_CONFIG_PATH=/usr/local/gib/configs/tuner_config_a4.txtpb + + echo "Launching workload with the following arguments:" + {{- range $root.Values.workload.defaultArguments }} + echo " {{ . }}" + {{- end }} + {{- range $root.Values.workload.arguments }} + echo " {{ . }}" + {{- end }} + echo "" + + sleep 10 + + bash /workload/launcher/launch-workload.sh \ + {{- range $root.Values.workload.defaultArguments }} + {{ . }} \ + {{- end }} + {{- range $root.Values.workload.arguments }} + {{ . }} \ + {{- end }} + + + volumeMounts: + {{ if $root.Values.network.gibVersion }} + - name: gib + mountPath: /usr/local/gib + {{ end }} + + {{- if $root.Values.workload.configFile }} + - name: workload-configuration + mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }} + {{- end }} + + - name: workload-launcher + mountPath: /workload/launcher + + - name: shared-memory + mountPath: /dev/shm + + {{- range $pvc := $root.Values.volumes.pvcMounts }} + - name: "{{ $pvc.claimName }}" + mountPath: "{{ $pvc.mountPath }}" + {{- end }} + + {{- range $gcs := $root.Values.volumes.gcsMounts }} + - name: "{{ $gcs.bucketName }}" + mountPath: "{{ $gcs.mountPath }}" + {{- end }} + + {{- if $root.Values.volumes.ssdMountPath }} + - name: local-ssd + mountPath: "{{ $root.Values.volumes.ssdMountPath }}" + {{- end }} + + resources: + limits: + nvidia.com/gpu: {{ $gpusPerNode }} diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-launcher-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-launcher-configmap.yaml new file mode 100644 index 00000000..7026e0f1 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-launcher-configmap.yaml @@ -0,0 +1,28 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-launcher" +data: + launch-workload.sh: |- +{{- if .Values.workload_launcher }} +{{ .Values.workload_launcher | nindent 4 }} +{{- else }} + #!/bin/bash + echo "No workload launcher specified" + exit 1 +{{- end }} diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-svc.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-svc.yaml new file mode 100644 index 00000000..7cfe220b --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-svc.yaml @@ -0,0 +1,22 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + name: "{{ .Release.Name }}" +spec: + clusterIP: None + selector: + jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/values.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/values.yaml new file mode 100644 index 00000000..66fab5bd --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/values.yaml @@ -0,0 +1,33 @@ +dwsSettings: + maxRunDurationSeconds: null +network: + gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0 + hostNetwork: true + ncclSettings: + - name: NCCL_DEBUG + value: WARN + subnetworks[]: null +queue: null +tasSettings: + topologyRequest: + kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname +volumes: + gcsMounts: + - bucketName: null + mountPath: null + gcsVolumes: true + psVolumes: false +workload: + arguments[]: null + configFile: llama3-1-8b-bf16-seq4096-gbs256-gpus8.py + configPath: /workload/configs/ + defaultArguments[]: null + envs: + - name: ARTIFACT_DIR + value: null + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NEMO_LAUNCH_SCRIPT + value: /workload/configs/llama3-1-8b-bf16-seq4096-gbs256-gpus8.py + gpus: 8 + image: nvcr.io/nvidia/nemo:25.11 From d591ebdee4d22d1049576a190e90a0b7d1a34ff0 Mon Sep 17 00:00:00 2001 From: Vishwas Reddy Date: Thu, 19 Mar 2026 12:57:15 -0700 Subject: [PATCH 6/9] Remove '-1node' suffix from workload name --- .../1node-bf16-seq4096-gbs256/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/README.md b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/README.md index 1f1e50c0..5c32f305 100644 --- a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/README.md +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/README.md @@ -89,7 +89,7 @@ your client: ```bash cd $RECIPE_ROOT -export WORKLOAD_NAME=$USER-a4-llama3-1-8b-1node +export WORKLOAD_NAME=$USER-a4-llama3-1-8b helm install $WORKLOAD_NAME . -f values.yaml \ --set-file workload_launcher=launcher.sh \ --set-file workload_config=llama3-1-8b-bf16-seq4096-gbs256-gpus8.py \ @@ -107,7 +107,7 @@ helm install $WORKLOAD_NAME . -f values.yaml \ ```bash cd $RECIPE_ROOT - export WORKLOAD_NAME=$USER-a4-llama3-1-8b-1node + export WORKLOAD_NAME=$USER-a4-llama3-1-8b helm install $WORKLOAD_NAME . -f values.yaml \ --set-file workload_launcher=launcher.sh \ --set-file workload_config=llama3-1-8b-bf16-seq4096-gbs256-gpus8.py \ @@ -124,7 +124,7 @@ helm install $WORKLOAD_NAME . -f values.yaml \ To check the status of pods in your job, run the following command: ``` -kubectl get pods | grep $USER-a4-llama3-1-8b-1node +kubectl get pods | grep $USER-a4-llama3-1-8b ``` Replace the following: @@ -141,7 +141,7 @@ Information about the training job's progress, including crucial details such as loss, step count, and step time, is generated by the rank 0 process. This process runs on the pod whose name begins with `JOB_NAME_PREFIX-workload-0-0`. -For example: `$USER-a4-llama3-1-8b-1node-workload-0-0-s9zrv`. +For example: `$USER-a4-llama3-1-8b-workload-0-0-s9zrv`. ### Uninstall the Helm release @@ -149,5 +149,5 @@ You can delete the job and other resources created by the Helm chart. To uninstall Helm, run the following command from your client: ```bash -helm uninstall $USER-a4-llama3-1-8b-1node -``` \ No newline at end of file +helm uninstall $USER-a4-llama3-1-8b +``` From dd4dda79172fc4a9412b83c504e2a4511f789916 Mon Sep 17 00:00:00 2001 From: Vishwas Reddy Date: Thu, 19 Mar 2026 20:07:02 +0000 Subject: [PATCH 7/9] feat: add 8-node configuration for seq4096 --- .../8node-bf16-seq4096-gbs256/Chart.yaml | 20 ++ .../8node-bf16-seq4096-gbs256/README.md | 153 ++++++++ .../8node-bf16-seq4096-gbs256/launcher.sh | 106 ++++++ .../llama3-1-8b-bf16-seq4096-gbs256-gpus64.py | 142 ++++++++ .../recipe_launch_command.sh | 1 + .../templates/workload-config-configmap.yaml | 28 ++ .../templates/workload-job.yaml | 333 ++++++++++++++++++ .../workload-launcher-configmap.yaml | 28 ++ .../templates/workload-svc.yaml | 22 ++ .../8node-bf16-seq4096-gbs256/values.yaml | 33 ++ 10 files changed, 866 insertions(+) create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/Chart.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/README.md create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/launcher.sh create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/llama3-1-8b-bf16-seq4096-gbs256-gpus64.py create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/recipe_launch_command.sh create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-config-configmap.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-job.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-launcher-configmap.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-svc.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/values.yaml diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/Chart.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/Chart.yaml new file mode 100644 index 00000000..af46c11a --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/Chart.yaml @@ -0,0 +1,20 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v2 +name: a4_jobset_workload +description: a4_jobset_workload +type: application +version: 0.1.0 +appVersion: "1.16.0" diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/README.md b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/README.md new file mode 100644 index 00000000..7449e4d7 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/README.md @@ -0,0 +1,153 @@ + +# Pretrain llama3-1-8b workloads on a4 GKE Node pools with Nvidia NeMo Framework + +This recipe outlines the steps for running a llama3-1-8b pretraining +workload on [a4 GKE Node pools](https://cloud.google.com/kubernetes-engine) by using the +[NVIDIA NeMo framework](https://github.com/NVIDIA/nemo). + +## Orchestration and deployment tools + +For this recipe, the following setup is used: + +- Orchestration - [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine) +- Pretraining job configuration and deployment - A Helm chart is used to + configure and deploy the [Kubernetes Jobset](https://kubernetes.io/blog/2025/03/23/introducing-jobset) resource which manages the execution of the + [NeMo pretraining workload](https://github.com/NVIDIA/nemo). + +## Test environment + +This recipe has been optimized for and tested with the following configuration: + +- GKE cluster +Please follow Cluster Toolkit [instructions](https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/examples/gke-a4) +to create your a4 GKE cluster. + +## Training dataset + +This recipe uses a mock pretraining dataset provided by the NeMo framework. + +## Docker container image + +This recipe uses the following docker images: + +- `nvcr.io/nvidia/nemo:25.11` +- `us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0` + +## Run the recipe + +From your client workstation, complete the following steps: + +### Configure environment settings + +Set the environment variables to match your environment: + + ```bash + export PROJECT_ID= + export CLUSTER_REGION= + export CLUSTER_NAME= + export GCS_BUCKET= # Note: path should not be prefixed with gs:// + export KUEUE_NAME= + ``` + +Replace the following values: + + - ``: your Google Cloud project ID. + - ``: the region where your cluster is located. + - ``: the name of your GKE cluster. + - ``: the name of your Cloud Storage bucket. Don't include the `gs://` prefix. + - ``: the name of the Kueue local queue. The default queue created by the cluster toolkit is `a4`. Make sure to verify the name of the local queue in your cluster. + +Set the default project: + + ```bash + gcloud config set project $PROJECT_ID + ``` + +### Get the recipe + +Clone the `gpu-recipes` repository and set a reference to the recipe folder. + +``` +git clone https://github.com/ai-hypercomputer/gpu-recipes.git +cd gpu-recipes +export REPO_ROOT=`git rev-parse --show-toplevel` +export RECIPE_ROOT=$REPO_ROOT/training/a4/llama3-1-8b/nemo-pretraining-gke/8_nodes +cd $RECIPE_ROOT +``` + +### Get cluster credentials + +``` +gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION +``` + +### Configure and submit a pretraining job + +#### Using 8 node (64 gpus) bf16 precision +To execute the job with the default settings, run the following command from +your client: + +```bash +cd $RECIPE_ROOT +export WORKLOAD_NAME=$USER-a4-llama3-1-8b +helm install $WORKLOAD_NAME . -f values.yaml \ +--set-file workload_launcher=launcher.sh \ +--set-file workload_config=llama3-1-8b-bf16-seq4096-gbs256-gpus64.py \ +--set workload.image=nvcr.io/nvidia/nemo:25.11 \ +--set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \ +--set volumes.gcsMounts[0].mountPath=/job-logs \ +--set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \ +--set queue=${KUEUE_NAME} +``` + +**Examples** + +- To set the number of training steps to 100, run the following command from + your client: + + ```bash + cd $RECIPE_ROOT + export WORKLOAD_NAME=$USER-a4-llama3-1-8b + helm install $WORKLOAD_NAME . -f values.yaml \ + --set-file workload_launcher=launcher.sh \ + --set-file workload_config=llama3-1-8b-bf16-seq4096-gbs256-gpus64.py \ + --set workload.image=nvcr.io/nvidia/nemo:25.11 \ + --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \ + --set volumes.gcsMounts[0].mountPath=/job-logs \ + --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \ + --set queue=${KUEUE_NAME} \ + --set workload.arguments[0]="trainer.max_steps=100" + ``` + +### Monitor the job + +To check the status of pods in your job, run the following command: + +``` +kubectl get pods | grep $USER-a4-llama3-1-8b +``` + +Replace the following: + +- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4-llama3-1-8b. + +To get the logs for one of the pods, run the following command: + +``` +kubectl logs POD_NAME +``` + +Information about the training job's progress, including crucial details such as +loss, step count, and step time, is generated by the rank 0 process. +This process runs on the pod whose name begins with +`JOB_NAME_PREFIX-workload-0-0`. +For example: `$USER-a4-llama3-1-8b-workload-0-0-s9zrv`. + +### Uninstall the Helm release + +You can delete the job and other resources created by the Helm chart. To +uninstall Helm, run the following command from your client: + +```bash +helm uninstall $USER-a4-llama3-1-8b +``` \ No newline at end of file diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/launcher.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/launcher.sh new file mode 100644 index 00000000..357d27a4 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/launcher.sh @@ -0,0 +1,106 @@ +usage() +{ +cat << EOF +usage: bash ./launcher.sh [config-override [config-override ...]] +config-override (Optional) A NeMo configuration override. E.g. trainer.max_steps=10000. +EOF +} + +parse_args() { + while [ "$1" != "" ]; do + case $(grep -o "=" <<< "$1" | wc -l) in + 1 ) + config_overrides+=("$1") + ;; + * ) + echo "Invalid config override: $1" + usage + exit 1 + esac + shift + done + config_overrides="${config_overrides[*]}" +} + +config_overrides=() +parse_args "$@" + +if [ -z "${config_overrides}" ]; then + echo "No NeMo config overrides specified" +else + echo "NeMo config overrides:" + echo " ${config_overrides}" +fi + +if [[ -n "${NCCL_PLUGIN_PATH}" ]]; then + export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH" + ldconfig $LD_LIBRARY_PATH + echo "Added $LD_LIBRARY_PATH to ldconfig:" + ldconfig -p | grep libcuda | sed 's/^/ /' + echo "" +fi + +if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then + explicit_log_dir=${EXPLICIT_LOG_DIR} +else + explicit_log_dir=workload_logs +fi +echo "Logging to ${explicit_log_dir}" + +if [[ -n "${TOKENIZER_PATH}" ]]; then + echo "Getting tokenizer files" + cp ${TOKENIZER_PATH}/* . + echo "" +fi + +echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of $NNODES nodes" + + +pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger + +export HF_TOKEN="" + +# Export the nemo2 config to yaml. +python ${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \ +trainer.num_nodes="$NNODES" \ +log.explicit_log_dir="${explicit_log_dir}" \ +trainer.max_steps=30 \ +trainer.num_nodes=8 \ +trainer.devices=8 \ +${config_overrides} \ +--to-yaml exported_nemo_config.yaml + +# Create the nsys directory. +mkdir -p ${explicit_log_dir}/nsys + +OMP_NUM_THREADS=12 NSYS_CONFIG_DIRECTIVES="AgentLaunchTimeoutSec=240;AppLaunchTimeoutSec=240" TORCH_NCCL_ENABLE_MONITORING=0 \ +/usr/local/bin/nsys profile -s none -t nvtx,cuda --capture-range=cudaProfilerApi --capture-range-end=stop \ +-o ${explicit_log_dir}/nsys/noderank-${JOB_COMPLETION_INDEX} \ +--session-new "nemo-rank${JOB_COMPLETION_INDEX}"-$RANDOM \ +--wait all \ +torchrun \ +--nproc-per-node="8" \ +--nnodes="${NNODES}" \ +--node_rank="${JOB_COMPLETION_INDEX}" \ +--rdzv_id="${JOB_IDENTIFIER}" \ +--master_addr="${MASTER_ADDR}" \ +--master_port="${MASTER_PORT}" \ +${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \ +trainer.num_nodes="$NNODES" \ +log.explicit_log_dir="${explicit_log_dir}" \ +trainer.max_steps=30 \ +trainer.num_nodes=8 \ +trainer.devices=8 \ +${config_overrides} + +if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then + mkdir -p ${ARTIFACT_DIR} + cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/ + cp ${NEMO_LAUNCH_SCRIPT} ${ARTIFACT_DIR}/run-cli.py + cp dllogger.json ${ARTIFACT_DIR}/dllogger.json + cp exported_nemo_config.yaml ${ARTIFACT_DIR}/nemo-configuration.yaml + env > ${ARTIFACT_DIR}/environ.txt + ls ${ARTIFACT_DIR} +fi +echo "Training completed" +echo "Pod on $(hostname --fqdn) is exiting" \ No newline at end of file diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/llama3-1-8b-bf16-seq4096-gbs256-gpus64.py b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/llama3-1-8b-bf16-seq4096-gbs256-gpus64.py new file mode 100644 index 00000000..7b4acbba --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/llama3-1-8b-bf16-seq4096-gbs256-gpus64.py @@ -0,0 +1,142 @@ +"""Nemo2 pretraining recipe for Llama 3.1 8B model.""" + +from nemo.collections import llm +from nemo.collections.llm.recipes import llama31_8b +from nemo.lightning.pytorch.callbacks import NsysCallback +from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback +from nemo.utils.loggers.dllogger import DLLogger +import nemo_run as run +from scripts.performance.helpers import ( + set_primary_perf_configs, +) +from scripts.performance.utils import get_comm_overlap_callback_idx + + +def recipe( + profile_enabled: bool = False, + profile_start_step: int = 0, + profile_end_step: int = 0, + profile_ranks: str = "0", +) -> run.Partial: + """Returns a Nemo2 training recipe for Llama 3.1 8B model. + + Args: + profile_enabled: Whether to enable Nsys profiling. + profile_start_step: The step to start profiling. + profile_end_step: The step to end profiling. + profile_ranks: The ranks to profile, comma separated. + + Returns: + A Nemo2 training recipe. + """ + # Start from the Nemo standard recipe. + pretrain = llama31_8b.pretrain_recipe(performance_mode=True) + + num_nodes = 8 + num_gpus_per_node = 8 + mbs = 4 + gbs = 256 + max_steps = 30 + tp_size = 1 + pp_size = 1 + cp_size = 1 + vp_size = 1 # Virtual Pipeline Parallelism + ep_size = 1 # Expert Parallelism + enable_cuda_graphs = False + compute_dtype = "bf16" + fp8_recipe = None # Not needed for bf16 + nccl_communicator_config_path = None + use_mcore_fsdp = False + use_fsdp_double_buffer = False + use_user_buffer_registration = False + use_sharp = False + keep_fsdp_fp8_transpose_cache = False + + pretrain = set_primary_perf_configs( + pretrain, + "pre_train", + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + mbs=mbs, + gbs=gbs, + max_steps=max_steps, + tp_size=tp_size, + pp_size=pp_size, + cp_size=cp_size, + vp_size=vp_size, + ep_size=ep_size, + enable_cuda_graphs=enable_cuda_graphs, + compute_dtype=compute_dtype, + fp8_recipe=fp8_recipe, + nccl_communicator_config_path=nccl_communicator_config_path, + use_mcore_fsdp=use_mcore_fsdp, + use_fsdp_double_buffer=use_fsdp_double_buffer, + use_user_buffer_registration=use_user_buffer_registration, + use_sharp=use_sharp, + keep_fsdp_fp8_transpose_cache=keep_fsdp_fp8_transpose_cache, + ) + + # Sequence Length (model and data) + pretrain.model.config.seq_length = 4096 + pretrain.data.seq_length = 4096 + + # Set the number of steps to 50 for a quicker benchmark. + pretrain.trainer.max_steps = 50 + + # Disable validation batches. + pretrain.trainer.limit_val_batches = 0.0 + pretrain.trainer.val_check_interval = 0 + + # Add the Nsys profiling callback if enabled. + if profile_enabled: + pretrain.trainer.callbacks.append( + run.Config( + NsysCallback, + start_step=profile_start_step, + end_step=profile_end_step, + ranks=[int(x) for x in profile_ranks.split(",")], + gen_shape=False, + ) + ) + + # Add the FLOPs measurement callback. + pretrain.trainer.callbacks.append( + run.Config( + FLOPsMeasurementCallback, + model_name="llama31-8b", + model_config=pretrain.model.config, + data_config=pretrain.data, + ) + ) + + # When `performance_mode` is enabled, the Megatron communication overlap + # callback is already added to the recipe. + # https://github.com/NVIDIA-NeMo/NeMo/blob/90a396a567ebb4e8c1c41e454dc00cb71f911317/nemo/collections/llm/recipes/llama31_8b.py#L231 + comm_overlap_callback_idx = get_comm_overlap_callback_idx( + pretrain.trainer.callbacks + ) + pretrain.trainer.callbacks[ + comm_overlap_callback_idx + ].tp_comm_bootstrap_backend = "nccl" + + # Disable checkpointing. + pretrain.log.ckpt = None + pretrain.trainer.enable_checkpointing = False + + # Log every step. + pretrain.trainer.log_every_n_steps = 1 + + # Enable DLLogger + dllogger_config = run.Config( + DLLogger, + verbose=True, + stdout=True, + json_file="dllogger.json", + ) + pretrain.log.extra_loggers = [dllogger_config] + + return pretrain + + +if __name__ == "__main__": + run.cli.main(llm.pretrain, default_factory=recipe) diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/recipe_launch_command.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/recipe_launch_command.sh new file mode 100644 index 00000000..f01800fc --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/recipe_launch_command.sh @@ -0,0 +1 @@ +helm install vishwasreddy-ubench-48bq . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-8b-bf16-seq4096-gbs256-gpus64.py --set workload.image=nvcr.io/nvidia/nemo:25.11 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/vishwasreddy-ubench-48bq \ No newline at end of file diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-config-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-config-configmap.yaml new file mode 100644 index 00000000..a1d54cee --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-config-configmap.yaml @@ -0,0 +1,28 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- if .Values.workload.configFile }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-config" +data: + workload-configuration: |- +{{- if .Values.workload_config }} +{{ .Values.workload_config | nindent 4 }} +{{- else }} +{{ "config: null" | nindent 4 }} +{{- end }} +{{- end }} diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-job.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-job.yaml new file mode 100644 index 00000000..ae59e456 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-job.yaml @@ -0,0 +1,333 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{$timestamp := now | date "2006-01-02-15-04-05"}} +{{$jobSuffix := randAlphaNum 4 | lower}} +{{$jobuuid := uuidv4}} +{{$nodes := div .Values.workload.gpus 8 | max 1}} +{{$gpusPerNode := min .Values.workload.gpus 8}} +{{- $root := . -}} + +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: "{{ .Release.Name }}" + namespace: default + labels: + {{- if $root.Values.queue }} + kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}" + {{- end }} +spec: + {{- if $root.Values.queue }} + suspend: true + {{- end }} + failurePolicy: + maxRestarts: {{ default 0 $root.Values.workload.max_workload_restarts }} + replicatedJobs: + - name: workload + replicas: 1 + template: + spec: + parallelism: {{ $nodes }} + completions: {{ $nodes }} + backoffLimit: 0 + completionMode: Indexed + activeDeadlineSeconds: 14400 # 4 hours (4 * 60 * 60) + ttlSecondsAfterFinished: 43200 # 12 hours (12 * 60 * 60) + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: workload + {{- if $root.Values.volumes.gcsVolumes }} + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "500m" + gke-gcsfuse/memory-limit: "1Ti" + gke-gcsfuse/ephemeral-storage-limit: "2Ti" + {{- end }} + {{- if $root.Values.volumes.psVolumes }} + gke-parallelstore/volumes: "true" + gke-parallelstore/cpu-limit: "0" + gke-parallelstore/memory-limit: "0" + {{- end }} + {{- if and $root.Values.queue $root.Values.tasSettings.topologyRequest }} + {{- toYaml .Values.tasSettings.topologyRequest | nindent 14 }} + {{- end }} + {{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }} + provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}" + {{- end }} + {{- if not $root.Values.network.hostNetwork }} + networking.gke.io/default-interface: "eth0" + networking.gke.io/interfaces: | + {{- if $root.Values.network.subnetworks }} + [ + {{- range $i, $subnetwork := $root.Values.network.subnetworks }} + {"interfaceName":"eth{{ $i }}","network":"{{ $subnetwork }}"}{{ eq $i 9 | ternary "" ","}} + {{- end }} + ] + {{- else }} + [ + {"interfaceName":"eth0","network":"default"}, + {"interfaceName":"eth1","network":"gvnic-1"}, + {{- range $i := until 8 }} + {"interfaceName":"eth{{ add 2 $i }}","network":"rdma-{{ $i }}"}{{ eq $i 7 | ternary "" ","}} + {{- end }} + ] + {{- end }} + {{- end }} + spec: + {{- if $root.Values.network.hostNetwork }} + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + {{- end }} + subdomain: "{{.Release.Name}}" + restartPolicy: Never + {{- if $root.Values.targetNodes }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: "In" + values: + {{- range $hostname := $root.Values.targetNodes }} + - {{ $hostname }} + {{- end }} + {{- end }} + {{- if $root.Values.avoidNodes }} + {{- if not $root.Values.targetNodes }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + {{- end }} + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: "NotIn" + values: + {{- range $hostname := $root.Values.avoidNodes }} + - {{ $hostname }} + {{- end }} + {{- end }} + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + - operator: "Exists" + key: cloud.google.com/impending-node-termination + + volumes: + {{ if $root.Values.network.gibVersion }} + - name: gib + emptyDir: {} + {{ end }} + + {{- if $root.Values.workload.configFile }} + - name: workload-configuration + configMap: + name: "{{.Release.Name}}-config" + items: + - key: workload-configuration + path: {{ $root.Values.workload.configFile | default "workload-configuration" }} + {{- end }} + + - name: workload-launcher + configMap: + name: "{{.Release.Name}}-launcher" + + - name: shared-memory + emptyDir: + medium: "Memory" + sizeLimit: 250Gi + + {{- range $pvc := $root.Values.volumes.pvcMounts }} + - name: "{{ $pvc.claimName }}" + persistentVolumeClaim: + claimName: "{{ $pvc.claimName }}" + {{- end }} + + {{- range $gcs := $root.Values.volumes.gcsMounts }} + - name: "{{ $gcs.bucketName }}" + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: "{{ $gcs.bucketName }}" + {{- if $gcs.mountOptions }} + mountOptions: "{{ $gcs.mountOptions }}" + {{- end }} + {{- end}} + + {{- if $root.Values.volumes.ssdMountPath }} + - name: local-ssd + hostPath: + path: /mnt/stateful_partition/kube-ephemeral-ssd + {{- end }} + + initContainers: + {{ if $root.Values.network.gibVersion }} + - name: nccl-plugin-installer + image: {{ $root.Values.network.gibVersion }} + imagePullPolicy: Always + args: + - | + set -ex + /scripts/container_entry.sh install --install-nccl + cp -R /var/lib/gib/lib64/. /target/usr/local/gib/lib64 + cp -R /var/lib/gib/. /target/usr/local/gib + command: + - /bin/sh + - -c + volumeMounts: + - mountPath: /target/usr/local/gib + name: gib + {{ end}} + + containers: + {{- if $root.Values.workload.gcsSidecarImage }} + - name: gke-gcsfuse-sidecar + image: {{ $root.Values.workload.gcsSidecarImage }} + - name: gke-gcsfuse-metadata-prefetch + image: {{ $root.Values.workload.gcsSidecarImage }} + {{- end }} + {{- if $root.Values.workload.psSidecarImage }} + - name: gke-parallelstore-sidecar + image: {{ $root.Values.workload.psSidecarImage }} + {{- end }} + + - name: workload + image: "{{ $root.Values.workload.image }}" + imagePullPolicy: Always + {{- if $root.Values.network.hostNetwork }} + securityContext: + privileged: true + {{- end }} + env: + - name: JOB_IDENTIFIER + value: "{{ .Release.Name }}-{{ $timestamp }}" + - name: JOB_TIMESTAMP + value: "{{ $timestamp }}" + - name: JOB_UUID + value: "{{ $jobuuid }}" + - name: JOB_ORCHESTRATOR + value: "gke" + # Add RANK based on the pod's index provided by the Indexed Job + # This is crucial for torch.distributed initialization. + - name: JOB_COMPLETION_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index'] + - name: RANK_0_FQDN + value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local" + - name: HOSTNAME_PREFIX + value: "{{.Release.Name}}-workload-" + - name: DOMAIN_NAME + value: "{{.Release.Name}}.default.svc.cluster.local" + - name: MASTER_ADDR + value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local" + - name: MASTER_PORT + value: "6002" + - name: WORLD_SIZE + value: "{{ $root.Values.workload.gpus }}" + - name: NNODES + value: "{{ $nodes }}" + - name: GPUS_PER_NODE + value: "{{ $gpusPerNode }}" + + - name: NCCL_PLUGIN_PATH + value: /usr/local/gib/lib64 + + {{ if $root.Values.network.gibVersion }} + - name: NCCL_INIT_SCRIPT + value: "/usr/local/gib/scripts/set_nccl_env.sh" + {{ end }} + + {{ if $root.Values.network.ncclSettings }} + {{- toYaml .Values.network.ncclSettings | nindent 14 }} + {{ end }} + + {{ if $root.Values.workload.envs }} + {{- toYaml .Values.workload.envs | nindent 14 }} + {{ end }} + + command: + - bash + - -c + - | + echo "Pod on $(hostname --fqdn) is running" + echo "Pod is assigned job index of $JOB_COMPLETION_INDEX" + + if [[ -n "${NCCL_INIT_SCRIPT}" ]]; then + echo "Running NCCL init script: ${NCCL_INIT_SCRIPT}" + source ${NCCL_INIT_SCRIPT} + fi + + # Overriding NCCL_SOCKET_IFNAME definition + export NCCL_SOCKET_IFNAME="eth0,eth1" + export NCCL_TUNER_CONFIG_PATH=/usr/local/gib/configs/tuner_config_a4.txtpb + + echo "Launching workload with the following arguments:" + {{- range $root.Values.workload.defaultArguments }} + echo " {{ . }}" + {{- end }} + {{- range $root.Values.workload.arguments }} + echo " {{ . }}" + {{- end }} + echo "" + + sleep 10 + + bash /workload/launcher/launch-workload.sh \ + {{- range $root.Values.workload.defaultArguments }} + {{ . }} \ + {{- end }} + {{- range $root.Values.workload.arguments }} + {{ . }} \ + {{- end }} + + + volumeMounts: + {{ if $root.Values.network.gibVersion }} + - name: gib + mountPath: /usr/local/gib + {{ end }} + + {{- if $root.Values.workload.configFile }} + - name: workload-configuration + mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }} + {{- end }} + + - name: workload-launcher + mountPath: /workload/launcher + + - name: shared-memory + mountPath: /dev/shm + + {{- range $pvc := $root.Values.volumes.pvcMounts }} + - name: "{{ $pvc.claimName }}" + mountPath: "{{ $pvc.mountPath }}" + {{- end }} + + {{- range $gcs := $root.Values.volumes.gcsMounts }} + - name: "{{ $gcs.bucketName }}" + mountPath: "{{ $gcs.mountPath }}" + {{- end }} + + {{- if $root.Values.volumes.ssdMountPath }} + - name: local-ssd + mountPath: "{{ $root.Values.volumes.ssdMountPath }}" + {{- end }} + + resources: + limits: + nvidia.com/gpu: {{ $gpusPerNode }} diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-launcher-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-launcher-configmap.yaml new file mode 100644 index 00000000..7026e0f1 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-launcher-configmap.yaml @@ -0,0 +1,28 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-launcher" +data: + launch-workload.sh: |- +{{- if .Values.workload_launcher }} +{{ .Values.workload_launcher | nindent 4 }} +{{- else }} + #!/bin/bash + echo "No workload launcher specified" + exit 1 +{{- end }} diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-svc.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-svc.yaml new file mode 100644 index 00000000..7cfe220b --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-svc.yaml @@ -0,0 +1,22 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + name: "{{ .Release.Name }}" +spec: + clusterIP: None + selector: + jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/values.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/values.yaml new file mode 100644 index 00000000..7c5e36ae --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/values.yaml @@ -0,0 +1,33 @@ +dwsSettings: + maxRunDurationSeconds: null +network: + gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0 + hostNetwork: true + ncclSettings: + - name: NCCL_DEBUG + value: WARN + subnetworks[]: null +queue: null +tasSettings: + topologyRequest: + kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname +volumes: + gcsMounts: + - bucketName: null + mountPath: null + gcsVolumes: true + psVolumes: false +workload: + arguments[]: null + configFile: llama3-1-8b-bf16-seq4096-gbs256-gpus64.py + configPath: /workload/configs/ + defaultArguments[]: null + envs: + - name: ARTIFACT_DIR + value: null + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NEMO_LAUNCH_SCRIPT + value: /workload/configs/llama3-1-8b-bf16-seq4096-gbs256-gpus64.py + gpus: 64 + image: nvcr.io/nvidia/nemo:25.11 From bf5372215aa8e042553e63c63456188594577a81 Mon Sep 17 00:00:00 2001 From: Vishwas Reddy Date: Thu, 19 Mar 2026 20:20:01 +0000 Subject: [PATCH 8/9] feat: add 4-node bf16 recipe for llama3-1-8b --- .../4node-bf16-seq4096-gbs2048/Chart.yaml | 20 ++ .../4node-bf16-seq4096-gbs2048/README.md | 155 ++++++++ .../4node-bf16-seq4096-gbs2048/launcher.sh | 106 ++++++ .../llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py | 142 ++++++++ .../recipe_launch_command.sh | 1 + .../templates/workload-config-configmap.yaml | 28 ++ .../templates/workload-job.yaml | 333 ++++++++++++++++++ .../workload-launcher-configmap.yaml | 28 ++ .../templates/workload-svc.yaml | 22 ++ .../4node-bf16-seq4096-gbs2048/values.yaml | 33 ++ 10 files changed, 868 insertions(+) create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/Chart.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/README.md create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/launcher.sh create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/recipe_launch_command.sh create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-config-configmap.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-job.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-launcher-configmap.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-svc.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/values.yaml diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/Chart.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/Chart.yaml new file mode 100644 index 00000000..af46c11a --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/Chart.yaml @@ -0,0 +1,20 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v2 +name: a4_jobset_workload +description: a4_jobset_workload +type: application +version: 0.1.0 +appVersion: "1.16.0" diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/README.md b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/README.md new file mode 100644 index 00000000..adf30a26 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/README.md @@ -0,0 +1,155 @@ + +# Pretrain llama3-1-8b workloads on a4 GKE Node pools with Nvidia NeMo Framework + +This recipe outlines the steps for running a llama3-1-8b pretraining +workload on [a4 GKE Node pools](https://cloud.google.com/kubernetes-engine) by using the +[NVIDIA NeMo framework](https://github.com/NVIDIA/nemo). + +## Orchestration and deployment tools + +For this recipe, the following setup is used: + +- Orchestration - [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine) +- Pretraining job configuration and deployment - A Helm chart is used to + configure and deploy the [Kubernetes Jobset](https://kubernetes.io/blog/2025/03/23/introducing-jobset) resource which manages the execution of the + [NeMo pretraining workload](https://github.com/NVIDIA/nemo). + +## Test environment + +This recipe has been optimized for and tested with the following configuration: + +- GKE cluster +Please follow Cluster Toolkit [instructions](https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/examples/gke-a4) +to create your a4 GKE cluster. + +## Training dataset + +This recipe uses a mock pretraining dataset provided by the NeMo framework. + +## Docker container image + +This recipe uses the following docker images: + +- `nvcr.io/nvidia/nemo:25.11` +- `us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.1` + +## Run the recipe + +From your client workstation, complete the following steps: + +### Configure environment settings + +Set the environment variables to match your environment: + + ```bash + export PROJECT_ID= + export CLUSTER_REGION= + export CLUSTER_NAME= + export GCS_BUCKET= # Note: path should not be prefixed with gs:// + export KUEUE_NAME= + export HF_TOKEN= + ``` + +Replace the following values: + + - ``: your Google Cloud project ID. + - ``: the region where your cluster is located. + - ``: the name of your GKE cluster. + - ``: the name of your Cloud Storage bucket. Don't include the `gs://` prefix. + - ``: the name of the Kueue local queue. The default queue created by the cluster toolkit is `a4`. Make sure to verify the name of the local queue in your cluster. + - ``: Your HuggingFace token. + +Set the default project: + + ```bash + gcloud config set project $PROJECT_ID + ``` + +### Get the recipe + +Clone the `gpu-recipes` repository and set a reference to the recipe folder. + +``` +git clone https://github.com/ai-hypercomputer/gpu-recipes.git +cd gpu-recipes +export REPO_ROOT=`git rev-parse --show-toplevel` +export RECIPE_ROOT=$REPO_ROOT/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq8192-gbs1024/recipe +cd $RECIPE_ROOT +``` + +### Get cluster credentials + +``` +gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION +``` + +### Configure and submit a pretraining job + +#### Using 4 node (32 gpus) bf16 precision +To execute the job with the default settings, run the following command from +your client: + +```bash +cd $RECIPE_ROOT +export WORKLOAD_NAME=$USER-a4-llama3-1-8b +helm install $WORKLOAD_NAME . -f values.yaml \ +--set-file workload_launcher=launcher.sh \ +--set-file workload_config=llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py \ +--set workload.image=nvcr.io/nvidia/nemo:25.11 \ +--set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \ +--set volumes.gcsMounts[0].mountPath=/job-logs \ +--set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \ +--set queue=${KUEUE_NAME} +``` + +**Examples** + +- To set the number of training steps to 100, run the following command from + your client: + + ```bash + cd $RECIPE_ROOT + export WORKLOAD_NAME=$USER-a4-llama3-1-8b + helm install $WORKLOAD_NAME . -f values.yaml \ + --set-file workload_launcher=launcher.sh \ + --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py \ + --set workload.image=nvcr.io/nvidia/nemo:25.11 \ + --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \ + --set volumes.gcsMounts[0].mountPath=/job-logs \ + --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \ + --set queue=${KUEUE_NAME} \ + --set workload.arguments[0]="trainer.max_steps=100" + ``` + +### Monitor the job + +To check the status of pods in your job, run the following command: + +``` +kubectl get pods | grep $USER-a4-llama3-1-8b +``` + +Replace the following: + +- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4-llama3-1-8b. + +To get the logs for one of the pods, run the following command: + +``` +kubectl logs POD_NAME +``` + +Information about the training job's progress, including crucial details such as +loss, step count, and step time, is generated by the rank 0 process. +This process runs on the pod whose name begins with +`JOB_NAME_PREFIX-workload-0-0`. +For example: `$USER-a4-llama3-1-8b-workload-0-0-s9zrv`. + +### Uninstall the Helm release + +You can delete the job and other resources created by the Helm chart. To +uninstall Helm, run the following command from your client: + +```bash +helm uninstall $USER-a4-llama3-1-8b +``` diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/launcher.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/launcher.sh new file mode 100644 index 00000000..f9c58c29 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/launcher.sh @@ -0,0 +1,106 @@ +usage() +{ +cat << EOF +usage: bash ./launcher.sh [config-override [config-override ...]] +config-override (Optional) A NeMo configuration override. E.g. trainer.max_steps=10000. +EOF +} + +parse_args() { + while [ "$1" != "" ]; do + case $(grep -o "=" <<< "$1" | wc -l) in + 1 ) + config_overrides+=("$1") + ;; + * ) + echo "Invalid config override: $1" + usage + exit 1 + esac + shift + done + config_overrides="${config_overrides[*]}" +} + +config_overrides=() +parse_args "$@" + +if [ -z "${config_overrides}" ]; then + echo "No NeMo config overrides specified" +else + echo "NeMo config overrides:" + echo " ${config_overrides}" +fi + +if [[ -n "${NCCL_PLUGIN_PATH}" ]]; then + export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH" + ldconfig $LD_LIBRARY_PATH + echo "Added $LD_LIBRARY_PATH to ldconfig:" + ldconfig -p | grep libcuda | sed 's/^/ /' + echo "" +fi + +if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then + explicit_log_dir=${EXPLICIT_LOG_DIR} +else + explicit_log_dir=workload_logs +fi +echo "Logging to ${explicit_log_dir}" + +if [[ -n "${TOKENIZER_PATH}" ]]; then + echo "Getting tokenizer files" + cp ${TOKENIZER_PATH}/* . + echo "" +fi + +echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of $NNODES nodes" + + +pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger + +export HF_TOKEN="" + +# Export the nemo2 config to yaml. +python ${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \ +trainer.num_nodes="$NNODES" \ +log.explicit_log_dir="${explicit_log_dir}" \ +trainer.max_steps=30 \ +trainer.num_nodes=4 \ +trainer.devices=8 \ +${config_overrides} \ +--to-yaml exported_nemo_config.yaml + +# Create the nsys directory. +mkdir -p ${explicit_log_dir}/nsys + +OMP_NUM_THREADS=12 NSYS_CONFIG_DIRECTIVES="AgentLaunchTimeoutSec=240;AppLaunchTimeoutSec=240" TORCH_NCCL_ENABLE_MONITORING=0 \ +/usr/local/bin/nsys profile -s none -t nvtx,cuda --capture-range=cudaProfilerApi --capture-range-end=stop \ +-o ${explicit_log_dir}/nsys/noderank-${JOB_COMPLETION_INDEX} \ +--session-new "nemo-rank${JOB_COMPLETION_INDEX}"-$RANDOM \ +--wait all \ +torchrun \ +--nproc-per-node="8" \ +--nnodes="${NNODES}" \ +--node_rank="${JOB_COMPLETION_INDEX}" \ +--rdzv_id="${JOB_IDENTIFIER}" \ +--master_addr="${MASTER_ADDR}" \ +--master_port="${MASTER_PORT}" \ +${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \ +trainer.num_nodes="$NNODES" \ +log.explicit_log_dir="${explicit_log_dir}" \ +trainer.max_steps=30 \ +trainer.num_nodes=4 \ +trainer.devices=8 \ +${config_overrides} + +if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then + mkdir -p ${ARTIFACT_DIR} + cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/ + cp ${NEMO_LAUNCH_SCRIPT} ${ARTIFACT_DIR}/run-cli.py + cp dllogger.json ${ARTIFACT_DIR}/dllogger.json + cp exported_nemo_config.yaml ${ARTIFACT_DIR}/nemo-configuration.yaml + env > ${ARTIFACT_DIR}/environ.txt + ls ${ARTIFACT_DIR} +fi +echo "Training completed" +echo "Pod on $(hostname --fqdn) is exiting" \ No newline at end of file diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py new file mode 100644 index 00000000..e3b90cc1 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py @@ -0,0 +1,142 @@ +"""Nemo2 pretraining recipe for Llama 3.1 8B model.""" + +from nemo.collections import llm +from nemo.collections.llm.recipes import llama31_8b +from nemo.lightning.pytorch.callbacks import NsysCallback +from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback +from nemo.utils.loggers.dllogger import DLLogger +import nemo_run as run +from scripts.performance.helpers import ( + set_primary_perf_configs, +) +from scripts.performance.utils import get_comm_overlap_callback_idx + + +def recipe( + profile_enabled: bool = False, + profile_start_step: int = 0, + profile_end_step: int = 0, + profile_ranks: str = "0", +) -> run.Partial: + """Returns a Nemo2 training recipe for Llama 3.1 8B model. + + Args: + profile_enabled: Whether to enable Nsys profiling. + profile_start_step: The step to start profiling. + profile_end_step: The step to end profiling. + profile_ranks: The ranks to profile, comma separated. + + Returns: + A Nemo2 training recipe. + """ + # Start from the Nemo standard recipe. + pretrain = llama31_8b.pretrain_recipe(performance_mode=True) + + num_nodes = 4 + num_gpus_per_node = 8 + mbs = 4 + gbs = 2048 + max_steps = 30 + tp_size = 1 + pp_size = 1 + cp_size = 1 + vp_size = 1 # Virtual Pipeline Parallelism + ep_size = 1 # Expert Parallelism + enable_cuda_graphs = False + compute_dtype = "bf16" + fp8_recipe = None # Not needed for bf16 + nccl_communicator_config_path = None + use_mcore_fsdp = False + use_fsdp_double_buffer = False + use_user_buffer_registration = False + use_sharp = False + keep_fsdp_fp8_transpose_cache = False + + pretrain = set_primary_perf_configs( + pretrain, + "pre_train", + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + mbs=mbs, + gbs=gbs, + max_steps=max_steps, + tp_size=tp_size, + pp_size=pp_size, + cp_size=cp_size, + vp_size=vp_size, + ep_size=ep_size, + enable_cuda_graphs=enable_cuda_graphs, + compute_dtype=compute_dtype, + fp8_recipe=fp8_recipe, + nccl_communicator_config_path=nccl_communicator_config_path, + use_mcore_fsdp=use_mcore_fsdp, + use_fsdp_double_buffer=use_fsdp_double_buffer, + use_user_buffer_registration=use_user_buffer_registration, + use_sharp=use_sharp, + keep_fsdp_fp8_transpose_cache=keep_fsdp_fp8_transpose_cache, + ) + + # Sequence Length (model and data) + pretrain.model.config.seq_length = 4096 + pretrain.data.seq_length = 4096 + + # Set the number of steps to 50 for a quicker benchmark. + pretrain.trainer.max_steps = 50 + + # Disable validation batches. + pretrain.trainer.limit_val_batches = 0.0 + pretrain.trainer.val_check_interval = 0 + + # Add the Nsys profiling callback if enabled. + if profile_enabled: + pretrain.trainer.callbacks.append( + run.Config( + NsysCallback, + start_step=profile_start_step, + end_step=profile_end_step, + ranks=[int(x) for x in profile_ranks.split(",")], + gen_shape=False, + ) + ) + + # Add the FLOPs measurement callback. + pretrain.trainer.callbacks.append( + run.Config( + FLOPsMeasurementCallback, + model_name="llama31-8b", + model_config=pretrain.model.config, + data_config=pretrain.data, + ) + ) + + # When `performance_mode` is enabled, the Megatron communication overlap + # callback is already added to the recipe. + # https://github.com/NVIDIA-NeMo/NeMo/blob/90a396a567ebb4e8c1c41e454dc00cb71f911317/nemo/collections/llm/recipes/llama31_8b.py#L231 + comm_overlap_callback_idx = get_comm_overlap_callback_idx( + pretrain.trainer.callbacks + ) + pretrain.trainer.callbacks[ + comm_overlap_callback_idx + ].tp_comm_bootstrap_backend = "nccl" + + # Disable checkpointing. + pretrain.log.ckpt = None + pretrain.trainer.enable_checkpointing = False + + # Log every step. + pretrain.trainer.log_every_n_steps = 1 + + # Enable DLLogger + dllogger_config = run.Config( + DLLogger, + verbose=True, + stdout=True, + json_file="dllogger.json", + ) + pretrain.log.extra_loggers = [dllogger_config] + + return pretrain + + +if __name__ == "__main__": + run.cli.main(llm.pretrain, default_factory=recipe) diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/recipe_launch_command.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/recipe_launch_command.sh new file mode 100644 index 00000000..88294653 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/recipe_launch_command.sh @@ -0,0 +1 @@ +helm install vishwasreddy-ubench-squa . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py --set workload.image=nvcr.io/nvidia/nemo:25.11 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/vishwasreddy-ubench-squa \ No newline at end of file diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-config-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-config-configmap.yaml new file mode 100644 index 00000000..a1d54cee --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-config-configmap.yaml @@ -0,0 +1,28 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- if .Values.workload.configFile }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-config" +data: + workload-configuration: |- +{{- if .Values.workload_config }} +{{ .Values.workload_config | nindent 4 }} +{{- else }} +{{ "config: null" | nindent 4 }} +{{- end }} +{{- end }} diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-job.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-job.yaml new file mode 100644 index 00000000..ae59e456 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-job.yaml @@ -0,0 +1,333 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{$timestamp := now | date "2006-01-02-15-04-05"}} +{{$jobSuffix := randAlphaNum 4 | lower}} +{{$jobuuid := uuidv4}} +{{$nodes := div .Values.workload.gpus 8 | max 1}} +{{$gpusPerNode := min .Values.workload.gpus 8}} +{{- $root := . -}} + +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: "{{ .Release.Name }}" + namespace: default + labels: + {{- if $root.Values.queue }} + kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}" + {{- end }} +spec: + {{- if $root.Values.queue }} + suspend: true + {{- end }} + failurePolicy: + maxRestarts: {{ default 0 $root.Values.workload.max_workload_restarts }} + replicatedJobs: + - name: workload + replicas: 1 + template: + spec: + parallelism: {{ $nodes }} + completions: {{ $nodes }} + backoffLimit: 0 + completionMode: Indexed + activeDeadlineSeconds: 14400 # 4 hours (4 * 60 * 60) + ttlSecondsAfterFinished: 43200 # 12 hours (12 * 60 * 60) + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: workload + {{- if $root.Values.volumes.gcsVolumes }} + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "500m" + gke-gcsfuse/memory-limit: "1Ti" + gke-gcsfuse/ephemeral-storage-limit: "2Ti" + {{- end }} + {{- if $root.Values.volumes.psVolumes }} + gke-parallelstore/volumes: "true" + gke-parallelstore/cpu-limit: "0" + gke-parallelstore/memory-limit: "0" + {{- end }} + {{- if and $root.Values.queue $root.Values.tasSettings.topologyRequest }} + {{- toYaml .Values.tasSettings.topologyRequest | nindent 14 }} + {{- end }} + {{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }} + provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}" + {{- end }} + {{- if not $root.Values.network.hostNetwork }} + networking.gke.io/default-interface: "eth0" + networking.gke.io/interfaces: | + {{- if $root.Values.network.subnetworks }} + [ + {{- range $i, $subnetwork := $root.Values.network.subnetworks }} + {"interfaceName":"eth{{ $i }}","network":"{{ $subnetwork }}"}{{ eq $i 9 | ternary "" ","}} + {{- end }} + ] + {{- else }} + [ + {"interfaceName":"eth0","network":"default"}, + {"interfaceName":"eth1","network":"gvnic-1"}, + {{- range $i := until 8 }} + {"interfaceName":"eth{{ add 2 $i }}","network":"rdma-{{ $i }}"}{{ eq $i 7 | ternary "" ","}} + {{- end }} + ] + {{- end }} + {{- end }} + spec: + {{- if $root.Values.network.hostNetwork }} + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + {{- end }} + subdomain: "{{.Release.Name}}" + restartPolicy: Never + {{- if $root.Values.targetNodes }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: "In" + values: + {{- range $hostname := $root.Values.targetNodes }} + - {{ $hostname }} + {{- end }} + {{- end }} + {{- if $root.Values.avoidNodes }} + {{- if not $root.Values.targetNodes }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + {{- end }} + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: "NotIn" + values: + {{- range $hostname := $root.Values.avoidNodes }} + - {{ $hostname }} + {{- end }} + {{- end }} + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + - operator: "Exists" + key: cloud.google.com/impending-node-termination + + volumes: + {{ if $root.Values.network.gibVersion }} + - name: gib + emptyDir: {} + {{ end }} + + {{- if $root.Values.workload.configFile }} + - name: workload-configuration + configMap: + name: "{{.Release.Name}}-config" + items: + - key: workload-configuration + path: {{ $root.Values.workload.configFile | default "workload-configuration" }} + {{- end }} + + - name: workload-launcher + configMap: + name: "{{.Release.Name}}-launcher" + + - name: shared-memory + emptyDir: + medium: "Memory" + sizeLimit: 250Gi + + {{- range $pvc := $root.Values.volumes.pvcMounts }} + - name: "{{ $pvc.claimName }}" + persistentVolumeClaim: + claimName: "{{ $pvc.claimName }}" + {{- end }} + + {{- range $gcs := $root.Values.volumes.gcsMounts }} + - name: "{{ $gcs.bucketName }}" + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: "{{ $gcs.bucketName }}" + {{- if $gcs.mountOptions }} + mountOptions: "{{ $gcs.mountOptions }}" + {{- end }} + {{- end}} + + {{- if $root.Values.volumes.ssdMountPath }} + - name: local-ssd + hostPath: + path: /mnt/stateful_partition/kube-ephemeral-ssd + {{- end }} + + initContainers: + {{ if $root.Values.network.gibVersion }} + - name: nccl-plugin-installer + image: {{ $root.Values.network.gibVersion }} + imagePullPolicy: Always + args: + - | + set -ex + /scripts/container_entry.sh install --install-nccl + cp -R /var/lib/gib/lib64/. /target/usr/local/gib/lib64 + cp -R /var/lib/gib/. /target/usr/local/gib + command: + - /bin/sh + - -c + volumeMounts: + - mountPath: /target/usr/local/gib + name: gib + {{ end}} + + containers: + {{- if $root.Values.workload.gcsSidecarImage }} + - name: gke-gcsfuse-sidecar + image: {{ $root.Values.workload.gcsSidecarImage }} + - name: gke-gcsfuse-metadata-prefetch + image: {{ $root.Values.workload.gcsSidecarImage }} + {{- end }} + {{- if $root.Values.workload.psSidecarImage }} + - name: gke-parallelstore-sidecar + image: {{ $root.Values.workload.psSidecarImage }} + {{- end }} + + - name: workload + image: "{{ $root.Values.workload.image }}" + imagePullPolicy: Always + {{- if $root.Values.network.hostNetwork }} + securityContext: + privileged: true + {{- end }} + env: + - name: JOB_IDENTIFIER + value: "{{ .Release.Name }}-{{ $timestamp }}" + - name: JOB_TIMESTAMP + value: "{{ $timestamp }}" + - name: JOB_UUID + value: "{{ $jobuuid }}" + - name: JOB_ORCHESTRATOR + value: "gke" + # Add RANK based on the pod's index provided by the Indexed Job + # This is crucial for torch.distributed initialization. + - name: JOB_COMPLETION_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index'] + - name: RANK_0_FQDN + value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local" + - name: HOSTNAME_PREFIX + value: "{{.Release.Name}}-workload-" + - name: DOMAIN_NAME + value: "{{.Release.Name}}.default.svc.cluster.local" + - name: MASTER_ADDR + value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local" + - name: MASTER_PORT + value: "6002" + - name: WORLD_SIZE + value: "{{ $root.Values.workload.gpus }}" + - name: NNODES + value: "{{ $nodes }}" + - name: GPUS_PER_NODE + value: "{{ $gpusPerNode }}" + + - name: NCCL_PLUGIN_PATH + value: /usr/local/gib/lib64 + + {{ if $root.Values.network.gibVersion }} + - name: NCCL_INIT_SCRIPT + value: "/usr/local/gib/scripts/set_nccl_env.sh" + {{ end }} + + {{ if $root.Values.network.ncclSettings }} + {{- toYaml .Values.network.ncclSettings | nindent 14 }} + {{ end }} + + {{ if $root.Values.workload.envs }} + {{- toYaml .Values.workload.envs | nindent 14 }} + {{ end }} + + command: + - bash + - -c + - | + echo "Pod on $(hostname --fqdn) is running" + echo "Pod is assigned job index of $JOB_COMPLETION_INDEX" + + if [[ -n "${NCCL_INIT_SCRIPT}" ]]; then + echo "Running NCCL init script: ${NCCL_INIT_SCRIPT}" + source ${NCCL_INIT_SCRIPT} + fi + + # Overriding NCCL_SOCKET_IFNAME definition + export NCCL_SOCKET_IFNAME="eth0,eth1" + export NCCL_TUNER_CONFIG_PATH=/usr/local/gib/configs/tuner_config_a4.txtpb + + echo "Launching workload with the following arguments:" + {{- range $root.Values.workload.defaultArguments }} + echo " {{ . }}" + {{- end }} + {{- range $root.Values.workload.arguments }} + echo " {{ . }}" + {{- end }} + echo "" + + sleep 10 + + bash /workload/launcher/launch-workload.sh \ + {{- range $root.Values.workload.defaultArguments }} + {{ . }} \ + {{- end }} + {{- range $root.Values.workload.arguments }} + {{ . }} \ + {{- end }} + + + volumeMounts: + {{ if $root.Values.network.gibVersion }} + - name: gib + mountPath: /usr/local/gib + {{ end }} + + {{- if $root.Values.workload.configFile }} + - name: workload-configuration + mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }} + {{- end }} + + - name: workload-launcher + mountPath: /workload/launcher + + - name: shared-memory + mountPath: /dev/shm + + {{- range $pvc := $root.Values.volumes.pvcMounts }} + - name: "{{ $pvc.claimName }}" + mountPath: "{{ $pvc.mountPath }}" + {{- end }} + + {{- range $gcs := $root.Values.volumes.gcsMounts }} + - name: "{{ $gcs.bucketName }}" + mountPath: "{{ $gcs.mountPath }}" + {{- end }} + + {{- if $root.Values.volumes.ssdMountPath }} + - name: local-ssd + mountPath: "{{ $root.Values.volumes.ssdMountPath }}" + {{- end }} + + resources: + limits: + nvidia.com/gpu: {{ $gpusPerNode }} diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-launcher-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-launcher-configmap.yaml new file mode 100644 index 00000000..7026e0f1 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-launcher-configmap.yaml @@ -0,0 +1,28 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-launcher" +data: + launch-workload.sh: |- +{{- if .Values.workload_launcher }} +{{ .Values.workload_launcher | nindent 4 }} +{{- else }} + #!/bin/bash + echo "No workload launcher specified" + exit 1 +{{- end }} diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-svc.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-svc.yaml new file mode 100644 index 00000000..7cfe220b --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-svc.yaml @@ -0,0 +1,22 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + name: "{{ .Release.Name }}" +spec: + clusterIP: None + selector: + jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/values.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/values.yaml new file mode 100644 index 00000000..9d6140b2 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/values.yaml @@ -0,0 +1,33 @@ +dwsSettings: + maxRunDurationSeconds: null +network: + gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.1 + hostNetwork: true + ncclSettings: + - name: NCCL_DEBUG + value: WARN + subnetworks[]: null +queue: null +tasSettings: + topologyRequest: + kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname +volumes: + gcsMounts: + - bucketName: null + mountPath: null + gcsVolumes: true + psVolumes: false +workload: + arguments[]: null + configFile: llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py + configPath: /workload/configs/ + defaultArguments[]: null + envs: + - name: ARTIFACT_DIR + value: null + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NEMO_LAUNCH_SCRIPT + value: /workload/configs/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py + gpus: 32 + image: nvcr.io/nvidia/nemo:25.11 From e96e9087c24cd8562fde9eae134625b51d4fa542 Mon Sep 17 00:00:00 2001 From: Vishwas Reddy Date: Thu, 19 Mar 2026 21:10:12 +0000 Subject: [PATCH 9/9] feat: add 8-node bf16 recipe for seq8192 gbs2048 --- .../8node-bf16-seq8192-gbs2048/Chart.yaml | 20 ++ .../8node-bf16-seq8192-gbs2048/README.md | 155 ++++++++ .../8node-bf16-seq8192-gbs2048/launcher.sh | 106 ++++++ .../llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py | 145 ++++++++ .../recipe_launch_command.sh | 1 + .../templates/workload-config-configmap.yaml | 28 ++ .../templates/workload-job.yaml | 333 ++++++++++++++++++ .../workload-launcher-configmap.yaml | 28 ++ .../templates/workload-svc.yaml | 22 ++ .../8node-bf16-seq8192-gbs2048/values.yaml | 33 ++ 10 files changed, 871 insertions(+) create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/Chart.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/README.md create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/launcher.sh create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/recipe_launch_command.sh create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-config-configmap.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-job.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-launcher-configmap.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-svc.yaml create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/values.yaml diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/Chart.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/Chart.yaml new file mode 100644 index 00000000..af46c11a --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/Chart.yaml @@ -0,0 +1,20 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v2 +name: a4_jobset_workload +description: a4_jobset_workload +type: application +version: 0.1.0 +appVersion: "1.16.0" diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/README.md b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/README.md new file mode 100644 index 00000000..23454e49 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/README.md @@ -0,0 +1,155 @@ + +# Pretrain llama3-1-8b workloads on a4 GKE Node pools with Nvidia NeMo Framework + +This recipe outlines the steps for running a llama3-1-8b pretraining +workload on [a4 GKE Node pools](https://cloud.google.com/kubernetes-engine) by using the +[NVIDIA NeMo framework](https://github.com/NVIDIA/nemo). + +## Orchestration and deployment tools + +For this recipe, the following setup is used: + +- Orchestration - [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine) +- Pretraining job configuration and deployment - A Helm chart is used to + configure and deploy the [Kubernetes Jobset](https://kubernetes.io/blog/2025/03/23/introducing-jobset) resource which manages the execution of the + [NeMo pretraining workload](https://github.com/NVIDIA/nemo). + +## Test environment + +This recipe has been optimized for and tested with the following configuration: + +- GKE cluster +Please follow Cluster Toolkit [instructions](https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/examples/gke-a4) +to create your a4 GKE cluster. + +## Training dataset + +This recipe uses a mock pretraining dataset provided by the NeMo framework. + +## Docker container image + +This recipe uses the following docker images: + +- `nvcr.io/nvidia/nemo:25.11` +- `us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.1` + +## Run the recipe + +From your client workstation, complete the following steps: + +### Configure environment settings + +Set the environment variables to match your environment: + + ```bash + export PROJECT_ID= + export CLUSTER_REGION= + export CLUSTER_NAME= + export GCS_BUCKET= # Note: path should not be prefixed with gs:// + export KUEUE_NAME= + export HF_TOKEN= + ``` + +Replace the following values: + + - ``: your Google Cloud project ID. + - ``: the region where your cluster is located. + - ``: the name of your GKE cluster. + - ``: the name of your Cloud Storage bucket. Don't include the `gs://` prefix. + - ``: the name of the Kueue local queue. The default queue created by the cluster toolkit is `a4`. Make sure to verify the name of the local queue in your cluster. + - ``: Your HuggingFace token. + +Set the default project: + + ```bash + gcloud config set project $PROJECT_ID + ``` + +### Get the recipe + +Clone the `gpu-recipes` repository and set a reference to the recipe folder. + +``` +git clone https://github.com/ai-hypercomputer/gpu-recipes.git +cd gpu-recipes +export REPO_ROOT=`git rev-parse --show-toplevel` +export RECIPE_ROOT=$REPO_ROOT/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs1024/recipe +cd $RECIPE_ROOT +``` + +### Get cluster credentials + +``` +gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION +``` + +### Configure and submit a pretraining job + +#### Using 8 node (64 gpus) bf16 precision +To execute the job with the default settings, run the following command from +your client: + +```bash +cd $RECIPE_ROOT +export WORKLOAD_NAME=$USER-a4-llama3-1-8b +helm install $WORKLOAD_NAME . -f values.yaml \ +--set-file workload_launcher=launcher.sh \ +--set-file workload_config=llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py \ +--set workload.image=nvcr.io/nvidia/nemo:25.11 \ +--set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \ +--set volumes.gcsMounts[0].mountPath=/job-logs \ +--set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \ +--set queue=${KUEUE_NAME} +``` + +**Examples** + +- To set the number of training steps to 100, run the following command from + your client: + + ```bash + cd $RECIPE_ROOT + export WORKLOAD_NAME=$USER-a4-llama3-1-8b + helm install $WORKLOAD_NAME . -f values.yaml \ + --set-file workload_launcher=launcher.sh \ + --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py \ + --set workload.image=nvcr.io/nvidia/nemo:25.11 \ + --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \ + --set volumes.gcsMounts[0].mountPath=/job-logs \ + --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \ + --set queue=${KUEUE_NAME} \ + --set workload.arguments[0]="trainer.max_steps=100" + ``` + +### Monitor the job + +To check the status of pods in your job, run the following command: + +``` +kubectl get pods | grep $USER-a4-llama3-1-8b +``` + +Replace the following: + +- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4-llama3-1-8b. + +To get the logs for one of the pods, run the following command: + +``` +kubectl logs POD_NAME +``` + +Information about the training job's progress, including crucial details such as +loss, step count, and step time, is generated by the rank 0 process. +This process runs on the pod whose name begins with +`JOB_NAME_PREFIX-workload-0-0`. +For example: `$USER-a4-llama3-1-8b-workload-0-0-s9zrv`. + +### Uninstall the Helm release + +You can delete the job and other resources created by the Helm chart. To +uninstall Helm, run the following command from your client: + +```bash +helm uninstall $USER-a4-llama3-1-8b +``` diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/launcher.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/launcher.sh new file mode 100644 index 00000000..19ec7a17 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/launcher.sh @@ -0,0 +1,106 @@ +usage() +{ +cat << EOF +usage: bash ./launcher.sh [config-override [config-override ...]] +config-override (Optional) A NeMo configuration override. E.g. trainer.max_steps=10000. +EOF +} + +parse_args() { + while [ "$1" != "" ]; do + case $(grep -o "=" <<< "$1" | wc -l) in + 1 ) + config_overrides+=("$1") + ;; + * ) + echo "Invalid config override: $1" + usage + exit 1 + esac + shift + done + config_overrides="${config_overrides[*]}" +} + +config_overrides=() +parse_args "$@" + +if [ -z "${config_overrides}" ]; then + echo "No NeMo config overrides specified" +else + echo "NeMo config overrides:" + echo " ${config_overrides}" +fi + +if [[ -n "${NCCL_PLUGIN_PATH}" ]]; then + export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH" + ldconfig $LD_LIBRARY_PATH + echo "Added $LD_LIBRARY_PATH to ldconfig:" + ldconfig -p | grep libcuda | sed 's/^/ /' + echo "" +fi + +if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then + explicit_log_dir=${EXPLICIT_LOG_DIR} +else + explicit_log_dir=workload_logs +fi +echo "Logging to ${explicit_log_dir}" + +if [[ -n "${TOKENIZER_PATH}" ]]; then + echo "Getting tokenizer files" + cp ${TOKENIZER_PATH}/* . + echo "" +fi + +echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of $NNODES nodes" + + +pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger + +export HF_TOKEN=" + +# Export the nemo2 config to yaml. +python ${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \ +trainer.num_nodes="$NNODES" \ +log.explicit_log_dir="${explicit_log_dir}" \ +trainer.max_steps=30 \ +trainer.num_nodes=8 \ +trainer.devices=8 \ +${config_overrides} \ +--to-yaml exported_nemo_config.yaml + +# Create the nsys directory. +mkdir -p ${explicit_log_dir}/nsys + +OMP_NUM_THREADS=12 NSYS_CONFIG_DIRECTIVES="AgentLaunchTimeoutSec=240;AppLaunchTimeoutSec=240" TORCH_NCCL_ENABLE_MONITORING=0 \ +/usr/local/bin/nsys profile -s none -t nvtx,cuda --capture-range=cudaProfilerApi --capture-range-end=stop \ +-o ${explicit_log_dir}/nsys/noderank-${JOB_COMPLETION_INDEX} \ +--session-new "nemo-rank${JOB_COMPLETION_INDEX}"-$RANDOM \ +--wait all \ +torchrun \ +--nproc-per-node="8" \ +--nnodes="${NNODES}" \ +--node_rank="${JOB_COMPLETION_INDEX}" \ +--rdzv_id="${JOB_IDENTIFIER}" \ +--master_addr="${MASTER_ADDR}" \ +--master_port="${MASTER_PORT}" \ +${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \ +trainer.num_nodes="$NNODES" \ +log.explicit_log_dir="${explicit_log_dir}" \ +trainer.max_steps=30 \ +trainer.num_nodes=8 \ +trainer.devices=8 \ +${config_overrides} + +if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then + mkdir -p ${ARTIFACT_DIR} + cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/ + cp ${NEMO_LAUNCH_SCRIPT} ${ARTIFACT_DIR}/run-cli.py + cp dllogger.json ${ARTIFACT_DIR}/dllogger.json + cp exported_nemo_config.yaml ${ARTIFACT_DIR}/nemo-configuration.yaml + env > ${ARTIFACT_DIR}/environ.txt + ls ${ARTIFACT_DIR} +fi +echo "Training completed" +echo "Pod on $(hostname --fqdn) is exiting" \ No newline at end of file diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py new file mode 100644 index 00000000..a5fd7524 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py @@ -0,0 +1,145 @@ +"""Nemo2 pretraining recipe for Llama 3.1 8B model.""" + +from nemo.collections import llm +from nemo.collections.llm.recipes import llama31_8b +from nemo.lightning.pytorch.callbacks import NsysCallback +from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback +from nemo.utils.loggers.dllogger import DLLogger +import nemo_run as run +from scripts.performance.helpers import ( + set_primary_perf_configs, +) +from scripts.performance.utils import get_comm_overlap_callback_idx + + +def recipe( + profile_enabled: bool = False, + profile_start_step: int = 0, + profile_end_step: int = 0, + profile_ranks: str = "0", +) -> run.Partial: + """Returns a Nemo2 training recipe for Llama 3.1 8B model. + + Args: + profile_enabled: Whether to enable Nsys profiling. + profile_start_step: The step to start profiling. + profile_end_step: The step to end profiling. + profile_ranks: The ranks to profile, comma separated. + + Returns: + A Nemo2 training recipe. + """ + # Start from the Nemo standard recipe. + pretrain = llama31_8b.pretrain_recipe(performance_mode=True) + + num_nodes = 4 + num_gpus_per_node = 8 + mbs = 1 + gbs = 2048 + max_steps = 30 + tp_size = 1 + pp_size = 1 + cp_size = 1 + vp_size = 1 # Virtual Pipeline Parallelism + ep_size = 1 # Expert Parallelism + enable_cuda_graphs = False + compute_dtype = "bf16" + fp8_recipe = None # Not needed for bf16 + nccl_communicator_config_path = None + use_mcore_fsdp = False + use_fsdp_double_buffer = False + use_user_buffer_registration = False + use_sharp = False + keep_fsdp_fp8_transpose_cache = False + + pretrain = set_primary_perf_configs( + pretrain, + "pre_train", + num_nodes=num_nodes, + num_gpus_per_node=num_gpus_per_node, + mbs=mbs, + gbs=gbs, + max_steps=max_steps, + tp_size=tp_size, + pp_size=pp_size, + cp_size=cp_size, + vp_size=vp_size, + ep_size=ep_size, + enable_cuda_graphs=enable_cuda_graphs, + compute_dtype=compute_dtype, + fp8_recipe=fp8_recipe, + nccl_communicator_config_path=nccl_communicator_config_path, + use_mcore_fsdp=use_mcore_fsdp, + use_fsdp_double_buffer=use_fsdp_double_buffer, + use_user_buffer_registration=use_user_buffer_registration, + use_sharp=use_sharp, + keep_fsdp_fp8_transpose_cache=keep_fsdp_fp8_transpose_cache, + ) + + # Sequence Length (model and data) + pretrain.model.config.seq_length = 8192 + pretrain.data.seq_length = 8192 + + # Enable activation checkpointing to save memory + pretrain.model.config.recompute_granularity = "selective" + + # Set max_steps for trainer + pretrain.trainer.max_steps = max_steps + + # Disable validation batches. + pretrain.trainer.limit_val_batches = 0.0 + pretrain.trainer.val_check_interval = 0 + + # Add the Nsys profiling callback if enabled. + if profile_enabled: + pretrain.trainer.callbacks.append( + run.Config( + NsysCallback, + start_step=profile_start_step, + end_step=profile_end_step, + ranks=[int(x) for x in profile_ranks.split(",")], + gen_shape=False, + ) + ) + + # Add the FLOPs measurement callback. + pretrain.trainer.callbacks.append( + run.Config( + FLOPsMeasurementCallback, + model_name="llama31-8b", + model_config=pretrain.model.config, + data_config=pretrain.data, + ) + ) + + # When `performance_mode` is enabled, the Megatron communication overlap + # callback is already added to the recipe. + # https://github.com/NVIDIA-NeMo/NeMo/blob/90a396a567ebb4e8c1c41e454dc00cb71f911317/nemo/collections/llm/recipes/llama31_8b.py#L231 + comm_overlap_callback_idx = get_comm_overlap_callback_idx( + pretrain.trainer.callbacks + ) + pretrain.trainer.callbacks[ + comm_overlap_callback_idx + ].tp_comm_bootstrap_backend = "nccl" + + # Disable checkpointing. + pretrain.log.ckpt = None + pretrain.trainer.enable_checkpointing = False + + # Log every step. + pretrain.trainer.log_every_n_steps = 1 + + # Enable DLLogger + dllogger_config = run.Config( + DLLogger, + verbose=True, + stdout=True, + json_file="dllogger.json", + ) + pretrain.log.extra_loggers = [dllogger_config] + + return pretrain + + +if __name__ == "__main__": + run.cli.main(llm.pretrain, default_factory=recipe) diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/recipe_launch_command.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/recipe_launch_command.sh new file mode 100644 index 00000000..35c3e1b8 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/recipe_launch_command.sh @@ -0,0 +1 @@ +helm install vishwasreddy-ubench-46fe . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py --set workload.image=nvcr.io/nvidia/nemo:25.11 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/vishwasreddy-ubench-46fe \ No newline at end of file diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-config-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-config-configmap.yaml new file mode 100644 index 00000000..a1d54cee --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-config-configmap.yaml @@ -0,0 +1,28 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- if .Values.workload.configFile }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-config" +data: + workload-configuration: |- +{{- if .Values.workload_config }} +{{ .Values.workload_config | nindent 4 }} +{{- else }} +{{ "config: null" | nindent 4 }} +{{- end }} +{{- end }} diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-job.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-job.yaml new file mode 100644 index 00000000..ae59e456 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-job.yaml @@ -0,0 +1,333 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{$timestamp := now | date "2006-01-02-15-04-05"}} +{{$jobSuffix := randAlphaNum 4 | lower}} +{{$jobuuid := uuidv4}} +{{$nodes := div .Values.workload.gpus 8 | max 1}} +{{$gpusPerNode := min .Values.workload.gpus 8}} +{{- $root := . -}} + +apiVersion: jobset.x-k8s.io/v1alpha2 +kind: JobSet +metadata: + name: "{{ .Release.Name }}" + namespace: default + labels: + {{- if $root.Values.queue }} + kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}" + {{- end }} +spec: + {{- if $root.Values.queue }} + suspend: true + {{- end }} + failurePolicy: + maxRestarts: {{ default 0 $root.Values.workload.max_workload_restarts }} + replicatedJobs: + - name: workload + replicas: 1 + template: + spec: + parallelism: {{ $nodes }} + completions: {{ $nodes }} + backoffLimit: 0 + completionMode: Indexed + activeDeadlineSeconds: 14400 # 4 hours (4 * 60 * 60) + ttlSecondsAfterFinished: 43200 # 12 hours (12 * 60 * 60) + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: workload + {{- if $root.Values.volumes.gcsVolumes }} + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "500m" + gke-gcsfuse/memory-limit: "1Ti" + gke-gcsfuse/ephemeral-storage-limit: "2Ti" + {{- end }} + {{- if $root.Values.volumes.psVolumes }} + gke-parallelstore/volumes: "true" + gke-parallelstore/cpu-limit: "0" + gke-parallelstore/memory-limit: "0" + {{- end }} + {{- if and $root.Values.queue $root.Values.tasSettings.topologyRequest }} + {{- toYaml .Values.tasSettings.topologyRequest | nindent 14 }} + {{- end }} + {{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }} + provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}" + {{- end }} + {{- if not $root.Values.network.hostNetwork }} + networking.gke.io/default-interface: "eth0" + networking.gke.io/interfaces: | + {{- if $root.Values.network.subnetworks }} + [ + {{- range $i, $subnetwork := $root.Values.network.subnetworks }} + {"interfaceName":"eth{{ $i }}","network":"{{ $subnetwork }}"}{{ eq $i 9 | ternary "" ","}} + {{- end }} + ] + {{- else }} + [ + {"interfaceName":"eth0","network":"default"}, + {"interfaceName":"eth1","network":"gvnic-1"}, + {{- range $i := until 8 }} + {"interfaceName":"eth{{ add 2 $i }}","network":"rdma-{{ $i }}"}{{ eq $i 7 | ternary "" ","}} + {{- end }} + ] + {{- end }} + {{- end }} + spec: + {{- if $root.Values.network.hostNetwork }} + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + {{- end }} + subdomain: "{{.Release.Name}}" + restartPolicy: Never + {{- if $root.Values.targetNodes }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: "In" + values: + {{- range $hostname := $root.Values.targetNodes }} + - {{ $hostname }} + {{- end }} + {{- end }} + {{- if $root.Values.avoidNodes }} + {{- if not $root.Values.targetNodes }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + {{- end }} + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: "NotIn" + values: + {{- range $hostname := $root.Values.avoidNodes }} + - {{ $hostname }} + {{- end }} + {{- end }} + tolerations: + - operator: "Exists" + key: nvidia.com/gpu + - operator: "Exists" + key: cloud.google.com/impending-node-termination + + volumes: + {{ if $root.Values.network.gibVersion }} + - name: gib + emptyDir: {} + {{ end }} + + {{- if $root.Values.workload.configFile }} + - name: workload-configuration + configMap: + name: "{{.Release.Name}}-config" + items: + - key: workload-configuration + path: {{ $root.Values.workload.configFile | default "workload-configuration" }} + {{- end }} + + - name: workload-launcher + configMap: + name: "{{.Release.Name}}-launcher" + + - name: shared-memory + emptyDir: + medium: "Memory" + sizeLimit: 250Gi + + {{- range $pvc := $root.Values.volumes.pvcMounts }} + - name: "{{ $pvc.claimName }}" + persistentVolumeClaim: + claimName: "{{ $pvc.claimName }}" + {{- end }} + + {{- range $gcs := $root.Values.volumes.gcsMounts }} + - name: "{{ $gcs.bucketName }}" + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: "{{ $gcs.bucketName }}" + {{- if $gcs.mountOptions }} + mountOptions: "{{ $gcs.mountOptions }}" + {{- end }} + {{- end}} + + {{- if $root.Values.volumes.ssdMountPath }} + - name: local-ssd + hostPath: + path: /mnt/stateful_partition/kube-ephemeral-ssd + {{- end }} + + initContainers: + {{ if $root.Values.network.gibVersion }} + - name: nccl-plugin-installer + image: {{ $root.Values.network.gibVersion }} + imagePullPolicy: Always + args: + - | + set -ex + /scripts/container_entry.sh install --install-nccl + cp -R /var/lib/gib/lib64/. /target/usr/local/gib/lib64 + cp -R /var/lib/gib/. /target/usr/local/gib + command: + - /bin/sh + - -c + volumeMounts: + - mountPath: /target/usr/local/gib + name: gib + {{ end}} + + containers: + {{- if $root.Values.workload.gcsSidecarImage }} + - name: gke-gcsfuse-sidecar + image: {{ $root.Values.workload.gcsSidecarImage }} + - name: gke-gcsfuse-metadata-prefetch + image: {{ $root.Values.workload.gcsSidecarImage }} + {{- end }} + {{- if $root.Values.workload.psSidecarImage }} + - name: gke-parallelstore-sidecar + image: {{ $root.Values.workload.psSidecarImage }} + {{- end }} + + - name: workload + image: "{{ $root.Values.workload.image }}" + imagePullPolicy: Always + {{- if $root.Values.network.hostNetwork }} + securityContext: + privileged: true + {{- end }} + env: + - name: JOB_IDENTIFIER + value: "{{ .Release.Name }}-{{ $timestamp }}" + - name: JOB_TIMESTAMP + value: "{{ $timestamp }}" + - name: JOB_UUID + value: "{{ $jobuuid }}" + - name: JOB_ORCHESTRATOR + value: "gke" + # Add RANK based on the pod's index provided by the Indexed Job + # This is crucial for torch.distributed initialization. + - name: JOB_COMPLETION_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index'] + - name: RANK_0_FQDN + value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local" + - name: HOSTNAME_PREFIX + value: "{{.Release.Name}}-workload-" + - name: DOMAIN_NAME + value: "{{.Release.Name}}.default.svc.cluster.local" + - name: MASTER_ADDR + value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local" + - name: MASTER_PORT + value: "6002" + - name: WORLD_SIZE + value: "{{ $root.Values.workload.gpus }}" + - name: NNODES + value: "{{ $nodes }}" + - name: GPUS_PER_NODE + value: "{{ $gpusPerNode }}" + + - name: NCCL_PLUGIN_PATH + value: /usr/local/gib/lib64 + + {{ if $root.Values.network.gibVersion }} + - name: NCCL_INIT_SCRIPT + value: "/usr/local/gib/scripts/set_nccl_env.sh" + {{ end }} + + {{ if $root.Values.network.ncclSettings }} + {{- toYaml .Values.network.ncclSettings | nindent 14 }} + {{ end }} + + {{ if $root.Values.workload.envs }} + {{- toYaml .Values.workload.envs | nindent 14 }} + {{ end }} + + command: + - bash + - -c + - | + echo "Pod on $(hostname --fqdn) is running" + echo "Pod is assigned job index of $JOB_COMPLETION_INDEX" + + if [[ -n "${NCCL_INIT_SCRIPT}" ]]; then + echo "Running NCCL init script: ${NCCL_INIT_SCRIPT}" + source ${NCCL_INIT_SCRIPT} + fi + + # Overriding NCCL_SOCKET_IFNAME definition + export NCCL_SOCKET_IFNAME="eth0,eth1" + export NCCL_TUNER_CONFIG_PATH=/usr/local/gib/configs/tuner_config_a4.txtpb + + echo "Launching workload with the following arguments:" + {{- range $root.Values.workload.defaultArguments }} + echo " {{ . }}" + {{- end }} + {{- range $root.Values.workload.arguments }} + echo " {{ . }}" + {{- end }} + echo "" + + sleep 10 + + bash /workload/launcher/launch-workload.sh \ + {{- range $root.Values.workload.defaultArguments }} + {{ . }} \ + {{- end }} + {{- range $root.Values.workload.arguments }} + {{ . }} \ + {{- end }} + + + volumeMounts: + {{ if $root.Values.network.gibVersion }} + - name: gib + mountPath: /usr/local/gib + {{ end }} + + {{- if $root.Values.workload.configFile }} + - name: workload-configuration + mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }} + {{- end }} + + - name: workload-launcher + mountPath: /workload/launcher + + - name: shared-memory + mountPath: /dev/shm + + {{- range $pvc := $root.Values.volumes.pvcMounts }} + - name: "{{ $pvc.claimName }}" + mountPath: "{{ $pvc.mountPath }}" + {{- end }} + + {{- range $gcs := $root.Values.volumes.gcsMounts }} + - name: "{{ $gcs.bucketName }}" + mountPath: "{{ $gcs.mountPath }}" + {{- end }} + + {{- if $root.Values.volumes.ssdMountPath }} + - name: local-ssd + mountPath: "{{ $root.Values.volumes.ssdMountPath }}" + {{- end }} + + resources: + limits: + nvidia.com/gpu: {{ $gpusPerNode }} diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-launcher-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-launcher-configmap.yaml new file mode 100644 index 00000000..7026e0f1 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-launcher-configmap.yaml @@ -0,0 +1,28 @@ +# yamllint disable +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-launcher" +data: + launch-workload.sh: |- +{{- if .Values.workload_launcher }} +{{ .Values.workload_launcher | nindent 4 }} +{{- else }} + #!/bin/bash + echo "No workload launcher specified" + exit 1 +{{- end }} diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-svc.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-svc.yaml new file mode 100644 index 00000000..7cfe220b --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-svc.yaml @@ -0,0 +1,22 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + name: "{{ .Release.Name }}" +spec: + clusterIP: None + selector: + jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}" diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/values.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/values.yaml new file mode 100644 index 00000000..4b7c1b79 --- /dev/null +++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/values.yaml @@ -0,0 +1,33 @@ +dwsSettings: + maxRunDurationSeconds: null +network: + gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.1 + hostNetwork: true + ncclSettings: + - name: NCCL_DEBUG + value: WARN + subnetworks[]: null +queue: null +tasSettings: + topologyRequest: + kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname +volumes: + gcsMounts: + - bucketName: null + mountPath: null + gcsVolumes: true + psVolumes: false +workload: + arguments[]: null + configFile: llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py + configPath: /workload/configs/ + defaultArguments[]: null + envs: + - name: ARTIFACT_DIR + value: null + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NEMO_LAUNCH_SCRIPT + value: /workload/configs/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py + gpus: 64 + image: nvcr.io/nvidia/nemo:25.11