From 52b7db69b447b963d26bc05c0ceaaf63856d3849 Mon Sep 17 00:00:00 2001
From: Vishwas Reddy <vishwasreddy38@gmail.com>
Date: Thu, 19 Mar 2026 18:32:10 +0000
Subject: [PATCH 1/9] Recipe for llama3.1-8b 16nodes with gbs 256/seq 8192

---
 .../16node-bf16-seq8192-gbs256/Chart.yaml     |  20 ++
 .../16node-bf16-seq8192-gbs256/README.md      | 153 ++++++++
 .../16node-bf16-seq8192-gbs256/launcher.sh    | 106 ++++++
 ...llama3-1-8b-bf16-seq8192-gbs256-gpus128.py | 142 ++++++++
 .../recipe_launch_command.sh                  |   1 +
 .../templates/workload-config-configmap.yaml  |  28 ++
 .../templates/workload-job.yaml               | 333 ++++++++++++++++++
 .../workload-launcher-configmap.yaml          |  28 ++
 .../templates/workload-svc.yaml               |  22 ++
 .../16node-bf16-seq8192-gbs256/values.yaml    |  33 ++
 10 files changed, 866 insertions(+)
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/Chart.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/README.md
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/launcher.sh
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/llama3-1-8b-bf16-seq8192-gbs256-gpus128.py
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/recipe_launch_command.sh
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-config-configmap.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-job.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-launcher-configmap.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-svc.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/values.yaml

diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/Chart.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/Chart.yaml
new file mode 100644
index 00000000..af46c11a
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/Chart.yaml
@@ -0,0 +1,20 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v2
+name: a4_jobset_workload
+description: a4_jobset_workload
+type: application
+version: 0.1.0
+appVersion: "1.16.0"
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/README.md b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/README.md
new file mode 100644
index 00000000..483ed116
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/README.md
@@ -0,0 +1,153 @@
+<!-- mdformat global-off -->
+# Pretrain llama3-1-8b workloads on a4 GKE Node pools with Nvidia NeMo Framework
+
+This recipe outlines the steps for running a llama3-1-8b pretraining
+workload on [a4 GKE Node pools](https://cloud.google.com/kubernetes-engine) by using the
+[NVIDIA NeMo framework](https://github.com/NVIDIA/nemo).
+
+## Orchestration and deployment tools
+
+For this recipe, the following setup is used:
+
+- Orchestration - [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine)
+- Pretraining job configuration and deployment - A Helm chart is used to
+  configure and deploy the [Kubernetes Jobset](https://kubernetes.io/blog/2025/03/23/introducing-jobset) resource which manages the execution of the
+  [NeMo pretraining workload](https://github.com/NVIDIA/nemo).
+
+## Test environment
+
+This recipe has been optimized for and tested with the following configuration:
+
+- GKE cluster
+Please follow Cluster Toolkit [instructions](https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/examples/gke-a4)
+to create your a4 GKE cluster.
+
+## Training dataset
+
+This recipe uses a mock pretraining dataset provided by the NeMo framework.
+
+## Docker container image
+
+This recipe uses the following docker images:
+
+- `nvcr.io/nvidia/nemo:25.07`
+- `us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0`
+
+## Run the recipe
+
+From your client workstation, complete the following steps:
+
+### Configure environment settings
+
+Set the environment variables to match your environment:
+
+ ```bash
+ export PROJECT_ID=<PROJECT_ID>
+ export CLUSTER_REGION=<CLUSTER_REGION>
+ export CLUSTER_NAME=<CLUSTER_NAME>
+ export GCS_BUCKET=<GCS_BUCKET> # Note: path should not be prefixed with gs://
+ export KUEUE_NAME=<KUEUE_NAME>
+ ```
+
+Replace the following values:
+
+ - `<PROJECT_ID>`: your Google Cloud project ID.
+ - `<CLUSTER_REGION>`: the region where your cluster is located.
+ - `<CLUSTER_NAME>`: the name of your GKE cluster.
+ - `<GCS_BUCKET>`: the name of your Cloud Storage bucket. Don't include the `gs://` prefix.
+ - `<KUEUE_NAME>`: the name of the Kueue local queue. The default queue created by the cluster toolkit is `a4`. Make sure to verify the name of the local queue in your cluster.
+
+Set the default project:
+
+ ```bash
+ gcloud config set project $PROJECT_ID
+ ```
+
+### Get the recipe
+
+Clone the `gpu-recipes` repository and set a reference to the recipe folder.
+
+```
+git clone https://github.com/ai-hypercomputer/gpu-recipes.git
+cd gpu-recipes
+export REPO_ROOT=`git rev-parse --show-toplevel`
+export RECIPE_ROOT=$REPO_ROOT/training/a4/llama3-1-8b/nemo-pretraining-gke/16_nodes
+cd $RECIPE_ROOT
+```
+
+### Get cluster credentials
+
+```
+gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION
+```
+
+### Configure and submit a pretraining job
+
+#### Using 16 node (128 gpus) bf16 precision
+To execute the job with the default settings, run the following command from
+your client:
+
+```bash
+cd $RECIPE_ROOT
+export WORKLOAD_NAME=$USER-a4-llama3-1-8b-16node
+helm install $WORKLOAD_NAME . -f values.yaml \
+--set-file workload_launcher=launcher.sh \
+--set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus128.py \
+--set workload.image=nvcr.io/nvidia/nemo:25.07 \
+--set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
+--set volumes.gcsMounts[0].mountPath=/job-logs \
+--set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
+--set queue=${KUEUE_NAME}
+```
+
+**Examples**
+
+-   To set the number of training steps to 100, run the following command from
+    your client:
+
+    ```bash
+    cd $RECIPE_ROOT
+    export WORKLOAD_NAME=$USER-a4-llama3-1-8b-16node
+    helm install $WORKLOAD_NAME . -f values.yaml \
+    --set-file workload_launcher=launcher.sh \
+    --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus128.py \
+    --set workload.image=nvcr.io/nvidia/nemo:25.07 \
+    --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
+    --set volumes.gcsMounts[0].mountPath=/job-logs \
+    --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
+    --set queue=${KUEUE_NAME} \
+    --set workload.arguments[0]="trainer.max_steps=100"
+    ```
+
+### Monitor the job
+
+To check the status of pods in your job, run the following command:
+
+```
+kubectl get pods | grep $USER-a4-llama3-1-8b-16node
+```
+
+Replace the following:
+
+- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4-llama3-1-8b-16node.
+
+To get the logs for one of the pods, run the following command:
+
+```
+kubectl logs POD_NAME
+```
+
+Information about the training job's progress, including crucial details such as
+loss, step count, and step time, is generated by the rank 0 process.
+This process runs on the pod whose name begins with
+`JOB_NAME_PREFIX-workload-0-0`.
+For example: `$USER-a4-llama3-1-8b-16node-workload-0-0-s9zrv`.
+
+### Uninstall the Helm release
+
+You can delete the job and other resources created by the Helm chart. To
+uninstall Helm, run the following command from your client:
+
+```bash
+helm uninstall $USER-a4-llama3-1-8b-16node
+```
\ No newline at end of file
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/launcher.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/launcher.sh
new file mode 100644
index 00000000..daf2bd09
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/launcher.sh
@@ -0,0 +1,106 @@
+usage()
+{
+cat << EOF
+usage: bash ./launcher.sh [config-override  [config-override ...]]
+config-override  (Optional) A  NeMo configuration override. E.g. trainer.max_steps=10000.
+EOF
+}
+
+parse_args() {
+  while [ "$1" != "" ]; do
+    case $(grep -o "=" <<< "$1" | wc -l) in
+        1  )
+        config_overrides+=("$1")
+        ;;
+        * )
+            echo "Invalid config override: $1"
+            usage
+            exit 1
+    esac
+    shift
+  done
+  config_overrides="${config_overrides[*]}"
+}
+
+config_overrides=()
+parse_args "$@"
+
+if [ -z "${config_overrides}" ]; then
+  echo "No NeMo config overrides specified"
+else
+  echo "NeMo config overrides:"
+  echo "  ${config_overrides}"
+fi
+
+if [[ -n "${NCCL_PLUGIN_PATH}" ]]; then
+  export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH"
+  ldconfig $LD_LIBRARY_PATH
+  echo "Added $LD_LIBRARY_PATH to ldconfig:"
+  ldconfig -p | grep libcuda | sed 's/^/  /'
+  echo ""
+fi
+
+if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then
+  explicit_log_dir=${EXPLICIT_LOG_DIR}
+else
+  explicit_log_dir=workload_logs
+fi
+echo "Logging to ${explicit_log_dir}"
+
+if [[ -n "${TOKENIZER_PATH}" ]]; then
+  echo "Getting tokenizer files"
+  cp ${TOKENIZER_PATH}/* .
+  echo ""
+fi
+
+echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of $NNODES nodes"
+
+
+pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger
+
+export HF_TOKEN="<YOUR_HF_TOKEN>"
+
+# Export the nemo2 config to yaml.
+python ${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \
+trainer.num_nodes="$NNODES" \
+log.explicit_log_dir="${explicit_log_dir}" \
+trainer.max_steps=30 \
+trainer.num_nodes=16 \
+trainer.devices=8 \
+${config_overrides} \
+--to-yaml exported_nemo_config.yaml
+
+# Create the nsys directory.
+mkdir -p ${explicit_log_dir}/nsys
+
+OMP_NUM_THREADS=12 NSYS_CONFIG_DIRECTIVES="AgentLaunchTimeoutSec=240;AppLaunchTimeoutSec=240" TORCH_NCCL_ENABLE_MONITORING=0 \
+/usr/local/bin/nsys profile -s none -t nvtx,cuda --capture-range=cudaProfilerApi --capture-range-end=stop \
+-o ${explicit_log_dir}/nsys/noderank-${JOB_COMPLETION_INDEX} \
+--session-new "nemo-rank${JOB_COMPLETION_INDEX}"-$RANDOM \
+--wait all \
+torchrun \
+--nproc-per-node="8" \
+--nnodes="${NNODES}" \
+--node_rank="${JOB_COMPLETION_INDEX}" \
+--rdzv_id="${JOB_IDENTIFIER}" \
+--master_addr="${MASTER_ADDR}" \
+--master_port="${MASTER_PORT}" \
+${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \
+trainer.num_nodes="$NNODES" \
+log.explicit_log_dir="${explicit_log_dir}" \
+trainer.max_steps=30 \
+trainer.num_nodes=16 \
+trainer.devices=8 \
+${config_overrides}
+
+if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
+  mkdir -p ${ARTIFACT_DIR}
+  cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/
+  cp ${NEMO_LAUNCH_SCRIPT} ${ARTIFACT_DIR}/run-cli.py
+  cp dllogger.json ${ARTIFACT_DIR}/dllogger.json
+  cp exported_nemo_config.yaml ${ARTIFACT_DIR}/nemo-configuration.yaml
+  env > ${ARTIFACT_DIR}/environ.txt
+  ls ${ARTIFACT_DIR}
+fi
+echo "Training completed"
+echo "Pod on $(hostname --fqdn) is exiting"
\ No newline at end of file
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/llama3-1-8b-bf16-seq8192-gbs256-gpus128.py b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/llama3-1-8b-bf16-seq8192-gbs256-gpus128.py
new file mode 100644
index 00000000..4ae82ea4
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/llama3-1-8b-bf16-seq8192-gbs256-gpus128.py
@@ -0,0 +1,142 @@
+"""Nemo2 pretraining recipe for Llama 3.1 8B model."""
+
+from nemo.collections import llm
+from nemo.collections.llm.recipes import llama31_8b
+from nemo.lightning.pytorch.callbacks import NsysCallback
+from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback
+from nemo.utils.loggers.dllogger import DLLogger
+import nemo_run as run
+from scripts.performance.helpers import (
+    set_primary_perf_configs,
+)
+from scripts.performance.utils import get_comm_overlap_callback_idx
+
+
+def recipe(
+    profile_enabled: bool = False,
+    profile_start_step: int = 0,
+    profile_end_step: int = 0,
+    profile_ranks: str = "0",
+) -> run.Partial:
+  """Returns a Nemo2 training recipe for Llama 3.1 8B model.
+
+  Args:
+      profile_enabled: Whether to enable Nsys profiling.
+      profile_start_step: The step to start profiling.
+      profile_end_step: The step to end profiling.
+      profile_ranks: The ranks to profile, comma separated.
+
+  Returns:
+      A Nemo2 training recipe.
+  """
+  # Start from the Nemo standard recipe.
+  pretrain = llama31_8b.pretrain_recipe(performance_mode=True)
+
+  num_nodes = 16
+  num_gpus_per_node = 8
+  mbs = 2
+  gbs = 256
+  max_steps = 30
+  tp_size = 1
+  pp_size = 1
+  cp_size = 1
+  vp_size = 1  # Virtual Pipeline Parallelism
+  ep_size = 1  # Expert Parallelism
+  enable_cuda_graphs = False
+  compute_dtype = "bf16"
+  fp8_recipe = None  # Not needed for bf16
+  nccl_communicator_config_path = None
+  use_mcore_fsdp = False
+  use_fsdp_double_buffer = False
+  use_user_buffer_registration = False
+  use_sharp = False
+  keep_fsdp_fp8_transpose_cache = False
+
+  pretrain = set_primary_perf_configs(
+      pretrain,
+      "pre_train",
+      num_nodes=num_nodes,
+      num_gpus_per_node=num_gpus_per_node,
+      mbs=mbs,
+      gbs=gbs,
+      max_steps=max_steps,
+      tp_size=tp_size,
+      pp_size=pp_size,
+      cp_size=cp_size,
+      vp_size=vp_size,
+      ep_size=ep_size,
+      enable_cuda_graphs=enable_cuda_graphs,
+      compute_dtype=compute_dtype,
+      fp8_recipe=fp8_recipe,
+      nccl_communicator_config_path=nccl_communicator_config_path,
+      use_mcore_fsdp=use_mcore_fsdp,
+      use_fsdp_double_buffer=use_fsdp_double_buffer,
+      use_user_buffer_registration=use_user_buffer_registration,
+      use_sharp=use_sharp,
+      keep_fsdp_fp8_transpose_cache=keep_fsdp_fp8_transpose_cache,
+  )
+
+  # Sequence Length (model and data)
+  pretrain.model.config.seq_length = 8192
+  pretrain.data.seq_length = 8192
+
+ # Set the number of steps to 50 for a quicker benchmark.
+  pretrain.trainer.max_steps = 50
+
+  # Disable validation batches.
+  pretrain.trainer.limit_val_batches = 0.0
+  pretrain.trainer.val_check_interval = 0
+
+  # Add the Nsys profiling callback if enabled.
+  if profile_enabled:
+    pretrain.trainer.callbacks.append(
+        run.Config(
+            NsysCallback,
+            start_step=profile_start_step,
+            end_step=profile_end_step,
+            ranks=[int(x) for x in profile_ranks.split(",")],
+            gen_shape=False,
+        )
+    )
+
+  # Add the FLOPs measurement callback.
+  pretrain.trainer.callbacks.append(
+      run.Config(
+          FLOPsMeasurementCallback,
+          model_name="llama31-8b",
+          model_config=pretrain.model.config,
+          data_config=pretrain.data,
+      )
+  )
+
+  # When `performance_mode` is enabled, the Megatron communication overlap
+  # callback is already added to the recipe.
+  # https://github.com/NVIDIA-NeMo/NeMo/blob/90a396a567ebb4e8c1c41e454dc00cb71f911317/nemo/collections/llm/recipes/llama31_8b.py#L231
+  comm_overlap_callback_idx = get_comm_overlap_callback_idx(
+      pretrain.trainer.callbacks
+  )
+  pretrain.trainer.callbacks[
+      comm_overlap_callback_idx
+  ].tp_comm_bootstrap_backend = "nccl"
+
+  # Disable checkpointing.
+  pretrain.log.ckpt = None
+  pretrain.trainer.enable_checkpointing = False
+
+  # Log every step.
+  pretrain.trainer.log_every_n_steps = 1
+
+  # Enable DLLogger
+  dllogger_config = run.Config(
+      DLLogger,
+      verbose=True,
+      stdout=True,
+      json_file="dllogger.json",
+  )
+  pretrain.log.extra_loggers = [dllogger_config]
+
+  return pretrain
+
+
+if __name__ == "__main__":
+  run.cli.main(llm.pretrain, default_factory=recipe)
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/recipe_launch_command.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/recipe_launch_command.sh
new file mode 100644
index 00000000..29ae3f6a
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/recipe_launch_command.sh
@@ -0,0 +1 @@
+helm install vishwasreddy-ubench-6jdz . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus128.py --set workload.image=nvcr.io/nvidia/nemo:25.07 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/vishwasreddy-ubench-6jdz
\ No newline at end of file
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-config-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-config-configmap.yaml
new file mode 100644
index 00000000..a1d54cee
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-config-configmap.yaml
@@ -0,0 +1,28 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{- if .Values.workload.configFile }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-config"
+data:
+  workload-configuration: |-
+{{- if .Values.workload_config }}
+{{ .Values.workload_config | nindent 4 }}
+{{- else }}
+{{ "config: null" | nindent 4 }}
+{{- end }}
+{{- end }}
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-job.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-job.yaml
new file mode 100644
index 00000000..ae59e456
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-job.yaml
@@ -0,0 +1,333 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{$timestamp := now | date "2006-01-02-15-04-05"}}
+{{$jobSuffix := randAlphaNum 4 | lower}}
+{{$jobuuid := uuidv4}}
+{{$nodes := div .Values.workload.gpus 8 | max 1}}
+{{$gpusPerNode := min .Values.workload.gpus 8}}
+{{- $root := . -}}
+
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+  name: "{{ .Release.Name }}"
+  namespace: default
+  labels:
+  {{- if $root.Values.queue }}
+    kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}"
+  {{- end }}
+spec:
+  {{- if $root.Values.queue }}
+  suspend: true
+  {{- end }}
+  failurePolicy:
+    maxRestarts: {{ default 0 $root.Values.workload.max_workload_restarts }}
+  replicatedJobs:
+  - name: workload
+    replicas: 1
+    template:
+      spec:
+        parallelism: {{ $nodes }}
+        completions: {{ $nodes }}
+        backoffLimit: 0
+        completionMode: Indexed
+        activeDeadlineSeconds: 14400 # 4 hours (4 * 60 * 60)
+        ttlSecondsAfterFinished: 43200 # 12 hours (12 * 60 * 60)
+        template:
+          metadata:
+            annotations:
+              kubectl.kubernetes.io/default-container: workload
+              {{- if $root.Values.volumes.gcsVolumes }}
+              gke-gcsfuse/volumes: "true"
+              gke-gcsfuse/cpu-limit: "500m"
+              gke-gcsfuse/memory-limit: "1Ti"
+              gke-gcsfuse/ephemeral-storage-limit: "2Ti"
+              {{- end }}
+              {{- if $root.Values.volumes.psVolumes }}
+              gke-parallelstore/volumes: "true"
+              gke-parallelstore/cpu-limit: "0"
+              gke-parallelstore/memory-limit: "0"
+              {{- end }}
+              {{- if and $root.Values.queue $root.Values.tasSettings.topologyRequest }}
+              {{- toYaml .Values.tasSettings.topologyRequest | nindent 14 }}
+              {{- end }}
+              {{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }}
+              provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}"
+              {{- end }}
+              {{- if not $root.Values.network.hostNetwork }}
+              networking.gke.io/default-interface: "eth0"
+              networking.gke.io/interfaces: |
+              {{- if $root.Values.network.subnetworks }}
+                [
+                  {{- range $i, $subnetwork := $root.Values.network.subnetworks }}
+                  {"interfaceName":"eth{{ $i }}","network":"{{ $subnetwork }}"}{{ eq $i 9 | ternary "" ","}}
+                  {{- end }}
+                ]
+              {{- else }}
+                [
+                  {"interfaceName":"eth0","network":"default"},
+                  {"interfaceName":"eth1","network":"gvnic-1"},
+                  {{- range  $i := until 8 }}
+                  {"interfaceName":"eth{{ add 2 $i }}","network":"rdma-{{ $i }}"}{{ eq $i 7 | ternary "" ","}}
+                  {{- end }}
+                ]
+              {{- end }}
+              {{- end }}
+          spec:
+            {{- if $root.Values.network.hostNetwork }}
+            hostNetwork: true
+            dnsPolicy: ClusterFirstWithHostNet
+            {{- end }}
+            subdomain: "{{.Release.Name}}"
+            restartPolicy: Never
+            {{- if $root.Values.targetNodes }}
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+                  nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: kubernetes.io/hostname
+                      operator: "In"
+                      values:
+                      {{- range $hostname := $root.Values.targetNodes }}
+                      - {{ $hostname }}
+                      {{- end }}
+            {{- end }}
+            {{- if $root.Values.avoidNodes }}
+            {{- if not $root.Values.targetNodes }}
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+            {{- end }}
+                  nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: kubernetes.io/hostname
+                      operator: "NotIn"
+                      values:
+                      {{- range $hostname := $root.Values.avoidNodes }}
+                      - {{ $hostname }}
+                      {{- end }}
+            {{- end }}
+            tolerations:
+            - operator: "Exists"
+              key: nvidia.com/gpu
+            - operator: "Exists"
+              key: cloud.google.com/impending-node-termination
+
+            volumes:
+            {{ if $root.Values.network.gibVersion }}
+            - name: gib
+              emptyDir: {}
+            {{ end }}
+
+            {{- if $root.Values.workload.configFile }}
+            - name: workload-configuration
+              configMap:
+                name: "{{.Release.Name}}-config"
+                items:
+                - key: workload-configuration
+                  path: {{ $root.Values.workload.configFile | default "workload-configuration" }}
+            {{- end }}
+
+            - name: workload-launcher
+              configMap:
+                name: "{{.Release.Name}}-launcher"
+
+            - name: shared-memory
+              emptyDir:
+                medium: "Memory"
+                sizeLimit: 250Gi
+
+            {{- range $pvc := $root.Values.volumes.pvcMounts }}
+            - name: "{{ $pvc.claimName }}"
+              persistentVolumeClaim:
+                claimName: "{{ $pvc.claimName }}"
+            {{- end }}
+
+            {{- range $gcs := $root.Values.volumes.gcsMounts }}
+            - name: "{{ $gcs.bucketName }}"
+              csi:
+                driver: gcsfuse.csi.storage.gke.io
+                volumeAttributes:
+                  bucketName: "{{ $gcs.bucketName }}"
+                  {{- if $gcs.mountOptions }}
+                  mountOptions: "{{ $gcs.mountOptions }}"
+                  {{- end }}
+            {{- end}}
+
+            {{- if $root.Values.volumes.ssdMountPath }}
+            - name: local-ssd
+              hostPath:
+                path: /mnt/stateful_partition/kube-ephemeral-ssd
+            {{- end }}
+
+            initContainers:
+            {{ if $root.Values.network.gibVersion }}
+            - name: nccl-plugin-installer
+              image: {{ $root.Values.network.gibVersion }}
+              imagePullPolicy: Always
+              args:
+              - |
+                set -ex
+                /scripts/container_entry.sh install --install-nccl
+                cp -R /var/lib/gib/lib64/. /target/usr/local/gib/lib64
+                cp -R /var/lib/gib/. /target/usr/local/gib
+              command:
+              - /bin/sh
+              - -c
+              volumeMounts:
+              - mountPath: /target/usr/local/gib
+                name: gib
+            {{ end}}
+
+            containers:
+            {{- if $root.Values.workload.gcsSidecarImage }}
+            - name: gke-gcsfuse-sidecar
+              image: {{ $root.Values.workload.gcsSidecarImage }}
+            - name: gke-gcsfuse-metadata-prefetch
+              image: {{ $root.Values.workload.gcsSidecarImage }}
+            {{- end }}
+            {{- if $root.Values.workload.psSidecarImage }}
+            - name: gke-parallelstore-sidecar
+              image: {{ $root.Values.workload.psSidecarImage }}
+            {{- end }}
+
+            - name: workload
+              image: "{{ $root.Values.workload.image }}"
+              imagePullPolicy: Always
+              {{- if $root.Values.network.hostNetwork }}
+              securityContext:
+                privileged: true
+              {{- end }}
+              env:
+              - name: JOB_IDENTIFIER
+                value: "{{ .Release.Name }}-{{ $timestamp }}"
+              - name: JOB_TIMESTAMP
+                value: "{{ $timestamp }}"
+              - name: JOB_UUID
+                value: "{{ $jobuuid }}"
+              - name: JOB_ORCHESTRATOR
+                value: "gke"
+              # Add RANK based on the pod's index provided by the Indexed Job
+              # This is crucial for torch.distributed initialization.
+              - name: JOB_COMPLETION_INDEX
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
+              - name: RANK_0_FQDN
+                value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local"
+              - name: HOSTNAME_PREFIX
+                value: "{{.Release.Name}}-workload-"
+              - name: DOMAIN_NAME
+                value: "{{.Release.Name}}.default.svc.cluster.local"
+              - name: MASTER_ADDR
+                value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local"
+              - name: MASTER_PORT
+                value: "6002"
+              - name: WORLD_SIZE
+                value: "{{ $root.Values.workload.gpus }}"
+              - name: NNODES
+                value: "{{ $nodes }}"
+              - name: GPUS_PER_NODE
+                value: "{{ $gpusPerNode }}"
+
+              - name: NCCL_PLUGIN_PATH
+                value: /usr/local/gib/lib64
+
+              {{ if $root.Values.network.gibVersion }}
+              - name: NCCL_INIT_SCRIPT
+                value: "/usr/local/gib/scripts/set_nccl_env.sh"
+              {{ end }}
+
+              {{ if $root.Values.network.ncclSettings }}
+              {{- toYaml .Values.network.ncclSettings | nindent 14 }}
+              {{ end }}
+
+              {{ if $root.Values.workload.envs }}
+              {{- toYaml .Values.workload.envs | nindent 14 }}
+              {{ end }}
+
+              command:
+              - bash
+              - -c
+              - |
+                echo "Pod on $(hostname --fqdn) is running"
+                echo "Pod is assigned job index of $JOB_COMPLETION_INDEX"
+
+                if [[ -n "${NCCL_INIT_SCRIPT}" ]]; then
+                  echo "Running NCCL init script: ${NCCL_INIT_SCRIPT}"
+                  source ${NCCL_INIT_SCRIPT}
+                fi
+
+                # Overriding NCCL_SOCKET_IFNAME definition
+                export NCCL_SOCKET_IFNAME="eth0,eth1"
+                export NCCL_TUNER_CONFIG_PATH=/usr/local/gib/configs/tuner_config_a4.txtpb
+
+                echo "Launching workload with the following arguments:"
+                {{- range $root.Values.workload.defaultArguments }}
+                echo "  {{ . }}"
+                {{- end }}
+                {{- range $root.Values.workload.arguments }}
+                echo "  {{ . }}"
+                {{- end }}
+                echo ""
+
+                sleep 10
+
+                bash /workload/launcher/launch-workload.sh \
+                {{- range $root.Values.workload.defaultArguments }}
+                {{ . }} \
+                {{- end }}
+                {{- range $root.Values.workload.arguments }}
+                {{ . }} \
+                {{- end }}
+
+
+              volumeMounts:
+                {{ if $root.Values.network.gibVersion }}
+                - name: gib
+                  mountPath: /usr/local/gib
+                {{ end }}
+
+                {{- if $root.Values.workload.configFile }}
+                - name: workload-configuration
+                  mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }}
+                {{- end }}
+
+                - name: workload-launcher
+                  mountPath: /workload/launcher
+
+                - name: shared-memory
+                  mountPath: /dev/shm
+
+                {{- range $pvc := $root.Values.volumes.pvcMounts }}
+                - name: "{{ $pvc.claimName }}"
+                  mountPath: "{{ $pvc.mountPath }}"
+                {{- end }}
+
+                {{- range $gcs := $root.Values.volumes.gcsMounts }}
+                - name: "{{ $gcs.bucketName }}"
+                  mountPath: "{{ $gcs.mountPath }}"
+                {{- end }}
+
+                {{- if $root.Values.volumes.ssdMountPath }}
+                - name: local-ssd
+                  mountPath: "{{ $root.Values.volumes.ssdMountPath }}"
+                {{- end }}
+
+              resources:
+                limits:
+                  nvidia.com/gpu: {{ $gpusPerNode }}
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-launcher-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-launcher-configmap.yaml
new file mode 100644
index 00000000..7026e0f1
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-launcher-configmap.yaml
@@ -0,0 +1,28 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-launcher"
+data:
+  launch-workload.sh: |-
+{{- if .Values.workload_launcher }}
+{{ .Values.workload_launcher | nindent 4 }}
+{{- else }}
+    #!/bin/bash
+    echo "No workload launcher specified"
+    exit 1
+{{- end }}
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-svc.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-svc.yaml
new file mode 100644
index 00000000..7cfe220b
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/templates/workload-svc.yaml
@@ -0,0 +1,22 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: "{{ .Release.Name }}"
+spec:
+  clusterIP: None
+  selector:
+    jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}"
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/values.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/values.yaml
new file mode 100644
index 00000000..d82e91f8
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/values.yaml
@@ -0,0 +1,33 @@
+dwsSettings:
+  maxRunDurationSeconds: null
+network:
+  gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0
+  hostNetwork: true
+  ncclSettings:
+  - name: NCCL_DEBUG
+    value: WARN
+  subnetworks[]: null
+queue: null
+tasSettings:
+  topologyRequest:
+    kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname
+volumes:
+  gcsMounts:
+  - bucketName: null
+    mountPath: null
+  gcsVolumes: true
+  psVolumes: false
+workload:
+  arguments[]: null
+  configFile: llama3-1-8b-bf16-seq8192-gbs256-gpus128.py
+  configPath: /workload/configs/
+  defaultArguments[]: null
+  envs:
+  - name: ARTIFACT_DIR
+    value: null
+  - name: GLOO_SOCKET_IFNAME
+    value: eth0
+  - name: NEMO_LAUNCH_SCRIPT
+    value: /workload/configs/llama3-1-8b-bf16-seq8192-gbs256-gpus128.py
+  gpus: 128
+  image: nvcr.io/nvidia/nemo:25.07

From 5f2a0253cb2e7a787f479c56f0a0df166ad8ee2b Mon Sep 17 00:00:00 2001
From: Vishwas Reddy <vishwasreddy38@gmail.com>
Date: Thu, 19 Mar 2026 11:59:08 -0700
Subject: [PATCH 2/9] Update WORKLOAD_NAME in README for consistency

---
 .../16node-bf16-seq8192-gbs256/README.md             | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/README.md b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/README.md
index 483ed116..ba00e866 100644
--- a/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/README.md
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/16node-bf16-seq8192-gbs256/README.md
@@ -89,7 +89,7 @@ your client:
 
 ```bash
 cd $RECIPE_ROOT
-export WORKLOAD_NAME=$USER-a4-llama3-1-8b-16node
+export WORKLOAD_NAME=$USER-a4-llama3-1-8b
 helm install $WORKLOAD_NAME . -f values.yaml \
 --set-file workload_launcher=launcher.sh \
 --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus128.py \
@@ -107,7 +107,7 @@ helm install $WORKLOAD_NAME . -f values.yaml \
 
     ```bash
     cd $RECIPE_ROOT
-    export WORKLOAD_NAME=$USER-a4-llama3-1-8b-16node
+    export WORKLOAD_NAME=$USER-a4-llama3-1-8b
     helm install $WORKLOAD_NAME . -f values.yaml \
     --set-file workload_launcher=launcher.sh \
     --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus128.py \
@@ -124,7 +124,7 @@ helm install $WORKLOAD_NAME . -f values.yaml \
 To check the status of pods in your job, run the following command:
 
 ```
-kubectl get pods | grep $USER-a4-llama3-1-8b-16node
+kubectl get pods | grep $USER-a4-llama3-1-8b
 ```
 
 Replace the following:
@@ -141,7 +141,7 @@ Information about the training job's progress, including crucial details such as
 loss, step count, and step time, is generated by the rank 0 process.
 This process runs on the pod whose name begins with
 `JOB_NAME_PREFIX-workload-0-0`.
-For example: `$USER-a4-llama3-1-8b-16node-workload-0-0-s9zrv`.
+For example: `$USER-a4-llama3-1-8b-workload-0-0-s9zrv`.
 
 ### Uninstall the Helm release
 
@@ -149,5 +149,5 @@ You can delete the job and other resources created by the Helm chart. To
 uninstall Helm, run the following command from your client:
 
 ```bash
-helm uninstall $USER-a4-llama3-1-8b-16node
-```
\ No newline at end of file
+helm uninstall $USER-a4-llama3-1-8b
+```

From 8f5c02ebdc3e9849596c6f110df3bd8363bfbf3d Mon Sep 17 00:00:00 2001
From: Vishwas Reddy <vishwasreddy38@gmail.com>
Date: Thu, 19 Mar 2026 19:31:17 +0000
Subject: [PATCH 3/9] feat: add 8-node bf16 recipe for llama3-1-8b

---
 .../8node-bf16-seq8192-gbs256/Chart.yaml      |  20 ++
 .../8node-bf16-seq8192-gbs256/README.md       | 153 ++++++++
 .../8node-bf16-seq8192-gbs256/launcher.sh     | 106 ++++++
 .../llama3-1-8b-bf16-seq8192-gbs256-gpus64.py | 142 ++++++++
 .../recipe_launch_command.sh                  |   1 +
 .../templates/workload-config-configmap.yaml  |  28 ++
 .../templates/workload-job.yaml               | 333 ++++++++++++++++++
 .../workload-launcher-configmap.yaml          |  28 ++
 .../templates/workload-svc.yaml               |  22 ++
 .../8node-bf16-seq8192-gbs256/values.yaml     |  33 ++
 10 files changed, 866 insertions(+)
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/Chart.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/README.md
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/launcher.sh
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/llama3-1-8b-bf16-seq8192-gbs256-gpus64.py
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/recipe_launch_command.sh
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-config-configmap.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-job.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-launcher-configmap.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-svc.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/values.yaml

diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/Chart.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/Chart.yaml
new file mode 100644
index 00000000..af46c11a
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/Chart.yaml
@@ -0,0 +1,20 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v2
+name: a4_jobset_workload
+description: a4_jobset_workload
+type: application
+version: 0.1.0
+appVersion: "1.16.0"
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/README.md b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/README.md
new file mode 100644
index 00000000..cc666f48
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/README.md
@@ -0,0 +1,153 @@
+<!-- mdformat global-off -->
+# Pretrain llama3-1-8b workloads on a4 GKE Node pools with Nvidia NeMo Framework
+
+This recipe outlines the steps for running a llama3-1-8b pretraining
+workload on [a4 GKE Node pools](https://cloud.google.com/kubernetes-engine) by using the
+[NVIDIA NeMo framework](https://github.com/NVIDIA/nemo).
+
+## Orchestration and deployment tools
+
+For this recipe, the following setup is used:
+
+- Orchestration - [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine)
+- Pretraining job configuration and deployment - A Helm chart is used to
+  configure and deploy the [Kubernetes Jobset](https://kubernetes.io/blog/2025/03/23/introducing-jobset) resource which manages the execution of the
+  [NeMo pretraining workload](https://github.com/NVIDIA/nemo).
+
+## Test environment
+
+This recipe has been optimized for and tested with the following configuration:
+
+- GKE cluster
+Please follow Cluster Toolkit [instructions](https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/examples/gke-a4)
+to create your a4 GKE cluster.
+
+## Training dataset
+
+This recipe uses a mock pretraining dataset provided by the NeMo framework.
+
+## Docker container image
+
+This recipe uses the following docker images:
+
+- `nvcr.io/nvidia/nemo:25.07`
+- `us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0`
+
+## Run the recipe
+
+From your client workstation, complete the following steps:
+
+### Configure environment settings
+
+Set the environment variables to match your environment:
+
+ ```bash
+ export PROJECT_ID=<PROJECT_ID>
+ export CLUSTER_REGION=<CLUSTER_REGION>
+ export CLUSTER_NAME=<CLUSTER_NAME>
+ export GCS_BUCKET=<GCS_BUCKET> # Note: path should not be prefixed with gs://
+ export KUEUE_NAME=<KUEUE_NAME>
+ ```
+
+Replace the following values:
+
+ - `<PROJECT_ID>`: your Google Cloud project ID.
+ - `<CLUSTER_REGION>`: the region where your cluster is located.
+ - `<CLUSTER_NAME>`: the name of your GKE cluster.
+ - `<GCS_BUCKET>`: the name of your Cloud Storage bucket. Don't include the `gs://` prefix.
+ - `<KUEUE_NAME>`: the name of the Kueue local queue. The default queue created by the cluster toolkit is `a4`. Make sure to verify the name of the local queue in your cluster.
+
+Set the default project:
+
+ ```bash
+ gcloud config set project $PROJECT_ID
+ ```
+
+### Get the recipe
+
+Clone the `gpu-recipes` repository and set a reference to the recipe folder.
+
+```
+git clone https://github.com/ai-hypercomputer/gpu-recipes.git
+cd gpu-recipes
+export REPO_ROOT=`git rev-parse --show-toplevel`
+export RECIPE_ROOT=$REPO_ROOT/training/a4/llama3-1-8b/nemo-pretraining-gke/8_nodes
+cd $RECIPE_ROOT
+```
+
+### Get cluster credentials
+
+```
+gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION
+```
+
+### Configure and submit a pretraining job
+
+#### Using 8 node (64 gpus) bf16 precision
+To execute the job with the default settings, run the following command from
+your client:
+
+```bash
+cd $RECIPE_ROOT
+export WORKLOAD_NAME=$USER-a4-llama3-1-8b-8node
+helm install $WORKLOAD_NAME . -f values.yaml \
+--set-file workload_launcher=launcher.sh \
+--set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus64.py \
+--set workload.image=nvcr.io/nvidia/nemo:25.07 \
+--set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
+--set volumes.gcsMounts[0].mountPath=/job-logs \
+--set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
+--set queue=${KUEUE_NAME}
+```
+
+**Examples**
+
+-   To set the number of training steps to 100, run the following command from
+    your client:
+
+    ```bash
+    cd $RECIPE_ROOT
+    export WORKLOAD_NAME=$USER-a4-llama3-1-8b-8node
+    helm install $WORKLOAD_NAME . -f values.yaml \
+    --set-file workload_launcher=launcher.sh \
+    --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus64.py \
+    --set workload.image=nvcr.io/nvidia/nemo:25.07 \
+    --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
+    --set volumes.gcsMounts[0].mountPath=/job-logs \
+    --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
+    --set queue=${KUEUE_NAME} \
+    --set workload.arguments[0]="trainer.max_steps=100"
+    ```
+
+### Monitor the job
+
+To check the status of pods in your job, run the following command:
+
+```
+kubectl get pods | grep $USER-a4-llama3-1-8b-8node
+```
+
+Replace the following:
+
+- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4-llama3-1-8b-8node.
+
+To get the logs for one of the pods, run the following command:
+
+```
+kubectl logs POD_NAME
+```
+
+Information about the training job's progress, including crucial details such as
+loss, step count, and step time, is generated by the rank 0 process.
+This process runs on the pod whose name begins with
+`JOB_NAME_PREFIX-workload-0-0`.
+For example: `$USER-a4-llama3-1-8b-8node-workload-0-0-s9zrv`.
+
+### Uninstall the Helm release
+
+You can delete the job and other resources created by the Helm chart. To
+uninstall Helm, run the following command from your client:
+
+```bash
+helm uninstall $USER-a4-llama3-1-8b-8node
+```
\ No newline at end of file
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/launcher.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/launcher.sh
new file mode 100644
index 00000000..357d27a4
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/launcher.sh
@@ -0,0 +1,106 @@
+usage()
+{
+cat << EOF
+usage: bash ./launcher.sh [config-override  [config-override ...]]
+config-override  (Optional) A  NeMo configuration override. E.g. trainer.max_steps=10000.
+EOF
+}
+
+parse_args() {
+  while [ "$1" != "" ]; do
+    case $(grep -o "=" <<< "$1" | wc -l) in
+        1  )
+        config_overrides+=("$1")
+        ;;
+        * )
+            echo "Invalid config override: $1"
+            usage
+            exit 1
+    esac
+    shift
+  done
+  config_overrides="${config_overrides[*]}"
+}
+
+config_overrides=()
+parse_args "$@"
+
+if [ -z "${config_overrides}" ]; then
+  echo "No NeMo config overrides specified"
+else
+  echo "NeMo config overrides:"
+  echo "  ${config_overrides}"
+fi
+
+if [[ -n "${NCCL_PLUGIN_PATH}" ]]; then
+  export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH"
+  ldconfig $LD_LIBRARY_PATH
+  echo "Added $LD_LIBRARY_PATH to ldconfig:"
+  ldconfig -p | grep libcuda | sed 's/^/  /'
+  echo ""
+fi
+
+if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then
+  explicit_log_dir=${EXPLICIT_LOG_DIR}
+else
+  explicit_log_dir=workload_logs
+fi
+echo "Logging to ${explicit_log_dir}"
+
+if [[ -n "${TOKENIZER_PATH}" ]]; then
+  echo "Getting tokenizer files"
+  cp ${TOKENIZER_PATH}/* .
+  echo ""
+fi
+
+echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of $NNODES nodes"
+
+
+pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger
+
+export HF_TOKEN="<YOUR_HF_TOKEN>"
+
+# Export the nemo2 config to yaml.
+python ${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \
+trainer.num_nodes="$NNODES" \
+log.explicit_log_dir="${explicit_log_dir}" \
+trainer.max_steps=30 \
+trainer.num_nodes=8 \
+trainer.devices=8 \
+${config_overrides} \
+--to-yaml exported_nemo_config.yaml
+
+# Create the nsys directory.
+mkdir -p ${explicit_log_dir}/nsys
+
+OMP_NUM_THREADS=12 NSYS_CONFIG_DIRECTIVES="AgentLaunchTimeoutSec=240;AppLaunchTimeoutSec=240" TORCH_NCCL_ENABLE_MONITORING=0 \
+/usr/local/bin/nsys profile -s none -t nvtx,cuda --capture-range=cudaProfilerApi --capture-range-end=stop \
+-o ${explicit_log_dir}/nsys/noderank-${JOB_COMPLETION_INDEX} \
+--session-new "nemo-rank${JOB_COMPLETION_INDEX}"-$RANDOM \
+--wait all \
+torchrun \
+--nproc-per-node="8" \
+--nnodes="${NNODES}" \
+--node_rank="${JOB_COMPLETION_INDEX}" \
+--rdzv_id="${JOB_IDENTIFIER}" \
+--master_addr="${MASTER_ADDR}" \
+--master_port="${MASTER_PORT}" \
+${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \
+trainer.num_nodes="$NNODES" \
+log.explicit_log_dir="${explicit_log_dir}" \
+trainer.max_steps=30 \
+trainer.num_nodes=8 \
+trainer.devices=8 \
+${config_overrides}
+
+if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
+  mkdir -p ${ARTIFACT_DIR}
+  cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/
+  cp ${NEMO_LAUNCH_SCRIPT} ${ARTIFACT_DIR}/run-cli.py
+  cp dllogger.json ${ARTIFACT_DIR}/dllogger.json
+  cp exported_nemo_config.yaml ${ARTIFACT_DIR}/nemo-configuration.yaml
+  env > ${ARTIFACT_DIR}/environ.txt
+  ls ${ARTIFACT_DIR}
+fi
+echo "Training completed"
+echo "Pod on $(hostname --fqdn) is exiting"
\ No newline at end of file
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/llama3-1-8b-bf16-seq8192-gbs256-gpus64.py b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/llama3-1-8b-bf16-seq8192-gbs256-gpus64.py
new file mode 100644
index 00000000..fbf01761
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/llama3-1-8b-bf16-seq8192-gbs256-gpus64.py
@@ -0,0 +1,142 @@
+"""Nemo2 pretraining recipe for Llama 3.1 8B model."""
+
+from nemo.collections import llm
+from nemo.collections.llm.recipes import llama31_8b
+from nemo.lightning.pytorch.callbacks import NsysCallback
+from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback
+from nemo.utils.loggers.dllogger import DLLogger
+import nemo_run as run
+from scripts.performance.helpers import (
+    set_primary_perf_configs,
+)
+from scripts.performance.utils import get_comm_overlap_callback_idx
+
+
+def recipe(
+    profile_enabled: bool = False,
+    profile_start_step: int = 0,
+    profile_end_step: int = 0,
+    profile_ranks: str = "0",
+) -> run.Partial:
+  """Returns a Nemo2 training recipe for Llama 3.1 8B model.
+
+  Args:
+      profile_enabled: Whether to enable Nsys profiling.
+      profile_start_step: The step to start profiling.
+      profile_end_step: The step to end profiling.
+      profile_ranks: The ranks to profile, comma separated.
+
+  Returns:
+      A Nemo2 training recipe.
+  """
+  # Start from the Nemo standard recipe.
+  pretrain = llama31_8b.pretrain_recipe(performance_mode=True)
+
+  num_nodes = 8
+  num_gpus_per_node = 8
+  mbs = 2
+  gbs = 256
+  max_steps = 30
+  tp_size = 1
+  pp_size = 1
+  cp_size = 1
+  vp_size = 1  # Virtual Pipeline Parallelism
+  ep_size = 1  # Expert Parallelism
+  enable_cuda_graphs = False
+  compute_dtype = "bf16"
+  fp8_recipe = None  # Not needed for bf16
+  nccl_communicator_config_path = None
+  use_mcore_fsdp = False
+  use_fsdp_double_buffer = False
+  use_user_buffer_registration = False
+  use_sharp = False
+  keep_fsdp_fp8_transpose_cache = False
+
+  pretrain = set_primary_perf_configs(
+      pretrain,
+      "pre_train",
+      num_nodes=num_nodes,
+      num_gpus_per_node=num_gpus_per_node,
+      mbs=mbs,
+      gbs=gbs,
+      max_steps=max_steps,
+      tp_size=tp_size,
+      pp_size=pp_size,
+      cp_size=cp_size,
+      vp_size=vp_size,
+      ep_size=ep_size,
+      enable_cuda_graphs=enable_cuda_graphs,
+      compute_dtype=compute_dtype,
+      fp8_recipe=fp8_recipe,
+      nccl_communicator_config_path=nccl_communicator_config_path,
+      use_mcore_fsdp=use_mcore_fsdp,
+      use_fsdp_double_buffer=use_fsdp_double_buffer,
+      use_user_buffer_registration=use_user_buffer_registration,
+      use_sharp=use_sharp,
+      keep_fsdp_fp8_transpose_cache=keep_fsdp_fp8_transpose_cache,
+  )
+
+  # Sequence Length (model and data)
+  pretrain.model.config.seq_length = 8192
+  pretrain.data.seq_length = 8192
+
+ # Set the number of steps to 50 for a quicker benchmark.
+  pretrain.trainer.max_steps = 50
+
+  # Disable validation batches.
+  pretrain.trainer.limit_val_batches = 0.0
+  pretrain.trainer.val_check_interval = 0
+
+  # Add the Nsys profiling callback if enabled.
+  if profile_enabled:
+    pretrain.trainer.callbacks.append(
+        run.Config(
+            NsysCallback,
+            start_step=profile_start_step,
+            end_step=profile_end_step,
+            ranks=[int(x) for x in profile_ranks.split(",")],
+            gen_shape=False,
+        )
+    )
+
+  # Add the FLOPs measurement callback.
+  pretrain.trainer.callbacks.append(
+      run.Config(
+          FLOPsMeasurementCallback,
+          model_name="llama31-8b",
+          model_config=pretrain.model.config,
+          data_config=pretrain.data,
+      )
+  )
+
+  # When `performance_mode` is enabled, the Megatron communication overlap
+  # callback is already added to the recipe.
+  # https://github.com/NVIDIA-NeMo/NeMo/blob/90a396a567ebb4e8c1c41e454dc00cb71f911317/nemo/collections/llm/recipes/llama31_8b.py#L231
+  comm_overlap_callback_idx = get_comm_overlap_callback_idx(
+      pretrain.trainer.callbacks
+  )
+  pretrain.trainer.callbacks[
+      comm_overlap_callback_idx
+  ].tp_comm_bootstrap_backend = "nccl"
+
+  # Disable checkpointing.
+  pretrain.log.ckpt = None
+  pretrain.trainer.enable_checkpointing = False
+
+  # Log every step.
+  pretrain.trainer.log_every_n_steps = 1
+
+  # Enable DLLogger
+  dllogger_config = run.Config(
+      DLLogger,
+      verbose=True,
+      stdout=True,
+      json_file="dllogger.json",
+  )
+  pretrain.log.extra_loggers = [dllogger_config]
+
+  return pretrain
+
+
+if __name__ == "__main__":
+  run.cli.main(llm.pretrain, default_factory=recipe)
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/recipe_launch_command.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/recipe_launch_command.sh
new file mode 100644
index 00000000..378e1475
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/recipe_launch_command.sh
@@ -0,0 +1 @@
+helm install vishwasreddy-ubench-c3i3 . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus64.py --set workload.image=nvcr.io/nvidia/nemo:25.07 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/vishwasreddy-ubench-c3i3
\ No newline at end of file
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-config-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-config-configmap.yaml
new file mode 100644
index 00000000..a1d54cee
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-config-configmap.yaml
@@ -0,0 +1,28 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{- if .Values.workload.configFile }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-config"
+data:
+  workload-configuration: |-
+{{- if .Values.workload_config }}
+{{ .Values.workload_config | nindent 4 }}
+{{- else }}
+{{ "config: null" | nindent 4 }}
+{{- end }}
+{{- end }}
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-job.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-job.yaml
new file mode 100644
index 00000000..ae59e456
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-job.yaml
@@ -0,0 +1,333 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{$timestamp := now | date "2006-01-02-15-04-05"}}
+{{$jobSuffix := randAlphaNum 4 | lower}}
+{{$jobuuid := uuidv4}}
+{{$nodes := div .Values.workload.gpus 8 | max 1}}
+{{$gpusPerNode := min .Values.workload.gpus 8}}
+{{- $root := . -}}
+
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+  name: "{{ .Release.Name }}"
+  namespace: default
+  labels:
+  {{- if $root.Values.queue }}
+    kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}"
+  {{- end }}
+spec:
+  {{- if $root.Values.queue }}
+  suspend: true
+  {{- end }}
+  failurePolicy:
+    maxRestarts: {{ default 0 $root.Values.workload.max_workload_restarts }}
+  replicatedJobs:
+  - name: workload
+    replicas: 1
+    template:
+      spec:
+        parallelism: {{ $nodes }}
+        completions: {{ $nodes }}
+        backoffLimit: 0
+        completionMode: Indexed
+        activeDeadlineSeconds: 14400 # 4 hours (4 * 60 * 60)
+        ttlSecondsAfterFinished: 43200 # 12 hours (12 * 60 * 60)
+        template:
+          metadata:
+            annotations:
+              kubectl.kubernetes.io/default-container: workload
+              {{- if $root.Values.volumes.gcsVolumes }}
+              gke-gcsfuse/volumes: "true"
+              gke-gcsfuse/cpu-limit: "500m"
+              gke-gcsfuse/memory-limit: "1Ti"
+              gke-gcsfuse/ephemeral-storage-limit: "2Ti"
+              {{- end }}
+              {{- if $root.Values.volumes.psVolumes }}
+              gke-parallelstore/volumes: "true"
+              gke-parallelstore/cpu-limit: "0"
+              gke-parallelstore/memory-limit: "0"
+              {{- end }}
+              {{- if and $root.Values.queue $root.Values.tasSettings.topologyRequest }}
+              {{- toYaml .Values.tasSettings.topologyRequest | nindent 14 }}
+              {{- end }}
+              {{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }}
+              provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}"
+              {{- end }}
+              {{- if not $root.Values.network.hostNetwork }}
+              networking.gke.io/default-interface: "eth0"
+              networking.gke.io/interfaces: |
+              {{- if $root.Values.network.subnetworks }}
+                [
+                  {{- range $i, $subnetwork := $root.Values.network.subnetworks }}
+                  {"interfaceName":"eth{{ $i }}","network":"{{ $subnetwork }}"}{{ eq $i 9 | ternary "" ","}}
+                  {{- end }}
+                ]
+              {{- else }}
+                [
+                  {"interfaceName":"eth0","network":"default"},
+                  {"interfaceName":"eth1","network":"gvnic-1"},
+                  {{- range  $i := until 8 }}
+                  {"interfaceName":"eth{{ add 2 $i }}","network":"rdma-{{ $i }}"}{{ eq $i 7 | ternary "" ","}}
+                  {{- end }}
+                ]
+              {{- end }}
+              {{- end }}
+          spec:
+            {{- if $root.Values.network.hostNetwork }}
+            hostNetwork: true
+            dnsPolicy: ClusterFirstWithHostNet
+            {{- end }}
+            subdomain: "{{.Release.Name}}"
+            restartPolicy: Never
+            {{- if $root.Values.targetNodes }}
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+                  nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: kubernetes.io/hostname
+                      operator: "In"
+                      values:
+                      {{- range $hostname := $root.Values.targetNodes }}
+                      - {{ $hostname }}
+                      {{- end }}
+            {{- end }}
+            {{- if $root.Values.avoidNodes }}
+            {{- if not $root.Values.targetNodes }}
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+            {{- end }}
+                  nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: kubernetes.io/hostname
+                      operator: "NotIn"
+                      values:
+                      {{- range $hostname := $root.Values.avoidNodes }}
+                      - {{ $hostname }}
+                      {{- end }}
+            {{- end }}
+            tolerations:
+            - operator: "Exists"
+              key: nvidia.com/gpu
+            - operator: "Exists"
+              key: cloud.google.com/impending-node-termination
+
+            volumes:
+            {{ if $root.Values.network.gibVersion }}
+            - name: gib
+              emptyDir: {}
+            {{ end }}
+
+            {{- if $root.Values.workload.configFile }}
+            - name: workload-configuration
+              configMap:
+                name: "{{.Release.Name}}-config"
+                items:
+                - key: workload-configuration
+                  path: {{ $root.Values.workload.configFile | default "workload-configuration" }}
+            {{- end }}
+
+            - name: workload-launcher
+              configMap:
+                name: "{{.Release.Name}}-launcher"
+
+            - name: shared-memory
+              emptyDir:
+                medium: "Memory"
+                sizeLimit: 250Gi
+
+            {{- range $pvc := $root.Values.volumes.pvcMounts }}
+            - name: "{{ $pvc.claimName }}"
+              persistentVolumeClaim:
+                claimName: "{{ $pvc.claimName }}"
+            {{- end }}
+
+            {{- range $gcs := $root.Values.volumes.gcsMounts }}
+            - name: "{{ $gcs.bucketName }}"
+              csi:
+                driver: gcsfuse.csi.storage.gke.io
+                volumeAttributes:
+                  bucketName: "{{ $gcs.bucketName }}"
+                  {{- if $gcs.mountOptions }}
+                  mountOptions: "{{ $gcs.mountOptions }}"
+                  {{- end }}
+            {{- end}}
+
+            {{- if $root.Values.volumes.ssdMountPath }}
+            - name: local-ssd
+              hostPath:
+                path: /mnt/stateful_partition/kube-ephemeral-ssd
+            {{- end }}
+
+            initContainers:
+            {{ if $root.Values.network.gibVersion }}
+            - name: nccl-plugin-installer
+              image: {{ $root.Values.network.gibVersion }}
+              imagePullPolicy: Always
+              args:
+              - |
+                set -ex
+                /scripts/container_entry.sh install --install-nccl
+                cp -R /var/lib/gib/lib64/. /target/usr/local/gib/lib64
+                cp -R /var/lib/gib/. /target/usr/local/gib
+              command:
+              - /bin/sh
+              - -c
+              volumeMounts:
+              - mountPath: /target/usr/local/gib
+                name: gib
+            {{ end}}
+
+            containers:
+            {{- if $root.Values.workload.gcsSidecarImage }}
+            - name: gke-gcsfuse-sidecar
+              image: {{ $root.Values.workload.gcsSidecarImage }}
+            - name: gke-gcsfuse-metadata-prefetch
+              image: {{ $root.Values.workload.gcsSidecarImage }}
+            {{- end }}
+            {{- if $root.Values.workload.psSidecarImage }}
+            - name: gke-parallelstore-sidecar
+              image: {{ $root.Values.workload.psSidecarImage }}
+            {{- end }}
+
+            - name: workload
+              image: "{{ $root.Values.workload.image }}"
+              imagePullPolicy: Always
+              {{- if $root.Values.network.hostNetwork }}
+              securityContext:
+                privileged: true
+              {{- end }}
+              env:
+              - name: JOB_IDENTIFIER
+                value: "{{ .Release.Name }}-{{ $timestamp }}"
+              - name: JOB_TIMESTAMP
+                value: "{{ $timestamp }}"
+              - name: JOB_UUID
+                value: "{{ $jobuuid }}"
+              - name: JOB_ORCHESTRATOR
+                value: "gke"
+              # Add RANK based on the pod's index provided by the Indexed Job
+              # This is crucial for torch.distributed initialization.
+              - name: JOB_COMPLETION_INDEX
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
+              - name: RANK_0_FQDN
+                value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local"
+              - name: HOSTNAME_PREFIX
+                value: "{{.Release.Name}}-workload-"
+              - name: DOMAIN_NAME
+                value: "{{.Release.Name}}.default.svc.cluster.local"
+              - name: MASTER_ADDR
+                value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local"
+              - name: MASTER_PORT
+                value: "6002"
+              - name: WORLD_SIZE
+                value: "{{ $root.Values.workload.gpus }}"
+              - name: NNODES
+                value: "{{ $nodes }}"
+              - name: GPUS_PER_NODE
+                value: "{{ $gpusPerNode }}"
+
+              - name: NCCL_PLUGIN_PATH
+                value: /usr/local/gib/lib64
+
+              {{ if $root.Values.network.gibVersion }}
+              - name: NCCL_INIT_SCRIPT
+                value: "/usr/local/gib/scripts/set_nccl_env.sh"
+              {{ end }}
+
+              {{ if $root.Values.network.ncclSettings }}
+              {{- toYaml .Values.network.ncclSettings | nindent 14 }}
+              {{ end }}
+
+              {{ if $root.Values.workload.envs }}
+              {{- toYaml .Values.workload.envs | nindent 14 }}
+              {{ end }}
+
+              command:
+              - bash
+              - -c
+              - |
+                echo "Pod on $(hostname --fqdn) is running"
+                echo "Pod is assigned job index of $JOB_COMPLETION_INDEX"
+
+                if [[ -n "${NCCL_INIT_SCRIPT}" ]]; then
+                  echo "Running NCCL init script: ${NCCL_INIT_SCRIPT}"
+                  source ${NCCL_INIT_SCRIPT}
+                fi
+
+                # Overriding NCCL_SOCKET_IFNAME definition
+                export NCCL_SOCKET_IFNAME="eth0,eth1"
+                export NCCL_TUNER_CONFIG_PATH=/usr/local/gib/configs/tuner_config_a4.txtpb
+
+                echo "Launching workload with the following arguments:"
+                {{- range $root.Values.workload.defaultArguments }}
+                echo "  {{ . }}"
+                {{- end }}
+                {{- range $root.Values.workload.arguments }}
+                echo "  {{ . }}"
+                {{- end }}
+                echo ""
+
+                sleep 10
+
+                bash /workload/launcher/launch-workload.sh \
+                {{- range $root.Values.workload.defaultArguments }}
+                {{ . }} \
+                {{- end }}
+                {{- range $root.Values.workload.arguments }}
+                {{ . }} \
+                {{- end }}
+
+
+              volumeMounts:
+                {{ if $root.Values.network.gibVersion }}
+                - name: gib
+                  mountPath: /usr/local/gib
+                {{ end }}
+
+                {{- if $root.Values.workload.configFile }}
+                - name: workload-configuration
+                  mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }}
+                {{- end }}
+
+                - name: workload-launcher
+                  mountPath: /workload/launcher
+
+                - name: shared-memory
+                  mountPath: /dev/shm
+
+                {{- range $pvc := $root.Values.volumes.pvcMounts }}
+                - name: "{{ $pvc.claimName }}"
+                  mountPath: "{{ $pvc.mountPath }}"
+                {{- end }}
+
+                {{- range $gcs := $root.Values.volumes.gcsMounts }}
+                - name: "{{ $gcs.bucketName }}"
+                  mountPath: "{{ $gcs.mountPath }}"
+                {{- end }}
+
+                {{- if $root.Values.volumes.ssdMountPath }}
+                - name: local-ssd
+                  mountPath: "{{ $root.Values.volumes.ssdMountPath }}"
+                {{- end }}
+
+              resources:
+                limits:
+                  nvidia.com/gpu: {{ $gpusPerNode }}
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-launcher-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-launcher-configmap.yaml
new file mode 100644
index 00000000..7026e0f1
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-launcher-configmap.yaml
@@ -0,0 +1,28 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-launcher"
+data:
+  launch-workload.sh: |-
+{{- if .Values.workload_launcher }}
+{{ .Values.workload_launcher | nindent 4 }}
+{{- else }}
+    #!/bin/bash
+    echo "No workload launcher specified"
+    exit 1
+{{- end }}
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-svc.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-svc.yaml
new file mode 100644
index 00000000..7cfe220b
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/templates/workload-svc.yaml
@@ -0,0 +1,22 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: "{{ .Release.Name }}"
+spec:
+  clusterIP: None
+  selector:
+    jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}"
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/values.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/values.yaml
new file mode 100644
index 00000000..42c40d7b
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/values.yaml
@@ -0,0 +1,33 @@
+dwsSettings:
+  maxRunDurationSeconds: null
+network:
+  gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0
+  hostNetwork: true
+  ncclSettings:
+  - name: NCCL_DEBUG
+    value: WARN
+  subnetworks[]: null
+queue: null
+tasSettings:
+  topologyRequest:
+    kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname
+volumes:
+  gcsMounts:
+  - bucketName: null
+    mountPath: null
+  gcsVolumes: true
+  psVolumes: false
+workload:
+  arguments[]: null
+  configFile: llama3-1-8b-bf16-seq8192-gbs256-gpus64.py
+  configPath: /workload/configs/
+  defaultArguments[]: null
+  envs:
+  - name: ARTIFACT_DIR
+    value: null
+  - name: GLOO_SOCKET_IFNAME
+    value: eth0
+  - name: NEMO_LAUNCH_SCRIPT
+    value: /workload/configs/llama3-1-8b-bf16-seq8192-gbs256-gpus64.py
+  gpus: 64
+  image: nvcr.io/nvidia/nemo:25.07

From 2ac2b5b75a1dc95a33786d80aa7ff0f8bc1f97c6 Mon Sep 17 00:00:00 2001
From: Vishwas Reddy <vishwasreddy38@gmail.com>
Date: Thu, 19 Mar 2026 12:38:57 -0700
Subject: [PATCH 4/9] Remove '-8node' suffix from workload name

---
 .../8node-bf16-seq8192-gbs256/README.md              | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/README.md b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/README.md
index cc666f48..aa44841a 100644
--- a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/README.md
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs256/README.md
@@ -89,7 +89,7 @@ your client:
 
 ```bash
 cd $RECIPE_ROOT
-export WORKLOAD_NAME=$USER-a4-llama3-1-8b-8node
+export WORKLOAD_NAME=$USER-a4-llama3-1-8b
 helm install $WORKLOAD_NAME . -f values.yaml \
 --set-file workload_launcher=launcher.sh \
 --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus64.py \
@@ -107,7 +107,7 @@ helm install $WORKLOAD_NAME . -f values.yaml \
 
     ```bash
     cd $RECIPE_ROOT
-    export WORKLOAD_NAME=$USER-a4-llama3-1-8b-8node
+    export WORKLOAD_NAME=$USER-a4-llama3-1-8b
     helm install $WORKLOAD_NAME . -f values.yaml \
     --set-file workload_launcher=launcher.sh \
     --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs256-gpus64.py \
@@ -124,7 +124,7 @@ helm install $WORKLOAD_NAME . -f values.yaml \
 To check the status of pods in your job, run the following command:
 
 ```
-kubectl get pods | grep $USER-a4-llama3-1-8b-8node
+kubectl get pods | grep $USER-a4-llama3-1-8b
 ```
 
 Replace the following:
@@ -141,7 +141,7 @@ Information about the training job's progress, including crucial details such as
 loss, step count, and step time, is generated by the rank 0 process.
 This process runs on the pod whose name begins with
 `JOB_NAME_PREFIX-workload-0-0`.
-For example: `$USER-a4-llama3-1-8b-8node-workload-0-0-s9zrv`.
+For example: `$USER-a4-llama3-1-8b-workload-0-0-s9zrv`.
 
 ### Uninstall the Helm release
 
@@ -149,5 +149,5 @@ You can delete the job and other resources created by the Helm chart. To
 uninstall Helm, run the following command from your client:
 
 ```bash
-helm uninstall $USER-a4-llama3-1-8b-8node
-```
\ No newline at end of file
+helm uninstall $USER-a4-llama3-1-8b
+```

From b182d558a6b2829875b38e4fe9e716cebd6fbd54 Mon Sep 17 00:00:00 2001
From: Vishwas Reddy <vishwasreddy38@gmail.com>
Date: Thu, 19 Mar 2026 19:54:19 +0000
Subject: [PATCH 5/9] feat: add 1-node bf16 recipe for llama3-1-8b

---
 .../1node-bf16-seq4096-gbs256/Chart.yaml      |  20 ++
 .../1node-bf16-seq4096-gbs256/README.md       | 153 ++++++++
 .../1node-bf16-seq4096-gbs256/launcher.sh     | 106 ++++++
 .../llama3-1-8b-bf16-seq4096-gbs256-gpus8.py  | 142 ++++++++
 .../recipe_launch_command.sh                  |   1 +
 .../templates/workload-config-configmap.yaml  |  28 ++
 .../templates/workload-job.yaml               | 333 ++++++++++++++++++
 .../workload-launcher-configmap.yaml          |  28 ++
 .../templates/workload-svc.yaml               |  22 ++
 .../1node-bf16-seq4096-gbs256/values.yaml     |  33 ++
 10 files changed, 866 insertions(+)
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/Chart.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/README.md
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/launcher.sh
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/llama3-1-8b-bf16-seq4096-gbs256-gpus8.py
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/recipe_launch_command.sh
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-config-configmap.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-job.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-launcher-configmap.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-svc.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/values.yaml

diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/Chart.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/Chart.yaml
new file mode 100644
index 00000000..af46c11a
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/Chart.yaml
@@ -0,0 +1,20 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v2
+name: a4_jobset_workload
+description: a4_jobset_workload
+type: application
+version: 0.1.0
+appVersion: "1.16.0"
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/README.md b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/README.md
new file mode 100644
index 00000000..1f1e50c0
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/README.md
@@ -0,0 +1,153 @@
+<!-- mdformat global-off -->
+# Pretrain llama3-1-8b workloads on a4 GKE Node pools with Nvidia NeMo Framework
+
+This recipe outlines the steps for running a llama3-1-8b pretraining
+workload on [a4 GKE Node pools](https://cloud.google.com/kubernetes-engine) by using the
+[NVIDIA NeMo framework](https://github.com/NVIDIA/nemo).
+
+## Orchestration and deployment tools
+
+For this recipe, the following setup is used:
+
+- Orchestration - [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine)
+- Pretraining job configuration and deployment - A Helm chart is used to
+  configure and deploy the [Kubernetes Jobset](https://kubernetes.io/blog/2025/03/23/introducing-jobset) resource which manages the execution of the
+  [NeMo pretraining workload](https://github.com/NVIDIA/nemo).
+
+## Test environment
+
+This recipe has been optimized for and tested with the following configuration:
+
+- GKE cluster
+Please follow Cluster Toolkit [instructions](https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/examples/gke-a4)
+to create your a4 GKE cluster.
+
+## Training dataset
+
+This recipe uses a mock pretraining dataset provided by the NeMo framework.
+
+## Docker container image
+
+This recipe uses the following docker images:
+
+- `nvcr.io/nvidia/nemo:25.11`
+- `us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0`
+
+## Run the recipe
+
+From your client workstation, complete the following steps:
+
+### Configure environment settings
+
+Set the environment variables to match your environment:
+
+ ```bash
+ export PROJECT_ID=<PROJECT_ID>
+ export CLUSTER_REGION=<CLUSTER_REGION>
+ export CLUSTER_NAME=<CLUSTER_NAME>
+ export GCS_BUCKET=<GCS_BUCKET> # Note: path should not be prefixed with gs://
+ export KUEUE_NAME=<KUEUE_NAME>
+ ```
+
+Replace the following values:
+
+ - `<PROJECT_ID>`: your Google Cloud project ID.
+ - `<CLUSTER_REGION>`: the region where your cluster is located.
+ - `<CLUSTER_NAME>`: the name of your GKE cluster.
+ - `<GCS_BUCKET>`: the name of your Cloud Storage bucket. Don't include the `gs://` prefix.
+ - `<KUEUE_NAME>`: the name of the Kueue local queue. The default queue created by the cluster toolkit is `a4`. Make sure to verify the name of the local queue in your cluster.
+
+Set the default project:
+
+ ```bash
+ gcloud config set project $PROJECT_ID
+ ```
+
+### Get the recipe
+
+Clone the `gpu-recipes` repository and set a reference to the recipe folder.
+
+```
+git clone https://github.com/ai-hypercomputer/gpu-recipes.git
+cd gpu-recipes
+export REPO_ROOT=`git rev-parse --show-toplevel`
+export RECIPE_ROOT=$REPO_ROOT/training/a4/llama3-1-8b/nemo-pretraining-gke/1_nodes
+cd $RECIPE_ROOT
+```
+
+### Get cluster credentials
+
+```
+gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION
+```
+
+### Configure and submit a pretraining job
+
+#### Using 1 node (8 gpus) bf16 precision
+To execute the job with the default settings, run the following command from
+your client:
+
+```bash
+cd $RECIPE_ROOT
+export WORKLOAD_NAME=$USER-a4-llama3-1-8b-1node
+helm install $WORKLOAD_NAME . -f values.yaml \
+--set-file workload_launcher=launcher.sh \
+--set-file workload_config=llama3-1-8b-bf16-seq4096-gbs256-gpus8.py \
+--set workload.image=nvcr.io/nvidia/nemo:25.11 \
+--set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
+--set volumes.gcsMounts[0].mountPath=/job-logs \
+--set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
+--set queue=${KUEUE_NAME}
+```
+
+**Examples**
+
+-   To set the number of training steps to 100, run the following command from
+    your client:
+
+    ```bash
+    cd $RECIPE_ROOT
+    export WORKLOAD_NAME=$USER-a4-llama3-1-8b-1node
+    helm install $WORKLOAD_NAME . -f values.yaml \
+    --set-file workload_launcher=launcher.sh \
+    --set-file workload_config=llama3-1-8b-bf16-seq4096-gbs256-gpus8.py \
+    --set workload.image=nvcr.io/nvidia/nemo:25.11 \
+    --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
+    --set volumes.gcsMounts[0].mountPath=/job-logs \
+    --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
+    --set queue=${KUEUE_NAME} \
+    --set workload.arguments[0]="trainer.max_steps=100"
+    ```
+
+### Monitor the job
+
+To check the status of pods in your job, run the following command:
+
+```
+kubectl get pods | grep $USER-a4-llama3-1-8b-1node
+```
+
+Replace the following:
+
+- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4-llama3-1-8b-1node.
+
+To get the logs for one of the pods, run the following command:
+
+```
+kubectl logs POD_NAME
+```
+
+Information about the training job's progress, including crucial details such as
+loss, step count, and step time, is generated by the rank 0 process.
+This process runs on the pod whose name begins with
+`JOB_NAME_PREFIX-workload-0-0`.
+For example: `$USER-a4-llama3-1-8b-1node-workload-0-0-s9zrv`.
+
+### Uninstall the Helm release
+
+You can delete the job and other resources created by the Helm chart. To
+uninstall Helm, run the following command from your client:
+
+```bash
+helm uninstall $USER-a4-llama3-1-8b-1node
+```
\ No newline at end of file
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/launcher.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/launcher.sh
new file mode 100644
index 00000000..8184f6f9
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/launcher.sh
@@ -0,0 +1,106 @@
+usage()
+{
+cat << EOF
+usage: bash ./launcher.sh [config-override  [config-override ...]]
+config-override  (Optional) A  NeMo configuration override. E.g. trainer.max_steps=10000.
+EOF
+}
+
+parse_args() {
+  while [ "$1" != "" ]; do
+    case $(grep -o "=" <<< "$1" | wc -l) in
+        1  )
+        config_overrides+=("$1")
+        ;;
+        * )
+            echo "Invalid config override: $1"
+            usage
+            exit 1
+    esac
+    shift
+  done
+  config_overrides="${config_overrides[*]}"
+}
+
+config_overrides=()
+parse_args "$@"
+
+if [ -z "${config_overrides}" ]; then
+  echo "No NeMo config overrides specified"
+else
+  echo "NeMo config overrides:"
+  echo "  ${config_overrides}"
+fi
+
+if [[ -n "${NCCL_PLUGIN_PATH}" ]]; then
+  export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH"
+  ldconfig $LD_LIBRARY_PATH
+  echo "Added $LD_LIBRARY_PATH to ldconfig:"
+  ldconfig -p | grep libcuda | sed 's/^/  /'
+  echo ""
+fi
+
+if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then
+  explicit_log_dir=${EXPLICIT_LOG_DIR}
+else
+  explicit_log_dir=workload_logs
+fi
+echo "Logging to ${explicit_log_dir}"
+
+if [[ -n "${TOKENIZER_PATH}" ]]; then
+  echo "Getting tokenizer files"
+  cp ${TOKENIZER_PATH}/* .
+  echo ""
+fi
+
+echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of $NNODES nodes"
+
+
+pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger
+
+export HF_TOKEN="<YOUR_HF_TOKEN>"
+
+# Export the nemo2 config to yaml.
+python ${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \
+trainer.num_nodes="$NNODES" \
+log.explicit_log_dir="${explicit_log_dir}" \
+trainer.max_steps=30 \
+trainer.num_nodes=1 \
+trainer.devices=8 \
+${config_overrides} \
+--to-yaml exported_nemo_config.yaml
+
+# Create the nsys directory.
+mkdir -p ${explicit_log_dir}/nsys
+
+OMP_NUM_THREADS=12 NSYS_CONFIG_DIRECTIVES="AgentLaunchTimeoutSec=240;AppLaunchTimeoutSec=240" TORCH_NCCL_ENABLE_MONITORING=0 \
+/usr/local/bin/nsys profile -s none -t nvtx,cuda --capture-range=cudaProfilerApi --capture-range-end=stop \
+-o ${explicit_log_dir}/nsys/noderank-${JOB_COMPLETION_INDEX} \
+--session-new "nemo-rank${JOB_COMPLETION_INDEX}"-$RANDOM \
+--wait all \
+torchrun \
+--nproc-per-node="8" \
+--nnodes="${NNODES}" \
+--node_rank="${JOB_COMPLETION_INDEX}" \
+--rdzv_id="${JOB_IDENTIFIER}" \
+--master_addr="${MASTER_ADDR}" \
+--master_port="${MASTER_PORT}" \
+${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \
+trainer.num_nodes="$NNODES" \
+log.explicit_log_dir="${explicit_log_dir}" \
+trainer.max_steps=30 \
+trainer.num_nodes=1 \
+trainer.devices=8 \
+${config_overrides}
+
+if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
+  mkdir -p ${ARTIFACT_DIR}
+  cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/
+  cp ${NEMO_LAUNCH_SCRIPT} ${ARTIFACT_DIR}/run-cli.py
+  cp dllogger.json ${ARTIFACT_DIR}/dllogger.json
+  cp exported_nemo_config.yaml ${ARTIFACT_DIR}/nemo-configuration.yaml
+  env > ${ARTIFACT_DIR}/environ.txt
+  ls ${ARTIFACT_DIR}
+fi
+echo "Training completed"
+echo "Pod on $(hostname --fqdn) is exiting"
\ No newline at end of file
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/llama3-1-8b-bf16-seq4096-gbs256-gpus8.py b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/llama3-1-8b-bf16-seq4096-gbs256-gpus8.py
new file mode 100644
index 00000000..3a6a1198
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/llama3-1-8b-bf16-seq4096-gbs256-gpus8.py
@@ -0,0 +1,142 @@
+"""Nemo2 pretraining recipe for Llama 3.1 8B model."""
+
+from nemo.collections import llm
+from nemo.collections.llm.recipes import llama31_8b
+from nemo.lightning.pytorch.callbacks import NsysCallback
+from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback
+from nemo.utils.loggers.dllogger import DLLogger
+import nemo_run as run
+from scripts.performance.helpers import (
+    set_primary_perf_configs,
+)
+from scripts.performance.utils import get_comm_overlap_callback_idx
+
+
+def recipe(
+    profile_enabled: bool = False,
+    profile_start_step: int = 0,
+    profile_end_step: int = 0,
+    profile_ranks: str = "0",
+) -> run.Partial:
+  """Returns a Nemo2 training recipe for Llama 3.1 8B model.
+
+  Args:
+      profile_enabled: Whether to enable Nsys profiling.
+      profile_start_step: The step to start profiling.
+      profile_end_step: The step to end profiling.
+      profile_ranks: The ranks to profile, comma separated.
+
+  Returns:
+      A Nemo2 training recipe.
+  """
+  # Start from the Nemo standard recipe.
+  pretrain = llama31_8b.pretrain_recipe(performance_mode=True)
+
+  num_nodes = 1
+  num_gpus_per_node = 8
+  mbs = 2
+  gbs = 256
+  max_steps = 30
+  tp_size = 1
+  pp_size = 1
+  cp_size = 1
+  vp_size = 1  # Virtual Pipeline Parallelism
+  ep_size = 1  # Expert Parallelism
+  enable_cuda_graphs = False
+  compute_dtype = "bf16"
+  fp8_recipe = None  # Not needed for bf16
+  nccl_communicator_config_path = None
+  use_mcore_fsdp = False
+  use_fsdp_double_buffer = False
+  use_user_buffer_registration = False
+  use_sharp = False
+  keep_fsdp_fp8_transpose_cache = False
+
+  pretrain = set_primary_perf_configs(
+      pretrain,
+      "pre_train",
+      num_nodes=num_nodes,
+      num_gpus_per_node=num_gpus_per_node,
+      mbs=mbs,
+      gbs=gbs,
+      max_steps=max_steps,
+      tp_size=tp_size,
+      pp_size=pp_size,
+      cp_size=cp_size,
+      vp_size=vp_size,
+      ep_size=ep_size,
+      enable_cuda_graphs=enable_cuda_graphs,
+      compute_dtype=compute_dtype,
+      fp8_recipe=fp8_recipe,
+      nccl_communicator_config_path=nccl_communicator_config_path,
+      use_mcore_fsdp=use_mcore_fsdp,
+      use_fsdp_double_buffer=use_fsdp_double_buffer,
+      use_user_buffer_registration=use_user_buffer_registration,
+      use_sharp=use_sharp,
+      keep_fsdp_fp8_transpose_cache=keep_fsdp_fp8_transpose_cache,
+  )
+
+  # Sequence Length (model and data)
+  pretrain.model.config.seq_length = 4096
+  pretrain.data.seq_length = 4096
+
+ # Set the number of steps to 50 for a quicker benchmark.
+  pretrain.trainer.max_steps = 50
+
+  # Disable validation batches.
+  pretrain.trainer.limit_val_batches = 0.0
+  pretrain.trainer.val_check_interval = 0
+
+  # Add the Nsys profiling callback if enabled.
+  if profile_enabled:
+    pretrain.trainer.callbacks.append(
+        run.Config(
+            NsysCallback,
+            start_step=profile_start_step,
+            end_step=profile_end_step,
+            ranks=[int(x) for x in profile_ranks.split(",")],
+            gen_shape=False,
+        )
+    )
+
+  # Add the FLOPs measurement callback.
+  pretrain.trainer.callbacks.append(
+      run.Config(
+          FLOPsMeasurementCallback,
+          model_name="llama31-8b",
+          model_config=pretrain.model.config,
+          data_config=pretrain.data,
+      )
+  )
+
+  # When `performance_mode` is enabled, the Megatron communication overlap
+  # callback is already added to the recipe.
+  # https://github.com/NVIDIA-NeMo/NeMo/blob/90a396a567ebb4e8c1c41e454dc00cb71f911317/nemo/collections/llm/recipes/llama31_8b.py#L231
+  comm_overlap_callback_idx = get_comm_overlap_callback_idx(
+      pretrain.trainer.callbacks
+  )
+  pretrain.trainer.callbacks[
+      comm_overlap_callback_idx
+  ].tp_comm_bootstrap_backend = "nccl"
+
+  # Disable checkpointing.
+  pretrain.log.ckpt = None
+  pretrain.trainer.enable_checkpointing = False
+
+  # Log every step.
+  pretrain.trainer.log_every_n_steps = 1
+
+  # Enable DLLogger
+  dllogger_config = run.Config(
+      DLLogger,
+      verbose=True,
+      stdout=True,
+      json_file="dllogger.json",
+  )
+  pretrain.log.extra_loggers = [dllogger_config]
+
+  return pretrain
+
+
+if __name__ == "__main__":
+  run.cli.main(llm.pretrain, default_factory=recipe)
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/recipe_launch_command.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/recipe_launch_command.sh
new file mode 100644
index 00000000..c4d8cb3b
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/recipe_launch_command.sh
@@ -0,0 +1 @@
+helm install vishwasreddy-ubench-d5hr . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-8b-bf16-seq4096-gbs256-gpus8.py --set workload.image=nvcr.io/nvidia/nemo:25.11 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/vishwasreddy-ubench-d5hr
\ No newline at end of file
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-config-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-config-configmap.yaml
new file mode 100644
index 00000000..a1d54cee
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-config-configmap.yaml
@@ -0,0 +1,28 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{- if .Values.workload.configFile }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-config"
+data:
+  workload-configuration: |-
+{{- if .Values.workload_config }}
+{{ .Values.workload_config | nindent 4 }}
+{{- else }}
+{{ "config: null" | nindent 4 }}
+{{- end }}
+{{- end }}
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-job.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-job.yaml
new file mode 100644
index 00000000..ae59e456
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-job.yaml
@@ -0,0 +1,333 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{$timestamp := now | date "2006-01-02-15-04-05"}}
+{{$jobSuffix := randAlphaNum 4 | lower}}
+{{$jobuuid := uuidv4}}
+{{$nodes := div .Values.workload.gpus 8 | max 1}}
+{{$gpusPerNode := min .Values.workload.gpus 8}}
+{{- $root := . -}}
+
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+  name: "{{ .Release.Name }}"
+  namespace: default
+  labels:
+  {{- if $root.Values.queue }}
+    kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}"
+  {{- end }}
+spec:
+  {{- if $root.Values.queue }}
+  suspend: true
+  {{- end }}
+  failurePolicy:
+    maxRestarts: {{ default 0 $root.Values.workload.max_workload_restarts }}
+  replicatedJobs:
+  - name: workload
+    replicas: 1
+    template:
+      spec:
+        parallelism: {{ $nodes }}
+        completions: {{ $nodes }}
+        backoffLimit: 0
+        completionMode: Indexed
+        activeDeadlineSeconds: 14400 # 4 hours (4 * 60 * 60)
+        ttlSecondsAfterFinished: 43200 # 12 hours (12 * 60 * 60)
+        template:
+          metadata:
+            annotations:
+              kubectl.kubernetes.io/default-container: workload
+              {{- if $root.Values.volumes.gcsVolumes }}
+              gke-gcsfuse/volumes: "true"
+              gke-gcsfuse/cpu-limit: "500m"
+              gke-gcsfuse/memory-limit: "1Ti"
+              gke-gcsfuse/ephemeral-storage-limit: "2Ti"
+              {{- end }}
+              {{- if $root.Values.volumes.psVolumes }}
+              gke-parallelstore/volumes: "true"
+              gke-parallelstore/cpu-limit: "0"
+              gke-parallelstore/memory-limit: "0"
+              {{- end }}
+              {{- if and $root.Values.queue $root.Values.tasSettings.topologyRequest }}
+              {{- toYaml .Values.tasSettings.topologyRequest | nindent 14 }}
+              {{- end }}
+              {{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }}
+              provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}"
+              {{- end }}
+              {{- if not $root.Values.network.hostNetwork }}
+              networking.gke.io/default-interface: "eth0"
+              networking.gke.io/interfaces: |
+              {{- if $root.Values.network.subnetworks }}
+                [
+                  {{- range $i, $subnetwork := $root.Values.network.subnetworks }}
+                  {"interfaceName":"eth{{ $i }}","network":"{{ $subnetwork }}"}{{ eq $i 9 | ternary "" ","}}
+                  {{- end }}
+                ]
+              {{- else }}
+                [
+                  {"interfaceName":"eth0","network":"default"},
+                  {"interfaceName":"eth1","network":"gvnic-1"},
+                  {{- range  $i := until 8 }}
+                  {"interfaceName":"eth{{ add 2 $i }}","network":"rdma-{{ $i }}"}{{ eq $i 7 | ternary "" ","}}
+                  {{- end }}
+                ]
+              {{- end }}
+              {{- end }}
+          spec:
+            {{- if $root.Values.network.hostNetwork }}
+            hostNetwork: true
+            dnsPolicy: ClusterFirstWithHostNet
+            {{- end }}
+            subdomain: "{{.Release.Name}}"
+            restartPolicy: Never
+            {{- if $root.Values.targetNodes }}
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+                  nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: kubernetes.io/hostname
+                      operator: "In"
+                      values:
+                      {{- range $hostname := $root.Values.targetNodes }}
+                      - {{ $hostname }}
+                      {{- end }}
+            {{- end }}
+            {{- if $root.Values.avoidNodes }}
+            {{- if not $root.Values.targetNodes }}
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+            {{- end }}
+                  nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: kubernetes.io/hostname
+                      operator: "NotIn"
+                      values:
+                      {{- range $hostname := $root.Values.avoidNodes }}
+                      - {{ $hostname }}
+                      {{- end }}
+            {{- end }}
+            tolerations:
+            - operator: "Exists"
+              key: nvidia.com/gpu
+            - operator: "Exists"
+              key: cloud.google.com/impending-node-termination
+
+            volumes:
+            {{ if $root.Values.network.gibVersion }}
+            - name: gib
+              emptyDir: {}
+            {{ end }}
+
+            {{- if $root.Values.workload.configFile }}
+            - name: workload-configuration
+              configMap:
+                name: "{{.Release.Name}}-config"
+                items:
+                - key: workload-configuration
+                  path: {{ $root.Values.workload.configFile | default "workload-configuration" }}
+            {{- end }}
+
+            - name: workload-launcher
+              configMap:
+                name: "{{.Release.Name}}-launcher"
+
+            - name: shared-memory
+              emptyDir:
+                medium: "Memory"
+                sizeLimit: 250Gi
+
+            {{- range $pvc := $root.Values.volumes.pvcMounts }}
+            - name: "{{ $pvc.claimName }}"
+              persistentVolumeClaim:
+                claimName: "{{ $pvc.claimName }}"
+            {{- end }}
+
+            {{- range $gcs := $root.Values.volumes.gcsMounts }}
+            - name: "{{ $gcs.bucketName }}"
+              csi:
+                driver: gcsfuse.csi.storage.gke.io
+                volumeAttributes:
+                  bucketName: "{{ $gcs.bucketName }}"
+                  {{- if $gcs.mountOptions }}
+                  mountOptions: "{{ $gcs.mountOptions }}"
+                  {{- end }}
+            {{- end}}
+
+            {{- if $root.Values.volumes.ssdMountPath }}
+            - name: local-ssd
+              hostPath:
+                path: /mnt/stateful_partition/kube-ephemeral-ssd
+            {{- end }}
+
+            initContainers:
+            {{ if $root.Values.network.gibVersion }}
+            - name: nccl-plugin-installer
+              image: {{ $root.Values.network.gibVersion }}
+              imagePullPolicy: Always
+              args:
+              - |
+                set -ex
+                /scripts/container_entry.sh install --install-nccl
+                cp -R /var/lib/gib/lib64/. /target/usr/local/gib/lib64
+                cp -R /var/lib/gib/. /target/usr/local/gib
+              command:
+              - /bin/sh
+              - -c
+              volumeMounts:
+              - mountPath: /target/usr/local/gib
+                name: gib
+            {{ end}}
+
+            containers:
+            {{- if $root.Values.workload.gcsSidecarImage }}
+            - name: gke-gcsfuse-sidecar
+              image: {{ $root.Values.workload.gcsSidecarImage }}
+            - name: gke-gcsfuse-metadata-prefetch
+              image: {{ $root.Values.workload.gcsSidecarImage }}
+            {{- end }}
+            {{- if $root.Values.workload.psSidecarImage }}
+            - name: gke-parallelstore-sidecar
+              image: {{ $root.Values.workload.psSidecarImage }}
+            {{- end }}
+
+            - name: workload
+              image: "{{ $root.Values.workload.image }}"
+              imagePullPolicy: Always
+              {{- if $root.Values.network.hostNetwork }}
+              securityContext:
+                privileged: true
+              {{- end }}
+              env:
+              - name: JOB_IDENTIFIER
+                value: "{{ .Release.Name }}-{{ $timestamp }}"
+              - name: JOB_TIMESTAMP
+                value: "{{ $timestamp }}"
+              - name: JOB_UUID
+                value: "{{ $jobuuid }}"
+              - name: JOB_ORCHESTRATOR
+                value: "gke"
+              # Add RANK based on the pod's index provided by the Indexed Job
+              # This is crucial for torch.distributed initialization.
+              - name: JOB_COMPLETION_INDEX
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
+              - name: RANK_0_FQDN
+                value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local"
+              - name: HOSTNAME_PREFIX
+                value: "{{.Release.Name}}-workload-"
+              - name: DOMAIN_NAME
+                value: "{{.Release.Name}}.default.svc.cluster.local"
+              - name: MASTER_ADDR
+                value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local"
+              - name: MASTER_PORT
+                value: "6002"
+              - name: WORLD_SIZE
+                value: "{{ $root.Values.workload.gpus }}"
+              - name: NNODES
+                value: "{{ $nodes }}"
+              - name: GPUS_PER_NODE
+                value: "{{ $gpusPerNode }}"
+
+              - name: NCCL_PLUGIN_PATH
+                value: /usr/local/gib/lib64
+
+              {{ if $root.Values.network.gibVersion }}
+              - name: NCCL_INIT_SCRIPT
+                value: "/usr/local/gib/scripts/set_nccl_env.sh"
+              {{ end }}
+
+              {{ if $root.Values.network.ncclSettings }}
+              {{- toYaml .Values.network.ncclSettings | nindent 14 }}
+              {{ end }}
+
+              {{ if $root.Values.workload.envs }}
+              {{- toYaml .Values.workload.envs | nindent 14 }}
+              {{ end }}
+
+              command:
+              - bash
+              - -c
+              - |
+                echo "Pod on $(hostname --fqdn) is running"
+                echo "Pod is assigned job index of $JOB_COMPLETION_INDEX"
+
+                if [[ -n "${NCCL_INIT_SCRIPT}" ]]; then
+                  echo "Running NCCL init script: ${NCCL_INIT_SCRIPT}"
+                  source ${NCCL_INIT_SCRIPT}
+                fi
+
+                # Overriding NCCL_SOCKET_IFNAME definition
+                export NCCL_SOCKET_IFNAME="eth0,eth1"
+                export NCCL_TUNER_CONFIG_PATH=/usr/local/gib/configs/tuner_config_a4.txtpb
+
+                echo "Launching workload with the following arguments:"
+                {{- range $root.Values.workload.defaultArguments }}
+                echo "  {{ . }}"
+                {{- end }}
+                {{- range $root.Values.workload.arguments }}
+                echo "  {{ . }}"
+                {{- end }}
+                echo ""
+
+                sleep 10
+
+                bash /workload/launcher/launch-workload.sh \
+                {{- range $root.Values.workload.defaultArguments }}
+                {{ . }} \
+                {{- end }}
+                {{- range $root.Values.workload.arguments }}
+                {{ . }} \
+                {{- end }}
+
+
+              volumeMounts:
+                {{ if $root.Values.network.gibVersion }}
+                - name: gib
+                  mountPath: /usr/local/gib
+                {{ end }}
+
+                {{- if $root.Values.workload.configFile }}
+                - name: workload-configuration
+                  mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }}
+                {{- end }}
+
+                - name: workload-launcher
+                  mountPath: /workload/launcher
+
+                - name: shared-memory
+                  mountPath: /dev/shm
+
+                {{- range $pvc := $root.Values.volumes.pvcMounts }}
+                - name: "{{ $pvc.claimName }}"
+                  mountPath: "{{ $pvc.mountPath }}"
+                {{- end }}
+
+                {{- range $gcs := $root.Values.volumes.gcsMounts }}
+                - name: "{{ $gcs.bucketName }}"
+                  mountPath: "{{ $gcs.mountPath }}"
+                {{- end }}
+
+                {{- if $root.Values.volumes.ssdMountPath }}
+                - name: local-ssd
+                  mountPath: "{{ $root.Values.volumes.ssdMountPath }}"
+                {{- end }}
+
+              resources:
+                limits:
+                  nvidia.com/gpu: {{ $gpusPerNode }}
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-launcher-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-launcher-configmap.yaml
new file mode 100644
index 00000000..7026e0f1
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-launcher-configmap.yaml
@@ -0,0 +1,28 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-launcher"
+data:
+  launch-workload.sh: |-
+{{- if .Values.workload_launcher }}
+{{ .Values.workload_launcher | nindent 4 }}
+{{- else }}
+    #!/bin/bash
+    echo "No workload launcher specified"
+    exit 1
+{{- end }}
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-svc.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-svc.yaml
new file mode 100644
index 00000000..7cfe220b
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/templates/workload-svc.yaml
@@ -0,0 +1,22 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: "{{ .Release.Name }}"
+spec:
+  clusterIP: None
+  selector:
+    jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}"
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/values.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/values.yaml
new file mode 100644
index 00000000..66fab5bd
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/values.yaml
@@ -0,0 +1,33 @@
+dwsSettings:
+  maxRunDurationSeconds: null
+network:
+  gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0
+  hostNetwork: true
+  ncclSettings:
+  - name: NCCL_DEBUG
+    value: WARN
+  subnetworks[]: null
+queue: null
+tasSettings:
+  topologyRequest:
+    kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname
+volumes:
+  gcsMounts:
+  - bucketName: null
+    mountPath: null
+  gcsVolumes: true
+  psVolumes: false
+workload:
+  arguments[]: null
+  configFile: llama3-1-8b-bf16-seq4096-gbs256-gpus8.py
+  configPath: /workload/configs/
+  defaultArguments[]: null
+  envs:
+  - name: ARTIFACT_DIR
+    value: null
+  - name: GLOO_SOCKET_IFNAME
+    value: eth0
+  - name: NEMO_LAUNCH_SCRIPT
+    value: /workload/configs/llama3-1-8b-bf16-seq4096-gbs256-gpus8.py
+  gpus: 8
+  image: nvcr.io/nvidia/nemo:25.11

From d591ebdee4d22d1049576a190e90a0b7d1a34ff0 Mon Sep 17 00:00:00 2001
From: Vishwas Reddy <vishwasreddy38@gmail.com>
Date: Thu, 19 Mar 2026 12:57:15 -0700
Subject: [PATCH 6/9] Remove '-1node' suffix from workload name

---
 .../1node-bf16-seq4096-gbs256/README.md              | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/README.md b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/README.md
index 1f1e50c0..5c32f305 100644
--- a/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/README.md
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/1node-bf16-seq4096-gbs256/README.md
@@ -89,7 +89,7 @@ your client:
 
 ```bash
 cd $RECIPE_ROOT
-export WORKLOAD_NAME=$USER-a4-llama3-1-8b-1node
+export WORKLOAD_NAME=$USER-a4-llama3-1-8b
 helm install $WORKLOAD_NAME . -f values.yaml \
 --set-file workload_launcher=launcher.sh \
 --set-file workload_config=llama3-1-8b-bf16-seq4096-gbs256-gpus8.py \
@@ -107,7 +107,7 @@ helm install $WORKLOAD_NAME . -f values.yaml \
 
     ```bash
     cd $RECIPE_ROOT
-    export WORKLOAD_NAME=$USER-a4-llama3-1-8b-1node
+    export WORKLOAD_NAME=$USER-a4-llama3-1-8b
     helm install $WORKLOAD_NAME . -f values.yaml \
     --set-file workload_launcher=launcher.sh \
     --set-file workload_config=llama3-1-8b-bf16-seq4096-gbs256-gpus8.py \
@@ -124,7 +124,7 @@ helm install $WORKLOAD_NAME . -f values.yaml \
 To check the status of pods in your job, run the following command:
 
 ```
-kubectl get pods | grep $USER-a4-llama3-1-8b-1node
+kubectl get pods | grep $USER-a4-llama3-1-8b
 ```
 
 Replace the following:
@@ -141,7 +141,7 @@ Information about the training job's progress, including crucial details such as
 loss, step count, and step time, is generated by the rank 0 process.
 This process runs on the pod whose name begins with
 `JOB_NAME_PREFIX-workload-0-0`.
-For example: `$USER-a4-llama3-1-8b-1node-workload-0-0-s9zrv`.
+For example: `$USER-a4-llama3-1-8b-workload-0-0-s9zrv`.
 
 ### Uninstall the Helm release
 
@@ -149,5 +149,5 @@ You can delete the job and other resources created by the Helm chart. To
 uninstall Helm, run the following command from your client:
 
 ```bash
-helm uninstall $USER-a4-llama3-1-8b-1node
-```
\ No newline at end of file
+helm uninstall $USER-a4-llama3-1-8b
+```

From dd4dda79172fc4a9412b83c504e2a4511f789916 Mon Sep 17 00:00:00 2001
From: Vishwas Reddy <vishwasreddy38@gmail.com>
Date: Thu, 19 Mar 2026 20:07:02 +0000
Subject: [PATCH 7/9] feat: add 8-node configuration for seq4096

---
 .../8node-bf16-seq4096-gbs256/Chart.yaml      |  20 ++
 .../8node-bf16-seq4096-gbs256/README.md       | 153 ++++++++
 .../8node-bf16-seq4096-gbs256/launcher.sh     | 106 ++++++
 .../llama3-1-8b-bf16-seq4096-gbs256-gpus64.py | 142 ++++++++
 .../recipe_launch_command.sh                  |   1 +
 .../templates/workload-config-configmap.yaml  |  28 ++
 .../templates/workload-job.yaml               | 333 ++++++++++++++++++
 .../workload-launcher-configmap.yaml          |  28 ++
 .../templates/workload-svc.yaml               |  22 ++
 .../8node-bf16-seq4096-gbs256/values.yaml     |  33 ++
 10 files changed, 866 insertions(+)
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/Chart.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/README.md
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/launcher.sh
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/llama3-1-8b-bf16-seq4096-gbs256-gpus64.py
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/recipe_launch_command.sh
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-config-configmap.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-job.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-launcher-configmap.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-svc.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/values.yaml

diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/Chart.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/Chart.yaml
new file mode 100644
index 00000000..af46c11a
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/Chart.yaml
@@ -0,0 +1,20 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v2
+name: a4_jobset_workload
+description: a4_jobset_workload
+type: application
+version: 0.1.0
+appVersion: "1.16.0"
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/README.md b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/README.md
new file mode 100644
index 00000000..7449e4d7
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/README.md
@@ -0,0 +1,153 @@
+<!-- mdformat global-off -->
+# Pretrain llama3-1-8b workloads on a4 GKE Node pools with Nvidia NeMo Framework
+
+This recipe outlines the steps for running a llama3-1-8b pretraining
+workload on [a4 GKE Node pools](https://cloud.google.com/kubernetes-engine) by using the
+[NVIDIA NeMo framework](https://github.com/NVIDIA/nemo).
+
+## Orchestration and deployment tools
+
+For this recipe, the following setup is used:
+
+- Orchestration - [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine)
+- Pretraining job configuration and deployment - A Helm chart is used to
+  configure and deploy the [Kubernetes Jobset](https://kubernetes.io/blog/2025/03/23/introducing-jobset) resource which manages the execution of the
+  [NeMo pretraining workload](https://github.com/NVIDIA/nemo).
+
+## Test environment
+
+This recipe has been optimized for and tested with the following configuration:
+
+- GKE cluster
+Please follow Cluster Toolkit [instructions](https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/examples/gke-a4)
+to create your a4 GKE cluster.
+
+## Training dataset
+
+This recipe uses a mock pretraining dataset provided by the NeMo framework.
+
+## Docker container image
+
+This recipe uses the following docker images:
+
+- `nvcr.io/nvidia/nemo:25.11`
+- `us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0`
+
+## Run the recipe
+
+From your client workstation, complete the following steps:
+
+### Configure environment settings
+
+Set the environment variables to match your environment:
+
+ ```bash
+ export PROJECT_ID=<PROJECT_ID>
+ export CLUSTER_REGION=<CLUSTER_REGION>
+ export CLUSTER_NAME=<CLUSTER_NAME>
+ export GCS_BUCKET=<GCS_BUCKET> # Note: path should not be prefixed with gs://
+ export KUEUE_NAME=<KUEUE_NAME>
+ ```
+
+Replace the following values:
+
+ - `<PROJECT_ID>`: your Google Cloud project ID.
+ - `<CLUSTER_REGION>`: the region where your cluster is located.
+ - `<CLUSTER_NAME>`: the name of your GKE cluster.
+ - `<GCS_BUCKET>`: the name of your Cloud Storage bucket. Don't include the `gs://` prefix.
+ - `<KUEUE_NAME>`: the name of the Kueue local queue. The default queue created by the cluster toolkit is `a4`. Make sure to verify the name of the local queue in your cluster.
+
+Set the default project:
+
+ ```bash
+ gcloud config set project $PROJECT_ID
+ ```
+
+### Get the recipe
+
+Clone the `gpu-recipes` repository and set a reference to the recipe folder.
+
+```
+git clone https://github.com/ai-hypercomputer/gpu-recipes.git
+cd gpu-recipes
+export REPO_ROOT=`git rev-parse --show-toplevel`
+export RECIPE_ROOT=$REPO_ROOT/training/a4/llama3-1-8b/nemo-pretraining-gke/8_nodes
+cd $RECIPE_ROOT
+```
+
+### Get cluster credentials
+
+```
+gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION
+```
+
+### Configure and submit a pretraining job
+
+#### Using 8 node (64 gpus) bf16 precision
+To execute the job with the default settings, run the following command from
+your client:
+
+```bash
+cd $RECIPE_ROOT
+export WORKLOAD_NAME=$USER-a4-llama3-1-8b
+helm install $WORKLOAD_NAME . -f values.yaml \
+--set-file workload_launcher=launcher.sh \
+--set-file workload_config=llama3-1-8b-bf16-seq4096-gbs256-gpus64.py \
+--set workload.image=nvcr.io/nvidia/nemo:25.11 \
+--set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
+--set volumes.gcsMounts[0].mountPath=/job-logs \
+--set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
+--set queue=${KUEUE_NAME}
+```
+
+**Examples**
+
+-   To set the number of training steps to 100, run the following command from
+    your client:
+
+    ```bash
+    cd $RECIPE_ROOT
+    export WORKLOAD_NAME=$USER-a4-llama3-1-8b
+    helm install $WORKLOAD_NAME . -f values.yaml \
+    --set-file workload_launcher=launcher.sh \
+    --set-file workload_config=llama3-1-8b-bf16-seq4096-gbs256-gpus64.py \
+    --set workload.image=nvcr.io/nvidia/nemo:25.11 \
+    --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
+    --set volumes.gcsMounts[0].mountPath=/job-logs \
+    --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
+    --set queue=${KUEUE_NAME} \
+    --set workload.arguments[0]="trainer.max_steps=100"
+    ```
+
+### Monitor the job
+
+To check the status of pods in your job, run the following command:
+
+```
+kubectl get pods | grep $USER-a4-llama3-1-8b
+```
+
+Replace the following:
+
+- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4-llama3-1-8b.
+
+To get the logs for one of the pods, run the following command:
+
+```
+kubectl logs POD_NAME
+```
+
+Information about the training job's progress, including crucial details such as
+loss, step count, and step time, is generated by the rank 0 process.
+This process runs on the pod whose name begins with
+`JOB_NAME_PREFIX-workload-0-0`.
+For example: `$USER-a4-llama3-1-8b-workload-0-0-s9zrv`.
+
+### Uninstall the Helm release
+
+You can delete the job and other resources created by the Helm chart. To
+uninstall Helm, run the following command from your client:
+
+```bash
+helm uninstall $USER-a4-llama3-1-8b
+```
\ No newline at end of file
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/launcher.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/launcher.sh
new file mode 100644
index 00000000..357d27a4
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/launcher.sh
@@ -0,0 +1,106 @@
+usage()
+{
+cat << EOF
+usage: bash ./launcher.sh [config-override  [config-override ...]]
+config-override  (Optional) A  NeMo configuration override. E.g. trainer.max_steps=10000.
+EOF
+}
+
+parse_args() {
+  while [ "$1" != "" ]; do
+    case $(grep -o "=" <<< "$1" | wc -l) in
+        1  )
+        config_overrides+=("$1")
+        ;;
+        * )
+            echo "Invalid config override: $1"
+            usage
+            exit 1
+    esac
+    shift
+  done
+  config_overrides="${config_overrides[*]}"
+}
+
+config_overrides=()
+parse_args "$@"
+
+if [ -z "${config_overrides}" ]; then
+  echo "No NeMo config overrides specified"
+else
+  echo "NeMo config overrides:"
+  echo "  ${config_overrides}"
+fi
+
+if [[ -n "${NCCL_PLUGIN_PATH}" ]]; then
+  export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH"
+  ldconfig $LD_LIBRARY_PATH
+  echo "Added $LD_LIBRARY_PATH to ldconfig:"
+  ldconfig -p | grep libcuda | sed 's/^/  /'
+  echo ""
+fi
+
+if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then
+  explicit_log_dir=${EXPLICIT_LOG_DIR}
+else
+  explicit_log_dir=workload_logs
+fi
+echo "Logging to ${explicit_log_dir}"
+
+if [[ -n "${TOKENIZER_PATH}" ]]; then
+  echo "Getting tokenizer files"
+  cp ${TOKENIZER_PATH}/* .
+  echo ""
+fi
+
+echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of $NNODES nodes"
+
+
+pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger
+
+export HF_TOKEN="<YOUR_HF_TOKEN>"
+
+# Export the nemo2 config to yaml.
+python ${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \
+trainer.num_nodes="$NNODES" \
+log.explicit_log_dir="${explicit_log_dir}" \
+trainer.max_steps=30 \
+trainer.num_nodes=8 \
+trainer.devices=8 \
+${config_overrides} \
+--to-yaml exported_nemo_config.yaml
+
+# Create the nsys directory.
+mkdir -p ${explicit_log_dir}/nsys
+
+OMP_NUM_THREADS=12 NSYS_CONFIG_DIRECTIVES="AgentLaunchTimeoutSec=240;AppLaunchTimeoutSec=240" TORCH_NCCL_ENABLE_MONITORING=0 \
+/usr/local/bin/nsys profile -s none -t nvtx,cuda --capture-range=cudaProfilerApi --capture-range-end=stop \
+-o ${explicit_log_dir}/nsys/noderank-${JOB_COMPLETION_INDEX} \
+--session-new "nemo-rank${JOB_COMPLETION_INDEX}"-$RANDOM \
+--wait all \
+torchrun \
+--nproc-per-node="8" \
+--nnodes="${NNODES}" \
+--node_rank="${JOB_COMPLETION_INDEX}" \
+--rdzv_id="${JOB_IDENTIFIER}" \
+--master_addr="${MASTER_ADDR}" \
+--master_port="${MASTER_PORT}" \
+${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \
+trainer.num_nodes="$NNODES" \
+log.explicit_log_dir="${explicit_log_dir}" \
+trainer.max_steps=30 \
+trainer.num_nodes=8 \
+trainer.devices=8 \
+${config_overrides}
+
+if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
+  mkdir -p ${ARTIFACT_DIR}
+  cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/
+  cp ${NEMO_LAUNCH_SCRIPT} ${ARTIFACT_DIR}/run-cli.py
+  cp dllogger.json ${ARTIFACT_DIR}/dllogger.json
+  cp exported_nemo_config.yaml ${ARTIFACT_DIR}/nemo-configuration.yaml
+  env > ${ARTIFACT_DIR}/environ.txt
+  ls ${ARTIFACT_DIR}
+fi
+echo "Training completed"
+echo "Pod on $(hostname --fqdn) is exiting"
\ No newline at end of file
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/llama3-1-8b-bf16-seq4096-gbs256-gpus64.py b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/llama3-1-8b-bf16-seq4096-gbs256-gpus64.py
new file mode 100644
index 00000000..7b4acbba
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/llama3-1-8b-bf16-seq4096-gbs256-gpus64.py
@@ -0,0 +1,142 @@
+"""Nemo2 pretraining recipe for Llama 3.1 8B model."""
+
+from nemo.collections import llm
+from nemo.collections.llm.recipes import llama31_8b
+from nemo.lightning.pytorch.callbacks import NsysCallback
+from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback
+from nemo.utils.loggers.dllogger import DLLogger
+import nemo_run as run
+from scripts.performance.helpers import (
+    set_primary_perf_configs,
+)
+from scripts.performance.utils import get_comm_overlap_callback_idx
+
+
+def recipe(
+    profile_enabled: bool = False,
+    profile_start_step: int = 0,
+    profile_end_step: int = 0,
+    profile_ranks: str = "0",
+) -> run.Partial:
+  """Returns a Nemo2 training recipe for Llama 3.1 8B model.
+
+  Args:
+      profile_enabled: Whether to enable Nsys profiling.
+      profile_start_step: The step to start profiling.
+      profile_end_step: The step to end profiling.
+      profile_ranks: The ranks to profile, comma separated.
+
+  Returns:
+      A Nemo2 training recipe.
+  """
+  # Start from the Nemo standard recipe.
+  pretrain = llama31_8b.pretrain_recipe(performance_mode=True)
+
+  num_nodes = 8
+  num_gpus_per_node = 8
+  mbs = 4
+  gbs = 256
+  max_steps = 30
+  tp_size = 1
+  pp_size = 1
+  cp_size = 1
+  vp_size = 1  # Virtual Pipeline Parallelism
+  ep_size = 1  # Expert Parallelism
+  enable_cuda_graphs = False
+  compute_dtype = "bf16"
+  fp8_recipe = None  # Not needed for bf16
+  nccl_communicator_config_path = None
+  use_mcore_fsdp = False
+  use_fsdp_double_buffer = False
+  use_user_buffer_registration = False
+  use_sharp = False
+  keep_fsdp_fp8_transpose_cache = False
+
+  pretrain = set_primary_perf_configs(
+      pretrain,
+      "pre_train",
+      num_nodes=num_nodes,
+      num_gpus_per_node=num_gpus_per_node,
+      mbs=mbs,
+      gbs=gbs,
+      max_steps=max_steps,
+      tp_size=tp_size,
+      pp_size=pp_size,
+      cp_size=cp_size,
+      vp_size=vp_size,
+      ep_size=ep_size,
+      enable_cuda_graphs=enable_cuda_graphs,
+      compute_dtype=compute_dtype,
+      fp8_recipe=fp8_recipe,
+      nccl_communicator_config_path=nccl_communicator_config_path,
+      use_mcore_fsdp=use_mcore_fsdp,
+      use_fsdp_double_buffer=use_fsdp_double_buffer,
+      use_user_buffer_registration=use_user_buffer_registration,
+      use_sharp=use_sharp,
+      keep_fsdp_fp8_transpose_cache=keep_fsdp_fp8_transpose_cache,
+  )
+
+  # Sequence Length (model and data)
+  pretrain.model.config.seq_length = 4096
+  pretrain.data.seq_length = 4096
+
+ # Set the number of steps to 50 for a quicker benchmark.
+  pretrain.trainer.max_steps = 50
+
+  # Disable validation batches.
+  pretrain.trainer.limit_val_batches = 0.0
+  pretrain.trainer.val_check_interval = 0
+
+  # Add the Nsys profiling callback if enabled.
+  if profile_enabled:
+    pretrain.trainer.callbacks.append(
+        run.Config(
+            NsysCallback,
+            start_step=profile_start_step,
+            end_step=profile_end_step,
+            ranks=[int(x) for x in profile_ranks.split(",")],
+            gen_shape=False,
+        )
+    )
+
+  # Add the FLOPs measurement callback.
+  pretrain.trainer.callbacks.append(
+      run.Config(
+          FLOPsMeasurementCallback,
+          model_name="llama31-8b",
+          model_config=pretrain.model.config,
+          data_config=pretrain.data,
+      )
+  )
+
+  # When `performance_mode` is enabled, the Megatron communication overlap
+  # callback is already added to the recipe.
+  # https://github.com/NVIDIA-NeMo/NeMo/blob/90a396a567ebb4e8c1c41e454dc00cb71f911317/nemo/collections/llm/recipes/llama31_8b.py#L231
+  comm_overlap_callback_idx = get_comm_overlap_callback_idx(
+      pretrain.trainer.callbacks
+  )
+  pretrain.trainer.callbacks[
+      comm_overlap_callback_idx
+  ].tp_comm_bootstrap_backend = "nccl"
+
+  # Disable checkpointing.
+  pretrain.log.ckpt = None
+  pretrain.trainer.enable_checkpointing = False
+
+  # Log every step.
+  pretrain.trainer.log_every_n_steps = 1
+
+  # Enable DLLogger
+  dllogger_config = run.Config(
+      DLLogger,
+      verbose=True,
+      stdout=True,
+      json_file="dllogger.json",
+  )
+  pretrain.log.extra_loggers = [dllogger_config]
+
+  return pretrain
+
+
+if __name__ == "__main__":
+  run.cli.main(llm.pretrain, default_factory=recipe)
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/recipe_launch_command.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/recipe_launch_command.sh
new file mode 100644
index 00000000..f01800fc
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/recipe_launch_command.sh
@@ -0,0 +1 @@
+helm install vishwasreddy-ubench-48bq . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-8b-bf16-seq4096-gbs256-gpus64.py --set workload.image=nvcr.io/nvidia/nemo:25.11 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/vishwasreddy-ubench-48bq
\ No newline at end of file
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-config-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-config-configmap.yaml
new file mode 100644
index 00000000..a1d54cee
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-config-configmap.yaml
@@ -0,0 +1,28 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{- if .Values.workload.configFile }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-config"
+data:
+  workload-configuration: |-
+{{- if .Values.workload_config }}
+{{ .Values.workload_config | nindent 4 }}
+{{- else }}
+{{ "config: null" | nindent 4 }}
+{{- end }}
+{{- end }}
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-job.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-job.yaml
new file mode 100644
index 00000000..ae59e456
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-job.yaml
@@ -0,0 +1,333 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{$timestamp := now | date "2006-01-02-15-04-05"}}
+{{$jobSuffix := randAlphaNum 4 | lower}}
+{{$jobuuid := uuidv4}}
+{{$nodes := div .Values.workload.gpus 8 | max 1}}
+{{$gpusPerNode := min .Values.workload.gpus 8}}
+{{- $root := . -}}
+
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+  name: "{{ .Release.Name }}"
+  namespace: default
+  labels:
+  {{- if $root.Values.queue }}
+    kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}"
+  {{- end }}
+spec:
+  {{- if $root.Values.queue }}
+  suspend: true
+  {{- end }}
+  failurePolicy:
+    maxRestarts: {{ default 0 $root.Values.workload.max_workload_restarts }}
+  replicatedJobs:
+  - name: workload
+    replicas: 1
+    template:
+      spec:
+        parallelism: {{ $nodes }}
+        completions: {{ $nodes }}
+        backoffLimit: 0
+        completionMode: Indexed
+        activeDeadlineSeconds: 14400 # 4 hours (4 * 60 * 60)
+        ttlSecondsAfterFinished: 43200 # 12 hours (12 * 60 * 60)
+        template:
+          metadata:
+            annotations:
+              kubectl.kubernetes.io/default-container: workload
+              {{- if $root.Values.volumes.gcsVolumes }}
+              gke-gcsfuse/volumes: "true"
+              gke-gcsfuse/cpu-limit: "500m"
+              gke-gcsfuse/memory-limit: "1Ti"
+              gke-gcsfuse/ephemeral-storage-limit: "2Ti"
+              {{- end }}
+              {{- if $root.Values.volumes.psVolumes }}
+              gke-parallelstore/volumes: "true"
+              gke-parallelstore/cpu-limit: "0"
+              gke-parallelstore/memory-limit: "0"
+              {{- end }}
+              {{- if and $root.Values.queue $root.Values.tasSettings.topologyRequest }}
+              {{- toYaml .Values.tasSettings.topologyRequest | nindent 14 }}
+              {{- end }}
+              {{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }}
+              provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}"
+              {{- end }}
+              {{- if not $root.Values.network.hostNetwork }}
+              networking.gke.io/default-interface: "eth0"
+              networking.gke.io/interfaces: |
+              {{- if $root.Values.network.subnetworks }}
+                [
+                  {{- range $i, $subnetwork := $root.Values.network.subnetworks }}
+                  {"interfaceName":"eth{{ $i }}","network":"{{ $subnetwork }}"}{{ eq $i 9 | ternary "" ","}}
+                  {{- end }}
+                ]
+              {{- else }}
+                [
+                  {"interfaceName":"eth0","network":"default"},
+                  {"interfaceName":"eth1","network":"gvnic-1"},
+                  {{- range  $i := until 8 }}
+                  {"interfaceName":"eth{{ add 2 $i }}","network":"rdma-{{ $i }}"}{{ eq $i 7 | ternary "" ","}}
+                  {{- end }}
+                ]
+              {{- end }}
+              {{- end }}
+          spec:
+            {{- if $root.Values.network.hostNetwork }}
+            hostNetwork: true
+            dnsPolicy: ClusterFirstWithHostNet
+            {{- end }}
+            subdomain: "{{.Release.Name}}"
+            restartPolicy: Never
+            {{- if $root.Values.targetNodes }}
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+                  nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: kubernetes.io/hostname
+                      operator: "In"
+                      values:
+                      {{- range $hostname := $root.Values.targetNodes }}
+                      - {{ $hostname }}
+                      {{- end }}
+            {{- end }}
+            {{- if $root.Values.avoidNodes }}
+            {{- if not $root.Values.targetNodes }}
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+            {{- end }}
+                  nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: kubernetes.io/hostname
+                      operator: "NotIn"
+                      values:
+                      {{- range $hostname := $root.Values.avoidNodes }}
+                      - {{ $hostname }}
+                      {{- end }}
+            {{- end }}
+            tolerations:
+            - operator: "Exists"
+              key: nvidia.com/gpu
+            - operator: "Exists"
+              key: cloud.google.com/impending-node-termination
+
+            volumes:
+            {{ if $root.Values.network.gibVersion }}
+            - name: gib
+              emptyDir: {}
+            {{ end }}
+
+            {{- if $root.Values.workload.configFile }}
+            - name: workload-configuration
+              configMap:
+                name: "{{.Release.Name}}-config"
+                items:
+                - key: workload-configuration
+                  path: {{ $root.Values.workload.configFile | default "workload-configuration" }}
+            {{- end }}
+
+            - name: workload-launcher
+              configMap:
+                name: "{{.Release.Name}}-launcher"
+
+            - name: shared-memory
+              emptyDir:
+                medium: "Memory"
+                sizeLimit: 250Gi
+
+            {{- range $pvc := $root.Values.volumes.pvcMounts }}
+            - name: "{{ $pvc.claimName }}"
+              persistentVolumeClaim:
+                claimName: "{{ $pvc.claimName }}"
+            {{- end }}
+
+            {{- range $gcs := $root.Values.volumes.gcsMounts }}
+            - name: "{{ $gcs.bucketName }}"
+              csi:
+                driver: gcsfuse.csi.storage.gke.io
+                volumeAttributes:
+                  bucketName: "{{ $gcs.bucketName }}"
+                  {{- if $gcs.mountOptions }}
+                  mountOptions: "{{ $gcs.mountOptions }}"
+                  {{- end }}
+            {{- end}}
+
+            {{- if $root.Values.volumes.ssdMountPath }}
+            - name: local-ssd
+              hostPath:
+                path: /mnt/stateful_partition/kube-ephemeral-ssd
+            {{- end }}
+
+            initContainers:
+            {{ if $root.Values.network.gibVersion }}
+            - name: nccl-plugin-installer
+              image: {{ $root.Values.network.gibVersion }}
+              imagePullPolicy: Always
+              args:
+              - |
+                set -ex
+                /scripts/container_entry.sh install --install-nccl
+                cp -R /var/lib/gib/lib64/. /target/usr/local/gib/lib64
+                cp -R /var/lib/gib/. /target/usr/local/gib
+              command:
+              - /bin/sh
+              - -c
+              volumeMounts:
+              - mountPath: /target/usr/local/gib
+                name: gib
+            {{ end}}
+
+            containers:
+            {{- if $root.Values.workload.gcsSidecarImage }}
+            - name: gke-gcsfuse-sidecar
+              image: {{ $root.Values.workload.gcsSidecarImage }}
+            - name: gke-gcsfuse-metadata-prefetch
+              image: {{ $root.Values.workload.gcsSidecarImage }}
+            {{- end }}
+            {{- if $root.Values.workload.psSidecarImage }}
+            - name: gke-parallelstore-sidecar
+              image: {{ $root.Values.workload.psSidecarImage }}
+            {{- end }}
+
+            - name: workload
+              image: "{{ $root.Values.workload.image }}"
+              imagePullPolicy: Always
+              {{- if $root.Values.network.hostNetwork }}
+              securityContext:
+                privileged: true
+              {{- end }}
+              env:
+              - name: JOB_IDENTIFIER
+                value: "{{ .Release.Name }}-{{ $timestamp }}"
+              - name: JOB_TIMESTAMP
+                value: "{{ $timestamp }}"
+              - name: JOB_UUID
+                value: "{{ $jobuuid }}"
+              - name: JOB_ORCHESTRATOR
+                value: "gke"
+              # Add RANK based on the pod's index provided by the Indexed Job
+              # This is crucial for torch.distributed initialization.
+              - name: JOB_COMPLETION_INDEX
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
+              - name: RANK_0_FQDN
+                value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local"
+              - name: HOSTNAME_PREFIX
+                value: "{{.Release.Name}}-workload-"
+              - name: DOMAIN_NAME
+                value: "{{.Release.Name}}.default.svc.cluster.local"
+              - name: MASTER_ADDR
+                value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local"
+              - name: MASTER_PORT
+                value: "6002"
+              - name: WORLD_SIZE
+                value: "{{ $root.Values.workload.gpus }}"
+              - name: NNODES
+                value: "{{ $nodes }}"
+              - name: GPUS_PER_NODE
+                value: "{{ $gpusPerNode }}"
+
+              - name: NCCL_PLUGIN_PATH
+                value: /usr/local/gib/lib64
+
+              {{ if $root.Values.network.gibVersion }}
+              - name: NCCL_INIT_SCRIPT
+                value: "/usr/local/gib/scripts/set_nccl_env.sh"
+              {{ end }}
+
+              {{ if $root.Values.network.ncclSettings }}
+              {{- toYaml .Values.network.ncclSettings | nindent 14 }}
+              {{ end }}
+
+              {{ if $root.Values.workload.envs }}
+              {{- toYaml .Values.workload.envs | nindent 14 }}
+              {{ end }}
+
+              command:
+              - bash
+              - -c
+              - |
+                echo "Pod on $(hostname --fqdn) is running"
+                echo "Pod is assigned job index of $JOB_COMPLETION_INDEX"
+
+                if [[ -n "${NCCL_INIT_SCRIPT}" ]]; then
+                  echo "Running NCCL init script: ${NCCL_INIT_SCRIPT}"
+                  source ${NCCL_INIT_SCRIPT}
+                fi
+
+                # Overriding NCCL_SOCKET_IFNAME definition
+                export NCCL_SOCKET_IFNAME="eth0,eth1"
+                export NCCL_TUNER_CONFIG_PATH=/usr/local/gib/configs/tuner_config_a4.txtpb
+
+                echo "Launching workload with the following arguments:"
+                {{- range $root.Values.workload.defaultArguments }}
+                echo "  {{ . }}"
+                {{- end }}
+                {{- range $root.Values.workload.arguments }}
+                echo "  {{ . }}"
+                {{- end }}
+                echo ""
+
+                sleep 10
+
+                bash /workload/launcher/launch-workload.sh \
+                {{- range $root.Values.workload.defaultArguments }}
+                {{ . }} \
+                {{- end }}
+                {{- range $root.Values.workload.arguments }}
+                {{ . }} \
+                {{- end }}
+
+
+              volumeMounts:
+                {{ if $root.Values.network.gibVersion }}
+                - name: gib
+                  mountPath: /usr/local/gib
+                {{ end }}
+
+                {{- if $root.Values.workload.configFile }}
+                - name: workload-configuration
+                  mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }}
+                {{- end }}
+
+                - name: workload-launcher
+                  mountPath: /workload/launcher
+
+                - name: shared-memory
+                  mountPath: /dev/shm
+
+                {{- range $pvc := $root.Values.volumes.pvcMounts }}
+                - name: "{{ $pvc.claimName }}"
+                  mountPath: "{{ $pvc.mountPath }}"
+                {{- end }}
+
+                {{- range $gcs := $root.Values.volumes.gcsMounts }}
+                - name: "{{ $gcs.bucketName }}"
+                  mountPath: "{{ $gcs.mountPath }}"
+                {{- end }}
+
+                {{- if $root.Values.volumes.ssdMountPath }}
+                - name: local-ssd
+                  mountPath: "{{ $root.Values.volumes.ssdMountPath }}"
+                {{- end }}
+
+              resources:
+                limits:
+                  nvidia.com/gpu: {{ $gpusPerNode }}
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-launcher-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-launcher-configmap.yaml
new file mode 100644
index 00000000..7026e0f1
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-launcher-configmap.yaml
@@ -0,0 +1,28 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-launcher"
+data:
+  launch-workload.sh: |-
+{{- if .Values.workload_launcher }}
+{{ .Values.workload_launcher | nindent 4 }}
+{{- else }}
+    #!/bin/bash
+    echo "No workload launcher specified"
+    exit 1
+{{- end }}
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-svc.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-svc.yaml
new file mode 100644
index 00000000..7cfe220b
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/templates/workload-svc.yaml
@@ -0,0 +1,22 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: "{{ .Release.Name }}"
+spec:
+  clusterIP: None
+  selector:
+    jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}"
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/values.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/values.yaml
new file mode 100644
index 00000000..7c5e36ae
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq4096-gbs256/values.yaml
@@ -0,0 +1,33 @@
+dwsSettings:
+  maxRunDurationSeconds: null
+network:
+  gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.0
+  hostNetwork: true
+  ncclSettings:
+  - name: NCCL_DEBUG
+    value: WARN
+  subnetworks[]: null
+queue: null
+tasSettings:
+  topologyRequest:
+    kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname
+volumes:
+  gcsMounts:
+  - bucketName: null
+    mountPath: null
+  gcsVolumes: true
+  psVolumes: false
+workload:
+  arguments[]: null
+  configFile: llama3-1-8b-bf16-seq4096-gbs256-gpus64.py
+  configPath: /workload/configs/
+  defaultArguments[]: null
+  envs:
+  - name: ARTIFACT_DIR
+    value: null
+  - name: GLOO_SOCKET_IFNAME
+    value: eth0
+  - name: NEMO_LAUNCH_SCRIPT
+    value: /workload/configs/llama3-1-8b-bf16-seq4096-gbs256-gpus64.py
+  gpus: 64
+  image: nvcr.io/nvidia/nemo:25.11

From bf5372215aa8e042553e63c63456188594577a81 Mon Sep 17 00:00:00 2001
From: Vishwas Reddy <vishwasreddy38@gmail.com>
Date: Thu, 19 Mar 2026 20:20:01 +0000
Subject: [PATCH 8/9] feat: add 4-node bf16 recipe for llama3-1-8b

---
 .../4node-bf16-seq4096-gbs2048/Chart.yaml     |  20 ++
 .../4node-bf16-seq4096-gbs2048/README.md      | 155 ++++++++
 .../4node-bf16-seq4096-gbs2048/launcher.sh    | 106 ++++++
 .../llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py | 142 ++++++++
 .../recipe_launch_command.sh                  |   1 +
 .../templates/workload-config-configmap.yaml  |  28 ++
 .../templates/workload-job.yaml               | 333 ++++++++++++++++++
 .../workload-launcher-configmap.yaml          |  28 ++
 .../templates/workload-svc.yaml               |  22 ++
 .../4node-bf16-seq4096-gbs2048/values.yaml    |  33 ++
 10 files changed, 868 insertions(+)
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/Chart.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/README.md
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/launcher.sh
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/recipe_launch_command.sh
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-config-configmap.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-job.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-launcher-configmap.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-svc.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/values.yaml

diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/Chart.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/Chart.yaml
new file mode 100644
index 00000000..af46c11a
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/Chart.yaml
@@ -0,0 +1,20 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v2
+name: a4_jobset_workload
+description: a4_jobset_workload
+type: application
+version: 0.1.0
+appVersion: "1.16.0"
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/README.md b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/README.md
new file mode 100644
index 00000000..adf30a26
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/README.md
@@ -0,0 +1,155 @@
+<!-- mdformat global-off -->
+# Pretrain llama3-1-8b workloads on a4 GKE Node pools with Nvidia NeMo Framework
+
+This recipe outlines the steps for running a llama3-1-8b pretraining
+workload on [a4 GKE Node pools](https://cloud.google.com/kubernetes-engine) by using the
+[NVIDIA NeMo framework](https://github.com/NVIDIA/nemo).
+
+## Orchestration and deployment tools
+
+For this recipe, the following setup is used:
+
+- Orchestration - [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine)
+- Pretraining job configuration and deployment - A Helm chart is used to
+  configure and deploy the [Kubernetes Jobset](https://kubernetes.io/blog/2025/03/23/introducing-jobset) resource which manages the execution of the
+  [NeMo pretraining workload](https://github.com/NVIDIA/nemo).
+
+## Test environment
+
+This recipe has been optimized for and tested with the following configuration:
+
+- GKE cluster
+Please follow Cluster Toolkit [instructions](https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/examples/gke-a4)
+to create your a4 GKE cluster.
+
+## Training dataset
+
+This recipe uses a mock pretraining dataset provided by the NeMo framework.
+
+## Docker container image
+
+This recipe uses the following docker images:
+
+- `nvcr.io/nvidia/nemo:25.11`
+- `us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.1`
+
+## Run the recipe
+
+From your client workstation, complete the following steps:
+
+### Configure environment settings
+
+Set the environment variables to match your environment:
+
+ ```bash
+ export PROJECT_ID=<PROJECT_ID>
+ export CLUSTER_REGION=<CLUSTER_REGION>
+ export CLUSTER_NAME=<CLUSTER_NAME>
+ export GCS_BUCKET=<GCS_BUCKET> # Note: path should not be prefixed with gs://
+ export KUEUE_NAME=<KUEUE_NAME>
+ export HF_TOKEN=<YOUR_HF_TOKEN>
+ ```
+
+Replace the following values:
+
+ - `<PROJECT_ID>`: your Google Cloud project ID.
+ - `<CLUSTER_REGION>`: the region where your cluster is located.
+ - `<CLUSTER_NAME>`: the name of your GKE cluster.
+ - `<GCS_BUCKET>`: the name of your Cloud Storage bucket. Don't include the `gs://` prefix.
+ - `<KUEUE_NAME>`: the name of the Kueue local queue. The default queue created by the cluster toolkit is `a4`. Make sure to verify the name of the local queue in your cluster.
+ - `<YOUR_HF_TOKEN>`: Your HuggingFace token.
+
+Set the default project:
+
+ ```bash
+ gcloud config set project $PROJECT_ID
+ ```
+
+### Get the recipe
+
+Clone the `gpu-recipes` repository and set a reference to the recipe folder.
+
+```
+git clone https://github.com/ai-hypercomputer/gpu-recipes.git
+cd gpu-recipes
+export REPO_ROOT=`git rev-parse --show-toplevel`
+export RECIPE_ROOT=$REPO_ROOT/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq8192-gbs1024/recipe
+cd $RECIPE_ROOT
+```
+
+### Get cluster credentials
+
+```
+gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION
+```
+
+### Configure and submit a pretraining job
+
+#### Using 4 node (32 gpus) bf16 precision
+To execute the job with the default settings, run the following command from
+your client:
+
+```bash
+cd $RECIPE_ROOT
+export WORKLOAD_NAME=$USER-a4-llama3-1-8b
+helm install $WORKLOAD_NAME . -f values.yaml \
+--set-file workload_launcher=launcher.sh \
+--set-file workload_config=llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py \
+--set workload.image=nvcr.io/nvidia/nemo:25.11 \
+--set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
+--set volumes.gcsMounts[0].mountPath=/job-logs \
+--set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
+--set queue=${KUEUE_NAME}
+```
+
+**Examples**
+
+-   To set the number of training steps to 100, run the following command from
+    your client:
+
+    ```bash
+    cd $RECIPE_ROOT
+    export WORKLOAD_NAME=$USER-a4-llama3-1-8b
+    helm install $WORKLOAD_NAME . -f values.yaml \
+    --set-file workload_launcher=launcher.sh \
+    --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py \
+    --set workload.image=nvcr.io/nvidia/nemo:25.11 \
+    --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
+    --set volumes.gcsMounts[0].mountPath=/job-logs \
+    --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
+    --set queue=${KUEUE_NAME} \
+    --set workload.arguments[0]="trainer.max_steps=100"
+    ```
+
+### Monitor the job
+
+To check the status of pods in your job, run the following command:
+
+```
+kubectl get pods | grep $USER-a4-llama3-1-8b
+```
+
+Replace the following:
+
+- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4-llama3-1-8b.
+
+To get the logs for one of the pods, run the following command:
+
+```
+kubectl logs POD_NAME
+```
+
+Information about the training job's progress, including crucial details such as
+loss, step count, and step time, is generated by the rank 0 process.
+This process runs on the pod whose name begins with
+`JOB_NAME_PREFIX-workload-0-0`.
+For example: `$USER-a4-llama3-1-8b-workload-0-0-s9zrv`.
+
+### Uninstall the Helm release
+
+You can delete the job and other resources created by the Helm chart. To
+uninstall Helm, run the following command from your client:
+
+```bash
+helm uninstall $USER-a4-llama3-1-8b
+```
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/launcher.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/launcher.sh
new file mode 100644
index 00000000..f9c58c29
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/launcher.sh
@@ -0,0 +1,106 @@
+usage()
+{
+cat << EOF
+usage: bash ./launcher.sh [config-override  [config-override ...]]
+config-override  (Optional) A  NeMo configuration override. E.g. trainer.max_steps=10000.
+EOF
+}
+
+parse_args() {
+  while [ "$1" != "" ]; do
+    case $(grep -o "=" <<< "$1" | wc -l) in
+        1  )
+        config_overrides+=("$1")
+        ;;
+        * )
+            echo "Invalid config override: $1"
+            usage
+            exit 1
+    esac
+    shift
+  done
+  config_overrides="${config_overrides[*]}"
+}
+
+config_overrides=()
+parse_args "$@"
+
+if [ -z "${config_overrides}" ]; then
+  echo "No NeMo config overrides specified"
+else
+  echo "NeMo config overrides:"
+  echo "  ${config_overrides}"
+fi
+
+if [[ -n "${NCCL_PLUGIN_PATH}" ]]; then
+  export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH"
+  ldconfig $LD_LIBRARY_PATH
+  echo "Added $LD_LIBRARY_PATH to ldconfig:"
+  ldconfig -p | grep libcuda | sed 's/^/  /'
+  echo ""
+fi
+
+if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then
+  explicit_log_dir=${EXPLICIT_LOG_DIR}
+else
+  explicit_log_dir=workload_logs
+fi
+echo "Logging to ${explicit_log_dir}"
+
+if [[ -n "${TOKENIZER_PATH}" ]]; then
+  echo "Getting tokenizer files"
+  cp ${TOKENIZER_PATH}/* .
+  echo ""
+fi
+
+echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of $NNODES nodes"
+
+
+pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger
+
+export HF_TOKEN="<YOUR_HF_TOKEN>"
+
+# Export the nemo2 config to yaml.
+python ${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \
+trainer.num_nodes="$NNODES" \
+log.explicit_log_dir="${explicit_log_dir}" \
+trainer.max_steps=30 \
+trainer.num_nodes=4 \
+trainer.devices=8 \
+${config_overrides} \
+--to-yaml exported_nemo_config.yaml
+
+# Create the nsys directory.
+mkdir -p ${explicit_log_dir}/nsys
+
+OMP_NUM_THREADS=12 NSYS_CONFIG_DIRECTIVES="AgentLaunchTimeoutSec=240;AppLaunchTimeoutSec=240" TORCH_NCCL_ENABLE_MONITORING=0 \
+/usr/local/bin/nsys profile -s none -t nvtx,cuda --capture-range=cudaProfilerApi --capture-range-end=stop \
+-o ${explicit_log_dir}/nsys/noderank-${JOB_COMPLETION_INDEX} \
+--session-new "nemo-rank${JOB_COMPLETION_INDEX}"-$RANDOM \
+--wait all \
+torchrun \
+--nproc-per-node="8" \
+--nnodes="${NNODES}" \
+--node_rank="${JOB_COMPLETION_INDEX}" \
+--rdzv_id="${JOB_IDENTIFIER}" \
+--master_addr="${MASTER_ADDR}" \
+--master_port="${MASTER_PORT}" \
+${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \
+trainer.num_nodes="$NNODES" \
+log.explicit_log_dir="${explicit_log_dir}" \
+trainer.max_steps=30 \
+trainer.num_nodes=4 \
+trainer.devices=8 \
+${config_overrides}
+
+if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
+  mkdir -p ${ARTIFACT_DIR}
+  cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/
+  cp ${NEMO_LAUNCH_SCRIPT} ${ARTIFACT_DIR}/run-cli.py
+  cp dllogger.json ${ARTIFACT_DIR}/dllogger.json
+  cp exported_nemo_config.yaml ${ARTIFACT_DIR}/nemo-configuration.yaml
+  env > ${ARTIFACT_DIR}/environ.txt
+  ls ${ARTIFACT_DIR}
+fi
+echo "Training completed"
+echo "Pod on $(hostname --fqdn) is exiting"
\ No newline at end of file
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py
new file mode 100644
index 00000000..e3b90cc1
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py
@@ -0,0 +1,142 @@
+"""Nemo2 pretraining recipe for Llama 3.1 8B model."""
+
+from nemo.collections import llm
+from nemo.collections.llm.recipes import llama31_8b
+from nemo.lightning.pytorch.callbacks import NsysCallback
+from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback
+from nemo.utils.loggers.dllogger import DLLogger
+import nemo_run as run
+from scripts.performance.helpers import (
+    set_primary_perf_configs,
+)
+from scripts.performance.utils import get_comm_overlap_callback_idx
+
+
+def recipe(
+    profile_enabled: bool = False,
+    profile_start_step: int = 0,
+    profile_end_step: int = 0,
+    profile_ranks: str = "0",
+) -> run.Partial:
+  """Returns a Nemo2 training recipe for Llama 3.1 8B model.
+
+  Args:
+      profile_enabled: Whether to enable Nsys profiling.
+      profile_start_step: The step to start profiling.
+      profile_end_step: The step to end profiling.
+      profile_ranks: The ranks to profile, comma separated.
+
+  Returns:
+      A Nemo2 training recipe.
+  """
+  # Start from the Nemo standard recipe.
+  pretrain = llama31_8b.pretrain_recipe(performance_mode=True)
+
+  num_nodes = 4
+  num_gpus_per_node = 8
+  mbs = 4
+  gbs = 2048
+  max_steps = 30
+  tp_size = 1
+  pp_size = 1
+  cp_size = 1
+  vp_size = 1  # Virtual Pipeline Parallelism
+  ep_size = 1  # Expert Parallelism
+  enable_cuda_graphs = False
+  compute_dtype = "bf16"
+  fp8_recipe = None  # Not needed for bf16
+  nccl_communicator_config_path = None
+  use_mcore_fsdp = False
+  use_fsdp_double_buffer = False
+  use_user_buffer_registration = False
+  use_sharp = False
+  keep_fsdp_fp8_transpose_cache = False
+
+  pretrain = set_primary_perf_configs(
+      pretrain,
+      "pre_train",
+      num_nodes=num_nodes,
+      num_gpus_per_node=num_gpus_per_node,
+      mbs=mbs,
+      gbs=gbs,
+      max_steps=max_steps,
+      tp_size=tp_size,
+      pp_size=pp_size,
+      cp_size=cp_size,
+      vp_size=vp_size,
+      ep_size=ep_size,
+      enable_cuda_graphs=enable_cuda_graphs,
+      compute_dtype=compute_dtype,
+      fp8_recipe=fp8_recipe,
+      nccl_communicator_config_path=nccl_communicator_config_path,
+      use_mcore_fsdp=use_mcore_fsdp,
+      use_fsdp_double_buffer=use_fsdp_double_buffer,
+      use_user_buffer_registration=use_user_buffer_registration,
+      use_sharp=use_sharp,
+      keep_fsdp_fp8_transpose_cache=keep_fsdp_fp8_transpose_cache,
+  )
+
+  # Sequence Length (model and data)
+  pretrain.model.config.seq_length = 4096
+  pretrain.data.seq_length = 4096
+
+ # Set the number of steps to 50 for a quicker benchmark.
+  pretrain.trainer.max_steps = 50
+
+  # Disable validation batches.
+  pretrain.trainer.limit_val_batches = 0.0
+  pretrain.trainer.val_check_interval = 0
+
+  # Add the Nsys profiling callback if enabled.
+  if profile_enabled:
+    pretrain.trainer.callbacks.append(
+        run.Config(
+            NsysCallback,
+            start_step=profile_start_step,
+            end_step=profile_end_step,
+            ranks=[int(x) for x in profile_ranks.split(",")],
+            gen_shape=False,
+        )
+    )
+
+  # Add the FLOPs measurement callback.
+  pretrain.trainer.callbacks.append(
+      run.Config(
+          FLOPsMeasurementCallback,
+          model_name="llama31-8b",
+          model_config=pretrain.model.config,
+          data_config=pretrain.data,
+      )
+  )
+
+  # When `performance_mode` is enabled, the Megatron communication overlap
+  # callback is already added to the recipe.
+  # https://github.com/NVIDIA-NeMo/NeMo/blob/90a396a567ebb4e8c1c41e454dc00cb71f911317/nemo/collections/llm/recipes/llama31_8b.py#L231
+  comm_overlap_callback_idx = get_comm_overlap_callback_idx(
+      pretrain.trainer.callbacks
+  )
+  pretrain.trainer.callbacks[
+      comm_overlap_callback_idx
+  ].tp_comm_bootstrap_backend = "nccl"
+
+  # Disable checkpointing.
+  pretrain.log.ckpt = None
+  pretrain.trainer.enable_checkpointing = False
+
+  # Log every step.
+  pretrain.trainer.log_every_n_steps = 1
+
+  # Enable DLLogger
+  dllogger_config = run.Config(
+      DLLogger,
+      verbose=True,
+      stdout=True,
+      json_file="dllogger.json",
+  )
+  pretrain.log.extra_loggers = [dllogger_config]
+
+  return pretrain
+
+
+if __name__ == "__main__":
+  run.cli.main(llm.pretrain, default_factory=recipe)
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/recipe_launch_command.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/recipe_launch_command.sh
new file mode 100644
index 00000000..88294653
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/recipe_launch_command.sh
@@ -0,0 +1 @@
+helm install vishwasreddy-ubench-squa . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py --set workload.image=nvcr.io/nvidia/nemo:25.11 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/vishwasreddy-ubench-squa
\ No newline at end of file
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-config-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-config-configmap.yaml
new file mode 100644
index 00000000..a1d54cee
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-config-configmap.yaml
@@ -0,0 +1,28 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{- if .Values.workload.configFile }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-config"
+data:
+  workload-configuration: |-
+{{- if .Values.workload_config }}
+{{ .Values.workload_config | nindent 4 }}
+{{- else }}
+{{ "config: null" | nindent 4 }}
+{{- end }}
+{{- end }}
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-job.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-job.yaml
new file mode 100644
index 00000000..ae59e456
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-job.yaml
@@ -0,0 +1,333 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{$timestamp := now | date "2006-01-02-15-04-05"}}
+{{$jobSuffix := randAlphaNum 4 | lower}}
+{{$jobuuid := uuidv4}}
+{{$nodes := div .Values.workload.gpus 8 | max 1}}
+{{$gpusPerNode := min .Values.workload.gpus 8}}
+{{- $root := . -}}
+
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+  name: "{{ .Release.Name }}"
+  namespace: default
+  labels:
+  {{- if $root.Values.queue }}
+    kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}"
+  {{- end }}
+spec:
+  {{- if $root.Values.queue }}
+  suspend: true
+  {{- end }}
+  failurePolicy:
+    maxRestarts: {{ default 0 $root.Values.workload.max_workload_restarts }}
+  replicatedJobs:
+  - name: workload
+    replicas: 1
+    template:
+      spec:
+        parallelism: {{ $nodes }}
+        completions: {{ $nodes }}
+        backoffLimit: 0
+        completionMode: Indexed
+        activeDeadlineSeconds: 14400 # 4 hours (4 * 60 * 60)
+        ttlSecondsAfterFinished: 43200 # 12 hours (12 * 60 * 60)
+        template:
+          metadata:
+            annotations:
+              kubectl.kubernetes.io/default-container: workload
+              {{- if $root.Values.volumes.gcsVolumes }}
+              gke-gcsfuse/volumes: "true"
+              gke-gcsfuse/cpu-limit: "500m"
+              gke-gcsfuse/memory-limit: "1Ti"
+              gke-gcsfuse/ephemeral-storage-limit: "2Ti"
+              {{- end }}
+              {{- if $root.Values.volumes.psVolumes }}
+              gke-parallelstore/volumes: "true"
+              gke-parallelstore/cpu-limit: "0"
+              gke-parallelstore/memory-limit: "0"
+              {{- end }}
+              {{- if and $root.Values.queue $root.Values.tasSettings.topologyRequest }}
+              {{- toYaml .Values.tasSettings.topologyRequest | nindent 14 }}
+              {{- end }}
+              {{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }}
+              provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}"
+              {{- end }}
+              {{- if not $root.Values.network.hostNetwork }}
+              networking.gke.io/default-interface: "eth0"
+              networking.gke.io/interfaces: |
+              {{- if $root.Values.network.subnetworks }}
+                [
+                  {{- range $i, $subnetwork := $root.Values.network.subnetworks }}
+                  {"interfaceName":"eth{{ $i }}","network":"{{ $subnetwork }}"}{{ eq $i 9 | ternary "" ","}}
+                  {{- end }}
+                ]
+              {{- else }}
+                [
+                  {"interfaceName":"eth0","network":"default"},
+                  {"interfaceName":"eth1","network":"gvnic-1"},
+                  {{- range  $i := until 8 }}
+                  {"interfaceName":"eth{{ add 2 $i }}","network":"rdma-{{ $i }}"}{{ eq $i 7 | ternary "" ","}}
+                  {{- end }}
+                ]
+              {{- end }}
+              {{- end }}
+          spec:
+            {{- if $root.Values.network.hostNetwork }}
+            hostNetwork: true
+            dnsPolicy: ClusterFirstWithHostNet
+            {{- end }}
+            subdomain: "{{.Release.Name}}"
+            restartPolicy: Never
+            {{- if $root.Values.targetNodes }}
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+                  nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: kubernetes.io/hostname
+                      operator: "In"
+                      values:
+                      {{- range $hostname := $root.Values.targetNodes }}
+                      - {{ $hostname }}
+                      {{- end }}
+            {{- end }}
+            {{- if $root.Values.avoidNodes }}
+            {{- if not $root.Values.targetNodes }}
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+            {{- end }}
+                  nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: kubernetes.io/hostname
+                      operator: "NotIn"
+                      values:
+                      {{- range $hostname := $root.Values.avoidNodes }}
+                      - {{ $hostname }}
+                      {{- end }}
+            {{- end }}
+            tolerations:
+            - operator: "Exists"
+              key: nvidia.com/gpu
+            - operator: "Exists"
+              key: cloud.google.com/impending-node-termination
+
+            volumes:
+            {{ if $root.Values.network.gibVersion }}
+            - name: gib
+              emptyDir: {}
+            {{ end }}
+
+            {{- if $root.Values.workload.configFile }}
+            - name: workload-configuration
+              configMap:
+                name: "{{.Release.Name}}-config"
+                items:
+                - key: workload-configuration
+                  path: {{ $root.Values.workload.configFile | default "workload-configuration" }}
+            {{- end }}
+
+            - name: workload-launcher
+              configMap:
+                name: "{{.Release.Name}}-launcher"
+
+            - name: shared-memory
+              emptyDir:
+                medium: "Memory"
+                sizeLimit: 250Gi
+
+            {{- range $pvc := $root.Values.volumes.pvcMounts }}
+            - name: "{{ $pvc.claimName }}"
+              persistentVolumeClaim:
+                claimName: "{{ $pvc.claimName }}"
+            {{- end }}
+
+            {{- range $gcs := $root.Values.volumes.gcsMounts }}
+            - name: "{{ $gcs.bucketName }}"
+              csi:
+                driver: gcsfuse.csi.storage.gke.io
+                volumeAttributes:
+                  bucketName: "{{ $gcs.bucketName }}"
+                  {{- if $gcs.mountOptions }}
+                  mountOptions: "{{ $gcs.mountOptions }}"
+                  {{- end }}
+            {{- end}}
+
+            {{- if $root.Values.volumes.ssdMountPath }}
+            - name: local-ssd
+              hostPath:
+                path: /mnt/stateful_partition/kube-ephemeral-ssd
+            {{- end }}
+
+            initContainers:
+            {{ if $root.Values.network.gibVersion }}
+            - name: nccl-plugin-installer
+              image: {{ $root.Values.network.gibVersion }}
+              imagePullPolicy: Always
+              args:
+              - |
+                set -ex
+                /scripts/container_entry.sh install --install-nccl
+                cp -R /var/lib/gib/lib64/. /target/usr/local/gib/lib64
+                cp -R /var/lib/gib/. /target/usr/local/gib
+              command:
+              - /bin/sh
+              - -c
+              volumeMounts:
+              - mountPath: /target/usr/local/gib
+                name: gib
+            {{ end}}
+
+            containers:
+            {{- if $root.Values.workload.gcsSidecarImage }}
+            - name: gke-gcsfuse-sidecar
+              image: {{ $root.Values.workload.gcsSidecarImage }}
+            - name: gke-gcsfuse-metadata-prefetch
+              image: {{ $root.Values.workload.gcsSidecarImage }}
+            {{- end }}
+            {{- if $root.Values.workload.psSidecarImage }}
+            - name: gke-parallelstore-sidecar
+              image: {{ $root.Values.workload.psSidecarImage }}
+            {{- end }}
+
+            - name: workload
+              image: "{{ $root.Values.workload.image }}"
+              imagePullPolicy: Always
+              {{- if $root.Values.network.hostNetwork }}
+              securityContext:
+                privileged: true
+              {{- end }}
+              env:
+              - name: JOB_IDENTIFIER
+                value: "{{ .Release.Name }}-{{ $timestamp }}"
+              - name: JOB_TIMESTAMP
+                value: "{{ $timestamp }}"
+              - name: JOB_UUID
+                value: "{{ $jobuuid }}"
+              - name: JOB_ORCHESTRATOR
+                value: "gke"
+              # Add RANK based on the pod's index provided by the Indexed Job
+              # This is crucial for torch.distributed initialization.
+              - name: JOB_COMPLETION_INDEX
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
+              - name: RANK_0_FQDN
+                value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local"
+              - name: HOSTNAME_PREFIX
+                value: "{{.Release.Name}}-workload-"
+              - name: DOMAIN_NAME
+                value: "{{.Release.Name}}.default.svc.cluster.local"
+              - name: MASTER_ADDR
+                value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local"
+              - name: MASTER_PORT
+                value: "6002"
+              - name: WORLD_SIZE
+                value: "{{ $root.Values.workload.gpus }}"
+              - name: NNODES
+                value: "{{ $nodes }}"
+              - name: GPUS_PER_NODE
+                value: "{{ $gpusPerNode }}"
+
+              - name: NCCL_PLUGIN_PATH
+                value: /usr/local/gib/lib64
+
+              {{ if $root.Values.network.gibVersion }}
+              - name: NCCL_INIT_SCRIPT
+                value: "/usr/local/gib/scripts/set_nccl_env.sh"
+              {{ end }}
+
+              {{ if $root.Values.network.ncclSettings }}
+              {{- toYaml .Values.network.ncclSettings | nindent 14 }}
+              {{ end }}
+
+              {{ if $root.Values.workload.envs }}
+              {{- toYaml .Values.workload.envs | nindent 14 }}
+              {{ end }}
+
+              command:
+              - bash
+              - -c
+              - |
+                echo "Pod on $(hostname --fqdn) is running"
+                echo "Pod is assigned job index of $JOB_COMPLETION_INDEX"
+
+                if [[ -n "${NCCL_INIT_SCRIPT}" ]]; then
+                  echo "Running NCCL init script: ${NCCL_INIT_SCRIPT}"
+                  source ${NCCL_INIT_SCRIPT}
+                fi
+
+                # Overriding NCCL_SOCKET_IFNAME definition
+                export NCCL_SOCKET_IFNAME="eth0,eth1"
+                export NCCL_TUNER_CONFIG_PATH=/usr/local/gib/configs/tuner_config_a4.txtpb
+
+                echo "Launching workload with the following arguments:"
+                {{- range $root.Values.workload.defaultArguments }}
+                echo "  {{ . }}"
+                {{- end }}
+                {{- range $root.Values.workload.arguments }}
+                echo "  {{ . }}"
+                {{- end }}
+                echo ""
+
+                sleep 10
+
+                bash /workload/launcher/launch-workload.sh \
+                {{- range $root.Values.workload.defaultArguments }}
+                {{ . }} \
+                {{- end }}
+                {{- range $root.Values.workload.arguments }}
+                {{ . }} \
+                {{- end }}
+
+
+              volumeMounts:
+                {{ if $root.Values.network.gibVersion }}
+                - name: gib
+                  mountPath: /usr/local/gib
+                {{ end }}
+
+                {{- if $root.Values.workload.configFile }}
+                - name: workload-configuration
+                  mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }}
+                {{- end }}
+
+                - name: workload-launcher
+                  mountPath: /workload/launcher
+
+                - name: shared-memory
+                  mountPath: /dev/shm
+
+                {{- range $pvc := $root.Values.volumes.pvcMounts }}
+                - name: "{{ $pvc.claimName }}"
+                  mountPath: "{{ $pvc.mountPath }}"
+                {{- end }}
+
+                {{- range $gcs := $root.Values.volumes.gcsMounts }}
+                - name: "{{ $gcs.bucketName }}"
+                  mountPath: "{{ $gcs.mountPath }}"
+                {{- end }}
+
+                {{- if $root.Values.volumes.ssdMountPath }}
+                - name: local-ssd
+                  mountPath: "{{ $root.Values.volumes.ssdMountPath }}"
+                {{- end }}
+
+              resources:
+                limits:
+                  nvidia.com/gpu: {{ $gpusPerNode }}
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-launcher-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-launcher-configmap.yaml
new file mode 100644
index 00000000..7026e0f1
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-launcher-configmap.yaml
@@ -0,0 +1,28 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-launcher"
+data:
+  launch-workload.sh: |-
+{{- if .Values.workload_launcher }}
+{{ .Values.workload_launcher | nindent 4 }}
+{{- else }}
+    #!/bin/bash
+    echo "No workload launcher specified"
+    exit 1
+{{- end }}
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-svc.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-svc.yaml
new file mode 100644
index 00000000..7cfe220b
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/templates/workload-svc.yaml
@@ -0,0 +1,22 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: "{{ .Release.Name }}"
+spec:
+  clusterIP: None
+  selector:
+    jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}"
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/values.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/values.yaml
new file mode 100644
index 00000000..9d6140b2
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/4node-bf16-seq4096-gbs2048/values.yaml
@@ -0,0 +1,33 @@
+dwsSettings:
+  maxRunDurationSeconds: null
+network:
+  gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.1
+  hostNetwork: true
+  ncclSettings:
+  - name: NCCL_DEBUG
+    value: WARN
+  subnetworks[]: null
+queue: null
+tasSettings:
+  topologyRequest:
+    kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname
+volumes:
+  gcsMounts:
+  - bucketName: null
+    mountPath: null
+  gcsVolumes: true
+  psVolumes: false
+workload:
+  arguments[]: null
+  configFile: llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py
+  configPath: /workload/configs/
+  defaultArguments[]: null
+  envs:
+  - name: ARTIFACT_DIR
+    value: null
+  - name: GLOO_SOCKET_IFNAME
+    value: eth0
+  - name: NEMO_LAUNCH_SCRIPT
+    value: /workload/configs/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py
+  gpus: 32
+  image: nvcr.io/nvidia/nemo:25.11

From e96e9087c24cd8562fde9eae134625b51d4fa542 Mon Sep 17 00:00:00 2001
From: Vishwas Reddy <vishwasreddy38@gmail.com>
Date: Thu, 19 Mar 2026 21:10:12 +0000
Subject: [PATCH 9/9] feat: add 8-node bf16 recipe for seq8192 gbs2048

---
 .../8node-bf16-seq8192-gbs2048/Chart.yaml     |  20 ++
 .../8node-bf16-seq8192-gbs2048/README.md      | 155 ++++++++
 .../8node-bf16-seq8192-gbs2048/launcher.sh    | 106 ++++++
 .../llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py | 145 ++++++++
 .../recipe_launch_command.sh                  |   1 +
 .../templates/workload-config-configmap.yaml  |  28 ++
 .../templates/workload-job.yaml               | 333 ++++++++++++++++++
 .../workload-launcher-configmap.yaml          |  28 ++
 .../templates/workload-svc.yaml               |  22 ++
 .../8node-bf16-seq8192-gbs2048/values.yaml    |  33 ++
 10 files changed, 871 insertions(+)
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/Chart.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/README.md
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/launcher.sh
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/recipe_launch_command.sh
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-config-configmap.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-job.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-launcher-configmap.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-svc.yaml
 create mode 100644 training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/values.yaml

diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/Chart.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/Chart.yaml
new file mode 100644
index 00000000..af46c11a
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/Chart.yaml
@@ -0,0 +1,20 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v2
+name: a4_jobset_workload
+description: a4_jobset_workload
+type: application
+version: 0.1.0
+appVersion: "1.16.0"
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/README.md b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/README.md
new file mode 100644
index 00000000..23454e49
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/README.md
@@ -0,0 +1,155 @@
+<!-- mdformat global-off -->
+# Pretrain llama3-1-8b workloads on a4 GKE Node pools with Nvidia NeMo Framework
+
+This recipe outlines the steps for running a llama3-1-8b pretraining
+workload on [a4 GKE Node pools](https://cloud.google.com/kubernetes-engine) by using the
+[NVIDIA NeMo framework](https://github.com/NVIDIA/nemo).
+
+## Orchestration and deployment tools
+
+For this recipe, the following setup is used:
+
+- Orchestration - [Google Kubernetes Engine (GKE)](https://cloud.google.com/kubernetes-engine)
+- Pretraining job configuration and deployment - A Helm chart is used to
+  configure and deploy the [Kubernetes Jobset](https://kubernetes.io/blog/2025/03/23/introducing-jobset) resource which manages the execution of the
+  [NeMo pretraining workload](https://github.com/NVIDIA/nemo).
+
+## Test environment
+
+This recipe has been optimized for and tested with the following configuration:
+
+- GKE cluster
+Please follow Cluster Toolkit [instructions](https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/examples/gke-a4)
+to create your a4 GKE cluster.
+
+## Training dataset
+
+This recipe uses a mock pretraining dataset provided by the NeMo framework.
+
+## Docker container image
+
+This recipe uses the following docker images:
+
+- `nvcr.io/nvidia/nemo:25.11`
+- `us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.1`
+
+## Run the recipe
+
+From your client workstation, complete the following steps:
+
+### Configure environment settings
+
+Set the environment variables to match your environment:
+
+ ```bash
+ export PROJECT_ID=<PROJECT_ID>
+ export CLUSTER_REGION=<CLUSTER_REGION>
+ export CLUSTER_NAME=<CLUSTER_NAME>
+ export GCS_BUCKET=<GCS_BUCKET> # Note: path should not be prefixed with gs://
+ export KUEUE_NAME=<KUEUE_NAME>
+ export HF_TOKEN=<YOUR_HF_TOKEN>
+ ```
+
+Replace the following values:
+
+ - `<PROJECT_ID>`: your Google Cloud project ID.
+ - `<CLUSTER_REGION>`: the region where your cluster is located.
+ - `<CLUSTER_NAME>`: the name of your GKE cluster.
+ - `<GCS_BUCKET>`: the name of your Cloud Storage bucket. Don't include the `gs://` prefix.
+ - `<KUEUE_NAME>`: the name of the Kueue local queue. The default queue created by the cluster toolkit is `a4`. Make sure to verify the name of the local queue in your cluster.
+ - `<YOUR_HF_TOKEN>`: Your HuggingFace token.
+
+Set the default project:
+
+ ```bash
+ gcloud config set project $PROJECT_ID
+ ```
+
+### Get the recipe
+
+Clone the `gpu-recipes` repository and set a reference to the recipe folder.
+
+```
+git clone https://github.com/ai-hypercomputer/gpu-recipes.git
+cd gpu-recipes
+export REPO_ROOT=`git rev-parse --show-toplevel`
+export RECIPE_ROOT=$REPO_ROOT/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs1024/recipe
+cd $RECIPE_ROOT
+```
+
+### Get cluster credentials
+
+```
+gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION
+```
+
+### Configure and submit a pretraining job
+
+#### Using 8 node (64 gpus) bf16 precision
+To execute the job with the default settings, run the following command from
+your client:
+
+```bash
+cd $RECIPE_ROOT
+export WORKLOAD_NAME=$USER-a4-llama3-1-8b
+helm install $WORKLOAD_NAME . -f values.yaml \
+--set-file workload_launcher=launcher.sh \
+--set-file workload_config=llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py \
+--set workload.image=nvcr.io/nvidia/nemo:25.11 \
+--set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
+--set volumes.gcsMounts[0].mountPath=/job-logs \
+--set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
+--set queue=${KUEUE_NAME}
+```
+
+**Examples**
+
+-   To set the number of training steps to 100, run the following command from
+    your client:
+
+    ```bash
+    cd $RECIPE_ROOT
+    export WORKLOAD_NAME=$USER-a4-llama3-1-8b
+    helm install $WORKLOAD_NAME . -f values.yaml \
+    --set-file workload_launcher=launcher.sh \
+    --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py \
+    --set workload.image=nvcr.io/nvidia/nemo:25.11 \
+    --set volumes.gcsMounts[0].bucketName=${GCS_BUCKET} \
+    --set volumes.gcsMounts[0].mountPath=/job-logs \
+    --set workload.envs[0].value=/job-logs/$WORKLOAD_NAME \
+    --set queue=${KUEUE_NAME} \
+    --set workload.arguments[0]="trainer.max_steps=100"
+    ```
+
+### Monitor the job
+
+To check the status of pods in your job, run the following command:
+
+```
+kubectl get pods | grep $USER-a4-llama3-1-8b
+```
+
+Replace the following:
+
+- JOB_NAME_PREFIX - your job name prefix. For example $USER-a4-llama3-1-8b.
+
+To get the logs for one of the pods, run the following command:
+
+```
+kubectl logs POD_NAME
+```
+
+Information about the training job's progress, including crucial details such as
+loss, step count, and step time, is generated by the rank 0 process.
+This process runs on the pod whose name begins with
+`JOB_NAME_PREFIX-workload-0-0`.
+For example: `$USER-a4-llama3-1-8b-workload-0-0-s9zrv`.
+
+### Uninstall the Helm release
+
+You can delete the job and other resources created by the Helm chart. To
+uninstall Helm, run the following command from your client:
+
+```bash
+helm uninstall $USER-a4-llama3-1-8b
+```
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/launcher.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/launcher.sh
new file mode 100644
index 00000000..19ec7a17
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/launcher.sh
@@ -0,0 +1,106 @@
+usage()
+{
+cat << EOF
+usage: bash ./launcher.sh [config-override  [config-override ...]]
+config-override  (Optional) A  NeMo configuration override. E.g. trainer.max_steps=10000.
+EOF
+}
+
+parse_args() {
+  while [ "$1" != "" ]; do
+    case $(grep -o "=" <<< "$1" | wc -l) in
+        1  )
+        config_overrides+=("$1")
+        ;;
+        * )
+            echo "Invalid config override: $1"
+            usage
+            exit 1
+    esac
+    shift
+  done
+  config_overrides="${config_overrides[*]}"
+}
+
+config_overrides=()
+parse_args "$@"
+
+if [ -z "${config_overrides}" ]; then
+  echo "No NeMo config overrides specified"
+else
+  echo "NeMo config overrides:"
+  echo "  ${config_overrides}"
+fi
+
+if [[ -n "${NCCL_PLUGIN_PATH}" ]]; then
+  export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH"
+  ldconfig $LD_LIBRARY_PATH
+  echo "Added $LD_LIBRARY_PATH to ldconfig:"
+  ldconfig -p | grep libcuda | sed 's/^/  /'
+  echo ""
+fi
+
+if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then
+  explicit_log_dir=${EXPLICIT_LOG_DIR}
+else
+  explicit_log_dir=workload_logs
+fi
+echo "Logging to ${explicit_log_dir}"
+
+if [[ -n "${TOKENIZER_PATH}" ]]; then
+  echo "Getting tokenizer files"
+  cp ${TOKENIZER_PATH}/* .
+  echo ""
+fi
+
+echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of $NNODES nodes"
+
+
+pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger
+
+export HF_TOKEN=<YOUR_HF_TOKEN>"
+
+# Export the nemo2 config to yaml.
+python ${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \
+trainer.num_nodes="$NNODES" \
+log.explicit_log_dir="${explicit_log_dir}" \
+trainer.max_steps=30 \
+trainer.num_nodes=8 \
+trainer.devices=8 \
+${config_overrides} \
+--to-yaml exported_nemo_config.yaml
+
+# Create the nsys directory.
+mkdir -p ${explicit_log_dir}/nsys
+
+OMP_NUM_THREADS=12 NSYS_CONFIG_DIRECTIVES="AgentLaunchTimeoutSec=240;AppLaunchTimeoutSec=240" TORCH_NCCL_ENABLE_MONITORING=0 \
+/usr/local/bin/nsys profile -s none -t nvtx,cuda --capture-range=cudaProfilerApi --capture-range-end=stop \
+-o ${explicit_log_dir}/nsys/noderank-${JOB_COMPLETION_INDEX} \
+--session-new "nemo-rank${JOB_COMPLETION_INDEX}"-$RANDOM \
+--wait all \
+torchrun \
+--nproc-per-node="8" \
+--nnodes="${NNODES}" \
+--node_rank="${JOB_COMPLETION_INDEX}" \
+--rdzv_id="${JOB_IDENTIFIER}" \
+--master_addr="${MASTER_ADDR}" \
+--master_port="${MASTER_PORT}" \
+${NEMO_LAUNCH_SCRIPT} --factory "recipe()" \
+trainer.num_nodes="$NNODES" \
+log.explicit_log_dir="${explicit_log_dir}" \
+trainer.max_steps=30 \
+trainer.num_nodes=8 \
+trainer.devices=8 \
+${config_overrides}
+
+if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
+  mkdir -p ${ARTIFACT_DIR}
+  cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/
+  cp ${NEMO_LAUNCH_SCRIPT} ${ARTIFACT_DIR}/run-cli.py
+  cp dllogger.json ${ARTIFACT_DIR}/dllogger.json
+  cp exported_nemo_config.yaml ${ARTIFACT_DIR}/nemo-configuration.yaml
+  env > ${ARTIFACT_DIR}/environ.txt
+  ls ${ARTIFACT_DIR}
+fi
+echo "Training completed"
+echo "Pod on $(hostname --fqdn) is exiting"
\ No newline at end of file
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py
new file mode 100644
index 00000000..a5fd7524
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py
@@ -0,0 +1,145 @@
+"""Nemo2 pretraining recipe for Llama 3.1 8B model."""
+
+from nemo.collections import llm
+from nemo.collections.llm.recipes import llama31_8b
+from nemo.lightning.pytorch.callbacks import NsysCallback
+from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback
+from nemo.utils.loggers.dllogger import DLLogger
+import nemo_run as run
+from scripts.performance.helpers import (
+    set_primary_perf_configs,
+)
+from scripts.performance.utils import get_comm_overlap_callback_idx
+
+
+def recipe(
+    profile_enabled: bool = False,
+    profile_start_step: int = 0,
+    profile_end_step: int = 0,
+    profile_ranks: str = "0",
+) -> run.Partial:
+  """Returns a Nemo2 training recipe for Llama 3.1 8B model.
+
+  Args:
+      profile_enabled: Whether to enable Nsys profiling.
+      profile_start_step: The step to start profiling.
+      profile_end_step: The step to end profiling.
+      profile_ranks: The ranks to profile, comma separated.
+
+  Returns:
+      A Nemo2 training recipe.
+  """
+  # Start from the Nemo standard recipe.
+  pretrain = llama31_8b.pretrain_recipe(performance_mode=True)
+
+  num_nodes = 4
+  num_gpus_per_node = 8
+  mbs = 1
+  gbs = 2048
+  max_steps = 30
+  tp_size = 1
+  pp_size = 1
+  cp_size = 1
+  vp_size = 1  # Virtual Pipeline Parallelism
+  ep_size = 1  # Expert Parallelism
+  enable_cuda_graphs = False
+  compute_dtype = "bf16"
+  fp8_recipe = None  # Not needed for bf16
+  nccl_communicator_config_path = None
+  use_mcore_fsdp = False
+  use_fsdp_double_buffer = False
+  use_user_buffer_registration = False
+  use_sharp = False
+  keep_fsdp_fp8_transpose_cache = False
+
+  pretrain = set_primary_perf_configs(
+      pretrain,
+      "pre_train",
+      num_nodes=num_nodes,
+      num_gpus_per_node=num_gpus_per_node,
+      mbs=mbs,
+      gbs=gbs,
+      max_steps=max_steps,
+      tp_size=tp_size,
+      pp_size=pp_size,
+      cp_size=cp_size,
+      vp_size=vp_size,
+      ep_size=ep_size,
+      enable_cuda_graphs=enable_cuda_graphs,
+      compute_dtype=compute_dtype,
+      fp8_recipe=fp8_recipe,
+      nccl_communicator_config_path=nccl_communicator_config_path,
+      use_mcore_fsdp=use_mcore_fsdp,
+      use_fsdp_double_buffer=use_fsdp_double_buffer,
+      use_user_buffer_registration=use_user_buffer_registration,
+      use_sharp=use_sharp,
+      keep_fsdp_fp8_transpose_cache=keep_fsdp_fp8_transpose_cache,
+  )
+
+  # Sequence Length (model and data)
+  pretrain.model.config.seq_length = 8192
+  pretrain.data.seq_length = 8192
+
+  # Enable activation checkpointing to save memory
+  pretrain.model.config.recompute_granularity = "selective"
+
+  # Set max_steps for trainer
+  pretrain.trainer.max_steps = max_steps
+
+  # Disable validation batches.
+  pretrain.trainer.limit_val_batches = 0.0
+  pretrain.trainer.val_check_interval = 0
+
+  # Add the Nsys profiling callback if enabled.
+  if profile_enabled:
+    pretrain.trainer.callbacks.append(
+        run.Config(
+            NsysCallback,
+            start_step=profile_start_step,
+            end_step=profile_end_step,
+            ranks=[int(x) for x in profile_ranks.split(",")],
+            gen_shape=False,
+        )
+    )
+
+  # Add the FLOPs measurement callback.
+  pretrain.trainer.callbacks.append(
+      run.Config(
+          FLOPsMeasurementCallback,
+          model_name="llama31-8b",
+          model_config=pretrain.model.config,
+          data_config=pretrain.data,
+      )
+  )
+
+  # When `performance_mode` is enabled, the Megatron communication overlap
+  # callback is already added to the recipe.
+  # https://github.com/NVIDIA-NeMo/NeMo/blob/90a396a567ebb4e8c1c41e454dc00cb71f911317/nemo/collections/llm/recipes/llama31_8b.py#L231
+  comm_overlap_callback_idx = get_comm_overlap_callback_idx(
+      pretrain.trainer.callbacks
+  )
+  pretrain.trainer.callbacks[
+      comm_overlap_callback_idx
+  ].tp_comm_bootstrap_backend = "nccl"
+
+  # Disable checkpointing.
+  pretrain.log.ckpt = None
+  pretrain.trainer.enable_checkpointing = False
+
+  # Log every step.
+  pretrain.trainer.log_every_n_steps = 1
+
+  # Enable DLLogger
+  dllogger_config = run.Config(
+      DLLogger,
+      verbose=True,
+      stdout=True,
+      json_file="dllogger.json",
+  )
+  pretrain.log.extra_loggers = [dllogger_config]
+
+  return pretrain
+
+
+if __name__ == "__main__":
+  run.cli.main(llm.pretrain, default_factory=recipe)
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/recipe_launch_command.sh b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/recipe_launch_command.sh
new file mode 100644
index 00000000..35c3e1b8
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/recipe_launch_command.sh
@@ -0,0 +1 @@
+helm install vishwasreddy-ubench-46fe . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py --set workload.image=nvcr.io/nvidia/nemo:25.11 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/vishwasreddy-ubench-46fe
\ No newline at end of file
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-config-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-config-configmap.yaml
new file mode 100644
index 00000000..a1d54cee
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-config-configmap.yaml
@@ -0,0 +1,28 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{- if .Values.workload.configFile }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-config"
+data:
+  workload-configuration: |-
+{{- if .Values.workload_config }}
+{{ .Values.workload_config | nindent 4 }}
+{{- else }}
+{{ "config: null" | nindent 4 }}
+{{- end }}
+{{- end }}
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-job.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-job.yaml
new file mode 100644
index 00000000..ae59e456
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-job.yaml
@@ -0,0 +1,333 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{$timestamp := now | date "2006-01-02-15-04-05"}}
+{{$jobSuffix := randAlphaNum 4 | lower}}
+{{$jobuuid := uuidv4}}
+{{$nodes := div .Values.workload.gpus 8 | max 1}}
+{{$gpusPerNode := min .Values.workload.gpus 8}}
+{{- $root := . -}}
+
+apiVersion: jobset.x-k8s.io/v1alpha2
+kind: JobSet
+metadata:
+  name: "{{ .Release.Name }}"
+  namespace: default
+  labels:
+  {{- if $root.Values.queue }}
+    kueue.x-k8s.io/queue-name: "{{ $root.Values.queue }}"
+  {{- end }}
+spec:
+  {{- if $root.Values.queue }}
+  suspend: true
+  {{- end }}
+  failurePolicy:
+    maxRestarts: {{ default 0 $root.Values.workload.max_workload_restarts }}
+  replicatedJobs:
+  - name: workload
+    replicas: 1
+    template:
+      spec:
+        parallelism: {{ $nodes }}
+        completions: {{ $nodes }}
+        backoffLimit: 0
+        completionMode: Indexed
+        activeDeadlineSeconds: 14400 # 4 hours (4 * 60 * 60)
+        ttlSecondsAfterFinished: 43200 # 12 hours (12 * 60 * 60)
+        template:
+          metadata:
+            annotations:
+              kubectl.kubernetes.io/default-container: workload
+              {{- if $root.Values.volumes.gcsVolumes }}
+              gke-gcsfuse/volumes: "true"
+              gke-gcsfuse/cpu-limit: "500m"
+              gke-gcsfuse/memory-limit: "1Ti"
+              gke-gcsfuse/ephemeral-storage-limit: "2Ti"
+              {{- end }}
+              {{- if $root.Values.volumes.psVolumes }}
+              gke-parallelstore/volumes: "true"
+              gke-parallelstore/cpu-limit: "0"
+              gke-parallelstore/memory-limit: "0"
+              {{- end }}
+              {{- if and $root.Values.queue $root.Values.tasSettings.topologyRequest }}
+              {{- toYaml .Values.tasSettings.topologyRequest | nindent 14 }}
+              {{- end }}
+              {{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }}
+              provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}"
+              {{- end }}
+              {{- if not $root.Values.network.hostNetwork }}
+              networking.gke.io/default-interface: "eth0"
+              networking.gke.io/interfaces: |
+              {{- if $root.Values.network.subnetworks }}
+                [
+                  {{- range $i, $subnetwork := $root.Values.network.subnetworks }}
+                  {"interfaceName":"eth{{ $i }}","network":"{{ $subnetwork }}"}{{ eq $i 9 | ternary "" ","}}
+                  {{- end }}
+                ]
+              {{- else }}
+                [
+                  {"interfaceName":"eth0","network":"default"},
+                  {"interfaceName":"eth1","network":"gvnic-1"},
+                  {{- range  $i := until 8 }}
+                  {"interfaceName":"eth{{ add 2 $i }}","network":"rdma-{{ $i }}"}{{ eq $i 7 | ternary "" ","}}
+                  {{- end }}
+                ]
+              {{- end }}
+              {{- end }}
+          spec:
+            {{- if $root.Values.network.hostNetwork }}
+            hostNetwork: true
+            dnsPolicy: ClusterFirstWithHostNet
+            {{- end }}
+            subdomain: "{{.Release.Name}}"
+            restartPolicy: Never
+            {{- if $root.Values.targetNodes }}
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+                  nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: kubernetes.io/hostname
+                      operator: "In"
+                      values:
+                      {{- range $hostname := $root.Values.targetNodes }}
+                      - {{ $hostname }}
+                      {{- end }}
+            {{- end }}
+            {{- if $root.Values.avoidNodes }}
+            {{- if not $root.Values.targetNodes }}
+            affinity:
+              nodeAffinity:
+                requiredDuringSchedulingIgnoredDuringExecution:
+            {{- end }}
+                  nodeSelectorTerms:
+                  - matchExpressions:
+                    - key: kubernetes.io/hostname
+                      operator: "NotIn"
+                      values:
+                      {{- range $hostname := $root.Values.avoidNodes }}
+                      - {{ $hostname }}
+                      {{- end }}
+            {{- end }}
+            tolerations:
+            - operator: "Exists"
+              key: nvidia.com/gpu
+            - operator: "Exists"
+              key: cloud.google.com/impending-node-termination
+
+            volumes:
+            {{ if $root.Values.network.gibVersion }}
+            - name: gib
+              emptyDir: {}
+            {{ end }}
+
+            {{- if $root.Values.workload.configFile }}
+            - name: workload-configuration
+              configMap:
+                name: "{{.Release.Name}}-config"
+                items:
+                - key: workload-configuration
+                  path: {{ $root.Values.workload.configFile | default "workload-configuration" }}
+            {{- end }}
+
+            - name: workload-launcher
+              configMap:
+                name: "{{.Release.Name}}-launcher"
+
+            - name: shared-memory
+              emptyDir:
+                medium: "Memory"
+                sizeLimit: 250Gi
+
+            {{- range $pvc := $root.Values.volumes.pvcMounts }}
+            - name: "{{ $pvc.claimName }}"
+              persistentVolumeClaim:
+                claimName: "{{ $pvc.claimName }}"
+            {{- end }}
+
+            {{- range $gcs := $root.Values.volumes.gcsMounts }}
+            - name: "{{ $gcs.bucketName }}"
+              csi:
+                driver: gcsfuse.csi.storage.gke.io
+                volumeAttributes:
+                  bucketName: "{{ $gcs.bucketName }}"
+                  {{- if $gcs.mountOptions }}
+                  mountOptions: "{{ $gcs.mountOptions }}"
+                  {{- end }}
+            {{- end}}
+
+            {{- if $root.Values.volumes.ssdMountPath }}
+            - name: local-ssd
+              hostPath:
+                path: /mnt/stateful_partition/kube-ephemeral-ssd
+            {{- end }}
+
+            initContainers:
+            {{ if $root.Values.network.gibVersion }}
+            - name: nccl-plugin-installer
+              image: {{ $root.Values.network.gibVersion }}
+              imagePullPolicy: Always
+              args:
+              - |
+                set -ex
+                /scripts/container_entry.sh install --install-nccl
+                cp -R /var/lib/gib/lib64/. /target/usr/local/gib/lib64
+                cp -R /var/lib/gib/. /target/usr/local/gib
+              command:
+              - /bin/sh
+              - -c
+              volumeMounts:
+              - mountPath: /target/usr/local/gib
+                name: gib
+            {{ end}}
+
+            containers:
+            {{- if $root.Values.workload.gcsSidecarImage }}
+            - name: gke-gcsfuse-sidecar
+              image: {{ $root.Values.workload.gcsSidecarImage }}
+            - name: gke-gcsfuse-metadata-prefetch
+              image: {{ $root.Values.workload.gcsSidecarImage }}
+            {{- end }}
+            {{- if $root.Values.workload.psSidecarImage }}
+            - name: gke-parallelstore-sidecar
+              image: {{ $root.Values.workload.psSidecarImage }}
+            {{- end }}
+
+            - name: workload
+              image: "{{ $root.Values.workload.image }}"
+              imagePullPolicy: Always
+              {{- if $root.Values.network.hostNetwork }}
+              securityContext:
+                privileged: true
+              {{- end }}
+              env:
+              - name: JOB_IDENTIFIER
+                value: "{{ .Release.Name }}-{{ $timestamp }}"
+              - name: JOB_TIMESTAMP
+                value: "{{ $timestamp }}"
+              - name: JOB_UUID
+                value: "{{ $jobuuid }}"
+              - name: JOB_ORCHESTRATOR
+                value: "gke"
+              # Add RANK based on the pod's index provided by the Indexed Job
+              # This is crucial for torch.distributed initialization.
+              - name: JOB_COMPLETION_INDEX
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
+              - name: RANK_0_FQDN
+                value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local"
+              - name: HOSTNAME_PREFIX
+                value: "{{.Release.Name}}-workload-"
+              - name: DOMAIN_NAME
+                value: "{{.Release.Name}}.default.svc.cluster.local"
+              - name: MASTER_ADDR
+                value: "{{.Release.Name}}-workload-0-0.{{.Release.Name}}.default.svc.cluster.local"
+              - name: MASTER_PORT
+                value: "6002"
+              - name: WORLD_SIZE
+                value: "{{ $root.Values.workload.gpus }}"
+              - name: NNODES
+                value: "{{ $nodes }}"
+              - name: GPUS_PER_NODE
+                value: "{{ $gpusPerNode }}"
+
+              - name: NCCL_PLUGIN_PATH
+                value: /usr/local/gib/lib64
+
+              {{ if $root.Values.network.gibVersion }}
+              - name: NCCL_INIT_SCRIPT
+                value: "/usr/local/gib/scripts/set_nccl_env.sh"
+              {{ end }}
+
+              {{ if $root.Values.network.ncclSettings }}
+              {{- toYaml .Values.network.ncclSettings | nindent 14 }}
+              {{ end }}
+
+              {{ if $root.Values.workload.envs }}
+              {{- toYaml .Values.workload.envs | nindent 14 }}
+              {{ end }}
+
+              command:
+              - bash
+              - -c
+              - |
+                echo "Pod on $(hostname --fqdn) is running"
+                echo "Pod is assigned job index of $JOB_COMPLETION_INDEX"
+
+                if [[ -n "${NCCL_INIT_SCRIPT}" ]]; then
+                  echo "Running NCCL init script: ${NCCL_INIT_SCRIPT}"
+                  source ${NCCL_INIT_SCRIPT}
+                fi
+
+                # Overriding NCCL_SOCKET_IFNAME definition
+                export NCCL_SOCKET_IFNAME="eth0,eth1"
+                export NCCL_TUNER_CONFIG_PATH=/usr/local/gib/configs/tuner_config_a4.txtpb
+
+                echo "Launching workload with the following arguments:"
+                {{- range $root.Values.workload.defaultArguments }}
+                echo "  {{ . }}"
+                {{- end }}
+                {{- range $root.Values.workload.arguments }}
+                echo "  {{ . }}"
+                {{- end }}
+                echo ""
+
+                sleep 10
+
+                bash /workload/launcher/launch-workload.sh \
+                {{- range $root.Values.workload.defaultArguments }}
+                {{ . }} \
+                {{- end }}
+                {{- range $root.Values.workload.arguments }}
+                {{ . }} \
+                {{- end }}
+
+
+              volumeMounts:
+                {{ if $root.Values.network.gibVersion }}
+                - name: gib
+                  mountPath: /usr/local/gib
+                {{ end }}
+
+                {{- if $root.Values.workload.configFile }}
+                - name: workload-configuration
+                  mountPath: {{ $root.Values.workload.configPath | default "/workload/configs" }}
+                {{- end }}
+
+                - name: workload-launcher
+                  mountPath: /workload/launcher
+
+                - name: shared-memory
+                  mountPath: /dev/shm
+
+                {{- range $pvc := $root.Values.volumes.pvcMounts }}
+                - name: "{{ $pvc.claimName }}"
+                  mountPath: "{{ $pvc.mountPath }}"
+                {{- end }}
+
+                {{- range $gcs := $root.Values.volumes.gcsMounts }}
+                - name: "{{ $gcs.bucketName }}"
+                  mountPath: "{{ $gcs.mountPath }}"
+                {{- end }}
+
+                {{- if $root.Values.volumes.ssdMountPath }}
+                - name: local-ssd
+                  mountPath: "{{ $root.Values.volumes.ssdMountPath }}"
+                {{- end }}
+
+              resources:
+                limits:
+                  nvidia.com/gpu: {{ $gpusPerNode }}
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-launcher-configmap.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-launcher-configmap.yaml
new file mode 100644
index 00000000..7026e0f1
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-launcher-configmap.yaml
@@ -0,0 +1,28 @@
+# yamllint disable
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-launcher"
+data:
+  launch-workload.sh: |-
+{{- if .Values.workload_launcher }}
+{{ .Values.workload_launcher | nindent 4 }}
+{{- else }}
+    #!/bin/bash
+    echo "No workload launcher specified"
+    exit 1
+{{- end }}
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-svc.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-svc.yaml
new file mode 100644
index 00000000..7cfe220b
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/templates/workload-svc.yaml
@@ -0,0 +1,22 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: "{{ .Release.Name }}"
+spec:
+  clusterIP: None
+  selector:
+    jobset.sigs.k8s.io/jobset-name: "{{ .Release.Name }}"
diff --git a/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/values.yaml b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/values.yaml
new file mode 100644
index 00000000..4b7c1b79
--- /dev/null
+++ b/training/a4/llama3-1-8b/nemo-pretraining-gke/8node-bf16-seq8192-gbs2048/values.yaml
@@ -0,0 +1,33 @@
+dwsSettings:
+  maxRunDurationSeconds: null
+network:
+  gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.1
+  hostNetwork: true
+  ncclSettings:
+  - name: NCCL_DEBUG
+    value: WARN
+  subnetworks[]: null
+queue: null
+tasSettings:
+  topologyRequest:
+    kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname
+volumes:
+  gcsMounts:
+  - bucketName: null
+    mountPath: null
+  gcsVolumes: true
+  psVolumes: false
+workload:
+  arguments[]: null
+  configFile: llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py
+  configPath: /workload/configs/
+  defaultArguments[]: null
+  envs:
+  - name: ARTIFACT_DIR
+    value: null
+  - name: GLOO_SOCKET_IFNAME
+    value: eth0
+  - name: NEMO_LAUNCH_SCRIPT
+    value: /workload/configs/llama3-1-8b-bf16-seq8192-gbs1024-gpus8.py
+  gpus: 64
+  image: nvcr.io/nvidia/nemo:25.11