Merge pull request #138 from AI-Hypercomputer/rishabh/qwen

tonyjohnchen · web-flow · commit 415897e398a5 · 2026-03-07T23:20:20.000-08:00
add qwen on slurm
diff --git a/training/a3ultra/llama3-1-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS1024/recipe/README.md b/training/a3ultra/llama3-1-70b/megatron-bridge-pretraining-slurm/4node-FP8CS-GBS1024/recipe/README.md
@@ -39,7 +39,7 @@ Set the environment variables to match your environment:
  export PROJECT_ID=<PROJECT_ID>
  export CLUSTER_REGION=<CLUSTER_REGION>
  export CLUSTER_NAME=<CLUSTER_NAME>
- gcloud compute ssh $CLUSTER_NAME --project supercomputer-testing --zone $CLUSTER_REGION -- -o Hostname=nic0.$CLUSTER_NAME.$CLUSTER_REGION.c.$PROJECT_ID$.internal.gcpnode.com
+ gcloud compute ssh $CLUSTER_NAME --project <project-name> --zone $CLUSTER_REGION -- -o Hostname=nic0.$CLUSTER_NAME.$CLUSTER_REGION.c.$PROJECT_ID$.internal.gcpnode.com
 
  ```
 
diff --git a/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/README.md b/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/README.md
@@ -0,0 +1,100 @@
+<!-- mdformat global-off -->
+# Pretrain Qwen 3 235B workloads on A4 Slurm Cluster with Nvidia Megatron-Bridge
+
+This recipe outlines the steps for running a Qwen 3 235B pretraining workload on [Google Cloud A4 Slurm clusters](https://docs.cloud.google.com/ai-hypercomputer/docs/create/create-slurm-cluster) by using [NVIDIA Megatron-Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge).
+
+## Orchestration and deployment tools
+
+For this recipe, the following setup is used:
+
+- Orchestration - [Slurm Workload Manager](https://slurm.schedmd.com/)
+- Deployment - [Cluster Toolkit](https://cloud.google.com/cluster-toolkit/docs/overview)
+
+## Test environment
+
+This recipe has been optimized for and tested with the following configuration:
+
+- A4 Slurm Cluster (16 nodes, 128 GPUs)
+- Machine Type: `a4-highgpu-8g`
+- Lustre Filesystem
+
+Please follow the instructions in the [Cluster Toolkit A4 Example README](https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/machine-learning/a4-highgpu-8g) to provision an A4 Slurm cluster.
+
+
+## Docker container image
+
+This recipe uses the following container images:
+
+- `nvcr.io/nvidia/nemo:25.11`
+
+## Run the recipe
+
+
+### Configure environment settings
+
+Set the environment variables to match your environment:
+
+ ```bash
+ export PROJECT_ID=<PROJECT_ID>
+ export CLUSTER_REGION=<CLUSTER_REGION>
+ export CLUSTER_NAME=<CLUSTER_NAME>
+ gcloud compute ssh $CLUSTER_NAME --project <project-name> --zone $CLUSTER_REGION -- -o Hostname=nic0.$CLUSTER_NAME.$CLUSTER_REGION.c.$PROJECT_ID$.internal.gcpnode.com
+
+ ```
+
+Replace the following values:
+
+ - `<PROJECT_ID>`: your Google Cloud project ID.
+ - `<CLUSTER_REGION>`: the region where your cluster is located.
+ - `<CLUSTER_NAME>`: the name of your SLURM cluster.
+
+Set the default project:
+
+ ```bash
+ gcloud config set project $PROJECT_ID
+ ```
+
+From your cluster login node, complete the following steps:
+
+### Get the recipe
+
+Clone the `gpu-recipes` repository and set a reference to the recipe folder.
+
+```
+git clone https://github.com/ai-hypercomputer/gpu-recipes.git
+cd gpu-recipes
+export REPO_ROOT=`git rev-parse --show-toplevel`
+export RECIPE_ROOT=$REPO_ROOT/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe
+cd $RECIPE_ROOT
+```
+
+### Submit a pretraining job
+
+```
+# set your HF_TOKEN inside launch_script.sh
+export HF_TOKEN="YOUR_HF_TOKEN" # Replace with your Hugging Face token.
+
+cd ..
+sbatch ./recipe/sbatch_script.sh
+```
+
+### Monitor the job
+
+To check the status of pods in your job, run the following command:
+
+```
+squeue --me
+```
+
+
+To get the logs for the job, run the following command:
+
+```
+tail -f slurm_{jobID}.out
+```
+
+### Uninstall the job
+
+```bash
+scancel -u $USER
+```
diff --git a/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/launch_script.sh b/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/launch_script.sh
@@ -0,0 +1,148 @@
+usage()
+{
+cat << EOF
+usage: bash ./launcher.sh [config-override  [config-override ...]]
+config-override  (Optional) A  NeMo configuration override. E.g. trainer.max_steps=10000.
+EOF
+}
+
+parse_args() {
+  while [[ "$1" != "" ]]; do
+    case $(grep -o "=" <<< "$1" | wc -l) in
+        1  )
+        config_overrides+=("$1")
+        ;;
+        * )
+            echo "Invalid config override: $1"
+            usage
+            exit 1
+    esac
+    shift
+  done
+  config_overrides="${config_overrides[*]}"
+}
+
+config_overrides=()
+parse_args "$@"
+
+if [[ -z "${config_overrides[*]}" ]]; then
+  echo "No NeMo config overrides specified"
+else
+  echo "NeMo config overrides:"
+  echo "  ${config_overrides}"
+fi
+
+export LD_LIBRARY_PATH="/usr/local/cuda/compat/lib:$NCCL_PLUGIN_PATH:$LD_LIBRARY_PATH"
+ldconfig "$LD_LIBRARY_PATH"
+echo "Added $LD_LIBRARY_PATH to ldconfig:"
+ldconfig -p | grep libcuda | sed 's/^/  /'
+echo ""
+
+if [[ -n "${EXPLICIT_LOG_DIR}" ]]; then
+  explicit_log_dir=${EXPLICIT_LOG_DIR}
+else
+  explicit_log_dir=workload_logs
+fi
+echo "Logging to ${explicit_log_dir}"
+
+if [[ -n "${TOKENIZER_PATH}" ]]; then
+  echo "Getting tokenizer files"
+  cp "${TOKENIZER_PATH}"/* .
+  echo ""
+fi
+
+echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of $NNODES nodes"
+
+pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger
+
+# Create the nsys directory.
+mkdir -p "${explicit_log_dir}/nsys"
+
+# Collect diagnostics to a single line
+kv="\"kernel_version\": \"$(uname --kernel-release)\""
+if command -v nvidia-smi &> /dev/null; then
+  cuda_v=$(nvidia-smi -q -x | grep -Po '(?<=<cuda_version>).*(?=</cuda_version>)' || true)
+  driver_v=$(nvidia-smi -q -x | grep -Po '(?<=<driver_version>).*(?=</driver_version>)' || true)
+  vbios_v=$(nvidia-smi -q -x | grep -Po '(?<=<vbios_version>).*(?=</vbios_version>)' | head -n1 || true)
+  kv="${kv}, \"cuda_version\": \"${cuda_v}\""
+  kv="${kv}, \"driver_version\": \"${driver_v}\""
+  kv="${kv}, \"vbios_version\": \"${vbios_v}\""
+fi
+echo "VERSION_DIAGNOSTICS: {${kv}}"
+
+
+export HF_TOKEN="<HF_TOKEN>"
+
+cd /opt
+rm -rf Megatron-Bridge
+git clone https://github.com/NVIDIA-NeMo/Megatron-Bridge.git
+cd Megatron-Bridge
+git checkout 7695d4acbfac19353d20e456509117efe4733d6b
+ls
+
+
+
+worker_command=$(cat <<- EOM
+  if [ "\$RANK" -eq "0" ]; then
+    echo "Worker 0 is stalling for a few seconds.." ;
+    sleep 3 ;
+    echo "The detected environment within worker rank 0 is:" ;
+    env | sed 's/^/  /' ;
+  fi ;
+
+  cd /opt/Megatron-Bridge ;
+
+  numactl \
+    --cpunodebind=\$((LOCAL_RANK/4)) \
+    --membind=\$((LOCAL_RANK/4))           nsys profile \
+    -t nvtx,cuda \
+    --cuda-event-trace=false \
+    --sample=none \
+    --capture-range=cudaProfilerApi \
+    --capture-range-end=stop \
+    --kill none \
+    -o "/${explicit_log_dir}/$JOB_IDENTIFIER/rank-\$RANK" \
+    --force-overwrite true \
+    --session-new "nsys-\$RANDOM-\$RANK" \
+  nice -10 \
+  python scripts/performance/run_script.py \
+    --gpu b200 \
+    --model_family_name qwen \
+    --model_recipe_name qwen3_235b_a22b \
+    --gpus_per_node 8 \
+    --num_gpus 128 \
+    --seq_length 4096 \
+    --compute_dtype bf16 \
+    --global_batch_size 4096 \
+    --tensor_model_parallel_size 1 \
+    --pipeline_model_parallel_size 8 \
+    --virtual_pipeline_model_parallel_size 4 \
+    --expert_model_parallel_size 8 \
+    --expert_tensor_parallel_size 1 \
+    --moe_a2a_overlap True \
+    --max_steps 30
+
+EOM
+)
+
+echo "$worker_command" > worker_command.sh
+chmod 777 worker_command.sh
+
+torchrun \
+--nproc-per-node="8" \
+--nnodes="16" \
+--node_rank="${JOB_COMPLETION_INDEX}" \
+--rdzv_id="${JOB_IDENTIFIER}" \
+--master_addr="${MASTER_ADDR}" \
+--master_port="${MASTER_PORT}" \
+--no-python bash worker_command.sh
+
+
+if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
+  mkdir -p "${ARTIFACT_DIR}"
+  cp -r "${explicit_log_dir}"/* "${ARTIFACT_DIR}/"
+  env > "${ARTIFACT_DIR}/environ.txt"
+  ls "${ARTIFACT_DIR}"
+fi
+echo "Training completed"
+echo "Pod on $(hostname --fqdn) is exiting"
diff --git a/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/sbatch_script.sh b/training/a4/qwen3-235b-a22b/megatron-bridge-pretraining-slurm/16node-BF16-GBS4096/recipe/sbatch_script.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+#SBATCH --job-name=qwen3_235b_bf16_b200_128gpus-8jje
+#SBATCH --nodes=16
+#SBATCH --ntasks-per-node=1
+#SBATCH --gres=gpu:8
+#SBATCH --mem=0
+
+# Exit early on failures
+set -e
+
+# Validate that the recipe location is setup correctly.
+# Recipe is expected to be in "recipe" folder inside current working directory
+RECIPE_DIR="$(pwd)/recipe"
+LAUNCH_SCRIPT="${RECIPE_DIR}/launch_script.sh"
+if [[ ! -f "${LAUNCH_SCRIPT}" ]]; then
+    echo "Error: Recipe is not located correctly. The recipe is expected to be in "recipe" folder inside current working directory. We could not find the launch script there." >&2
+    exit 1
+fi
+chmod +x "${LAUNCH_SCRIPT}"
+
+# Enroot the image if it is not already enrooted.
+export ENROOT_CONFIG_PATH=${HOME}/.config/enroot
+ORIG_IMAGE=nvcr.io#nvidia/nemo:25.11
+SQSH_IMAGE_PATH=${RECIPE_DIR}/sqsh/nvcr.io_nvidia_nemo:25.11
+if [[ ! -f "${SQSH_IMAGE_PATH}" ]]; then
+  mkdir -p "$(dirname "${SQSH_IMAGE_PATH}")"
+  echo "enrooting $ORIG_IMAGE to ${SQSH_IMAGE_PATH}"
+  enroot import --output "${SQSH_IMAGE_PATH}" -- "docker://${ORIG_IMAGE}"
+fi
+
+# get the master node
+master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+master_port=29500
+
+ARTIFACT_DIR_HOME="/home/$USER/job_artifacts/${SLURM_JOB_ID}"
+mkdir -p "$ARTIFACT_DIR_HOME"
+
+export NNODES=$SLURM_NNODES
+export MASTER_ADDR=$master_addr
+export MASTER_PORT=$master_port
+export ARTIFACT_DIR=/artifacts
+export JOB_NAME=qwen3_235b_bf16_b200_128gpus-8jje
+export JOB_IDENTIFIER=qwen3_235b_bf16_b200_128gpus-8jje
+
+
+
+srun --container-image="$SQSH_IMAGE_PATH" \
+  --container-mounts="${RECIPE_DIR}:/recipe:mkdir,${ARTIFACT_DIR_HOME}:${ARTIFACT_DIR}:mkdir" \
+  --container-workdir=/recipe \
+  --container-writable \
+  bash -c 'export JOB_COMPLETION_INDEX=$SLURM_NODEID; ./launch_script.sh'
+