From b6127bf969fddebd9b94e720e7f38cccbf06ff78 Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <mtanaka@microsoft.com>
Date: Fri, 18 Apr 2025 16:23:38 +0000
Subject: [PATCH 1/4] update description of versions for deepcompile

---
 benchmarks/deepcompile/README.md | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/benchmarks/deepcompile/README.md b/benchmarks/deepcompile/README.md
index 16a825199..440f2093b 100644
--- a/benchmarks/deepcompile/README.md
+++ b/benchmarks/deepcompile/README.md
@@ -10,7 +10,10 @@ We tested the scripts with Python 3.10.12 and CUDA 12.4.
 In addition, you need to install the following:
 
 - PyTorch v2.6.0
-- DeepSpeed (v0.16.6 or newer)
+  - For the Mixtral model, we recommend using PyTorch v2.7.0 (currently the final release candidate) due to an issue with `torch.where`.
+  - See details [here](https://github.com/pytorch/pytorch/issues/149278)
+- DeepSpeed (v.0.16.6 or newer)
+  - As DeepCompile is under active development, we recommend using the latest version or installing from source.
 - transformers
 - accelerate
 - datasets v3.1
@@ -24,6 +27,12 @@ pip3 install transformers datasets==3.1 accelerate
 # Install DeepSpeed
 pip install deepspeed
 
+# Or install the latest revision of DeepSpeed from source
+# git clone https://github.com/deepspeedai/DeepSpeed
+# cd DeepSpeed
+# git pip install .
+# cd ..
+
 # Clone this repository
 git clone https://github.com/deepspeedai/DeepSpeedExamples
 cd benchmarks/deepcompile

From 0e0f3092ef55c3afecbb5f80c66db4388241e6ec Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <mtanaka@microsoft.com>
Date: Sun, 20 Apr 2025 03:08:20 +0000
Subject: [PATCH 2/4] fix deepcompile benchmark script

Signed-off-by: Masahiro Tanaka <mtanaka@microsoft.com>
---
 benchmarks/deepcompile/run.sh           |  5 ++---
 benchmarks/deepcompile/run_bench.sh     |  2 ++
 benchmarks/deepcompile/run_bench_acc.sh |  2 ++
 benchmarks/deepcompile/run_bench_lm.py  |  8 +-------
 benchmarks/deepcompile/run_multinode.sh | 16 ++++++++++++++--
 5 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/benchmarks/deepcompile/run.sh b/benchmarks/deepcompile/run.sh
index 57da03193..78b289123 100644
--- a/benchmarks/deepcompile/run.sh
+++ b/benchmarks/deepcompile/run.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 
-
-NUM_NODES=${NUM_NODES:-$(wc -l < /job/hostfile)}
+NUM_NODES=${NUM_NODES:-1}
 NGPUS_PER_NODE=${NGPUS_PER_NODE:-$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)}
 NUM_PROCESSES=$((${NUM_NODES} * ${NGPUS_PER_NODE}))
 
@@ -217,7 +216,7 @@ echo "Logging to ${LOG_FILE}"
 ${HOME}/.local/bin/accelerate launch --main_process_ip ${HOST_IP} --main_process_port 12345 \
 --num_machines ${NUM_NODES} --num_processes ${NUM_PROCESSES} --machine_rank ${MACHINE_RANK} \
 --config_file configs/config.yaml \
-run_acc_lm.py \
+run_bench_lm.py \
 --model_name "${MODEL}" \
 --zero_stage ${ZERO_STAGE} \
 ${GAS_OPTS} \
diff --git a/benchmarks/deepcompile/run_bench.sh b/benchmarks/deepcompile/run_bench.sh
index 174e34951..78c5df473 100644
--- a/benchmarks/deepcompile/run_bench.sh
+++ b/benchmarks/deepcompile/run_bench.sh
@@ -6,6 +6,8 @@ DC_OPTS="--compile --deepcompile"
 ACC_OPTS="--gradient-accumulation-steps 1"
 AC_OPTS="--activation-checkpointing"
 
+export NUM_NODES=${NUM_NODES:-4}
+
 MODEL="meta-llama/Meta-Llama-3-70B-Instruct"
 BATCH_SIZE_OPTS=(1 2 4)
 SEQ_LENGTH_OPTS=(512 1024 2048)
diff --git a/benchmarks/deepcompile/run_bench_acc.sh b/benchmarks/deepcompile/run_bench_acc.sh
index a3b66844d..7c4e81815 100644
--- a/benchmarks/deepcompile/run_bench_acc.sh
+++ b/benchmarks/deepcompile/run_bench_acc.sh
@@ -5,6 +5,8 @@ COMPILE_OPTS="--compile"
 N3Z_OPTS="--compile --deepcompile"
 AC_OPTS="--activation-checkpointing"
 
+export NUM_NODES=${NUM_NODES:-4}
+
 MODEL="meta-llama/Meta-Llama-3-70B-Instruct"
 BATCH_SIZE_OPTS=(1)
 SEQ_LENGTH_OPTS=(1024)
diff --git a/benchmarks/deepcompile/run_bench_lm.py b/benchmarks/deepcompile/run_bench_lm.py
index f175d84d7..bb96fa68b 100644
--- a/benchmarks/deepcompile/run_bench_lm.py
+++ b/benchmarks/deepcompile/run_bench_lm.py
@@ -15,8 +15,6 @@
 
 from datasets.utils.logging import disable_progress_bar
 
-from patch_phi3_moe import patch_phi3moe
-
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_name", type=str, default="meta-llama/Llama-2-7b-hf")
@@ -98,16 +96,13 @@ def main():
         model = AutoModelForCausalLM.from_pretrained(model_weight_path, trust_remote_code=True)
     else:
         if args.num_layers > 0:
-            model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+            model_config = AutoConfig.from_pretrained(model_name, attn_implementation=args.attn_impl, trust_remote_code=True)
             print(f"num_hidden_layers: {model_config.num_hidden_layers} -> {args.num_layers}")
             model_config.num_hidden_layers = args.num_layers
             model = AutoModelForCausalLM.from_config(model_config, trust_remote_code=True)
         else:
             model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
 
-    if patch_phi3moe(model) and accelerator.is_main_process:
-        print("Patched Phi-3.5-MoE model")
-
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
     if args.save_weights and accelerator.is_main_process:
@@ -149,7 +144,6 @@ def tokenize_function(examples):
         torch._dynamo.config.capture_dynamic_output_shape_ops = True
         torch._dynamo.config.capture_scalar_outputs = True
 
-
     if is_deepspeed:
         if args.compile:
             schedule = make_schedule(args.passes.split(","), warmup=5) if args.passes else None
diff --git a/benchmarks/deepcompile/run_multinode.sh b/benchmarks/deepcompile/run_multinode.sh
index 6f3feba9a..92d30839d 100644
--- a/benchmarks/deepcompile/run_multinode.sh
+++ b/benchmarks/deepcompile/run_multinode.sh
@@ -4,11 +4,23 @@ echo $*
 
 SCRIPT_DIR=$(dirname $(realpath $0))
 HOST_IP=$(hostname -i)
-NUM_NODES=${NUM_NODES:-$(wc -l < /job/hostfile)}
+NUM_NODES=${NUM_NODES:-1}
+
+# verify that NUM_NODES is a positive integer
+if ! [[ "$NUM_NODES" =~ ^[1-9][0-9]*$ ]]; then
+    echo "Error: NUM_NODES must be a positive integer"
+    exit 1
+fi
+
+# check if NUM_NODES ==1 or hostfile_n${NUM_NODES} exists
+if [ ! -f hostfile_n${NUM_NODES} ] && [ "${NUM_NODES}" != "1" ]; then
+    echo "Error: hostfile_n${NUM_NODES} does not exist"
+    exit 1
+fi
 
 if [ "${NUM_NODES}" == "1" ]; then
     # avoid dependency on pdsh when possible
     cd ${SCRIPT_DIR}; bash ./run.sh --host-ip ${HOST_IP} $*
 else
-    ds_ssh -f hostfile_n${NUM_NODES} "cd ${SCRIPT_DIR}; bash ./run.sh --host-ip ${HOST_IP} $*"
+    ds_ssh -f hostfile_n${NUM_NODES} "cd ${SCRIPT_DIR}; NUM_NODES=${NUM_NODES} bash ./run.sh --host-ip ${HOST_IP} $*"
 fi

From 9d881c1f790893db62fd806863ca069728389e02 Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <mtanaka@microsoft.com>
Date: Sun, 20 Apr 2025 03:31:53 +0000
Subject: [PATCH 3/4] fix benchmark for z1

Signed-off-by: Masahiro Tanaka <mtanaka@microsoft.com>
---
 benchmarks/deepcompile/run_bench_lm.py | 2 +-
 benchmarks/deepcompile/run_bench_z1.sh | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/benchmarks/deepcompile/run_bench_lm.py b/benchmarks/deepcompile/run_bench_lm.py
index bb96fa68b..9d669afa1 100644
--- a/benchmarks/deepcompile/run_bench_lm.py
+++ b/benchmarks/deepcompile/run_bench_lm.py
@@ -72,7 +72,7 @@ def main():
     args = get_args()
     print(args)
 
-    if "offload_adam_states" in args.passes:
+    if args.passes is not None and "offload_adam_states" in args.passes:
         os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
 
     if args.deterministic:
diff --git a/benchmarks/deepcompile/run_bench_z1.sh b/benchmarks/deepcompile/run_bench_z1.sh
index b5491e3fc..ba61cd5e1 100644
--- a/benchmarks/deepcompile/run_bench_z1.sh
+++ b/benchmarks/deepcompile/run_bench_z1.sh
@@ -6,6 +6,8 @@ DC_OPTS="--compile --deepcompile"
 ACC_OPTS="--gradient-accumulation-steps 1"
 AC_OPTS="--activation-checkpointing"
 
+export NUM_NODES=${NUM_NODES:-4}
+
 MODEL="meta-llama/Meta-Llama-3-8B-Instruct"
 BATCH_SIZE_OPTS=(1 2 4)
 SEQ_LENGTH_OPTS=(512 1024 2048)

From ddf0d506e08c18113efdae015580f308346023ee Mon Sep 17 00:00:00 2001
From: Masahiro Tanaka <mtanaka@microsoft.com>
Date: Sun, 20 Apr 2025 03:57:02 +0000
Subject: [PATCH 4/4] add options for deepcompile bench

Signed-off-by: Masahiro Tanaka <mtanaka@microsoft.com>
---
 benchmarks/deepcompile/run.sh          |  8 ++++++++
 benchmarks/deepcompile/run_bench_lm.py | 10 +++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/benchmarks/deepcompile/run.sh b/benchmarks/deepcompile/run.sh
index 78b289123..f5e8662d5 100644
--- a/benchmarks/deepcompile/run.sh
+++ b/benchmarks/deepcompile/run.sh
@@ -94,6 +94,14 @@ while [[ $# -gt 0 ]]; do
             EXTRA_OPTS="${EXTRA_OPTS} --num_layers $2"
             shift 2
             ;;
+        --attn-impl)
+            EXTRA_OPTS="${EXTRA_OPTS} --attn_impl $2"
+            shift 2
+            ;;
+        --eval)
+            EXTRA_OPTS="${EXTRA_OPTS} --eval"
+            shift
+            ;;
         --debug-log)
             DEBUG_LOG=1
             shift
diff --git a/benchmarks/deepcompile/run_bench_lm.py b/benchmarks/deepcompile/run_bench_lm.py
index 9d669afa1..567fd2715 100644
--- a/benchmarks/deepcompile/run_bench_lm.py
+++ b/benchmarks/deepcompile/run_bench_lm.py
@@ -25,6 +25,7 @@ def get_args():
     parser.add_argument("--max_grad_norm", type=float, default=1.0)
     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
     parser.add_argument("--activation_checkpointing", action="store_true")
+    parser.add_argument("--eval", action="store_true")
     parser.add_argument("--dataset_name", type=str, default="timdettmers/openassistant-guanaco")
     parser.add_argument("--num_layers", type=int, default=0)
     parser.add_argument("--attn_impl", type=str, default="spda")
@@ -179,10 +180,13 @@ def tokenize_function(examples):
         on_trace_ready=torch.profiler.tensorboard_trace_handler(prof_dir),
     ) if do_profile else nullcontext()
 
-    # Training loop
-    model.train()
-    global_step = 0
+    # Training 
+    if args.eval:
+        model.eval()
+    else:
+        model.train()
 
+    global_step = 0
     iter_times = []
 
     # See https://github.com/microsoft/DeepSpeed/issues/6793