diff --git a/benchmarks/deepcompile/run.sh b/benchmarks/deepcompile/run.sh
index 57da03193..f5e8662d5 100644
--- a/benchmarks/deepcompile/run.sh
+++ b/benchmarks/deepcompile/run.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 
-
-NUM_NODES=${NUM_NODES:-$(wc -l < /job/hostfile)}
+NUM_NODES=${NUM_NODES:-1}
 NGPUS_PER_NODE=${NGPUS_PER_NODE:-$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)}
 NUM_PROCESSES=$((${NUM_NODES} * ${NGPUS_PER_NODE}))
 
@@ -95,6 +94,14 @@ while [[ $# -gt 0 ]]; do
             EXTRA_OPTS="${EXTRA_OPTS} --num_layers $2"
             shift 2
             ;;
+        --attn-impl)
+            EXTRA_OPTS="${EXTRA_OPTS} --attn_impl $2"
+            shift 2
+            ;;
+        --eval)
+            EXTRA_OPTS="${EXTRA_OPTS} --eval"
+            shift
+            ;;
         --debug-log)
             DEBUG_LOG=1
             shift
@@ -217,7 +224,7 @@ echo "Logging to ${LOG_FILE}"
 ${HOME}/.local/bin/accelerate launch --main_process_ip ${HOST_IP} --main_process_port 12345 \
 --num_machines ${NUM_NODES} --num_processes ${NUM_PROCESSES} --machine_rank ${MACHINE_RANK} \
 --config_file configs/config.yaml \
-run_acc_lm.py \
+run_bench_lm.py \
 --model_name "${MODEL}" \
 --zero_stage ${ZERO_STAGE} \
 ${GAS_OPTS} \
diff --git a/benchmarks/deepcompile/run_bench.sh b/benchmarks/deepcompile/run_bench.sh
index 174e34951..78c5df473 100644
--- a/benchmarks/deepcompile/run_bench.sh
+++ b/benchmarks/deepcompile/run_bench.sh
@@ -6,6 +6,8 @@ DC_OPTS="--compile --deepcompile"
 ACC_OPTS="--gradient-accumulation-steps 1"
 AC_OPTS="--activation-checkpointing"
 
+export NUM_NODES=${NUM_NODES:-4}
+
 MODEL="meta-llama/Meta-Llama-3-70B-Instruct"
 BATCH_SIZE_OPTS=(1 2 4)
 SEQ_LENGTH_OPTS=(512 1024 2048)
diff --git a/benchmarks/deepcompile/run_bench_acc.sh b/benchmarks/deepcompile/run_bench_acc.sh
index a3b66844d..7c4e81815 100644
--- a/benchmarks/deepcompile/run_bench_acc.sh
+++ b/benchmarks/deepcompile/run_bench_acc.sh
@@ -5,6 +5,8 @@ COMPILE_OPTS="--compile"
 N3Z_OPTS="--compile --deepcompile"
 AC_OPTS="--activation-checkpointing"
 
+export NUM_NODES=${NUM_NODES:-4}
+
 MODEL="meta-llama/Meta-Llama-3-70B-Instruct"
 BATCH_SIZE_OPTS=(1)
 SEQ_LENGTH_OPTS=(1024)
diff --git a/benchmarks/deepcompile/run_bench_lm.py b/benchmarks/deepcompile/run_bench_lm.py
index f175d84d7..567fd2715 100644
--- a/benchmarks/deepcompile/run_bench_lm.py
+++ b/benchmarks/deepcompile/run_bench_lm.py
@@ -15,8 +15,6 @@
 
 from datasets.utils.logging import disable_progress_bar
 
-from patch_phi3_moe import patch_phi3moe
-
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_name", type=str, default="meta-llama/Llama-2-7b-hf")
@@ -27,6 +25,7 @@ def get_args():
     parser.add_argument("--max_grad_norm", type=float, default=1.0)
     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
     parser.add_argument("--activation_checkpointing", action="store_true")
+    parser.add_argument("--eval", action="store_true")
     parser.add_argument("--dataset_name", type=str, default="timdettmers/openassistant-guanaco")
     parser.add_argument("--num_layers", type=int, default=0)
     parser.add_argument("--attn_impl", type=str, default="spda")
@@ -74,7 +73,7 @@ def main():
     args = get_args()
     print(args)
 
-    if "offload_adam_states" in args.passes:
+    if args.passes is not None and "offload_adam_states" in args.passes:
         os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
 
     if args.deterministic:
@@ -98,16 +97,13 @@ def main():
         model = AutoModelForCausalLM.from_pretrained(model_weight_path, trust_remote_code=True)
     else:
         if args.num_layers > 0:
-            model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+            model_config = AutoConfig.from_pretrained(model_name, attn_implementation=args.attn_impl, trust_remote_code=True)
             print(f"num_hidden_layers: {model_config.num_hidden_layers} -> {args.num_layers}")
             model_config.num_hidden_layers = args.num_layers
             model = AutoModelForCausalLM.from_config(model_config, trust_remote_code=True)
         else:
             model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
 
-    if patch_phi3moe(model) and accelerator.is_main_process:
-        print("Patched Phi-3.5-MoE model")
-
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
     if args.save_weights and accelerator.is_main_process:
@@ -149,7 +145,6 @@ def tokenize_function(examples):
         torch._dynamo.config.capture_dynamic_output_shape_ops = True
         torch._dynamo.config.capture_scalar_outputs = True
 
-
     if is_deepspeed:
         if args.compile:
             schedule = make_schedule(args.passes.split(","), warmup=5) if args.passes else None
@@ -185,10 +180,13 @@ def tokenize_function(examples):
         on_trace_ready=torch.profiler.tensorboard_trace_handler(prof_dir),
     ) if do_profile else nullcontext()
 
-    # Training loop
-    model.train()
-    global_step = 0
+    # Training 
+    if args.eval:
+        model.eval()
+    else:
+        model.train()
 
+    global_step = 0
     iter_times = []
 
     # See https://github.com/microsoft/DeepSpeed/issues/6793
diff --git a/benchmarks/deepcompile/run_bench_z1.sh b/benchmarks/deepcompile/run_bench_z1.sh
index b5491e3fc..ba61cd5e1 100644
--- a/benchmarks/deepcompile/run_bench_z1.sh
+++ b/benchmarks/deepcompile/run_bench_z1.sh
@@ -6,6 +6,8 @@ DC_OPTS="--compile --deepcompile"
 ACC_OPTS="--gradient-accumulation-steps 1"
 AC_OPTS="--activation-checkpointing"
 
+export NUM_NODES=${NUM_NODES:-4}
+
 MODEL="meta-llama/Meta-Llama-3-8B-Instruct"
 BATCH_SIZE_OPTS=(1 2 4)
 SEQ_LENGTH_OPTS=(512 1024 2048)
diff --git a/benchmarks/deepcompile/run_multinode.sh b/benchmarks/deepcompile/run_multinode.sh
index 6f3feba9a..92d30839d 100644
--- a/benchmarks/deepcompile/run_multinode.sh
+++ b/benchmarks/deepcompile/run_multinode.sh
@@ -4,11 +4,23 @@ echo $*
 
 SCRIPT_DIR=$(dirname $(realpath $0))
 HOST_IP=$(hostname -i)
-NUM_NODES=${NUM_NODES:-$(wc -l < /job/hostfile)}
+NUM_NODES=${NUM_NODES:-1}
+
+# verify that NUM_NODES is a positive integer
+if ! [[ "$NUM_NODES" =~ ^[1-9][0-9]*$ ]]; then
+    echo "Error: NUM_NODES must be a positive integer"
+    exit 1
+fi
+
+# check if NUM_NODES ==1 or hostfile_n${NUM_NODES} exists
+if [ ! -f hostfile_n${NUM_NODES} ] && [ "${NUM_NODES}" != "1" ]; then
+    echo "Error: hostfile_n${NUM_NODES} does not exist"
+    exit 1
+fi
 
 if [ "${NUM_NODES}" == "1" ]; then
     # avoid dependency on pdsh when possible
     cd ${SCRIPT_DIR}; bash ./run.sh --host-ip ${HOST_IP} $*
 else
-    ds_ssh -f hostfile_n${NUM_NODES} "cd ${SCRIPT_DIR}; bash ./run.sh --host-ip ${HOST_IP} $*"
+    ds_ssh -f hostfile_n${NUM_NODES} "cd ${SCRIPT_DIR}; NUM_NODES=${NUM_NODES} bash ./run.sh --host-ip ${HOST_IP} $*"
 fi