From b6127bf969fddebd9b94e720e7f38cccbf06ff78 Mon Sep 17 00:00:00 2001 From: Masahiro Tanaka Date: Fri, 18 Apr 2025 16:23:38 +0000 Subject: [PATCH 1/4] update description of versions for deepcompile --- benchmarks/deepcompile/README.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/benchmarks/deepcompile/README.md b/benchmarks/deepcompile/README.md index 16a825199..440f2093b 100644 --- a/benchmarks/deepcompile/README.md +++ b/benchmarks/deepcompile/README.md @@ -10,7 +10,10 @@ We tested the scripts with Python 3.10.12 and CUDA 12.4. In addition, you need to install the following: - PyTorch v2.6.0 -- DeepSpeed (v0.16.6 or newer) + - For the Mixtral model, we recommend using PyTorch v2.7.0 (currently the final release candidate) due to an issue with `torch.where`. + - See details [here](https://github.com/pytorch/pytorch/issues/149278) +- DeepSpeed (v.0.16.6 or newer) + - As DeepCompile is under active development, we recommend using the latest version or installing from source. - transformers - accelerate - datasets v3.1 @@ -24,6 +27,12 @@ pip3 install transformers datasets==3.1 accelerate # Install DeepSpeed pip install deepspeed +# Or install the latest revision of DeepSpeed from source +# git clone https://github.com/deepspeedai/DeepSpeed +# cd DeepSpeed +# git pip install . +# cd .. + # Clone this repository git clone https://github.com/deepspeedai/DeepSpeedExamples cd benchmarks/deepcompile From 0e0f3092ef55c3afecbb5f80c66db4388241e6ec Mon Sep 17 00:00:00 2001 From: Masahiro Tanaka Date: Sun, 20 Apr 2025 03:08:20 +0000 Subject: [PATCH 2/4] fix deepcompile benchmark script Signed-off-by: Masahiro Tanaka --- benchmarks/deepcompile/run.sh | 5 ++--- benchmarks/deepcompile/run_bench.sh | 2 ++ benchmarks/deepcompile/run_bench_acc.sh | 2 ++ benchmarks/deepcompile/run_bench_lm.py | 8 +------- benchmarks/deepcompile/run_multinode.sh | 16 ++++++++++++++-- 5 files changed, 21 insertions(+), 12 deletions(-) diff --git a/benchmarks/deepcompile/run.sh b/benchmarks/deepcompile/run.sh index 57da03193..78b289123 100644 --- a/benchmarks/deepcompile/run.sh +++ b/benchmarks/deepcompile/run.sh @@ -1,7 +1,6 @@ #!/bin/bash - -NUM_NODES=${NUM_NODES:-$(wc -l < /job/hostfile)} +NUM_NODES=${NUM_NODES:-1} NGPUS_PER_NODE=${NGPUS_PER_NODE:-$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)} NUM_PROCESSES=$((${NUM_NODES} * ${NGPUS_PER_NODE})) @@ -217,7 +216,7 @@ echo "Logging to ${LOG_FILE}" ${HOME}/.local/bin/accelerate launch --main_process_ip ${HOST_IP} --main_process_port 12345 \ --num_machines ${NUM_NODES} --num_processes ${NUM_PROCESSES} --machine_rank ${MACHINE_RANK} \ --config_file configs/config.yaml \ -run_acc_lm.py \ +run_bench_lm.py \ --model_name "${MODEL}" \ --zero_stage ${ZERO_STAGE} \ ${GAS_OPTS} \ diff --git a/benchmarks/deepcompile/run_bench.sh b/benchmarks/deepcompile/run_bench.sh index 174e34951..78c5df473 100644 --- a/benchmarks/deepcompile/run_bench.sh +++ b/benchmarks/deepcompile/run_bench.sh @@ -6,6 +6,8 @@ DC_OPTS="--compile --deepcompile" ACC_OPTS="--gradient-accumulation-steps 1" AC_OPTS="--activation-checkpointing" +export NUM_NODES=${NUM_NODES:-4} + MODEL="meta-llama/Meta-Llama-3-70B-Instruct" BATCH_SIZE_OPTS=(1 2 4) SEQ_LENGTH_OPTS=(512 1024 2048) diff --git a/benchmarks/deepcompile/run_bench_acc.sh b/benchmarks/deepcompile/run_bench_acc.sh index a3b66844d..7c4e81815 100644 --- a/benchmarks/deepcompile/run_bench_acc.sh +++ b/benchmarks/deepcompile/run_bench_acc.sh @@ -5,6 +5,8 @@ COMPILE_OPTS="--compile" N3Z_OPTS="--compile --deepcompile" AC_OPTS="--activation-checkpointing" +export NUM_NODES=${NUM_NODES:-4} + MODEL="meta-llama/Meta-Llama-3-70B-Instruct" BATCH_SIZE_OPTS=(1) SEQ_LENGTH_OPTS=(1024) diff --git a/benchmarks/deepcompile/run_bench_lm.py b/benchmarks/deepcompile/run_bench_lm.py index f175d84d7..bb96fa68b 100644 --- a/benchmarks/deepcompile/run_bench_lm.py +++ b/benchmarks/deepcompile/run_bench_lm.py @@ -15,8 +15,6 @@ from datasets.utils.logging import disable_progress_bar -from patch_phi3_moe import patch_phi3moe - def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--model_name", type=str, default="meta-llama/Llama-2-7b-hf") @@ -98,16 +96,13 @@ def main(): model = AutoModelForCausalLM.from_pretrained(model_weight_path, trust_remote_code=True) else: if args.num_layers > 0: - model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) + model_config = AutoConfig.from_pretrained(model_name, attn_implementation=args.attn_impl, trust_remote_code=True) print(f"num_hidden_layers: {model_config.num_hidden_layers} -> {args.num_layers}") model_config.num_hidden_layers = args.num_layers model = AutoModelForCausalLM.from_config(model_config, trust_remote_code=True) else: model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True) - if patch_phi3moe(model) and accelerator.is_main_process: - print("Patched Phi-3.5-MoE model") - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) if args.save_weights and accelerator.is_main_process: @@ -149,7 +144,6 @@ def tokenize_function(examples): torch._dynamo.config.capture_dynamic_output_shape_ops = True torch._dynamo.config.capture_scalar_outputs = True - if is_deepspeed: if args.compile: schedule = make_schedule(args.passes.split(","), warmup=5) if args.passes else None diff --git a/benchmarks/deepcompile/run_multinode.sh b/benchmarks/deepcompile/run_multinode.sh index 6f3feba9a..92d30839d 100644 --- a/benchmarks/deepcompile/run_multinode.sh +++ b/benchmarks/deepcompile/run_multinode.sh @@ -4,11 +4,23 @@ echo $* SCRIPT_DIR=$(dirname $(realpath $0)) HOST_IP=$(hostname -i) -NUM_NODES=${NUM_NODES:-$(wc -l < /job/hostfile)} +NUM_NODES=${NUM_NODES:-1} + +# verify that NUM_NODES is a positive integer +if ! [[ "$NUM_NODES" =~ ^[1-9][0-9]*$ ]]; then + echo "Error: NUM_NODES must be a positive integer" + exit 1 +fi + +# check if NUM_NODES ==1 or hostfile_n${NUM_NODES} exists +if [ ! -f hostfile_n${NUM_NODES} ] && [ "${NUM_NODES}" != "1" ]; then + echo "Error: hostfile_n${NUM_NODES} does not exist" + exit 1 +fi if [ "${NUM_NODES}" == "1" ]; then # avoid dependency on pdsh when possible cd ${SCRIPT_DIR}; bash ./run.sh --host-ip ${HOST_IP} $* else - ds_ssh -f hostfile_n${NUM_NODES} "cd ${SCRIPT_DIR}; bash ./run.sh --host-ip ${HOST_IP} $*" + ds_ssh -f hostfile_n${NUM_NODES} "cd ${SCRIPT_DIR}; NUM_NODES=${NUM_NODES} bash ./run.sh --host-ip ${HOST_IP} $*" fi From 9d881c1f790893db62fd806863ca069728389e02 Mon Sep 17 00:00:00 2001 From: Masahiro Tanaka Date: Sun, 20 Apr 2025 03:31:53 +0000 Subject: [PATCH 3/4] fix benchmark for z1 Signed-off-by: Masahiro Tanaka --- benchmarks/deepcompile/run_bench_lm.py | 2 +- benchmarks/deepcompile/run_bench_z1.sh | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/deepcompile/run_bench_lm.py b/benchmarks/deepcompile/run_bench_lm.py index bb96fa68b..9d669afa1 100644 --- a/benchmarks/deepcompile/run_bench_lm.py +++ b/benchmarks/deepcompile/run_bench_lm.py @@ -72,7 +72,7 @@ def main(): args = get_args() print(args) - if "offload_adam_states" in args.passes: + if args.passes is not None and "offload_adam_states" in args.passes: os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128' if args.deterministic: diff --git a/benchmarks/deepcompile/run_bench_z1.sh b/benchmarks/deepcompile/run_bench_z1.sh index b5491e3fc..ba61cd5e1 100644 --- a/benchmarks/deepcompile/run_bench_z1.sh +++ b/benchmarks/deepcompile/run_bench_z1.sh @@ -6,6 +6,8 @@ DC_OPTS="--compile --deepcompile" ACC_OPTS="--gradient-accumulation-steps 1" AC_OPTS="--activation-checkpointing" +export NUM_NODES=${NUM_NODES:-4} + MODEL="meta-llama/Meta-Llama-3-8B-Instruct" BATCH_SIZE_OPTS=(1 2 4) SEQ_LENGTH_OPTS=(512 1024 2048) From ddf0d506e08c18113efdae015580f308346023ee Mon Sep 17 00:00:00 2001 From: Masahiro Tanaka Date: Sun, 20 Apr 2025 03:57:02 +0000 Subject: [PATCH 4/4] add options for deepcompile bench Signed-off-by: Masahiro Tanaka --- benchmarks/deepcompile/run.sh | 8 ++++++++ benchmarks/deepcompile/run_bench_lm.py | 10 +++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/benchmarks/deepcompile/run.sh b/benchmarks/deepcompile/run.sh index 78b289123..f5e8662d5 100644 --- a/benchmarks/deepcompile/run.sh +++ b/benchmarks/deepcompile/run.sh @@ -94,6 +94,14 @@ while [[ $# -gt 0 ]]; do EXTRA_OPTS="${EXTRA_OPTS} --num_layers $2" shift 2 ;; + --attn-impl) + EXTRA_OPTS="${EXTRA_OPTS} --attn_impl $2" + shift 2 + ;; + --eval) + EXTRA_OPTS="${EXTRA_OPTS} --eval" + shift + ;; --debug-log) DEBUG_LOG=1 shift diff --git a/benchmarks/deepcompile/run_bench_lm.py b/benchmarks/deepcompile/run_bench_lm.py index 9d669afa1..567fd2715 100644 --- a/benchmarks/deepcompile/run_bench_lm.py +++ b/benchmarks/deepcompile/run_bench_lm.py @@ -25,6 +25,7 @@ def get_args(): parser.add_argument("--max_grad_norm", type=float, default=1.0) parser.add_argument("--gradient_accumulation_steps", type=int, default=1) parser.add_argument("--activation_checkpointing", action="store_true") + parser.add_argument("--eval", action="store_true") parser.add_argument("--dataset_name", type=str, default="timdettmers/openassistant-guanaco") parser.add_argument("--num_layers", type=int, default=0) parser.add_argument("--attn_impl", type=str, default="spda") @@ -179,10 +180,13 @@ def tokenize_function(examples): on_trace_ready=torch.profiler.tensorboard_trace_handler(prof_dir), ) if do_profile else nullcontext() - # Training loop - model.train() - global_step = 0 + # Training + if args.eval: + model.eval() + else: + model.train() + global_step = 0 iter_times = [] # See https://github.com/microsoft/DeepSpeed/issues/6793