diff --git a/benchmarks/deepcompile/run.sh b/benchmarks/deepcompile/run.sh index 57da03193..f5e8662d5 100644 --- a/benchmarks/deepcompile/run.sh +++ b/benchmarks/deepcompile/run.sh @@ -1,7 +1,6 @@ #!/bin/bash - -NUM_NODES=${NUM_NODES:-$(wc -l < /job/hostfile)} +NUM_NODES=${NUM_NODES:-1} NGPUS_PER_NODE=${NGPUS_PER_NODE:-$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)} NUM_PROCESSES=$((${NUM_NODES} * ${NGPUS_PER_NODE})) @@ -95,6 +94,14 @@ while [[ $# -gt 0 ]]; do EXTRA_OPTS="${EXTRA_OPTS} --num_layers $2" shift 2 ;; + --attn-impl) + EXTRA_OPTS="${EXTRA_OPTS} --attn_impl $2" + shift 2 + ;; + --eval) + EXTRA_OPTS="${EXTRA_OPTS} --eval" + shift + ;; --debug-log) DEBUG_LOG=1 shift @@ -217,7 +224,7 @@ echo "Logging to ${LOG_FILE}" ${HOME}/.local/bin/accelerate launch --main_process_ip ${HOST_IP} --main_process_port 12345 \ --num_machines ${NUM_NODES} --num_processes ${NUM_PROCESSES} --machine_rank ${MACHINE_RANK} \ --config_file configs/config.yaml \ -run_acc_lm.py \ +run_bench_lm.py \ --model_name "${MODEL}" \ --zero_stage ${ZERO_STAGE} \ ${GAS_OPTS} \ diff --git a/benchmarks/deepcompile/run_bench.sh b/benchmarks/deepcompile/run_bench.sh index 174e34951..78c5df473 100644 --- a/benchmarks/deepcompile/run_bench.sh +++ b/benchmarks/deepcompile/run_bench.sh @@ -6,6 +6,8 @@ DC_OPTS="--compile --deepcompile" ACC_OPTS="--gradient-accumulation-steps 1" AC_OPTS="--activation-checkpointing" +export NUM_NODES=${NUM_NODES:-4} + MODEL="meta-llama/Meta-Llama-3-70B-Instruct" BATCH_SIZE_OPTS=(1 2 4) SEQ_LENGTH_OPTS=(512 1024 2048) diff --git a/benchmarks/deepcompile/run_bench_acc.sh b/benchmarks/deepcompile/run_bench_acc.sh index a3b66844d..7c4e81815 100644 --- a/benchmarks/deepcompile/run_bench_acc.sh +++ b/benchmarks/deepcompile/run_bench_acc.sh @@ -5,6 +5,8 @@ COMPILE_OPTS="--compile" N3Z_OPTS="--compile --deepcompile" AC_OPTS="--activation-checkpointing" +export NUM_NODES=${NUM_NODES:-4} + MODEL="meta-llama/Meta-Llama-3-70B-Instruct" BATCH_SIZE_OPTS=(1) SEQ_LENGTH_OPTS=(1024) diff --git a/benchmarks/deepcompile/run_bench_lm.py b/benchmarks/deepcompile/run_bench_lm.py index f175d84d7..567fd2715 100644 --- a/benchmarks/deepcompile/run_bench_lm.py +++ b/benchmarks/deepcompile/run_bench_lm.py @@ -15,8 +15,6 @@ from datasets.utils.logging import disable_progress_bar -from patch_phi3_moe import patch_phi3moe - def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--model_name", type=str, default="meta-llama/Llama-2-7b-hf") @@ -27,6 +25,7 @@ def get_args(): parser.add_argument("--max_grad_norm", type=float, default=1.0) parser.add_argument("--gradient_accumulation_steps", type=int, default=1) parser.add_argument("--activation_checkpointing", action="store_true") + parser.add_argument("--eval", action="store_true") parser.add_argument("--dataset_name", type=str, default="timdettmers/openassistant-guanaco") parser.add_argument("--num_layers", type=int, default=0) parser.add_argument("--attn_impl", type=str, default="spda") @@ -74,7 +73,7 @@ def main(): args = get_args() print(args) - if "offload_adam_states" in args.passes: + if args.passes is not None and "offload_adam_states" in args.passes: os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128' if args.deterministic: @@ -98,16 +97,13 @@ def main(): model = AutoModelForCausalLM.from_pretrained(model_weight_path, trust_remote_code=True) else: if args.num_layers > 0: - model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) + model_config = AutoConfig.from_pretrained(model_name, attn_implementation=args.attn_impl, trust_remote_code=True) print(f"num_hidden_layers: {model_config.num_hidden_layers} -> {args.num_layers}") model_config.num_hidden_layers = args.num_layers model = AutoModelForCausalLM.from_config(model_config, trust_remote_code=True) else: model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True) - if patch_phi3moe(model) and accelerator.is_main_process: - print("Patched Phi-3.5-MoE model") - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) if args.save_weights and accelerator.is_main_process: @@ -149,7 +145,6 @@ def tokenize_function(examples): torch._dynamo.config.capture_dynamic_output_shape_ops = True torch._dynamo.config.capture_scalar_outputs = True - if is_deepspeed: if args.compile: schedule = make_schedule(args.passes.split(","), warmup=5) if args.passes else None @@ -185,10 +180,13 @@ def tokenize_function(examples): on_trace_ready=torch.profiler.tensorboard_trace_handler(prof_dir), ) if do_profile else nullcontext() - # Training loop - model.train() - global_step = 0 + # Training + if args.eval: + model.eval() + else: + model.train() + global_step = 0 iter_times = [] # See https://github.com/microsoft/DeepSpeed/issues/6793 diff --git a/benchmarks/deepcompile/run_bench_z1.sh b/benchmarks/deepcompile/run_bench_z1.sh index b5491e3fc..ba61cd5e1 100644 --- a/benchmarks/deepcompile/run_bench_z1.sh +++ b/benchmarks/deepcompile/run_bench_z1.sh @@ -6,6 +6,8 @@ DC_OPTS="--compile --deepcompile" ACC_OPTS="--gradient-accumulation-steps 1" AC_OPTS="--activation-checkpointing" +export NUM_NODES=${NUM_NODES:-4} + MODEL="meta-llama/Meta-Llama-3-8B-Instruct" BATCH_SIZE_OPTS=(1 2 4) SEQ_LENGTH_OPTS=(512 1024 2048) diff --git a/benchmarks/deepcompile/run_multinode.sh b/benchmarks/deepcompile/run_multinode.sh index 6f3feba9a..92d30839d 100644 --- a/benchmarks/deepcompile/run_multinode.sh +++ b/benchmarks/deepcompile/run_multinode.sh @@ -4,11 +4,23 @@ echo $* SCRIPT_DIR=$(dirname $(realpath $0)) HOST_IP=$(hostname -i) -NUM_NODES=${NUM_NODES:-$(wc -l < /job/hostfile)} +NUM_NODES=${NUM_NODES:-1} + +# verify that NUM_NODES is a positive integer +if ! [[ "$NUM_NODES" =~ ^[1-9][0-9]*$ ]]; then + echo "Error: NUM_NODES must be a positive integer" + exit 1 +fi + +# check if NUM_NODES ==1 or hostfile_n${NUM_NODES} exists +if [ ! -f hostfile_n${NUM_NODES} ] && [ "${NUM_NODES}" != "1" ]; then + echo "Error: hostfile_n${NUM_NODES} does not exist" + exit 1 +fi if [ "${NUM_NODES}" == "1" ]; then # avoid dependency on pdsh when possible cd ${SCRIPT_DIR}; bash ./run.sh --host-ip ${HOST_IP} $* else - ds_ssh -f hostfile_n${NUM_NODES} "cd ${SCRIPT_DIR}; bash ./run.sh --host-ip ${HOST_IP} $*" + ds_ssh -f hostfile_n${NUM_NODES} "cd ${SCRIPT_DIR}; NUM_NODES=${NUM_NODES} bash ./run.sh --host-ip ${HOST_IP} $*" fi