Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions benchmarks/deepcompile/run.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/bin/bash


NUM_NODES=${NUM_NODES:-$(wc -l < /job/hostfile)}
NUM_NODES=${NUM_NODES:-1}
NGPUS_PER_NODE=${NGPUS_PER_NODE:-$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)}
NUM_PROCESSES=$((${NUM_NODES} * ${NGPUS_PER_NODE}))

Expand Down Expand Up @@ -95,6 +94,14 @@ while [[ $# -gt 0 ]]; do
EXTRA_OPTS="${EXTRA_OPTS} --num_layers $2"
shift 2
;;
--attn-impl)
EXTRA_OPTS="${EXTRA_OPTS} --attn_impl $2"
shift 2
;;
--eval)
EXTRA_OPTS="${EXTRA_OPTS} --eval"
shift
;;
--debug-log)
DEBUG_LOG=1
shift
Expand Down Expand Up @@ -217,7 +224,7 @@ echo "Logging to ${LOG_FILE}"
${HOME}/.local/bin/accelerate launch --main_process_ip ${HOST_IP} --main_process_port 12345 \
--num_machines ${NUM_NODES} --num_processes ${NUM_PROCESSES} --machine_rank ${MACHINE_RANK} \
--config_file configs/config.yaml \
run_acc_lm.py \
run_bench_lm.py \
--model_name "${MODEL}" \
--zero_stage ${ZERO_STAGE} \
${GAS_OPTS} \
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/deepcompile/run_bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ DC_OPTS="--compile --deepcompile"
ACC_OPTS="--gradient-accumulation-steps 1"
AC_OPTS="--activation-checkpointing"

export NUM_NODES=${NUM_NODES:-4}

MODEL="meta-llama/Meta-Llama-3-70B-Instruct"
BATCH_SIZE_OPTS=(1 2 4)
SEQ_LENGTH_OPTS=(512 1024 2048)
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/deepcompile/run_bench_acc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ COMPILE_OPTS="--compile"
N3Z_OPTS="--compile --deepcompile"
AC_OPTS="--activation-checkpointing"

export NUM_NODES=${NUM_NODES:-4}

MODEL="meta-llama/Meta-Llama-3-70B-Instruct"
BATCH_SIZE_OPTS=(1)
SEQ_LENGTH_OPTS=(1024)
Expand Down
20 changes: 9 additions & 11 deletions benchmarks/deepcompile/run_bench_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@

from datasets.utils.logging import disable_progress_bar

from patch_phi3_moe import patch_phi3moe

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, default="meta-llama/Llama-2-7b-hf")
Expand All @@ -27,6 +25,7 @@ def get_args():
parser.add_argument("--max_grad_norm", type=float, default=1.0)
parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
parser.add_argument("--activation_checkpointing", action="store_true")
parser.add_argument("--eval", action="store_true")
parser.add_argument("--dataset_name", type=str, default="timdettmers/openassistant-guanaco")
parser.add_argument("--num_layers", type=int, default=0)
parser.add_argument("--attn_impl", type=str, default="spda")
Expand Down Expand Up @@ -74,7 +73,7 @@ def main():
args = get_args()
print(args)

if "offload_adam_states" in args.passes:
if args.passes is not None and "offload_adam_states" in args.passes:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

if args.deterministic:
Expand All @@ -98,16 +97,13 @@ def main():
model = AutoModelForCausalLM.from_pretrained(model_weight_path, trust_remote_code=True)
else:
if args.num_layers > 0:
model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
model_config = AutoConfig.from_pretrained(model_name, attn_implementation=args.attn_impl, trust_remote_code=True)
print(f"num_hidden_layers: {model_config.num_hidden_layers} -> {args.num_layers}")
model_config.num_hidden_layers = args.num_layers
model = AutoModelForCausalLM.from_config(model_config, trust_remote_code=True)
else:
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

if patch_phi3moe(model) and accelerator.is_main_process:
print("Patched Phi-3.5-MoE model")

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

if args.save_weights and accelerator.is_main_process:
Expand Down Expand Up @@ -149,7 +145,6 @@ def tokenize_function(examples):
torch._dynamo.config.capture_dynamic_output_shape_ops = True
torch._dynamo.config.capture_scalar_outputs = True


if is_deepspeed:
if args.compile:
schedule = make_schedule(args.passes.split(","), warmup=5) if args.passes else None
Expand Down Expand Up @@ -185,10 +180,13 @@ def tokenize_function(examples):
on_trace_ready=torch.profiler.tensorboard_trace_handler(prof_dir),
) if do_profile else nullcontext()

# Training loop
model.train()
global_step = 0
# Training
if args.eval:
model.eval()
else:
model.train()

global_step = 0
iter_times = []

# See https://github.com/microsoft/DeepSpeed/issues/6793
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/deepcompile/run_bench_z1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ DC_OPTS="--compile --deepcompile"
ACC_OPTS="--gradient-accumulation-steps 1"
AC_OPTS="--activation-checkpointing"

export NUM_NODES=${NUM_NODES:-4}

MODEL="meta-llama/Meta-Llama-3-8B-Instruct"
BATCH_SIZE_OPTS=(1 2 4)
SEQ_LENGTH_OPTS=(512 1024 2048)
Expand Down
16 changes: 14 additions & 2 deletions benchmarks/deepcompile/run_multinode.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,23 @@ echo $*

SCRIPT_DIR=$(dirname $(realpath $0))
HOST_IP=$(hostname -i)
NUM_NODES=${NUM_NODES:-$(wc -l < /job/hostfile)}
NUM_NODES=${NUM_NODES:-1}

# verify that NUM_NODES is a positive integer
if ! [[ "$NUM_NODES" =~ ^[1-9][0-9]*$ ]]; then
echo "Error: NUM_NODES must be a positive integer"
exit 1
fi

# check if NUM_NODES ==1 or hostfile_n${NUM_NODES} exists
if [ ! -f hostfile_n${NUM_NODES} ] && [ "${NUM_NODES}" != "1" ]; then
echo "Error: hostfile_n${NUM_NODES} does not exist"
exit 1
fi

if [ "${NUM_NODES}" == "1" ]; then
# avoid dependency on pdsh when possible
cd ${SCRIPT_DIR}; bash ./run.sh --host-ip ${HOST_IP} $*
else
ds_ssh -f hostfile_n${NUM_NODES} "cd ${SCRIPT_DIR}; bash ./run.sh --host-ip ${HOST_IP} $*"
ds_ssh -f hostfile_n${NUM_NODES} "cd ${SCRIPT_DIR}; NUM_NODES=${NUM_NODES} bash ./run.sh --host-ip ${HOST_IP} $*"
fi