Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 22 additions & 25 deletions bitbrain/finetune/run.sh
Original file line number Diff line number Diff line change
@@ -1,53 +1,50 @@
##########################
# llama factory start script
##########################
#CUDA_VISIBLE_DEVICES=5,6 python ../LLaMA-Factory/src/train.py \
export LLaMA_PATH=/home/chenyuhang/LLaMA-Factory
export LLaMA_PATH="llama_factory本地路径"
OUTPUT_DIR="输出路径"
#export CUDA_DEVICE_ORDER=PCI_BUS_ID
export CUDA_VISIBLE_DEVICES=4,5,6,7
export CUDA_VISIBLE_DEVICES=0,1,2,3
export NCCL_P2P_LEVEL=NVL
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

# 可设置混合策略: --mix_strategy interleave_over\
# --interleave_probs 0.1,0.35,0.2,0.2,0.1,0.05 \

# 可设置自定义的评估数据集 --eval_dataset ceval,cmmlu \


FORCE_TORCHRUN=1 CUDA_VISIBLE_DEVICES=4,5,6,7 torchrun --nproc_per_node 4 $LLaMA_PATH/src/train.py \
FORCE_TORCHRUN=1 CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node 4 $LLaMA_PATH/src/train.py \
--stage sft \
--do_train \
--model_name_or_path "预训练模型路径" \
--model_name_or_path "模型路径" \
--cutoff_len 2048 \
--dataset_dir "数据集路径" \
--dataset shared_gpt_format\
--dataset chinese_instruct,deepctrl_200k,ultrachat_200k,code_feedback_custom \
--overwrite_cache \
--max_samples 5000000 \
--packing True \
--enable_liger_kernel True\
--packing False \
--use_swanlab true \
--report_to swanlab \
--run_name sft_bit-brain \
--swanlab_project "swanlab项目名称" \
--run_name "运行名称" \
--preprocessing_num_workers 30 \
--template qwen \
--finetuning_type full \
--output_dir ${OUTPUT_DIR}/sft \
--output_dir ${OUTPUT_DIR}/"本次运行保存子路径" \
--overwrite_output_dir \
--per_device_train_batch_size 16 \
--per_device_train_batch_size 4 \
--per_device_eval_batch_size 4 \
--do_eval \
--val_size 100 \
--eval_strategy steps \
--eval_steps 1000 \
--flash_attn sdpa\
--gradient_accumulation_steps 4 \
--flash_attn fa2\
--gradient_accumulation_steps 16 \
--lr_scheduler_type cosine \
--warmup_ratio 0.0125 \
--max_grad_norm 1.0 \
--logging_steps 10 \
--save_steps 500 \
--learning_rate 3e-4 \
--save_steps 5000 \
--learning_rate 2e-5 \
--weight_decay 0.01 \
--num_train_epochs 4.0 \
--num_train_epochs 3.0 \
--plot_loss \
--bf16 \
--resume_from_checkpoint "恢复训练的checkpoint路径"
--bf16

#--resume_from_checkpoint ${OUTPUT_DIR}/"本次运行保存子路径/checkpoint-xxxx"