-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathtrain_1.3B_multi_node.sh
More file actions
60 lines (50 loc) · 2.11 KB
/
train_1.3B_multi_node.sh
File metadata and controls
60 lines (50 loc) · 2.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/bin/bash
# --- User-configurable section ---
NNODES=2 # Total number of machines
NPROC_PER_NODE=8 # Number of GPUs per machine
MASTER_ADDR="10.48.49.79" # IP address of the master node (must be accessible by all nodes)
MASTER_PORT=29500 # Port on the master node
NODE_RANK=$1 # Rank of the current node (starting from 0), passed via command-line argument
# --- Argument check ---
if [ -z "$NODE_RANK" ]; then
echo "Error: Please provide the node rank (NODE_RANK) as the first argument."
echo "Usage: bash train_1.3B_multi_node.sh 0 (on the master node)"
echo " bash train_1.3B_multi_node.sh 1 (on the first worker node)"
exit 1
fi
# --- Print configuration ---
echo "========================================"
echo "Starting multi-node training..."
echo "Total number of nodes (NNODES): $NNODES"
echo "Processes per node (NPROC_PER_NODE): $NPROC_PER_NODE"
echo "Master node address (MASTER_ADDR): $MASTER_ADDR:$MASTER_PORT"
echo "Current node rank (NODE_RANK): $NODE_RANK"
echo "========================================"
# if resume from the pretrained model
RESUME_CKPT="checkpoints/download_models/1.3B/MultiShotMaster_model_1.3B.safetensors"
TIMESTAMP=$(date +%Y%m%d_%H%M)
OUTPUT_DIR="models/train/1.3B/${TIMESTAMP}-T2V-1.3B"
echo "Training output will be saved to: ${OUTPUT_DIR}"
# --- Build and launch the torchrun command ---
torchrun \
--nnodes=$NNODES \
--nproc_per_node=$NPROC_PER_NODE \
--rdzv_id=job_123 \
--rdzv_backend=c10d \
--rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \
--node_rank=$NODE_RANK \
train.py \
--dataset_metadata_path toy_cases/train_multishot.csv \
--height 480 \
--width 832 \
--dataset_repeat 1 \
--learning_rate 1e-5 \
--num_epochs 100 \
--remove_prefix_in_ckpt "pipe.dit." \
--output_path "$OUTPUT_DIR" \
--trainable_models "dit" \
--dataset_num_workers 8 \
--save_steps 500 \
--wan_version "t2v-1.3B" \
--resume_ckpt "$RESUME_CKPT" \
--load_path_json "checkpoints/model_configs/model_path_1.3B.json"