Skip to content

Commit 4a29028

Browse files
committed
Multi-node GPU with ray
1 parent c9b47e2 commit 4a29028

5 files changed

Lines changed: 205 additions & 16 deletions

File tree

EIANN/network.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ def __init__(self, layer_config, projection_config, learning_rate=None, optimize
168168
if hasattr(pre_pop, 'image_dim') and pre_pop.image_dim is not None:
169169
projection = Conv2DProjection(pre_pop, post_pop, device='cpu', **projection_kwargs)
170170
else:
171-
projection = Projection(pre_pop, post_pop, device='cpu', **projection_kwargs) # TODO: move this to self.device?
171+
projection = Projection(pre_pop, post_pop, device='cpu', **projection_kwargs) # cannot initialize on self.device --> accuracy goes down
172172
post_pop.append_projection(projection)
173173
post_pop.incoming_projections[projection.name] = projection
174174
pre_pop.outgoing_projections[projection.name] = projection

EIANN/simulate/jobscripts/run_EIANN_gpu_bridges_mnist_ray.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#SBATCH --partition=GPU-shared
88
#SBATCH --gres=gpu:v100-32:3
99
#SBATCH --mem=80G
10-
#SBATCH --cpus-per-task=4
10+
#SBATCH --cpus-per-task=15
1111
#SBATCH --time=02:00:00
1212
#SBATCH -A bio240068p
1313
#SBATCH --mail-user=yc1376@scarletmail.rutgers.edu
@@ -22,7 +22,7 @@ conda activate eiann
2222
cd ~/EIANN
2323

2424
# Start Ray (local mode)
25-
ray start --head --num-gpus=3 --num-cpus=12
25+
ray start --head --num-gpus=3 --num-cpus=15
2626

2727
python EIANN/simulate/run_EIANN_mnist_ray.py \
2828
--network-config-file-name=20231129_EIANN_2_hidden_mnist_bpDale_relu_SGD_config_G_complete_optimized.yaml \
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#!/bin/bash -l
2+
#SBATCH -J eiann_gpu_mnist_ray
3+
#SBATCH -o /ocean/projects/bio240068p/chennawa/logs/EIANN/eiann_gpu_mnist_ray.%j.o
4+
#SBATCH -e /ocean/projects/bio240068p/chennawa/logs/EIANN/eiann_gpu_mnist_ray.%j.e
5+
#SBATCH --nodes=2
6+
#SBATCH --ntasks-per-node=1
7+
#SBATCH --partition=GPU
8+
#SBATCH --gres=gpu:v100-32:8
9+
#SBATCH --mem=80G
10+
#SBATCH --cpus-per-task=15
11+
#SBATCH --time=02:00:00
12+
#SBATCH -A bio240068p
13+
#SBATCH --mail-user=yc1376@scarletmail.rutgers.edu
14+
#SBATCH --mail-type=ALL
15+
16+
module purge
17+
module load cuda/12.4.0
18+
19+
source /opt/packages/anaconda3-2024.10-1/etc/profile.d/conda.sh
20+
conda activate eiann
21+
22+
cd ~/EIANN
23+
24+
# --- RAY CLUSTER LAUNCH ---
25+
26+
# 1. Get the list of nodes and the head node IP
27+
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
28+
nodes_array=($nodes)
29+
30+
head_node=${nodes_array[0]}
31+
# Get the IP address of the head node
32+
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
33+
34+
port=6379
35+
ip_head=$head_node_ip:$port
36+
export ip_head
37+
echo "Head node IP: $head_node_ip"
38+
39+
# 2. Start the Ray Head Node
40+
# Use --block (and "&") so the process stays alive in the background
41+
echo "Starting Head node on $head_node"
42+
srun --nodes=1 --ntasks=1 -w "$head_node" \
43+
ray start --head --node-ip-address="$head_node_ip" --port=$port --num-cpus=15 --num-gpus=8 --block &
44+
45+
# 3. Start Ray Worker Nodes
46+
# Loop over the rest of the nodes (starting from index 1)
47+
worker_num=$((SLURM_JOB_NUM_NODES - 1))
48+
49+
for ((i=1; i<=worker_num; i++)); do
50+
node_i=${nodes_array[$i]}
51+
echo "Starting Worker node on $node_i"
52+
srun --nodes=1 --ntasks=1 -w "$node_i" ray start --address "$ip_head" --num-cpus=15 --num-gpus=8 --block &
53+
done
54+
55+
# 4. Wait for cluster to initialize
56+
sleep 20
57+
58+
# 5. Export the address so ray.init() in python finds the cluster
59+
export RAY_ADDRESS=$ip_head
60+
61+
# --- RUN SCRIPT ---
62+
63+
python EIANN/simulate/run_EIANN_mnist_ray.py \
64+
--network-config-file-name=20231129_EIANN_2_hidden_mnist_bpDale_relu_SGD_config_G_complete_optimized.yaml \
65+
--data-dir=/ocean/projects/bio250022p/$USER/data/EIANN \
66+
--num-seeds=32
67+
68+
69+
# To submit:
70+
# cd ~/EIANN/EIANN/simulate/jobscripts
71+
# sbatch run_EIANN_gpu_multi_node_bridges_mnist_ray.sh
72+
73+
# See logs:
74+
# cd /ocean/projects/bio240068p/$USER/logs/EIANN

EIANN/simulate/run_EIANN_mnist_ray.py

Lines changed: 43 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,7 @@ def train_eiann(config):
9393
@click.option('--network-config-file-name', required=True, type=str, help="Network config file name")
9494
@click.option('--data-dir', default="../data/mnist", type=str, help="Directory for MNIST data")
9595
@click.option('--num-seeds', default=5, type=int, help="Number of different seeds to try")
96-
@click.option('--num-gpus', default=2, type=int, help="Number of GPUs to use")
97-
def main(network_config_file_name, data_dir, num_seeds, num_gpus):
96+
def main(network_config_file_name, data_dir, num_seeds):
9897

9998
overall_start_time = time.time()
10099

@@ -110,10 +109,8 @@ def main(network_config_file_name, data_dir, num_seeds, num_gpus):
110109
for i in range(num_seeds)
111110
]
112111

113-
# TODO: try across different GPU nodes, perhaps with MPI
114-
115112
tuner = tune.Tuner(
116-
tune.with_resources(train_eiann, resources={"cpu": 1, "gpu": 0.5}),
113+
tune.with_resources(train_eiann, resources={"cpu": 0, "gpu": 0.5}),
117114
param_space=tune.grid_search(param_space),
118115
run_config=RunConfig(name="eiann_mnist_parallel_ray")
119116
)
@@ -134,15 +131,48 @@ def main(network_config_file_name, data_dir, num_seeds, num_gpus):
134131

135132
# interact -p GPU-shared -N 1 --gres=gpu:v100-32:3 -t 01:00:00
136133

134+
135+
# ===== Single-node GPU runs =====
136+
137137
# bp Dale:
138-
# baseline: 1 cpu 0.5 gpu per seed (GPU-shared)
139-
# 492.76 sec
138+
# 36191138: 1 cpu 0.5 gpu per seed, request 12 cpus (GPU-shared)
139+
# 346.74 seconds
140+
141+
# 36191154: 1 cpu 0.5 gpu per seed, request 6 cpus (GPU-shared)
142+
# 382.83 seconds
143+
144+
# 36191200: 2 cpu 0.5 gpu per seed, request 12 cpus (GPU-shared)
145+
# 370.17 seconds
146+
147+
# 36191244: 1 cpu 0.25 gpu per seed, request 12 cpus (GPU-shared)
148+
# 397.13 seconds
149+
150+
# 36191248: 1 cpu 1 gpu per seed, request 12 cpus (GPU-shared)
151+
# 688.59 seconds -> did not request enough gpus -> retried with 36192557
152+
153+
# 36192288: 1 cpu 0.5 gpu per seed (12 seeds), request 12 cpus (GPU-shared)
154+
# 722.51 seconds -> sequential
155+
156+
# 36192294: 0 cpu 0.5 gpu per seed, request 12 cpus (GPU-shared)
157+
# 360.29 seconds
158+
159+
# 36192324: 1 cpu 0.2 gpu per seed, request 6 cpus (GPU-shared)
160+
# error
161+
162+
# 36192511: 0 cpu 0.5 gpu per seed, request 15 cpus (GPU-shared)
163+
# 356.43 seconds
164+
165+
# 36192512: 0 cpu 0.5 gpu per seed (12 seeds), request 15 cpus (GPU-shared)
166+
# 733.16 seconds
167+
168+
# 36192557: 1 cpu 1 gpu per seed, request 15 cpus 5 gpus (GPU)
169+
# error for some
140170

141-
# 36148795 - ray, 4 cpu 0.5 gpu per seed (GPU-shared)
142-
# 474.12 sec
171+
# ===== Multi-node GPU runs =====
143172

144-
# 36148818 - ray, 1 cpu 0.25 gpu per seed (GPU-shared)
145-
# 641.27 sec
173+
# bp Dale:
174+
# 36193643: 2 GPU nodes, 0 cpu 0.5 gpu per seed (32 seeds), request 15 cpus 16 gpus (GPU), with ray head only
175+
# 770.49 seconds -> only used 8 gpus
146176

147-
# 36148859 - ray, 1 cpu 1 gpu per seed (GPU)
148-
# 382.98 sec
177+
# 36193950: 2 GPU nodes, 0 cpu 0.5 gpu per seed (32 seeds), request 8,8 cpus 8,7 gpus (GPU), with ray head+worker
178+
# 384.80 seconds
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import os
2+
import time
3+
import torch
4+
import random
5+
import numpy as np
6+
import ray
7+
import click
8+
9+
import EIANN.utils as ut
10+
11+
@ray.remote(num_gpus=0.5, num_cpus=1)
12+
def run_seed(seed_idx, network_config_file_name, data_dir):
13+
network_seed = 66049 + seed_idx
14+
data_seed = 257 + seed_idx
15+
16+
ut.set_all_seeds(seed=network_seed)
17+
torch.cuda.manual_seed_all(network_seed)
18+
np.random.seed(data_seed)
19+
random.seed(data_seed)
20+
21+
train_loader, val_loader, test_loader, _ = ut.get_MNIST_dataloaders(data_dir=data_dir, data_seed=data_seed)
22+
23+
script_dir = os.path.dirname(os.path.abspath(__file__))
24+
project_root = os.path.dirname(os.path.dirname(script_dir))
25+
config_file_path = os.path.join(project_root, "EIANN", "network_config", "mnist", network_config_file_name)
26+
27+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28+
net = ut.build_EIANN_from_config(config_file_path, network_seed=network_seed, device=device)
29+
30+
start = time.time()
31+
net.train(
32+
train_loader,
33+
val_loader,
34+
epochs=1,
35+
samples_per_epoch=20_000,
36+
val_interval=(0, -1, 100),
37+
store_history=False
38+
)
39+
40+
val_acc = net.val_accuracy_history[-1]
41+
val_loss = net.val_loss_history[-1]
42+
return {
43+
"seed_idx": seed_idx,
44+
"val_accuracy": float(val_acc),
45+
"val_loss": float(val_loss),
46+
"run_time": time.time() - start,
47+
"network_seed": network_seed,
48+
"data_seed": data_seed,
49+
}
50+
51+
@click.command()
52+
@click.option('--network-config-file-name', required=True, type=str, help="Network config file name")
53+
@click.option('--data-dir', default="../data/mnist", type=str, help="Directory for MNIST data")
54+
@click.option('--num-seeds', default=5, type=int, help="Number of different seeds to try")
55+
def main(network_config_file_name, data_dir, num_seeds):
56+
overall_start_time = time.time()
57+
58+
ray.init()
59+
seeds = list(range(num_seeds))
60+
handles = [run_seed.remote(s, network_config_file_name, data_dir) for s in seeds]
61+
# Asynchronous collection
62+
pending = handles[:]
63+
results = []
64+
while pending:
65+
ready, pending = ray.wait(pending, num_returns=1)
66+
out = ray.get(ready[0])
67+
print("Finished seed", out["seed_idx"], "acc", out["val_accuracy"])
68+
results.append(out)
69+
70+
overall_end_time = time.time()
71+
print(f"Overall time for {num_seeds} seeds: {overall_end_time - overall_start_time:.2f} seconds")
72+
73+
print("Summary:")
74+
for res in results:
75+
print(f"Seed {res['seed_idx']}: Val Acc = {res['val_accuracy']}, Val Loss = {res['val_loss']}, Time = {res['run_time']:.2f} seconds")
76+
77+
if __name__ == "__main__":
78+
main()
79+
80+
# bp Dale:
81+
# 36192337: 1 cpu 0.2 gpu per seed, request 6 cpus (GPU-shared)
82+
# 436.63 seconds
83+
84+
# 36192343: 1 cpu 0.5 gpu per seed, request 6 cpus (GPU-shared)
85+
# 389.19 seconds

0 commit comments

Comments
 (0)