Multi-node GPU with ray

yashc73080 · yashc73080 · commit 4a29028dd95e · 2025-12-01T10:00:04.000-05:00
diff --git a/EIANN/network.py b/EIANN/network.py
@@ -168,7 +168,7 @@ def __init__(self, layer_config, projection_config, learning_rate=None, optimize
                         if hasattr(pre_pop, 'image_dim') and pre_pop.image_dim is not None:
                             projection = Conv2DProjection(pre_pop, post_pop, device='cpu', **projection_kwargs)
                         else:
-                            projection = Projection(pre_pop, post_pop, device='cpu', **projection_kwargs) # TODO: move this to self.device?
+                            projection = Projection(pre_pop, post_pop, device='cpu', **projection_kwargs) # cannot initialize on self.device --> accuracy goes down
                         post_pop.append_projection(projection)
                         post_pop.incoming_projections[projection.name] = projection
                         pre_pop.outgoing_projections[projection.name] = projection
diff --git a/EIANN/simulate/jobscripts/run_EIANN_gpu_bridges_mnist_ray.sh b/EIANN/simulate/jobscripts/run_EIANN_gpu_bridges_mnist_ray.sh
@@ -7,7 +7,7 @@
 #SBATCH --partition=GPU-shared
 #SBATCH --gres=gpu:v100-32:3
 #SBATCH --mem=80G
-#SBATCH --cpus-per-task=4
+#SBATCH --cpus-per-task=15
 #SBATCH --time=02:00:00
 #SBATCH -A bio240068p
 #SBATCH --mail-user=yc1376@scarletmail.rutgers.edu
@@ -22,7 +22,7 @@ conda activate eiann
 cd ~/EIANN
 
 # Start Ray (local mode)
-ray start --head --num-gpus=3 --num-cpus=12
+ray start --head --num-gpus=3 --num-cpus=15
 
 python EIANN/simulate/run_EIANN_mnist_ray.py \
   --network-config-file-name=20231129_EIANN_2_hidden_mnist_bpDale_relu_SGD_config_G_complete_optimized.yaml \
diff --git a/EIANN/simulate/jobscripts/run_EIANN_gpu_multi_node_bridges_mnist_ray.sh b/EIANN/simulate/jobscripts/run_EIANN_gpu_multi_node_bridges_mnist_ray.sh
@@ -0,0 +1,74 @@
+#!/bin/bash -l
+#SBATCH -J eiann_gpu_mnist_ray
+#SBATCH -o /ocean/projects/bio240068p/chennawa/logs/EIANN/eiann_gpu_mnist_ray.%j.o
+#SBATCH -e /ocean/projects/bio240068p/chennawa/logs/EIANN/eiann_gpu_mnist_ray.%j.e
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=1
+#SBATCH --partition=GPU
+#SBATCH --gres=gpu:v100-32:8
+#SBATCH --mem=80G
+#SBATCH --cpus-per-task=15
+#SBATCH --time=02:00:00
+#SBATCH -A bio240068p
+#SBATCH --mail-user=yc1376@scarletmail.rutgers.edu
+#SBATCH --mail-type=ALL
+
+module purge
+module load cuda/12.4.0
+
+source /opt/packages/anaconda3-2024.10-1/etc/profile.d/conda.sh
+conda activate eiann
+
+cd ~/EIANN
+
+# --- RAY CLUSTER LAUNCH ---
+
+# 1. Get the list of nodes and the head node IP
+nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+nodes_array=($nodes)
+
+head_node=${nodes_array[0]}
+# Get the IP address of the head node
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+
+port=6379
+ip_head=$head_node_ip:$port
+export ip_head
+echo "Head node IP: $head_node_ip"
+
+# 2. Start the Ray Head Node
+# Use --block (and "&") so the process stays alive in the background
+echo "Starting Head node on $head_node"
+srun --nodes=1 --ntasks=1 -w "$head_node" \
+    ray start --head --node-ip-address="$head_node_ip" --port=$port --num-cpus=15 --num-gpus=8 --block &
+
+# 3. Start Ray Worker Nodes
+# Loop over the rest of the nodes (starting from index 1)
+worker_num=$((SLURM_JOB_NUM_NODES - 1))
+
+for ((i=1; i<=worker_num; i++)); do
+    node_i=${nodes_array[$i]}
+    echo "Starting Worker node on $node_i"
+    srun --nodes=1 --ntasks=1 -w "$node_i" ray start --address "$ip_head" --num-cpus=15 --num-gpus=8 --block &
+done
+
+# 4. Wait for cluster to initialize
+sleep 20
+
+# 5. Export the address so ray.init() in python finds the cluster
+export RAY_ADDRESS=$ip_head
+
+# --- RUN SCRIPT ---
+
+python EIANN/simulate/run_EIANN_mnist_ray.py \
+  --network-config-file-name=20231129_EIANN_2_hidden_mnist_bpDale_relu_SGD_config_G_complete_optimized.yaml \
+  --data-dir=/ocean/projects/bio250022p/$USER/data/EIANN \
+  --num-seeds=32
+
+
+# To submit:
+# cd ~/EIANN/EIANN/simulate/jobscripts
+# sbatch run_EIANN_gpu_multi_node_bridges_mnist_ray.sh
+
+# See logs:
+# cd /ocean/projects/bio240068p/$USER/logs/EIANN
diff --git a/EIANN/simulate/run_EIANN_mnist_ray.py b/EIANN/simulate/run_EIANN_mnist_ray.py
@@ -93,8 +93,7 @@ def train_eiann(config):
 @click.option('--network-config-file-name', required=True, type=str, help="Network config file name")
 @click.option('--data-dir', default="../data/mnist", type=str, help="Directory for MNIST data")
 @click.option('--num-seeds', default=5, type=int, help="Number of different seeds to try")
-@click.option('--num-gpus', default=2, type=int, help="Number of GPUs to use")
-def main(network_config_file_name, data_dir, num_seeds, num_gpus):
+def main(network_config_file_name, data_dir, num_seeds):
 
     overall_start_time = time.time()
 
@@ -110,10 +109,8 @@ def main(network_config_file_name, data_dir, num_seeds, num_gpus):
         for i in range(num_seeds)
     ]
 
-    # TODO: try across different GPU nodes, perhaps with MPI
-
     tuner = tune.Tuner(
-        tune.with_resources(train_eiann, resources={"cpu": 1, "gpu": 0.5}),
+        tune.with_resources(train_eiann, resources={"cpu": 0, "gpu": 0.5}),
         param_space=tune.grid_search(param_space),
         run_config=RunConfig(name="eiann_mnist_parallel_ray")
     )
@@ -134,15 +131,48 @@ def main(network_config_file_name, data_dir, num_seeds, num_gpus):
 
 # interact -p GPU-shared -N 1 --gres=gpu:v100-32:3 -t 01:00:00
 
+
+# ===== Single-node GPU runs =====
+
 # bp Dale: 
-# baseline: 1 cpu 0.5 gpu per seed (GPU-shared)
-#     492.76 sec
+# 36191138: 1 cpu 0.5 gpu per seed, request 12 cpus (GPU-shared)
+#   346.74 seconds
+
+# 36191154: 1 cpu 0.5 gpu per seed, request 6 cpus (GPU-shared)
+#   382.83 seconds
+
+# 36191200: 2 cpu 0.5 gpu per seed, request 12 cpus (GPU-shared)
+#   370.17 seconds
+
+# 36191244: 1 cpu 0.25 gpu per seed, request 12 cpus (GPU-shared)
+#   397.13 seconds
+
+# 36191248: 1 cpu 1 gpu per seed, request 12 cpus (GPU-shared)
+#   688.59 seconds -> did not request enough gpus -> retried with 36192557
+
+# 36192288: 1 cpu 0.5 gpu per seed (12 seeds), request 12 cpus (GPU-shared)
+#   722.51 seconds -> sequential
+
+# 36192294: 0 cpu 0.5 gpu per seed, request 12 cpus (GPU-shared)
+#   360.29 seconds
+
+# 36192324: 1 cpu 0.2 gpu per seed, request 6 cpus (GPU-shared)
+#   error
+
+# 36192511: 0 cpu 0.5 gpu per seed, request 15 cpus (GPU-shared)
+#   356.43 seconds
+
+# 36192512: 0 cpu 0.5 gpu per seed (12 seeds), request 15 cpus (GPU-shared)
+#   733.16 seconds
+
+# 36192557: 1 cpu 1 gpu per seed, request 15 cpus 5 gpus (GPU)
+#   error for some
 
-# 36148795 - ray, 4 cpu 0.5 gpu per seed (GPU-shared)
-#     474.12 sec
+# ===== Multi-node GPU runs =====
 
-# 36148818 - ray, 1 cpu 0.25 gpu per seed (GPU-shared)
-#     641.27 sec
+# bp Dale:
+# 36193643: 2 GPU nodes, 0 cpu 0.5 gpu per seed (32 seeds), request 15 cpus 16 gpus (GPU), with ray head only
+#   770.49 seconds -> only used 8 gpus
 
-# 36148859 - ray, 1 cpu 1 gpu per seed (GPU)
-#     382.98 sec
+# 36193950: 2 GPU nodes, 0 cpu 0.5 gpu per seed (32 seeds), request 8,8 cpus 8,7 gpus (GPU), with ray head+worker
+#   384.80 seconds
diff --git a/EIANN/simulate/run_EIANN_mnist_ray2.py b/EIANN/simulate/run_EIANN_mnist_ray2.py
@@ -0,0 +1,85 @@
+import os
+import time
+import torch
+import random
+import numpy as np
+import ray
+import click
+
+import EIANN.utils as ut
+
+@ray.remote(num_gpus=0.5, num_cpus=1)
+def run_seed(seed_idx, network_config_file_name, data_dir):
+    network_seed = 66049 + seed_idx
+    data_seed = 257 + seed_idx
+
+    ut.set_all_seeds(seed=network_seed)
+    torch.cuda.manual_seed_all(network_seed)
+    np.random.seed(data_seed)
+    random.seed(data_seed)
+
+    train_loader, val_loader, test_loader, _ = ut.get_MNIST_dataloaders(data_dir=data_dir, data_seed=data_seed)
+
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    project_root = os.path.dirname(os.path.dirname(script_dir))
+    config_file_path = os.path.join(project_root, "EIANN", "network_config", "mnist", network_config_file_name)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    net = ut.build_EIANN_from_config(config_file_path, network_seed=network_seed, device=device)
+
+    start = time.time()
+    net.train(
+        train_loader, 
+        val_loader, 
+        epochs=1, 
+        samples_per_epoch=20_000, 
+        val_interval=(0, -1, 100), 
+        store_history=False
+    )
+
+    val_acc = net.val_accuracy_history[-1]
+    val_loss = net.val_loss_history[-1]
+    return {
+        "seed_idx": seed_idx,
+        "val_accuracy": float(val_acc),
+        "val_loss": float(val_loss),
+        "run_time": time.time() - start,
+        "network_seed": network_seed,
+        "data_seed": data_seed,
+    }
+
+@click.command()
+@click.option('--network-config-file-name', required=True, type=str, help="Network config file name")
+@click.option('--data-dir', default="../data/mnist", type=str, help="Directory for MNIST data")
+@click.option('--num-seeds', default=5, type=int, help="Number of different seeds to try")
+def main(network_config_file_name, data_dir, num_seeds):
+    overall_start_time = time.time()
+
+    ray.init()
+    seeds = list(range(num_seeds))
+    handles = [run_seed.remote(s, network_config_file_name, data_dir) for s in seeds]
+    # Asynchronous collection
+    pending = handles[:]
+    results = []
+    while pending:
+        ready, pending = ray.wait(pending, num_returns=1)
+        out = ray.get(ready[0])
+        print("Finished seed", out["seed_idx"], "acc", out["val_accuracy"])
+        results.append(out)
+
+    overall_end_time = time.time()
+    print(f"Overall time for {num_seeds} seeds: {overall_end_time - overall_start_time:.2f} seconds")
+
+    print("Summary:")
+    for res in results:
+        print(f"Seed {res['seed_idx']}: Val Acc = {res['val_accuracy']}, Val Loss = {res['val_loss']}, Time = {res['run_time']:.2f} seconds")
+
+if __name__ == "__main__":
+    main()
+
+# bp Dale:
+# 36192337: 1 cpu 0.2 gpu per seed, request 6 cpus (GPU-shared)
+#   436.63 seconds 
+
+# 36192343: 1 cpu 0.5 gpu per seed, request 6 cpus (GPU-shared)
+#   389.19 seconds