-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdpython.py
More file actions
146 lines (120 loc) · 3.78 KB
/
Copy pathdpython.py
File metadata and controls
146 lines (120 loc) · 3.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#Change the path below to your Python file (D:\AIMOD\AIMOD\venv\Scripts\python.exe) insteafd of the one shown here.
PYTHON = r"D:\AIMOD\AIMOD\venv\Scripts\python.exe"
import sys
import os
import json
import subprocess
import shutil
import time
# ================= CONFIG =================
#Change the file location below to your cluster.json location
CONFIG = r"D:\distributed_env\cluster.json"
# ==========================================
# ---------- SAFETY CHECK ----------
if len(sys.argv) < 2:
print("❌ ERROR: No training script provided")
print("✅ Usage: dpython train.py [args]")
sys.exit(1)
SCRIPT = os.path.abspath(sys.argv[1])
ARGS = sys.argv[2:]
if not os.path.exists(SCRIPT):
print(f"❌ ERROR: Script not found: {SCRIPT}")
sys.exit(1)
# ---------- LOAD CONFIG ----------
with open(CONFIG) as f:
cfg = json.load(f)
MASTER_IP = cfg["master_ip"]
WORKER_IP = cfg["worker_ip"]
PORT = str(cfg["port"])
SSH_USER = cfg["ssh_user"]
# ---------- GPU HELPERS ----------
def get_gpu_stats(cmd):
try:
out = subprocess.check_output(cmd, stderr=subprocess.DEVNULL, timeout=5)
stats = []
for line in out.decode().strip().splitlines():
name, util, mem_used, mem_total = line.split(", ")
stats.append((name, util, mem_used, mem_total))
return stats
except Exception:
return []
def get_local_gpu_stats():
return get_gpu_stats([
"nvidia-smi",
"--query-gpu=name,utilization.gpu,memory.used,memory.total",
"--format=csv,noheader,nounits"
])
def get_remote_gpu_stats():
return get_gpu_stats([
"ssh",
f"{SSH_USER}@{WORKER_IP}",
"nvidia-smi --query-gpu=name,utilization.gpu,memory.used,memory.total "
"--format=csv,noheader,nounits"
])
def remote_available():
try:
subprocess.check_output(
["ssh", f"{SSH_USER}@{WORKER_IP}", "nvidia-smi"],
stderr=subprocess.DEVNULL,
timeout=5
)
return True
except Exception:
return False
# ---------- DISTRIBUTED CHILD PROCESS ----------
if "LOCAL_RANK" in os.environ:
subprocess.run([PYTHON, SCRIPT] + ARGS)
sys.exit(0)
# ---------- GPU STATUS ----------
print("\n================ GPU STATUS ================")
local_stats = get_local_gpu_stats()
if not local_stats:
print("❌ No local NVIDIA GPU found")
sys.exit(1)
print("🟢 LOCAL GPU:")
for i, (n, u, mu, mt) in enumerate(local_stats):
print(f" [{i}] {n} | Util {u}% | VRAM {mu}/{mt} MB")
use_remote = remote_available()
if use_remote:
remote_stats = get_remote_gpu_stats()
if remote_stats:
print("\n🟣 REMOTE GPU:")
for i, (n, u, mu, mt) in enumerate(remote_stats):
print(f" [{i}] {n} | Util {u}% | VRAM {mu}/{mt} MB")
else:
use_remote = False
if not use_remote:
print("\n⚠️ REMOTE GPU NOT AVAILABLE")
print("➡️ Falling back to LOCAL GPU ONLY")
print("============================================\n")
# ---------- SINGLE GPU FALLBACK ----------
if not use_remote:
print("[DPYTHON] Running single-GPU training\n")
subprocess.run([PYTHON, SCRIPT] + ARGS)
sys.exit(0)
# ---------- MULTI-GPU LAUNCH ----------
print("[DPYTHON] Launching remote worker...")
worker_cmd = (
f"{PYTHON} -m accelerate launch "
f"--num_machines 2 "
f"--machine_rank 1 "
f"--main_process_ip {MASTER_IP} "
f"--main_process_port {PORT} "
f"{SCRIPT} {' '.join(ARGS)}"
)
subprocess.Popen([
"ssh",
f"{SSH_USER}@{WORKER_IP}",
worker_cmd
])
time.sleep(2)
print("[DPYTHON] Launching local master...\n")
subprocess.run([
PYTHON, "-m", "accelerate", "launch",
"--num_machines", "2",
"--machine_rank", "0",
"--main_process_ip", MASTER_IP,
"--main_process_port", PORT,
SCRIPT,
*ARGS
])