forked from Misaka13514/setup-distributed-nix-builds
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathaction.yml
More file actions
276 lines (246 loc) · 9.78 KB
/
action.yml
File metadata and controls
276 lines (246 loc) · 9.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
name: "Nix Distributed Builder Setup"
description: "Set up Nix, maximize runner disk space, and connect to Tailscale for distributed building"
branding:
icon: "server"
color: "blue"
inputs:
tailscale_authkey:
description: "Tailscale OAuth client secret or Auth Key"
required: true
tailscale_hostname:
description: "Hostname to register with Tailscale"
required: true
tailscale_tags:
description: "Tags to advertise to Tailscale (e.g. tag:nix-ci-builder)"
required: true
role:
description: 'Role of the current job: "builder" or "coordinator"'
required: true
default: "builder"
builders:
description: "Space separated list of full builder hostnames to wait for. Required if role is coordinator"
required: false
default: ""
builder_timeout:
description: "Maximum time (in seconds) the builder should wait before self-terminating"
required: false
default: "300"
extra_nix_config:
description: "Extra Nix configuration to append to /etc/nix/nix.conf"
required: false
default: ""
runs:
using: "composite"
steps:
- name: Maximize disk space (Nothing but Nix)
if: runner.os == 'Linux'
uses: wimpysworld/nothing-but-nix@687c797a730352432950c707ab493fcc951818d7
with:
hatchet-protocol: "cleave"
- name: Install Nix
uses: cachix/install-nix-action@1ca7d21a94afc7c957383a2d217460d980de4934 # v31.10.1
with:
extra_nix_config: ${{ inputs.extra_nix_config }}
- name: Setup Magic Nix Cache
uses: DeterminateSystems/magic-nix-cache-action@565684385bcd71bad329742eefe8d12f2e765b39 # v13
with:
use-flakehub: false
diagnostic-endpoint: ""
- name: Install and Start Tailscale from nixpkgs
shell: bash
env:
AUTHKEY: ${{ inputs.tailscale_authkey }}
HOSTNAME: ${{ inputs.tailscale_hostname }}
TAGS: ${{ inputs.tailscale_tags }}
TAILSCALE_SSH_DEFAULT_PATH: "/var/root/.nix-profile/bin:/nix/var/nix/profiles/default/bin:/run/current-system/sw/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
run: |
nix profile add nixpkgs#tailscale
sudo env TAILSCALE_SSH_DEFAULT_PATH="${TAILSCALE_SSH_DEFAULT_PATH}" $(which tailscaled) > /tmp/tailscaled.log 2>&1 &
sleep 5
sudo $(which tailscale) up \
--authkey="${AUTHKEY}?ephemeral=true&preauthorized=true" \
--hostname="${HOSTNAME}" \
--ssh \
--advertise-tags="${TAGS}" \
--accept-routes
- name: Builder - Setup Teardown Script
if: ${{ inputs.role == 'builder' }}
shell: bash
run: |
cat << 'EOF' > /tmp/teardown.sh
#!/bin/bash
echo "Builder: Logging out of Tailscale to clear ephemeral node..."
sudo $(which tailscale) logout || true
EOF
chmod +x /tmp/teardown.sh
- name: Builder - Register Teardown
if: ${{ inputs.role == 'builder' }}
uses: gacts/run-and-post-run@81b6ce503cde93862cec047c54652e45c5dca991 # v1.4.3
with:
post: /tmp/teardown.sh
- name: Builder - Idle Wait
if: ${{ inputs.role == 'builder' }}
shell: bash
env:
BUILDER_TIMEOUT: ${{ inputs.builder_timeout }}
run: |
echo "Builder is online!"
echo "To terminate this builder manually, run:"
echo "$ ssh root@${{ inputs.tailscale_hostname }} \"touch /tmp/builder_done\""
(
while true; do
echo "--- System Stats ---"
if [ "$(uname)" = "Linux" ]; then
top -bn1 | grep "Cpu(s)" | awk '{print "CPU Usage: " $2 + $4 "%"}'
free -m | grep Mem | awk '{print "Memory Usage: " $3"/"$2 "MB"}'
else
top -l 1 | grep -E "^(CPU usage|PhysMem):"
fi
df -h /nix | tail -1 | awk '{print "Disk Space (/nix): " $3"/"$2 " ("$5")"}'
echo "--------------------"
sleep 60
done
) &
STATS_PID=$!
echo "Waiting for jobs..."
START_TIME=$(date +%s)
IS_CLAIMED=false
while true; do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
if [ "$IS_CLAIMED" = false ] && [ -f "/tmp/builder_claimed" ]; then
echo "Builder has been claimed! Extending timeout to 6 hours."
IS_CLAIMED=true
BUILDER_TIMEOUT=21600
START_TIME=$CURRENT_TIME
fi
if [ $ELAPSED -ge $BUILDER_TIMEOUT ]; then
echo "Timeout reached. Shutting down."
kill $STATS_PID 2>/dev/null || true
exit 1
fi
if [ -f "/tmp/builder_done" ]; then
echo "Received termination successful signal! Shutting down gracefully."
kill $STATS_PID 2>/dev/null || true
exit 0
fi
sleep 10
done
- name: Coordinator - Setup Teardown Script
if: ${{ inputs.role == 'coordinator' }}
shell: bash
env:
INPUT_BUILDERS: ${{ inputs.builders }}
run: |
echo "#!/bin/bash" | sudo tee /usr/local/bin/stop-nix-builders
echo "if [ -f /tmp/.builders_stopped ]; then echo 'Builders already stopped.'; exit 0; fi" | sudo tee -a /usr/local/bin/stop-nix-builders
echo "BUILDERS=\"${INPUT_BUILDERS}\"" | sudo tee -a /usr/local/bin/stop-nix-builders
cat << 'EOF' | sudo tee -a /usr/local/bin/stop-nix-builders
echo "Coordinator: Tearing down builders gracefully..."
for hostname in $BUILDERS; do
echo "Initiating teardown for $hostname..."
SUCCESS=false
for i in {1..5}; do
if sudo ssh -o ConnectTimeout=10 root@$hostname "touch /tmp/builder_done" 2>/dev/null; then
echo "Successfully sent teardown signal to $hostname."
SUCCESS=true
break
else
echo "Attempt $i failed to signal $hostname. Retrying in 3 seconds..."
sleep 3
fi
done
done
touch /tmp/.builders_stopped
EOF
sudo chmod +x /usr/local/bin/stop-nix-builders
cat << 'EOF' > /tmp/teardown.sh
#!/bin/bash
/usr/local/bin/stop-nix-builders
echo "Coordinator: Logging out of Tailscale..."
sudo $(which tailscale) logout || true
EOF
chmod +x /tmp/teardown.sh
- name: Coordinator - Register Teardown
if: ${{ inputs.role == 'coordinator' }}
uses: gacts/run-and-post-run@81b6ce503cde93862cec047c54652e45c5dca991 # v1.4.3
with:
post: /tmp/teardown.sh
- name: Coordinator - Wait for builders and Setup
if: ${{ inputs.role == 'coordinator' }}
env:
INPUT_BUILDERS: ${{ inputs.builders }}
BUILDER_TIMEOUT: ${{ inputs.builder_timeout }}
shell: bash
run: |
# 1. Wait for all builders
for hostname in $INPUT_BUILDERS; do
echo "Waiting for $hostname to appear on Tailscale network..."
TIMEOUT=$((BUILDER_TIMEOUT + 60))
TIMER=0
while true; do
TS_IP=$(sudo $(which tailscale) status | grep -w "$hostname" | awk '{print $1}' | grep -E '^[0-9]{1,3}(\.[0-9]{1,3}){3}' | head -n 1 || true)
if [ -n "$TS_IP" ]; then
echo "Resolved IP $TS_IP from Tailscale status for $hostname!"
echo "$TS_IP $hostname" | sudo tee -a /etc/hosts
break
fi
if [ $TIMER -ge $TIMEOUT ]; then
echo "Timeout waiting for $hostname."
exit 1
fi
sleep 5
TIMER=$((TIMER+5))
echo "Waiting for $hostname... ($TIMER/$TIMEOUT s)"
done
echo "Checking connectivity to $hostname..."
until ping -c 1 -W 1 "$hostname" >/dev/null 2>&1; do
if [ $TIMER -ge $TIMEOUT ]; then
echo "Timeout checking connectivity to $hostname."
exit 1
fi
sleep 2
TIMER=$((TIMER+2))
done
echo "$hostname is online and reachable!"
done
# 2. Setup Remote Builder in Nix
for hostname in $INPUT_BUILDERS; do
cat << EOF | sudo tee -a /etc/ssh/ssh_config
Host $hostname
StrictHostKeyChecking no
UserKnownHostsFile /dev/null
BatchMode yes
LogLevel ERROR
Compression no
Ciphers aes128-gcm@openssh.com
EOF
done
for hostname in $INPUT_BUILDERS; do
echo "Testing Nix connection to $hostname..."
SYS_INFO=$(sudo ssh root@$hostname "nix eval --raw --impure --expr 'builtins.currentSystem' && echo -n ' ' && (nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)")
SYSTEM=$(echo "$SYS_INFO" | awk '{print $1}')
CORES=$(echo "$SYS_INFO" | awk '{print $2}')
if [ -z "$SYSTEM" ]; then
echo "ERROR: Failed to query Nix system on $hostname!"
exit 1
fi
echo "Claiming builder $hostname ($SYSTEM with $CORES cores)..."
sudo ssh root@$hostname "touch /tmp/builder_claimed" || true
if [[ "$SYSTEM" == *"linux"* ]]; then
FEATURES="kvm,nixos-test,big-parallel,benchmark"
else
FEATURES="big-parallel,benchmark"
fi
OPT_CORES=$((CORES * 2))
echo "ssh-ng://root@$hostname?compress=false $SYSTEM - $OPT_CORES 1 $FEATURES" | sudo tee -a /etc/nix/machines
done
echo "builders = @/etc/nix/machines" | sudo tee -a /etc/nix/nix.conf
echo "builders-use-substitutes = true" | sudo tee -a /etc/nix/nix.conf
if [ "$(uname)" = "Darwin" ]; then
sudo launchctl stop org.nixos.nix-daemon || true
sudo launchctl start org.nixos.nix-daemon || true
else
sudo pkill nix-daemon || true
sudo systemctl restart nix-daemon.service || true
fi