-
Notifications
You must be signed in to change notification settings - Fork 28
Expand file tree
/
Copy pathspin-node.sh
More file actions
executable file
·952 lines (841 loc) · 37.4 KB
/
spin-node.sh
File metadata and controls
executable file
·952 lines (841 loc) · 37.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
#!/bin/bash
# set -e
currentDir=$(pwd)
scriptDir=$(dirname $0)
if [ "$scriptDir" == "." ]; then
scriptDir="$currentDir"
fi
# Save original args before parse-env.sh shifts them
_original_args="$*"
# 0. parse env and args
source "$(dirname $0)/parse-env.sh"
# Helper function to check if core dumps should be enabled for a node
# Accepts: "all", exact node names (zeam_0), or client types (zeam)
should_enable_core_dumps() {
local node_name="$1"
local client_type="${node_name%%_*}" # Extract client type (e.g., "zeam" from "zeam_0")
[ -z "$coreDumps" ] && return 1
[ "$coreDumps" = "all" ] && return 0
IFS=',' read -r -a dump_targets <<< "$coreDumps"
for target in "${dump_targets[@]}"; do
# Exact node name match or client type match
[ "$target" = "$node_name" ] || [ "$target" = "$client_type" ] && return 0
done
return 1
}
# Check if yq is installed (needed for deployment mode detection)
if ! command -v yq &> /dev/null; then
echo "Error: yq is required but not installed. Please install yq first."
echo "On macOS: brew install yq"
echo "On Linux: https://github.com/mikefarah/yq#install"
exit 1
fi
# Determine initial validator config file location
if [ "$validatorConfig" == "genesis_bootnode" ] || [ -z "$validatorConfig" ]; then
validator_config_file="$configDir/validator-config.yaml"
else
validator_config_file="$validatorConfig"
fi
# Read deployment mode: command-line argument takes precedence over config file
if [ -n "$deploymentMode" ]; then
# Use command-line argument if provided
deployment_mode="$deploymentMode"
echo "Using deployment mode from command line: $deployment_mode"
else
# Otherwise read from config file (default to 'local' if not specified)
if [ -f "$validator_config_file" ]; then
deployment_mode=$(yq eval '.deployment_mode // "local"' "$validator_config_file")
echo "Using deployment mode from config file: $deployment_mode"
else
deployment_mode="local"
echo "Using default deployment mode: $deployment_mode"
fi
fi
# If deployment mode is ansible and no explicit validatorConfig was provided,
# switch to ansible-devnet/genesis/validator-config.yaml and update configDir/dataDir
# This must happen BEFORE set-up.sh so genesis generation uses the correct directory
if [ "$deployment_mode" == "ansible" ] && ([ "$validatorConfig" == "genesis_bootnode" ] || [ -z "$validatorConfig" ]); then
configDir="$scriptDir/ansible-devnet/genesis"
dataDir="$scriptDir/ansible-devnet/data"
validator_config_file="$configDir/validator-config.yaml"
echo "Using Ansible deployment: configDir=$configDir, validator config=$validator_config_file"
fi
# Set up logging if --logs flag is enabled
if [ "$enableLogs" == "true" ]; then
_log_dir="$scriptDir/tmp"
mkdir -p "$_log_dir"
_log_start=$(date -u +%s)
_ts=$(date -u '+%d-%m-%Y-%H-%M')
if [ "$deployment_mode" == "ansible" ]; then
_log_prefix="ansible-run"
_config_prefix="ansible"
else
_log_prefix="local-run"
_config_prefix="local"
fi
_log_file="$_log_dir/${_log_prefix}-${_ts}.log"
echo "$(date -u '+%Y-%m-%d %H:%M:%S') START spin-node.sh $_original_args" >> "$_log_dir/devnet.log"
trap 'echo "$(date -u '\''+%Y-%m-%d %H:%M:%S'\'') END spin-node.sh ($(( $(date -u +%s) - _log_start ))s) -> '"$_log_file"'" >> "'"$_log_dir"'/devnet.log"' EXIT
exec > >(tee -a "$_log_file") 2>&1
echo "Logging to $_log_file"
# Copy validator config with timestamped name matching the run log
if [ -n "$replaceWith" ]; then
_config_copy="$_log_dir/${_config_prefix}-${networkName}-validator-config-replace-${_ts}.yaml"
else
_config_copy="$_log_dir/${_config_prefix}-${networkName}-validator-config-${_ts}.yaml"
fi
cp "$validator_config_file" "$_config_copy"
echo "Validator config copied to $_config_copy"
fi
# If --subnets N is specified, expand the validator config template into a new
# file with N nodes per client (same IP, unique incremented ports and keys).
# This must run after configDir/validator_config_file are resolved so the
# generated file lands in the correct genesis directory.
if [ -n "$subnets" ] && [ "$subnets" -ge 1 ] 2>/dev/null; then
if ! [[ "$subnets" =~ ^[0-9]+$ ]] || [ "$subnets" -lt 1 ] || [ "$subnets" -gt 5 ]; then
echo "Error: --subnets requires an integer between 1 and 5, got: $subnets"
exit 1
fi
if ! command -v python3 &> /dev/null; then
echo "Error: python3 is required to generate the subnet config."
exit 1
fi
expanded_config="${configDir}/validator-config-subnets-${subnets}.yaml"
[ "$dryRun" == "true" ] && echo "[DRY RUN] Generating subnet config preview (no deployment will occur)"
echo "Generating subnet config ($subnets subnet(s) per client) → $expanded_config"
if ! python3 "$scriptDir/generate-subnet-config.py" \
"$validator_config_file" "$subnets" "$expanded_config"; then
echo "❌ Failed to generate subnet config."
exit 1
fi
validator_config_file="$expanded_config"
echo "Using expanded config: $validator_config_file"
fi
# Handle --prepare mode: verify and install required software on all remote servers.
# Must run after deployment_mode is resolved but before genesis setup.
if [ -n "$prepareMode" ] && [ "$prepareMode" == "true" ]; then
if [ "$deployment_mode" != "ansible" ]; then
echo "Error: --prepare can only be used in ansible mode."
echo "Set deployment_mode: ansible in your validator-config.yaml or pass --deploymentMode ansible"
exit 1
fi
# Reject flags that have no meaning in prepare mode.
ignored_flags=()
[ -n "$node" ] && ignored_flags+=("--node")
[ -n "$cleanData" ] && ignored_flags+=("--cleanData")
[ -n "$generateGenesis" ] && ignored_flags+=("--generateGenesis")
[ -n "$FORCE_KEYGEN_FLAG" ] && ignored_flags+=("--forceKeyGen")
[ -n "$stopNodes" ] && ignored_flags+=("--stop")
[ -n "$restartClient" ] && ignored_flags+=("--restart-client")
[ -n "$checkpointSyncUrl" ] && ignored_flags+=("--checkpoint-sync-url")
[ -n "$dockerTag" ] && ignored_flags+=("--tag")
[ -n "$aggregatorNode" ] && ignored_flags+=("--aggregator")
[ -n "$coreDumps" ] && ignored_flags+=("--coreDumps")
[ -n "$enableMetrics" ] && ignored_flags+=("--metrics")
[ -n "$popupTerminal" ] && ignored_flags+=("--popupTerminal")
[ -n "$dockerWithSudo" ] && ignored_flags+=("--dockerWithSudo")
[ -n "$skipLeanpoint" ] && ignored_flags+=("--skip-leanpoint")
[ -n "$skipNemo" ] && ignored_flags+=("--skip-nemo")
[ -n "$validatorConfig" ] && [ "$validatorConfig" != "genesis_bootnode" ] \
&& ignored_flags+=("--validatorConfig")
if [ ${#ignored_flags[@]} -gt 0 ]; then
echo ""
echo "╔══════════════════════════════════════════════════════════════╗"
echo "║ ❌ ERROR ║"
echo "╠══════════════════════════════════════════════════════════════╣"
echo "║ --prepare does not accept the following flag(s): ║"
for flag in "${ignored_flags[@]}"; do
printf "║ %-60s║\n" "• $flag"
done
echo "╠══════════════════════════════════════════════════════════════╣"
echo "║ Allowed flags with --prepare: ║"
echo "║ • --sshKey / --private-key ║"
echo "║ • --useRoot ║"
echo "║ • --deploymentMode ansible ║"
echo "╚══════════════════════════════════════════════════════════════╝"
echo ""
exit 1
fi
if ! command -v ansible-playbook &> /dev/null; then
echo "Error: ansible-playbook is not installed."
echo "Install Ansible: brew install ansible (macOS) or pip install ansible"
exit 1
fi
if [ "$dryRun" == "true" ]; then
echo "[DRY RUN] Would prepare remote servers — running Ansible with --check --diff"
else
echo "Preparing remote servers (verifying and installing required software)..."
fi
if ! "$scriptDir/run-ansible.sh" "$configDir" "" "" "" "$validator_config_file" "$sshKeyFile" "$useRoot" "prepare" "" "" "" "$dryRun" "" "$networkName"; then
echo "❌ Server preparation failed."
exit 1
fi
[ "$dryRun" == "true" ] && echo "✅ Dry-run complete — no changes were made." || echo "✅ All remote servers are prepared."
exit 0
fi
#1. setup genesis params and run genesis generator
if [ "$dryRun" == "true" ]; then
echo "[DRY RUN] Skipping genesis generation (set-up.sh would run here)"
node_setup="${node_setup:-docker}" # ensure local-loop variable has a default
else
source "$(dirname $0)/set-up.sh"
# ✅ Genesis generator implemented using PK's eth-beacon-genesis tool
# Generates: validators.yaml, nodes.yaml, genesis.json, genesis.ssz, and .key files
fi
# 2. collect the nodes that the user has asked us to spin and perform setup
# Load nodes from validator config file
if [ -f "$validator_config_file" ]; then
# Use yq to extract node names from validator config
nodes=($(yq eval '.validators[].name' "$validator_config_file"))
# Validate that we found nodes
if [ ${#nodes[@]} -eq 0 ]; then
echo "Error: No validators found in $validator_config_file"
exit 1
fi
else
echo "Error: Validator config file not found at $validator_config_file"
if [ "$deployment_mode" == "ansible" ]; then
echo "Please create ansible-devnet/genesis/validator-config.yaml for Ansible deployments"
fi
nodes=()
exit 1
fi
echo "Detected nodes: ${nodes[@]}"
# nodes=("zeam_0" "ream_0" "qlean_0")
spin_nodes=()
restart_with_checkpoint_sync=false
# Aggregator selection — one randomly chosen aggregator per subnet.
#
# Skipped entirely for --restart-client: restarting a single node must not
# disturb the existing isAggregator assignments for the rest of the network.
#
# Subnet membership is read from the explicit 'subnet:' field in the config,
# which generate-subnet-config.py writes when --subnets N is used.
# Nodes without a 'subnet' field (standard single-subnet configs) all
# default to subnet 0 regardless of their name suffix.
#
# When --aggregator is specified, that node is used as the aggregator for
# its own subnet; all other subnets still get a random selection.
# Helper: get the subnet index for a node from the config (defaults to 0).
_node_subnet() {
yq eval ".validators[] | select(.name == \"$1\") | .subnet // 0" "$validator_config_file"
}
if [ -n "$restartClient" ]; then
echo "Note: skipping aggregator selection — --restart-client retains existing isAggregator assignments."
_aggregator_summary=()
else
# If --aggregator was given, validate it exists before doing anything else.
if [ -n "$aggregatorNode" ]; then
aggregator_found=false
for available_node in "${nodes[@]}"; do
if [[ "$aggregatorNode" == "$available_node" ]]; then
aggregator_found=true
break
fi
done
if [[ "$aggregator_found" == false ]]; then
echo "Error: Specified aggregator '$aggregatorNode' not found in validator config"
echo "Available nodes: ${nodes[@]}"
exit 1
fi
fi
# Collect unique subnet indices from the 'subnet' field (0 when absent).
_subnet_indices=()
for _node in "${nodes[@]}"; do
_subnet_indices+=("$(_node_subnet "$_node")")
done
_unique_subnets=($(printf '%s\n' "${_subnet_indices[@]}" | sort -un))
echo "Detected ${#_unique_subnets[@]} subnet(s): ${_unique_subnets[*]}"
# Snapshot which nodes already have isAggregator: true before we reset anything.
# This lets us honour manual edits in the YAML when no --aggregator flag was passed.
# Uses dynamic variable names (_preset_agg_<subnet>) for bash 3.2 compatibility
# (bash 3.2 ships with macOS and does not support declare -A).
for _node in "${nodes[@]}"; do
_is_agg=$(yq eval ".validators[] | select(.name == \"$_node\") | .isAggregator" "$validator_config_file")
if [[ "$_is_agg" == "true" ]]; then
_sn="$(_node_subnet "$_node")"
_varname="_preset_agg_${_sn}"
# Keep the first preset aggregator found per subnet.
[[ -z "${!_varname:-}" ]] && printf -v "$_varname" '%s' "$_node"
fi
done
# Reset every node's isAggregator flag (skipped in dry-run).
if [ "$dryRun" != "true" ]; then
yq eval -i '.validators[].isAggregator = false' "$validator_config_file"
fi
# Select one aggregator per subnet and set the flag.
# Priority: 1) --aggregator CLI flag 2) pre-existing isAggregator: true 3) random
_aggregator_summary=()
for _subnet_idx in "${_unique_subnets[@]}"; do
_subnet_nodes=()
for _node in "${nodes[@]}"; do
[[ "$(_node_subnet "$_node")" == "$_subnet_idx" ]] && _subnet_nodes+=("$_node")
done
_selected_agg=""
if [ -n "$aggregatorNode" ] && [[ "$(_node_subnet "$aggregatorNode")" == "$_subnet_idx" ]]; then
# 1. Explicit --aggregator flag.
_selected_agg="$aggregatorNode"
elif _pv="_preset_agg_${_subnet_idx}"; [ -n "${!_pv:-}" ]; then
# 2. A node had isAggregator: true in the config — respect the manual choice.
_preset="${!_pv}"
# Validate the preset node is still in the active nodes list.
_preset_valid=false
for _n in "${_subnet_nodes[@]}"; do
[[ "$_n" == "$_preset" ]] && _preset_valid=true && break
done
if [[ "$_preset_valid" == "true" ]]; then
_selected_agg="$_preset"
else
# Preset node no longer exists — fall back to random and warn.
echo "Warning: preset aggregator '$_preset' for subnet $_subnet_idx is not in the active node list; selecting randomly." >&2
_selected_agg="${_subnet_nodes[$((RANDOM % ${#_subnet_nodes[@]}))]}"
fi
else
# 3. No preference set — pick randomly.
_selected_agg="${_subnet_nodes[$((RANDOM % ${#_subnet_nodes[@]}))]}"
fi
if [ "$dryRun" != "true" ]; then
yq eval -i "(.validators[] | select(.name == \"$_selected_agg\") | .isAggregator) = true" "$validator_config_file"
fi
_aggregator_summary+=("subnet $_subnet_idx → $_selected_agg")
done
# Verify the invariant: exactly 1 aggregator per subnet (skipped in dry-run).
if [ "$dryRun" != "true" ]; then
_verify_failed=false
for _subnet_idx in "${_unique_subnets[@]}"; do
_agg_count=0
for _node in "${nodes[@]}"; do
if [[ "$(_node_subnet "$_node")" == "$_subnet_idx" ]]; then
_is_agg=$(yq eval ".validators[] | select(.name == \"$_node\") | .isAggregator" "$validator_config_file")
[[ "$_is_agg" == "true" ]] && _agg_count=$((_agg_count + 1))
fi
done
if [ "$_agg_count" -ne 1 ]; then
echo "Error: subnet $_subnet_idx has $_agg_count aggregator(s) — expected exactly 1" >&2
_verify_failed=true
fi
done
if [ "$_verify_failed" == "true" ]; then
echo "Aggregator invariant check failed. Aborting." >&2
exit 1
fi
fi
fi # end: aggregator selection (skipped for --restart-client)
# Print a prominent aggregator summary banner (only when aggregator selection ran).
if [ ${#_aggregator_summary[@]} -gt 0 ]; then
echo ""
echo "╔══════════════════════════════════════════════════════════════╗"
echo "║ 🗳 Aggregator Selection ║"
echo "╠══════════════════════════════════════════════════════════════╣"
for _line in "${_aggregator_summary[@]}"; do
printf "║ %-60s║\n" "$_line"
done
echo "╚══════════════════════════════════════════════════════════════╝"
echo ""
fi
# When --restart-client is specified, use it as the node list and enable checkpoint sync mode
if [[ -n "$restartClient" ]]; then
echo "Note: --restart-client is only used with --checkpoint-sync-url (default: https://leanpoint.leanroadmap.org/lean/v0/states/finalized)"
restart_with_checkpoint_sync=true
# Skip genesis when restarting with checkpoint sync (we're syncing from remote)
generateGenesis=false
# Parse comma-separated client names
IFS=',' read -r -a requested_nodes <<< "$restartClient"
for requested_node in "${requested_nodes[@]}"; do
requested_node=$(echo "$requested_node" | xargs) # trim whitespace
node_found=false
for available_node in "${nodes[@]}"; do
if [[ "$requested_node" == "$available_node" ]]; then
spin_nodes+=("$available_node")
node_found=true
break
fi
done
if [[ "$node_found" == false ]]; then
echo "Error: Node '$requested_node' not found in validator config"
echo "Available nodes: ${nodes[@]}"
exit 1
fi
done
echo "Restarting with checkpoint sync: ${spin_nodes[*]} from $checkpointSyncUrl"
cleanData=true # Clear data when restarting with checkpoint sync
node_present=true
# --- Handle --replace-with: swap client implementations ---
# Uses parallel arrays (bash 3.x compatible, no associative arrays)
if [[ -n "$replaceWith" ]]; then
IFS=',' read -r -a replace_nodes <<< "$replaceWith"
# Build replacement pairs as parallel arrays
replace_old_names=()
replace_new_names=()
has_replacements=false
i=0
for old_name in "${requested_nodes[@]}"; do
new_name=""
if [ $i -lt ${#replace_nodes[@]} ]; then
new_name=$(echo "${replace_nodes[$i]}" | xargs) # trim whitespace
fi
if [ -n "$new_name" ] && [ "$new_name" != "$old_name" ]; then
replace_old_names+=("$old_name")
replace_new_names+=("$new_name")
has_replacements=true
echo "Will replace: $old_name → $new_name"
fi
i=$((i + 1))
done
# Warn about extra --replace-with entries beyond --restart-client count
if [ ${#replace_nodes[@]} -gt ${#requested_nodes[@]} ]; then
echo "Warning: --replace-with has more entries (${#replace_nodes[@]}) than --restart-client (${#requested_nodes[@]}). Extra entries ignored."
fi
if [ "$has_replacements" = true ]; then
# 1. Stop old containers and clean data BEFORE config changes (inventory still resolves to old names)
echo "Stopping old containers and cleaning data before replacement..."
for idx in "${!replace_old_names[@]}"; do
old_name="${replace_old_names[$idx]}"
if [ "$deployment_mode" == "ansible" ]; then
echo "Stopping $old_name and cleaning remote data via Ansible..."
"$scriptDir/run-ansible.sh" "$configDir" "$old_name" "true" "$validatorConfig" "$validator_config_file" "$sshKeyFile" "$useRoot" "stop" "" "true" "" "" "" "$networkName" || {
echo "Warning: Failed to stop $old_name via Ansible, continuing..."
}
else
echo "Stopping local container $old_name..."
if [ -n "$dockerWithSudo" ]; then
sudo docker rm -f "$old_name" 2>/dev/null || true
else
docker rm -f "$old_name" 2>/dev/null || true
fi
# Remove old local data directory (different clients have different data structures)
old_data_dir="$dataDir/$old_name"
if [ -d "$old_data_dir" ]; then
rm -rf "$old_data_dir"
echo " Removed data dir $old_name"
fi
fi
done
# 2. Update config files (rename old → new)
for idx in "${!replace_old_names[@]}"; do
old_name="${replace_old_names[$idx]}"
new_name="${replace_new_names[$idx]}"
echo "Updating config files: $old_name → $new_name"
# validator-config.yaml: rename .validators[].name
yq eval -i "(.validators[] | select(.name == \"$old_name\") | .name) = \"$new_name\"" "$validator_config_file"
# validators.yaml: rename top-level key (two steps: copy then delete, single expression doesn't work)
validators_file="$configDir/validators.yaml"
if [ -f "$validators_file" ]; then
yq eval -i ".$new_name = .$old_name" "$validators_file"
yq eval -i "del(.$old_name)" "$validators_file"
fi
# annotated_validators.yaml: rename top-level key
annotated_file="$configDir/annotated_validators.yaml"
if [ -f "$annotated_file" ]; then
yq eval -i ".$new_name = .$old_name" "$annotated_file"
yq eval -i "del(.$old_name)" "$annotated_file"
fi
# Rename key file (overwrite if destination exists from a previous run)
if [ -f "$configDir/$old_name.key" ]; then
mv -f "$configDir/$old_name.key" "$configDir/$new_name.key"
echo " Renamed $old_name.key → $new_name.key"
fi
done
# 3. Update spin_nodes array with new names
for i in "${!spin_nodes[@]}"; do
old="${spin_nodes[$i]}"
for idx in "${!replace_old_names[@]}"; do
if [ "$old" = "${replace_old_names[$idx]}" ]; then
spin_nodes[$i]="${replace_new_names[$idx]}"
break
fi
done
done
# Re-read nodes from updated config (needed for aggregator and downstream logic)
nodes=($(yq eval '.validators[].name' "$validator_config_file"))
# Ensure inventory is regenerated on next run-ansible.sh call
# (the stop call may have regenerated it with old names)
touch "$validator_config_file"
echo "Updated spin_nodes: ${spin_nodes[*]}"
echo "Config files updated successfully."
fi
fi
# Parse comma-separated or space-separated node names or handle single node/all
elif [[ "$node" == "all" ]]; then
# Spin all nodes
spin_nodes=("${nodes[@]}")
node_present=true
else
# Handle both comma-separated and space-separated node names
if [[ "$node" == *","* ]]; then
IFS=',' read -r -a requested_nodes <<< "$node"
else
IFS=' ' read -r -a requested_nodes <<< "$node"
fi
# Check each requested node against available nodes
for requested_node in "${requested_nodes[@]}"; do
node_found=false
for available_node in "${nodes[@]}"; do
if [[ "$requested_node" == "$available_node" ]]; then
spin_nodes+=("$available_node")
node_present=true
node_found=true
break
fi
done
if [[ "$node_found" == false ]]; then
echo "Error: Node '$requested_node' not found in validator config"
echo "Available nodes: ${nodes[@]}"
exit 1
fi
done
fi
if [ ! -n "$node_present" ]; then
echo "invalid specified node, options =${nodes[@]} all, exiting."
exit;
fi;
# Check deployment mode and route to ansible if needed
if [ "$deployment_mode" == "ansible" ]; then
# Validate Ansible prerequisites before routing to Ansible deployment
echo "Validating Ansible prerequisites..."
# Check if Ansible is installed
if ! command -v ansible-playbook &> /dev/null; then
echo "Error: ansible-playbook is not installed."
echo "Install Ansible:"
echo " macOS: brew install ansible"
echo " Ubuntu: sudo apt-get install ansible"
echo " pip: pip install ansible"
exit 1
fi
# Check if docker collection is available
if ! ansible-galaxy collection list | grep -q "community.docker" 2>/dev/null; then
echo "Warning: community.docker collection not found. Installing..."
ansible-galaxy collection install community.docker
fi
echo "✅ Ansible prerequisites validated"
# Determine node list for Ansible: use restartClient/spin_nodes when restarting, else $node
if [[ "$restart_with_checkpoint_sync" == "true" ]]; then
ansible_node_arg=$(IFS=','; echo "${spin_nodes[*]}")
else
ansible_node_arg="$node"
fi
# Determine skip_genesis for Ansible (true when restarting with checkpoint sync)
# deploy-nodes.yml syncs config files to the target host, so copy-genesis to all hosts is not needed
ansible_skip_genesis="false"
[[ "$restart_with_checkpoint_sync" == "true" ]] && ansible_skip_genesis="true"
# Determine checkpoint_sync_url for Ansible (when restarting with checkpoint sync)
ansible_checkpoint_url=""
[[ "$restart_with_checkpoint_sync" == "true" ]] && [[ -n "$checkpointSyncUrl" ]] && ansible_checkpoint_url="$checkpointSyncUrl"
# Handle stop action
if [ -n "$stopNodes" ] && [ "$stopNodes" == "true" ]; then
echo "Stopping nodes via Ansible..."
if ! "$scriptDir/run-ansible.sh" "$configDir" "$ansible_node_arg" "$cleanData" "$validatorConfig" "$validator_config_file" "$sshKeyFile" "$useRoot" "stop" "$coreDumps" "$ansible_skip_genesis" "" "$dryRun" "" "$networkName"; then
echo "❌ Ansible stop operation failed. Exiting."
exit 1
fi
exit 0
fi
# When --replace-with already cleaned data in the stop step, don't pass clean_data to deploy
# (the old node name no longer exists in config, so clean-node-data.yml would fail resolving it)
ansible_clean_data="$cleanData"
[[ "${has_replacements:-false}" = "true" ]] && ansible_clean_data=""
# Call separate Ansible execution script
# If Ansible deployment fails, exit immediately (don't fall through to local deployment)
if [ "$dryRun" == "true" ]; then
echo "[DRY RUN] Would deploy via Ansible — running playbook with --check --diff"
fi
ansible_sync_all_hosts=""
[[ "${has_replacements:-false}" = "true" ]] && ansible_sync_all_hosts="true"
if ! "$scriptDir/run-ansible.sh" "$configDir" "$ansible_node_arg" "$ansible_clean_data" "$validatorConfig" "$validator_config_file" "$sshKeyFile" "$useRoot" "" "$coreDumps" "$ansible_skip_genesis" "$ansible_checkpoint_url" "$dryRun" "$ansible_sync_all_hosts" "$networkName"; then
echo "❌ Ansible deployment failed. Exiting."
exit 1
fi
if [ -z "$skipLeanpoint" ] && { [ "$restart_with_checkpoint_sync" != "true" ] || [ "${has_replacements:-false}" = "true" ]; }; then
# Sync leanpoint upstreams to tooling server and restart remote container (no 5th arg = remote)
if ! "$scriptDir/sync-leanpoint-upstreams.sh" "$validator_config_file" "$scriptDir" "$sshKeyFile" "$useRoot"; then
echo "Warning: leanpoint sync failed. If the tooling server requires a specific SSH key, run with: --sshKey <path-to-key>"
fi
fi
if [ -z "$skipNemo" ]; then
_nemo_reset_db=0
[ -n "$generateGenesis" ] && _nemo_reset_db=1
if ! NEMO_RESET_DB="$_nemo_reset_db" "$scriptDir/sync-nemo-tooling.sh" "$validator_config_file" "$scriptDir" "$sshKeyFile" "$useRoot"; then
echo "Warning: Nemo tooling sync failed. Pass --sshKey <path-to-key> if the tooling server requires it, or use --skip-nemo to skip."
fi
fi
# Push genesis time metric to Pushgateway if available
_pushgateway_url="${PUSHGATEWAY_URL:-http://46.225.10.32:9091}"
_genesis_config="$configDir/config.yaml"
if [ -f "$_genesis_config" ]; then
_genesis_time=$(grep "GENESIS_TIME:" "$_genesis_config" | awk '{print $2}')
if [ -n "$_genesis_time" ]; then
echo "lean_genesis_time $_genesis_time" | curl -s --data-binary @- \
"$_pushgateway_url/metrics/job/lean-quickstart/network/$networkName" || \
echo "Warning: Failed to push lean_genesis_time to Pushgateway."
fi
fi
# Ansible deployment succeeded, exit normally
exit 0
fi
# Handle stop action for local deployment
if [ -n "$stopNodes" ] && [ "$stopNodes" == "true" ]; then
echo "Stopping local nodes..."
# Load nodes from validator config file
if [ -f "$validator_config_file" ]; then
nodes=($(yq eval '.validators[].name' "$validator_config_file"))
else
echo "Error: Validator config file not found at $validator_config_file"
exit 1
fi
# Determine which nodes to stop
if [[ "$node" == "all" ]]; then
stop_nodes=("${nodes[@]}")
else
if [[ "$node" == *","* ]]; then
IFS=',' read -r -a requested_nodes <<< "$node"
else
IFS=' ' read -r -a requested_nodes <<< "$node"
fi
stop_nodes=("${requested_nodes[@]}")
fi
# Stop Docker containers
for node_name in "${stop_nodes[@]}"; do
echo "Stopping $node_name..."
if [ -n "$dockerWithSudo" ]; then
sudo docker rm -f "$node_name" 2>/dev/null || echo " Container $node_name not found or already stopped"
else
docker rm -f "$node_name" 2>/dev/null || echo " Container $node_name not found or already stopped"
fi
done
# Stop metrics stack if --metrics flag was passed
if [ -n "$enableMetrics" ] && [ "$enableMetrics" == "true" ]; then
echo "Stopping metrics stack..."
metricsDir="$scriptDir/metrics"
if [ -n "$dockerWithSudo" ]; then
sudo docker compose -f "$metricsDir/docker-compose-metrics.yaml" down 2>/dev/null || echo " Metrics stack not running or already stopped"
else
docker compose -f "$metricsDir/docker-compose-metrics.yaml" down 2>/dev/null || echo " Metrics stack not running or already stopped"
fi
fi
# Stop local leanpoint container if running
if [ -n "$dockerWithSudo" ]; then
sudo docker rm -f leanpoint 2>/dev/null || echo " Container leanpoint not found or already stopped"
sudo docker rm -f nemo 2>/dev/null || echo " Container nemo not found or already stopped"
else
docker rm -f leanpoint 2>/dev/null || echo " Container leanpoint not found or already stopped"
docker rm -f nemo 2>/dev/null || echo " Container nemo not found or already stopped"
fi
echo "✅ Local nodes stopped successfully!"
exit 0
fi
# 3. run clients (local deployment)
mkdir -p $dataDir
# Detect OS and set appropriate terminal command
popupTerminalCmd=""
if [[ "$OSTYPE" == "darwin"* ]]; then
# macOS - don't use popup terminal by default, just run in background
popupTerminalCmd=""
elif [[ "$OSTYPE" == "linux"* ]]; then
# Linux try a list of common terminals in order of preference
for term in x-terminal-emulator gnome-terminal konsole xfce4-terminal kitty alacritty lxterminal lxqt-terminal mate-terminal terminator xterm; do
if command -v "$term" &>/dev/null; then
# Most terminals accept `--` as "end of options" before the command
case "$term" in
gnome-terminal|xfce4-terminal|konsole|lxterminal|lxqt-terminal|terminator|alacritty|kitty)
popupTerminalCmd="$term --"
;;
xterm|mate-terminal|x-terminal-emulator)
popupTerminalCmd="$term -e"
;;
*)
popupTerminalCmd="$term"
;;
esac
break
fi
done
fi
spinned_pids=()
for item in "${spin_nodes[@]}"; do
# extract client config FIRST before printing
IFS='_' read -r -a elements <<< "$item"
client="${elements[0]}"
echo -e "\n\nspining $item: client=$client (mode=$node_setup)"
printf '%*s' $(tput cols) | tr ' ' '-'
echo
# When restarting with checkpoint sync, stop existing container first
if [[ "$restart_with_checkpoint_sync" == "true" ]]; then
echo "Stopping existing container $item..."
if [ -n "$dockerWithSudo" ]; then
sudo docker rm -f "$item" 2>/dev/null || true
else
docker rm -f "$item" 2>/dev/null || true
fi
fi
# create and/or cleanup datadirs
itemDataDir="$dataDir/$item"
mkdir -p $itemDataDir
if [ -n "$cleanData" ]; then
cmd="rm -rf \"$itemDataDir\"/*"
if [ -n "$dockerWithSudo" ]; then
cmd="sudo $cmd"
fi
echo "$cmd"
eval "$cmd"
fi
# parse validator-config.yaml for $item to load args values
source parse-vc.sh
# export checkpoint_sync_url for client-cmd scripts when restarting with checkpoint sync
if [[ "$restart_with_checkpoint_sync" == "true" ]] && [[ -n "$checkpointSyncUrl" ]]; then
export checkpoint_sync_url="$checkpointSyncUrl"
else
unset checkpoint_sync_url 2>/dev/null || true
fi
# get client specific cmd and its mode (docker, binary)
sourceCmd="source client-cmds/$client-cmd.sh"
echo "$sourceCmd"
eval $sourceCmd
# spin nodes
if [ "$node_setup" == "binary" ]
then
# Add core dump support if enabled for this node
if should_enable_core_dumps "$item"; then
execCmd="ulimit -c unlimited && $node_binary"
echo "Core dumps enabled for $item (binary mode)"
else
execCmd="$node_binary"
fi
else
# Extract image name from node_docker (find word containing ':' which is the image:tag)
docker_image=$(echo "$node_docker" | grep -oE '[^ ]+:[^ ]+' | head -1)
# Pull image first
if [ -n "$dockerWithSudo" ]; then
sudo docker pull "$docker_image" || true
else
docker pull "$docker_image" || true
fi
execCmd="docker run --rm --pull=never"
if [ -n "$dockerWithSudo" ]
then
execCmd="sudo $execCmd"
fi;
# Use --network host for peer-to-peer communication to work
# On macOS Docker Desktop, containers share the VM's network stack, allowing them
# to reach each other via 127.0.0.1 (as configured in nodes.yaml ENR records).
# Note: Port mapping (-p) doesn't work with --network host, so metrics endpoints
# are not directly accessible from the macOS host. Use 'docker exec' to access them.
# Add core dump support if enabled for this node
# --init: forwards signals and reaps zombies (required for core dumps)
# --workdir /data: dumps land in the mounted volume
if should_enable_core_dumps "$item"; then
execCmd="$execCmd --init --ulimit core=-1 --workdir /data"
echo "Core dumps enabled for $item (dumps will be written to $dataDir/$item/)"
fi
execCmd="$execCmd --name $item --network host \
-v $configDir:/config \
-v $dataDir/$item:/data \
$node_docker"
fi;
if [ -n "$popupTerminal" ]
then
execCmd="$popupTerminalCmd $execCmd"
fi;
if [ "$dryRun" == "true" ]; then
echo "[DRY RUN] Would execute: $execCmd"
pid=0
else
echo "$execCmd"
eval "$execCmd" &
pid=$!
fi
spinned_pids+=($pid)
done;
# 4. Start metrics stack (Prometheus + Grafana) if --metrics flag was passed
if [ -n "$enableMetrics" ] && [ "$enableMetrics" == "true" ]; then
echo -e "\n\nStarting metrics stack (Prometheus + Grafana)..."
printf '%*s' $(tput cols) | tr ' ' '-'
echo
metricsDir="$scriptDir/metrics"
# Generate prometheus.yml from validator-config.yaml
"$scriptDir/generate-prometheus-config.sh" "$validator_config_file" "$metricsDir/prometheus"
# Pull and start metrics containers
if [ -n "$dockerWithSudo" ]; then
sudo docker compose -f "$metricsDir/docker-compose-metrics.yaml" up -d
else
docker compose -f "$metricsDir/docker-compose-metrics.yaml" up -d
fi
echo ""
echo "📊 Metrics stack started:"
echo " Prometheus: http://localhost:9090"
echo " Grafana: http://localhost:3000"
echo ""
fi
# Deploy leanpoint: locally (local devnet) or sync to tooling server (Ansible), unless --skip-leanpoint
# Skip leanpoint during checkpoint sync restart (node list hasn't changed)
local_leanpoint_deployed=0
if [ -z "$skipLeanpoint" ] && { [ "$restart_with_checkpoint_sync" != "true" ] || [ "${has_replacements:-false}" = "true" ]; }; then
if "$scriptDir/sync-leanpoint-upstreams.sh" "$validator_config_file" "$scriptDir" "$sshKeyFile" "$useRoot" "$dataDir"; then
local_leanpoint_deployed=1
else
echo "Warning: leanpoint deploy failed. For remote sync, pass --sshKey <path-to-key> if the tooling server requires it."
fi
fi
# Nemo explorer: same tooling server (Ansible) or local Docker; DB reset only with --generateGenesis
local_nemo_deployed=0
if [ -z "$skipNemo" ]; then
_nemo_reset_db=0
[ -n "$generateGenesis" ] && _nemo_reset_db=1
if NEMO_RESET_DB="$_nemo_reset_db" "$scriptDir/sync-nemo-tooling.sh" "$validator_config_file" "$scriptDir" "$sshKeyFile" "$useRoot" "$dataDir"; then
local_nemo_deployed=1
else
echo "Warning: Nemo deploy failed. Pass --sshKey if needed, or --skip-nemo to skip."
fi
fi
container_names="${spin_nodes[*]}"
process_ids="${spinned_pids[*]}"
cleanup() {
echo -e "\n\ncleaning up"
printf '%*s' $(tput cols) | tr ' ' '-'
echo
# try for docker containers
execCmd="docker rm -f $container_names"
if [ -n "$dockerWithSudo" ]
then
execCmd="sudo $execCmd"
fi;
echo "$execCmd"
eval "$execCmd"
if [ "${local_leanpoint_deployed:-0}" = "1" ]; then
execCmd="docker rm -f leanpoint"
[ -n "$dockerWithSudo" ] && execCmd="sudo $execCmd"
eval "$execCmd" 2>/dev/null || true
fi
if [ "${local_nemo_deployed:-0}" = "1" ]; then
execCmd="docker rm -f nemo"
[ -n "$dockerWithSudo" ] && execCmd="sudo $execCmd"
eval "$execCmd" 2>/dev/null || true
fi
# try for process ids
execCmd="kill -9 $process_ids"
echo "$execCmd"
eval "$execCmd"
# Stop metrics stack if it was started
if [ -n "$enableMetrics" ] && [ "$enableMetrics" == "true" ]; then
echo "Stopping metrics stack..."
metricsDir="$scriptDir/metrics"
if [ -n "$dockerWithSudo" ]; then
sudo docker compose -f "$metricsDir/docker-compose-metrics.yaml" down 2>/dev/null || true
else
docker compose -f "$metricsDir/docker-compose-metrics.yaml" down 2>/dev/null || true
fi
fi
}
trap "echo exit signal received;cleanup" SIGINT SIGTERM
echo -e "\n\nwaiting for nodes to exit"
printf '%*s' $(tput cols) | tr ' ' '-'
echo "press Ctrl+C to exit and cleanup..."
# Wait for background processes - use a compatible approach for all shells
if [ ${#spinned_pids[@]} -gt 0 ]; then
for pid in "${spinned_pids[@]}"; do
wait $pid 2>/dev/null || true
done
else
# Fallback: wait for any background job
wait
fi
cleanup