Skip to content

Commit 4ea09a7

Browse files
fix: stagger estate push to stop CI thundering-herd (#147)
Phase 1 of sync-all-parallel.exs pushed every owned repo via a single Task.async_stream with no pacing, so a full-estate sync fired thousands of GitHub Actions runs near-simultaneously and saturated the account hosted-runner concurrency cap (~34% of estate CI left transiently queued). Process repos in batches (default 25, --batch-size / SYNC_BATCH_SIZE) with a paced pause between batches (default 45s + 0-5s jitter, --batch-pause / SYNC_BATCH_PAUSE_SEC). Intra-batch concurrency and per-repo error handling are unchanged; every repo is still processed. Pacing is skipped on --dry-run and --no-throttle. Co-authored-by: hyperpolymath <hyperpolymath@users.noreply.github.com> Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
1 parent e366109 commit 4ea09a7

1 file changed

Lines changed: 97 additions & 3 deletions

File tree

scripts/sync-all-parallel.exs

Lines changed: 97 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,23 @@
1818
# --report PATH Write report to file
1919
# --commit-all MSG Auto-commit all dirty repos with given message
2020
# --push Auto-push after commit (with --commit-all)
21+
# --batch-size N Repos processed per throttled batch (default: 25,
22+
# env SYNC_BATCH_SIZE). Set 0 to disable batching.
23+
# --batch-pause SEC Seconds to pause between batches (default: 45,
24+
# env SYNC_BATCH_PAUSE_SEC). A small random jitter is
25+
# added on top to de-correlate CI trigger waves.
26+
# --no-throttle Disable inter-batch pacing entirely (one big stream,
27+
# legacy behaviour). Implied by --dry-run (no pushes).
28+
#
29+
# THROTTLING / THUNDERING-HERD CONTROL
30+
# Phase 1 pushes to every owned repo. Each push fires that repo's GitHub
31+
# Actions workflows. Pushing the whole estate (~355 repos) in one tight
32+
# window saturates the account-wide hosted-runner concurrency cap, leaving
33+
# a large fraction of estate CI transiently `queued`. To avoid this we
34+
# process repos in batches of --batch-size and pause --batch-pause seconds
35+
# (plus jitter) between batches, so CI trigger waves are spread over time.
36+
# Within a batch, the original --concurrency parallelism is unchanged, and
37+
# a failure in one repo never aborts the batch or the run.
2138

2239
defmodule SyncAll do
2340
@owned_prefixes [
@@ -31,6 +48,19 @@ defmodule SyncAll do
3148

3249
@ignored_dirs ~w[logs monitoring .git-private-farm]
3350

51+
# Throttling defaults. batch_size 25 + 45s pause keeps the per-minute
52+
# workflow-trigger rate well under the account hosted-runner cap while
53+
# still syncing the whole estate in a bounded time. Env vars let ops tune
54+
# without editing the script; CLI flags override env.
55+
@default_batch_size (case Integer.parse(System.get_env("SYNC_BATCH_SIZE") || "") do
56+
{n, _} when n >= 0 -> n
57+
_ -> 25
58+
end)
59+
@default_batch_pause_sec (case Integer.parse(System.get_env("SYNC_BATCH_PAUSE_SEC") || "") do
60+
{n, _} when n >= 0 -> n
61+
_ -> 45
62+
end)
63+
3464
defstruct [
3565
:repos_dir,
3666
:dry_run,
@@ -40,6 +70,9 @@ defmodule SyncAll do
4070
:report_file,
4171
:commit_msg,
4272
:auto_push,
73+
:batch_size,
74+
:batch_pause_sec,
75+
:throttle,
4376
max_depth: 3
4477
]
4578

@@ -51,6 +84,15 @@ defmodule SyncAll do
5184
IO.puts(" Repos dir: #{config.repos_dir}")
5285
IO.puts(" Concurrency: #{config.concurrency}")
5386
IO.puts(" Dry run: #{config.dry_run}")
87+
88+
cond do
89+
not throttling_active?(config) ->
90+
IO.puts(" Throttle: off (no inter-batch pacing)")
91+
92+
true ->
93+
IO.puts(" Throttle: batches of #{config.batch_size}, ~#{config.batch_pause_sec}s pause (+jitter)")
94+
end
95+
5496
if config.commit_msg, do: IO.puts(" Commit msg: #{config.commit_msg}")
5597

5698
start = System.monotonic_time(:millisecond)
@@ -107,7 +149,10 @@ defmodule SyncAll do
107149
do_fetch: true,
108150
report_file: nil,
109151
commit_msg: nil,
110-
auto_push: false
152+
auto_push: false,
153+
batch_size: @default_batch_size,
154+
batch_pause_sec: @default_batch_pause_sec,
155+
throttle: true
111156
})
112157
end
113158

@@ -121,6 +166,9 @@ defmodule SyncAll do
121166
defp parse_args(["--report", path | rest], config), do: parse_args(rest, %{config | report_file: path})
122167
defp parse_args(["--commit-all", msg | rest], config), do: parse_args(rest, %{config | commit_msg: msg})
123168
defp parse_args(["--depth", n | rest], config), do: parse_args(rest, %{config | max_depth: String.to_integer(n)})
169+
defp parse_args(["--batch-size", n | rest], config), do: parse_args(rest, %{config | batch_size: String.to_integer(n)})
170+
defp parse_args(["--batch-pause", n | rest], config), do: parse_args(rest, %{config | batch_pause_sec: String.to_integer(n)})
171+
defp parse_args(["--no-throttle" | rest], config), do: parse_args(rest, %{config | throttle: false})
124172

125173
defp parse_args([arg | rest], config) do
126174
if File.dir?(arg), do: parse_args(rest, %{config | repos_dir: arg}), else: parse_args(rest, config)
@@ -188,13 +236,60 @@ defmodule SyncAll do
188236

189237
# --- Phase 1: Parallel Sync ---
190238

239+
# Throttling is only meaningful when we will actually push (pushes are
240+
# what trigger remote CI). A dry run pushes nothing, so pacing it just
241+
# wastes wall-clock — disable it there as well as on explicit --no-throttle
242+
# or a non-positive batch size.
243+
defp throttling_active?(config) do
244+
config.throttle and not config.dry_run and config.batch_size > 0
245+
end
246+
191247
defp phase1_parallel(repos, config) do
192248
total = length(repos)
193249
counter = :counters.new(1, [:atomics])
194250

195251
IO.puts("\n\e[1m=== Phase 1: Parallel Fetch/Pull/Push (#{config.concurrency} workers) ===\e[0m")
196252

197-
repos
253+
batches =
254+
if throttling_active?(config) do
255+
Enum.chunk_every(repos, config.batch_size)
256+
else
257+
[repos]
258+
end
259+
260+
nbatches = length(batches)
261+
262+
if nbatches > 1 do
263+
IO.puts(
264+
"\e[2mThrottled: #{nbatches} batch(es) of up to #{config.batch_size}, " <>
265+
"~#{config.batch_pause_sec}s (+jitter) between batches\e[0m"
266+
)
267+
end
268+
269+
batches
270+
|> Enum.with_index(1)
271+
|> Enum.flat_map(fn {batch, idx} ->
272+
results = run_batch(batch, config, counter, total)
273+
274+
# Pause AFTER every batch except the last, so the next wave of
275+
# workflow triggers lands once the previous wave has had time to
276+
# start draining the runner queue. Jitter de-correlates repeated runs.
277+
if idx < nbatches do
278+
jitter_ms = :rand.uniform(5_000)
279+
pause_ms = config.batch_pause_sec * 1_000 + jitter_ms
280+
IO.puts("\n\e[2m Batch #{idx}/#{nbatches} done — pausing #{div(pause_ms, 1000)}s before next batch…\e[0m")
281+
Process.sleep(pause_ms)
282+
end
283+
284+
results
285+
end)
286+
|> tap(fn _ -> IO.puts("") end) # Clear progress line
287+
end
288+
289+
# Run one batch at the configured concurrency. A crash/timeout in one
290+
# repo is mapped to an error result and never aborts the batch or run.
291+
defp run_batch(batch, config, counter, total) do
292+
batch
198293
|> Task.async_stream(
199294
fn repo -> sync_one_repo(repo, config, counter, total) end,
200295
max_concurrency: config.concurrency,
@@ -205,7 +300,6 @@ defmodule SyncAll do
205300
{:ok, result} -> result
206301
{:exit, _reason} -> %{repo: "?", dirty: false, fetched: false, pulled: false, pushed: false, diverged: false, error: "timeout"}
207302
end)
208-
|> tap(fn _ -> IO.puts("") end) # Clear progress line
209303
end
210304

211305
defp sync_one_repo(repo, config, counter, total) do

0 commit comments

Comments
 (0)