1818# --report PATH Write report to file
1919# --commit-all MSG Auto-commit all dirty repos with given message
2020# --push Auto-push after commit (with --commit-all)
21+ # --batch-size N Repos processed per throttled batch (default: 25,
22+ # env SYNC_BATCH_SIZE). Set 0 to disable batching.
23+ # --batch-pause SEC Seconds to pause between batches (default: 45,
24+ # env SYNC_BATCH_PAUSE_SEC). A small random jitter is
25+ # added on top to de-correlate CI trigger waves.
26+ # --no-throttle Disable inter-batch pacing entirely (one big stream,
27+ # legacy behaviour). Implied by --dry-run (no pushes).
28+ #
29+ # THROTTLING / THUNDERING-HERD CONTROL
30+ # Phase 1 pushes to every owned repo. Each push fires that repo's GitHub
31+ # Actions workflows. Pushing the whole estate (~355 repos) in one tight
32+ # window saturates the account-wide hosted-runner concurrency cap, leaving
33+ # a large fraction of estate CI transiently `queued`. To avoid this we
34+ # process repos in batches of --batch-size and pause --batch-pause seconds
35+ # (plus jitter) between batches, so CI trigger waves are spread over time.
36+ # Within a batch, the original --concurrency parallelism is unchanged, and
37+ # a failure in one repo never aborts the batch or the run.
2138
2239defmodule SyncAll do
2340 @ owned_prefixes [
@@ -31,6 +48,19 @@ defmodule SyncAll do
3148
3249 @ ignored_dirs ~w[ logs monitoring .git-private-farm]
3350
51+ # Throttling defaults. batch_size 25 + 45s pause keeps the per-minute
52+ # workflow-trigger rate well under the account hosted-runner cap while
53+ # still syncing the whole estate in a bounded time. Env vars let ops tune
54+ # without editing the script; CLI flags override env.
55+ @ default_batch_size ( case Integer . parse ( System . get_env ( "SYNC_BATCH_SIZE" ) || "" ) do
56+ { n , _ } when n >= 0 -> n
57+ _ -> 25
58+ end )
59+ @ default_batch_pause_sec ( case Integer . parse ( System . get_env ( "SYNC_BATCH_PAUSE_SEC" ) || "" ) do
60+ { n , _ } when n >= 0 -> n
61+ _ -> 45
62+ end )
63+
3464 defstruct [
3565 :repos_dir ,
3666 :dry_run ,
@@ -40,6 +70,9 @@ defmodule SyncAll do
4070 :report_file ,
4171 :commit_msg ,
4272 :auto_push ,
73+ :batch_size ,
74+ :batch_pause_sec ,
75+ :throttle ,
4376 max_depth: 3
4477 ]
4578
@@ -51,6 +84,15 @@ defmodule SyncAll do
5184 IO . puts ( " Repos dir: #{ config . repos_dir } " )
5285 IO . puts ( " Concurrency: #{ config . concurrency } " )
5386 IO . puts ( " Dry run: #{ config . dry_run } " )
87+
88+ cond do
89+ not throttling_active? ( config ) ->
90+ IO . puts ( " Throttle: off (no inter-batch pacing)" )
91+
92+ true ->
93+ IO . puts ( " Throttle: batches of #{ config . batch_size } , ~#{ config . batch_pause_sec } s pause (+jitter)" )
94+ end
95+
5496 if config . commit_msg , do: IO . puts ( " Commit msg: #{ config . commit_msg } " )
5597
5698 start = System . monotonic_time ( :millisecond )
@@ -107,7 +149,10 @@ defmodule SyncAll do
107149 do_fetch: true ,
108150 report_file: nil ,
109151 commit_msg: nil ,
110- auto_push: false
152+ auto_push: false ,
153+ batch_size: @ default_batch_size ,
154+ batch_pause_sec: @ default_batch_pause_sec ,
155+ throttle: true
111156 } )
112157 end
113158
@@ -121,6 +166,9 @@ defmodule SyncAll do
121166 defp parse_args ( [ "--report" , path | rest ] , config ) , do: parse_args ( rest , % { config | report_file: path } )
122167 defp parse_args ( [ "--commit-all" , msg | rest ] , config ) , do: parse_args ( rest , % { config | commit_msg: msg } )
123168 defp parse_args ( [ "--depth" , n | rest ] , config ) , do: parse_args ( rest , % { config | max_depth: String . to_integer ( n ) } )
169+ defp parse_args ( [ "--batch-size" , n | rest ] , config ) , do: parse_args ( rest , % { config | batch_size: String . to_integer ( n ) } )
170+ defp parse_args ( [ "--batch-pause" , n | rest ] , config ) , do: parse_args ( rest , % { config | batch_pause_sec: String . to_integer ( n ) } )
171+ defp parse_args ( [ "--no-throttle" | rest ] , config ) , do: parse_args ( rest , % { config | throttle: false } )
124172
125173 defp parse_args ( [ arg | rest ] , config ) do
126174 if File . dir? ( arg ) , do: parse_args ( rest , % { config | repos_dir: arg } ) , else: parse_args ( rest , config )
@@ -188,13 +236,60 @@ defmodule SyncAll do
188236
189237 # --- Phase 1: Parallel Sync ---
190238
239+ # Throttling is only meaningful when we will actually push (pushes are
240+ # what trigger remote CI). A dry run pushes nothing, so pacing it just
241+ # wastes wall-clock — disable it there as well as on explicit --no-throttle
242+ # or a non-positive batch size.
243+ defp throttling_active? ( config ) do
244+ config . throttle and not config . dry_run and config . batch_size > 0
245+ end
246+
191247 defp phase1_parallel ( repos , config ) do
192248 total = length ( repos )
193249 counter = :counters . new ( 1 , [ :atomics ] )
194250
195251 IO . puts ( "\n \e [1m=== Phase 1: Parallel Fetch/Pull/Push (#{ config . concurrency } workers) ===\e [0m" )
196252
197- repos
253+ batches =
254+ if throttling_active? ( config ) do
255+ Enum . chunk_every ( repos , config . batch_size )
256+ else
257+ [ repos ]
258+ end
259+
260+ nbatches = length ( batches )
261+
262+ if nbatches > 1 do
263+ IO . puts (
264+ "\e [2mThrottled: #{ nbatches } batch(es) of up to #{ config . batch_size } , " <>
265+ "~#{ config . batch_pause_sec } s (+jitter) between batches\e [0m"
266+ )
267+ end
268+
269+ batches
270+ |> Enum . with_index ( 1 )
271+ |> Enum . flat_map ( fn { batch , idx } ->
272+ results = run_batch ( batch , config , counter , total )
273+
274+ # Pause AFTER every batch except the last, so the next wave of
275+ # workflow triggers lands once the previous wave has had time to
276+ # start draining the runner queue. Jitter de-correlates repeated runs.
277+ if idx < nbatches do
278+ jitter_ms = :rand . uniform ( 5_000 )
279+ pause_ms = config . batch_pause_sec * 1_000 + jitter_ms
280+ IO . puts ( "\n \e [2m Batch #{ idx } /#{ nbatches } done — pausing #{ div ( pause_ms , 1000 ) } s before next batch…\e [0m" )
281+ Process . sleep ( pause_ms )
282+ end
283+
284+ results
285+ end )
286+ |> tap ( fn _ -> IO . puts ( "" ) end ) # Clear progress line
287+ end
288+
289+ # Run one batch at the configured concurrency. A crash/timeout in one
290+ # repo is mapped to an error result and never aborts the batch or run.
291+ defp run_batch ( batch , config , counter , total ) do
292+ batch
198293 |> Task . async_stream (
199294 fn repo -> sync_one_repo ( repo , config , counter , total ) end ,
200295 max_concurrency: config . concurrency ,
@@ -205,7 +300,6 @@ defmodule SyncAll do
205300 { :ok , result } -> result
206301 { :exit , _reason } -> % { repo: "?" , dirty: false , fetched: false , pulled: false , pushed: false , diverged: false , error: "timeout" }
207302 end )
208- |> tap ( fn _ -> IO . puts ( "" ) end ) # Clear progress line
209303 end
210304
211305 defp sync_one_repo ( repo , config , counter , total ) do
0 commit comments