From 70ea80f905b39a45cfb22faece608fa35f505e11 Mon Sep 17 00:00:00 2001 From: David <12414531+DavidBellamy@users.noreply.github.com> Date: Sat, 18 Apr 2026 15:57:25 -0700 Subject: [PATCH] rollout: move base_port default from 15000 to 17500 Miles allocates SGLang engine ports (server, nccl, dist_init, dp_attention, engine_info_bootstrap) starting at base_port. The previous 15000 default fully overlapped Mooncake's RPC handshake range (rpc_min_port=15000..rpc_max_port=17000, from mooncake-transfer-engine/include/config.h). On PD-disaggregation runs, Mooncake's TransferEngine starts before SGLang's Uvicorn on each engine, so it can grab a port inside the miles-allocated range first. When sglang then tries to bind its HTTP server on the same port it gets EADDRINUSE and the engine dies. Downstream symptom: RolloutManager silently waits forever for the missing engine; no train_step fires. Secondary observable (noisy but benign): miles' _wait_server_healthy 2s poll (GET /health_generate) against the dead port lands on whichever process grabbed it (usually a sibling engine's mooncake handshake listener), producing log spam: readString: too large length from socket: 7018130145941931335 SocketHandShakePlugin: failed to receive handshake message, malformed json format ... json string length: 0, json string content: where 7018130145941931335 decodes as 8 LE ASCII bytes = 'GET /hea'. 17500 satisfies all three existing constraints: - < 32768 (below ephemeral range) - > 10002 with margin (clear of Ray 10002-19999 racing near 10002) - > 17000 (NEW: clear of mooncake RPC range) No behavior change for callers that pass base_port explicitly. Observed on LLM360/RL360 jobs 1564764 (port 15082) and 1565161 (port 15079). --- miles/ray/rollout.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/miles/ray/rollout.py b/miles/ray/rollout.py index 2a75d492b9..08c894e304 100644 --- a/miles/ray/rollout.py +++ b/miles/ray/rollout.py @@ -169,7 +169,7 @@ def start_engines(self, port_cursors: dict[int, int] | None = None) -> tuple[lis args=self.args, rollout_engines=rollout_engines ) else: - base_port = max(port_cursors.values()) if port_cursors else 15000 + base_port = max(port_cursors.values()) if port_cursors else 17500 addr_and_ports, port_cursors = _allocate_rollout_engine_addr_and_ports_normal( args=self.args, rollout_engines=rollout_engines, @@ -814,7 +814,7 @@ def _allocate_rollout_engine_addr_and_ports_normal( worker_type="regular", num_gpus_per_engine=None, rank_offset=0, - base_port=15000, + base_port=17500, ): # get ports # there are 4 ports we need to allocate @@ -842,8 +842,17 @@ def _allocate_rollout_engine_addr_and_ports_normal( num_engines_on_this_node = num_engines_per_node - (local_rank % num_engines_per_node) def get_addr_and_ports(engine, node_idx): - # use small ports to prevent ephemeral port between 32768 and 65536. - # also, ray uses port 10002-19999, thus we avoid near-10002 to avoid racing condition + # Port range constraints (all must hold): + # - < 32768 to stay clear of the ephemeral port range (32768-65535) + # - > 10002 but away from 10002 to avoid races with Ray (10002-19999) + # - > 17000 to stay clear of Mooncake's RPC handshake range + # (rpc_min_port=15000..rpc_max_port=17000 from + # mooncake-transfer-engine/include/config.h). Prior default of + # 15000 fully overlapped mooncake and caused intermittent + # EADDRINUSE when mooncake's TransferEngine grabbed a port + # inside the miles-allocated range before sglang's Uvicorn + # could bind it. + # 17500+ satisfies all three. start_port = node_port_cursor.get(node_idx, base_port) def port(consecutive=1):