From 70ea80f905b39a45cfb22faece608fa35f505e11 Mon Sep 17 00:00:00 2001
From: David <12414531+DavidBellamy@users.noreply.github.com>
Date: Sat, 18 Apr 2026 15:57:25 -0700
Subject: [PATCH] rollout: move base_port default from 15000 to 17500

Miles allocates SGLang engine ports (server, nccl, dist_init, dp_attention,
engine_info_bootstrap) starting at base_port. The previous 15000 default
fully overlapped Mooncake's RPC handshake range
(rpc_min_port=15000..rpc_max_port=17000, from
mooncake-transfer-engine/include/config.h). On PD-disaggregation runs,
Mooncake's TransferEngine starts before SGLang's Uvicorn on each engine,
so it can grab a port inside the miles-allocated range first. When
sglang then tries to bind its HTTP server on the same port it gets
EADDRINUSE and the engine dies. Downstream symptom: RolloutManager
silently waits forever for the missing engine; no train_step fires.

Secondary observable (noisy but benign): miles' _wait_server_healthy
2s poll (GET /health_generate) against the dead port lands on whichever
process grabbed it (usually a sibling engine's mooncake handshake
listener), producing log spam:

  readString: too large length from socket: 7018130145941931335
  SocketHandShakePlugin: failed to receive handshake message,
  malformed json format ... json string length: 0, json string content:

where 7018130145941931335 decodes as 8 LE ASCII bytes = 'GET /hea'.

17500 satisfies all three existing constraints:
- < 32768 (below ephemeral range)
- > 10002 with margin (clear of Ray 10002-19999 racing near 10002)
- > 17000 (NEW: clear of mooncake RPC range)

No behavior change for callers that pass base_port explicitly.

Observed on LLM360/RL360 jobs 1564764 (port 15082) and 1565161
(port 15079).
---
 miles/ray/rollout.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/miles/ray/rollout.py b/miles/ray/rollout.py
index 2a75d492b9..08c894e304 100644
--- a/miles/ray/rollout.py
+++ b/miles/ray/rollout.py
@@ -169,7 +169,7 @@ def start_engines(self, port_cursors: dict[int, int] | None = None) -> tuple[lis
                 args=self.args, rollout_engines=rollout_engines
             )
         else:
-            base_port = max(port_cursors.values()) if port_cursors else 15000
+            base_port = max(port_cursors.values()) if port_cursors else 17500
             addr_and_ports, port_cursors = _allocate_rollout_engine_addr_and_ports_normal(
                 args=self.args,
                 rollout_engines=rollout_engines,
@@ -814,7 +814,7 @@ def _allocate_rollout_engine_addr_and_ports_normal(
     worker_type="regular",
     num_gpus_per_engine=None,
     rank_offset=0,
-    base_port=15000,
+    base_port=17500,
 ):
     # get ports
     # there are 4 ports we need to allocate
@@ -842,8 +842,17 @@ def _allocate_rollout_engine_addr_and_ports_normal(
         num_engines_on_this_node = num_engines_per_node - (local_rank % num_engines_per_node)
 
         def get_addr_and_ports(engine, node_idx):
-            # use small ports to prevent ephemeral port between 32768 and 65536.
-            # also, ray uses port 10002-19999, thus we avoid near-10002 to avoid racing condition
+            # Port range constraints (all must hold):
+            #   - < 32768 to stay clear of the ephemeral port range (32768-65535)
+            #   - > 10002 but away from 10002 to avoid races with Ray (10002-19999)
+            #   - > 17000 to stay clear of Mooncake's RPC handshake range
+            #     (rpc_min_port=15000..rpc_max_port=17000 from
+            #     mooncake-transfer-engine/include/config.h). Prior default of
+            #     15000 fully overlapped mooncake and caused intermittent
+            #     EADDRINUSE when mooncake's TransferEngine grabbed a port
+            #     inside the miles-allocated range before sglang's Uvicorn
+            #     could bind it.
+            # 17500+ satisfies all three.
             start_port = node_port_cursor.get(node_idx, base_port)
 
             def port(consecutive=1):