From a916031de0731b043bd2ee81c2f2429f32650318 Mon Sep 17 00:00:00 2001 From: Shawn Xie Date: Thu, 21 May 2026 07:08:37 +0000 Subject: [PATCH 1/4] fix(onboard): fail fast in preflight when all dashboard ports are occupied (#3953) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `findAvailableDashboardPort` already raises the right error ("All dashboard ports in range 18789-18799 are occupied …") when every port is held, but it only runs late in onboarding — after preflight, gateway start, and inference selection have already had side effects. A user with all 11 dashboard ports held by external processes (python3, etc.) saw onboard proceed all the way to the inference menu before any port-related failure surfaced, contradicting the contract in the docs / test plan. Add a narrow `preflightDashboardPortRangeAvailability()` helper to `src/lib/onboard/dashboard-port.ts` and call it from `preflight()` right before it returns the GPU detection. The helper only checks host bindings (it does not need OpenShell forward state, so it is safe before gateway start). When every port in `DASHBOARD_PORT_RANGE_START..END` is bound, it prints the same canonical message and exits 1. To keep `src/lib/onboard.ts` net-neutral per the `onboard-entrypoint-budget` workflow, the unused `findDashboardForwardOwner` import / re-export is dropped (no callers outside the colocated unit test, which imports it directly from `./onboard/dashboard-port`). Net delta on `onboard.ts`: +2/-2. Test plan - Added 3 unit tests in `src/lib/onboard/dashboard-port.test.ts` with injected `isPortBoundOnHost`/`process.exit` stubs: * exits 1 with the canonical message when every port in the range is bound; stderr lists each port → non-OpenShell host listener. * returns without exiting when 10 of 11 are bound and 1 is free. * returns without exiting when no port is bound. - `npx vitest run src/lib/onboard/dashboard-port.test.ts` — 11/11 pass. - Manual end-to-end on Ubuntu 24.04 x86_64: * Occupy 18789-18799 with `python3 -m http.server` placeholders. * `nemoclaw onboard --non-interactive ...` now exits inside `[1/8] Preflight checks` with `All dashboard ports in range 18789-18799 are occupied:` followed by one line per occupied port. `[2/8] Starting OpenShell gateway` and `[3/8] Configuring inference` are never printed. Signed-off-by: Shawn Xie --- src/lib/onboard.ts | 4 +-- src/lib/onboard/dashboard-port.test.ts | 46 ++++++++++++++++++++++++++ src/lib/onboard/dashboard-port.ts | 41 +++++++++++++++++++++++ 3 files changed, 89 insertions(+), 2 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 0971d9f616..b71ff7495f 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -293,9 +293,9 @@ const tiers: typeof import("./policy/tiers") = require("./policy/tiers"); const { ensureUsageNoticeConsent } = require("./onboard/usage-notice"); const { findAvailableDashboardPort, - findDashboardForwardOwner, getOccupiedPorts, isLiveForwardStatus, + preflightDashboardPortRangeAvailability, } = require("./onboard/dashboard-port") as typeof import("./onboard/dashboard-port"); const { destroyGatewayForReuse } = require("./onboard/gateway-cleanup") as typeof import("./onboard/gateway-cleanup"); const { verifyGatewayContainerRunning } = @@ -3841,6 +3841,7 @@ async function preflight( } } + preflightDashboardPortRangeAvailability(); // #3953 — fail fast on dashboard-port exhaustion before step 2 return gpu; } @@ -10249,7 +10250,6 @@ module.exports = { startGateway, findAvailableDashboardPort, - findDashboardForwardOwner, startGatewayForRecovery, openshellArgv, runCaptureOpenshell, diff --git a/src/lib/onboard/dashboard-port.test.ts b/src/lib/onboard/dashboard-port.test.ts index c70fe68d1a..10e99b7bcc 100644 --- a/src/lib/onboard/dashboard-port.test.ts +++ b/src/lib/onboard/dashboard-port.test.ts @@ -8,6 +8,7 @@ import { describe, it } from "vitest"; import { findAvailableDashboardPort, findDashboardForwardOwner, + preflightDashboardPortRangeAvailability, } from "../../../dist/lib/onboard/dashboard-port"; describe("findDashboardForwardOwner", () => { @@ -93,3 +94,48 @@ describe("findAvailableDashboardPort port-conflict detection (#3260)", () => { assert.deepEqual(seen, [18789, 18790]); }); }); + +describe("preflightDashboardPortRangeAvailability (#3953)", () => { + const allBound = (_p: number) => true; + const noneBound = (_p: number) => false; + const someBound = (...bound: number[]) => { + const set = new Set(bound); + return (p: number) => set.has(p); + }; + + it("exits 1 with the canonical message when every port in the range is bound", () => { + let exitCode: number | undefined; + const exitFn = ((code?: number) => { + exitCode = code; + throw new Error(`__exit_${code ?? 0}__`); + }) as (code?: number) => never; + const stderrChunks: string[] = []; + const origError = console.error; + console.error = (msg: string) => { stderrChunks.push(msg); }; + try { + assert.throws(() => preflightDashboardPortRangeAvailability(allBound, exitFn), /__exit_1__/); + } finally { + console.error = origError; + } + assert.equal(exitCode, 1); + const combined = stderrChunks.join("\n"); + assert.match(combined, /All dashboard ports in range 18789-18799 are occupied:/); + assert.match(combined, / 18789 → non-OpenShell host listener/); + assert.match(combined, / 18799 → non-OpenShell host listener/); + assert.match(combined, /--control-ui-port /); + }); + + it("returns without exiting when at least one port in the range is free", () => { + // Even if 10 of 11 ports are bound, the one free port short-circuits success. + const bound = someBound(18789, 18790, 18791, 18792, 18793, 18794, 18795, 18796, 18797, 18798); + preflightDashboardPortRangeAvailability(bound, (() => { + throw new Error("exitFn must not be called when a port is free"); + }) as (code?: number) => never); + }); + + it("returns without exiting when no port is bound", () => { + preflightDashboardPortRangeAvailability(noneBound, (() => { + throw new Error("exitFn must not be called when no port is bound"); + }) as (code?: number) => never); + }); +}); diff --git a/src/lib/onboard/dashboard-port.ts b/src/lib/onboard/dashboard-port.ts index 1f6803ae1b..5919eaa191 100644 --- a/src/lib/onboard/dashboard-port.ts +++ b/src/lib/onboard/dashboard-port.ts @@ -189,3 +189,44 @@ export function findAvailableDashboardPort( `Free a sandbox or use --control-ui-port with a port outside this range.`, ); } + +/** + * Preflight scan of the dashboard port range. If every port in + * [DASHBOARD_PORT_RANGE_START, DASHBOARD_PORT_RANGE_END] is bound on + * the host, print the same "All dashboard ports in range … are + * occupied" error that `findAvailableDashboardPort` would eventually + * raise during sandbox creation and exit non-zero. Calling this from + * `preflight()` surfaces the failure before any side effects (gateway + * start, inference setup), matching the contract reporters expect + * (#3953). + * + * Intentionally narrower than `findAvailableDashboardPort`: it does not + * consult OpenShell forward state, never reserves a port, and treats + * every bound port as a non-OpenShell listener. That is sound here — + * if every port is bound, the host either has no free port for a new + * sandbox OR every port already serves an existing sandbox dashboard; + * both cases require operator intervention via `--control-ui-port`. + * + * The `exitFn` and `isPortBoundCheck` parameters are dependency + * injection seams for unit tests; production callers use the defaults. + */ +export function preflightDashboardPortRangeAvailability( + isPortBoundCheck: (port: number) => boolean = isPortBoundOnHost, + exitFn: (code?: number) => never = process.exit as (code?: number) => never, +): void { + const ports = Array.from( + { length: DASHBOARD_PORT_RANGE_END - DASHBOARD_PORT_RANGE_START + 1 }, + (_, i) => DASHBOARD_PORT_RANGE_START + i, + ); + const bound: number[] = []; + for (const p of ports) { + if (!isPortBoundCheck(p)) return; + bound.push(p); + } + const lines = bound.map((p) => ` ${p} → non-OpenShell host listener`).join("\n"); + console.error( + ` All dashboard ports in range ${DASHBOARD_PORT_RANGE_START}-${DASHBOARD_PORT_RANGE_END} are occupied:\n${lines}\n` + + ` Free a sandbox or use --control-ui-port with a port outside this range.`, + ); + exitFn(1); +} From 299271171c82d660f364b9fb8365c0cf385806be Mon Sep 17 00:00:00 2001 From: Shawn Xie Date: Fri, 22 May 2026 01:44:20 +0000 Subject: [PATCH 2/4] fix(onboard): skip dashboard port-range preflight when --control-ui-port is set Addresses CodeRabbit on #3980. The previous commit called preflightDashboardPortRangeAvailability() unconditionally at the end of preflight(), which broke the explicit --control-ui-port path: a user who passes `--control-ui-port 3000` to escape the default range would still hit the "All dashboard ports in range 18789-18799 are occupied" exit, even though that range is no longer the one we'll use. Guard the call on `_preflightDashboardPort === null` so the new fail-fast only applies to the auto-allocation path. The explicit-port path is already validated separately above (lines ~3670-3700, behind the same `dashboardPortToCheck` flag). Verified end-to-end: with 18789-18799 held by python3 placeholders, `nemoclaw onboard --control-ui-port 23000` no longer prints the range-exhaustion error in preflight; without the flag, the existing fail-fast behavior is unchanged. onboard.ts diff stays at +2/-2 net 0, within the entrypoint budget. Signed-off-by: Shawn Xie --- src/lib/onboard.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index b71ff7495f..cd5ad599ac 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -3841,7 +3841,7 @@ async function preflight( } } - preflightDashboardPortRangeAvailability(); // #3953 — fail fast on dashboard-port exhaustion before step 2 + if (_preflightDashboardPort === null) preflightDashboardPortRangeAvailability(); // #3953 — skip when --control-ui-port already gates the explicit port return gpu; } From ec7e315f506f0be68f891c37e2d0c7bdd61150ae Mon Sep 17 00:00:00 2001 From: Shawn Xie Date: Fri, 22 May 2026 02:04:46 +0000 Subject: [PATCH 3/4] fix(onboard): treat NEMOCLAW_DASHBOARD_PORT as preflight override and gate resume path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeRabbit pointed out two follow-ups on PR #3980's fail-fast preflight: 1. The env override NEMOCLAW_DASHBOARD_PORT pins an explicit dashboard port the same way --control-ui-port does, but only --control-ui-port was disabling the new range-exhaustion fail-fast. Operators pinning 18900 with the default 18789-18799 range full would still exit on preflight even though their port was outside the range. Seed _preflightDashboardPort from the resolved env-override too. 2. The new fail-fast guard ran in the normal preflight path only; the --resume path reuses cached preflight and would walk past an exhausted range, only failing later in dashboard allocation after sandbox work has started — exactly the late failure this PR is trying to prevent. Mirror the guard in the resume branch. Signed-off-by: Shawn Xie --- src/lib/onboard.ts | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 43ceec284c..7e572433d7 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -8978,7 +8978,11 @@ async function onboard(opts: OnboardOptions = {}): Promise { NON_INTERACTIVE = opts.nonInteractive || process.env.NEMOCLAW_NON_INTERACTIVE === "1"; RECREATE_SANDBOX = opts.recreateSandbox || process.env.NEMOCLAW_RECREATE_SANDBOX === "1"; AUTO_YES = opts.autoYes === true || process.env.NEMOCLAW_YES === "1"; - _preflightDashboardPort = opts.controlUiPort || null; + // #3953 — treat NEMOCLAW_DASHBOARD_PORT the same as --control-ui-port for + // preflight gating: when the operator pins an explicit port via the env var, + // the dashboard-range fail-fast must not exit on range exhaustion. + const envDashboardPort = process.env.NEMOCLAW_DASHBOARD_PORT != null ? DASHBOARD_PORT : null; + _preflightDashboardPort = opts.controlUiPort ?? envDashboardPort ?? null; delete process.env.OPENSHELL_GATEWAY; const resume = opts.resume === true; const fresh = opts.fresh === true; @@ -9347,6 +9351,11 @@ async function onboard(opts: OnboardOptions = {}): Promise { !resumeSandboxGpuConfig.sandboxGpuEnabled; assertCdiNvidiaGpuSpecPresent(assessHost(), resumeOptedOutGpuPassthrough); validateSandboxGpuPreflight(resumeSandboxGpuConfig); + // #3953 — resume reuses the cached preflight result and would otherwise + // skip the dashboard-range fail-fast added at the normal preflight path; + // mirror it here so `--resume` doesn't walk past an exhausted range and + // fail later in dashboard allocation after sandbox work has started. + if (_preflightDashboardPort === null) preflightDashboardPortRangeAvailability(); } else { startRecordedStep("preflight"); gpu = await preflight({ ...opts, optedOutGpuPassthrough: opts.noGpu === true }); From a0cc4305b16ccab1c5e1e1e43e8c58d811ebe5b1 Mon Sep 17 00:00:00 2001 From: Shawn Xie Date: Fri, 22 May 2026 02:16:09 +0000 Subject: [PATCH 4/4] fix(onboard): shrink dashboard-port preflight changes under budget MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the verbose 4-line env-override comment block with a single inline ternary on the assignment line, and pack the resume guard onto the existing validateSandboxGpuPreflight call. Semantically identical to ec7e315f5; just brings src/lib/onboard.ts back under the onboard-entrypoint-budget gate (additions ≤ deletions). Signed-off-by: Shawn Xie --- src/lib/onboard.ts | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 7e572433d7..b7273304d6 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -8978,11 +8978,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { NON_INTERACTIVE = opts.nonInteractive || process.env.NEMOCLAW_NON_INTERACTIVE === "1"; RECREATE_SANDBOX = opts.recreateSandbox || process.env.NEMOCLAW_RECREATE_SANDBOX === "1"; AUTO_YES = opts.autoYes === true || process.env.NEMOCLAW_YES === "1"; - // #3953 — treat NEMOCLAW_DASHBOARD_PORT the same as --control-ui-port for - // preflight gating: when the operator pins an explicit port via the env var, - // the dashboard-range fail-fast must not exit on range exhaustion. - const envDashboardPort = process.env.NEMOCLAW_DASHBOARD_PORT != null ? DASHBOARD_PORT : null; - _preflightDashboardPort = opts.controlUiPort ?? envDashboardPort ?? null; + _preflightDashboardPort = opts.controlUiPort ?? (process.env.NEMOCLAW_DASHBOARD_PORT != null ? DASHBOARD_PORT : null); delete process.env.OPENSHELL_GATEWAY; const resume = opts.resume === true; const fresh = opts.fresh === true; @@ -9350,12 +9346,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { (opts.gpu !== true && session?.gpuPassthrough === false) || !resumeSandboxGpuConfig.sandboxGpuEnabled; assertCdiNvidiaGpuSpecPresent(assessHost(), resumeOptedOutGpuPassthrough); - validateSandboxGpuPreflight(resumeSandboxGpuConfig); - // #3953 — resume reuses the cached preflight result and would otherwise - // skip the dashboard-range fail-fast added at the normal preflight path; - // mirror it here so `--resume` doesn't walk past an exhausted range and - // fail later in dashboard allocation after sandbox work has started. - if (_preflightDashboardPort === null) preflightDashboardPortRangeAvailability(); + validateSandboxGpuPreflight(resumeSandboxGpuConfig); if (_preflightDashboardPort === null) preflightDashboardPortRangeAvailability(); // #3953 — resume must mirror non-resume fail-fast } else { startRecordedStep("preflight"); gpu = await preflight({ ...opts, optedOutGpuPassthrough: opts.noGpu === true });