Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/reference/commands.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,8 @@ Use that line to distinguish a healthy backend from a broken proxy path that the

For cloud-only providers, the output omits the NIM status line unless a NIM container is registered or an unexpected NIM container is running.

When the sandbox's recorded driver is `docker` and the host Docker daemon is not reachable, the command prints `Failure layer: docker_unreachable — Docker daemon is not reachable.` as the first line of stdout, suppresses the host-side `Inference` probe (which otherwise hits the remote provider directly and is misleading when the local stack is down), and exits with a non-zero status.

If the sandbox or gateway cannot be verified, the command exits non-zero instead of reporting healthy inference from stale registry state.
Gateway and dashboard health checks treat HTTP `401` from device auth as a live service, not as an offline gateway.

Expand Down
4 changes: 4 additions & 0 deletions src/lib/actions/sandbox/gateway-failure-classifier.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ function defaultDockerInfo(): boolean {
return dockerInfo({ ignoreError: true, timeout: DOCKER_TIMEOUT_MS }).length > 0;
}

export function isDockerDaemonReachable(): boolean {
return defaultDockerInfo();
}

function dockerContainerListed(container: string, allFlag: boolean): boolean {
const args = ["ps"];
if (allFlag) args.push("-a");
Expand Down
38 changes: 37 additions & 1 deletion src/lib/actions/sandbox/status.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
import { describe, expect, it } from "vitest";

import type { ProviderHealthProbeOptions } from "../../../../dist/lib/inference/health";
import { getSandboxStatusInferenceHealth } from "../../../../dist/lib/actions/sandbox/status";
import {
getSandboxStatusInferenceHealth,
isDockerDaemonUnreachableForStatus,
} from "../../../../dist/lib/actions/sandbox/status";

describe("sandbox status inference health", () => {
it("passes the current model with the current provider", () => {
Expand Down Expand Up @@ -50,3 +53,36 @@ describe("sandbox status inference health", () => {
expect(called).toBe(false);
});
});

describe("isDockerDaemonUnreachableForStatus", () => {
it("returns false when sandbox entry is null", () => {
expect(isDockerDaemonUnreachableForStatus(null, () => false)).toBe(false);
});

it("returns false when the openshell driver is not docker", () => {
expect(
isDockerDaemonUnreachableForStatus(
{ name: "alpha", openshellDriver: "vm" } as never,
() => false,
),
).toBe(false);
});

it("returns true when driver is docker and the probe reports unreachable", () => {
expect(
isDockerDaemonUnreachableForStatus(
{ name: "alpha", openshellDriver: "docker" } as never,
() => false,
),
).toBe(true);
});

it("returns false when driver is docker and the probe reports reachable", () => {
expect(
isDockerDaemonUnreachableForStatus(
{ name: "alpha", openshellDriver: "docker" } as never,
() => true,
),
).toBe(false);
});
});
59 changes: 48 additions & 11 deletions src/lib/actions/sandbox/status.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,11 @@ import {
getActiveSandboxSessions,
} from "../../state/sandbox-session";
import { getSandboxDockerHealth } from "./docker-health";
import { classifyGatewayFailure, getLayerHeader } from "./gateway-failure-classifier";
import {
classifyGatewayFailure,
getLayerHeader,
isDockerDaemonReachable,
} from "./gateway-failure-classifier";
import type { SandboxGatewayState } from "./gateway-state";
import {
getReconciledSandboxGatewayState,
Expand Down Expand Up @@ -106,15 +110,42 @@ function maybeEnsureHermesToolGatewayBroker(sb: registry.SandboxEntry | null): v
}
}

async function printGatewayFailureLayerHeader(sandboxName: string): Promise<void> {
async function printGatewayFailureLayerHeader(
sandboxName: string,
alreadyPrintedDockerUnreachable = false,
): Promise<void> {
const failure = await classifyGatewayFailure(sandboxName);
if (alreadyPrintedDockerUnreachable && failure.layer === "docker_unreachable") {
return;
}
console.log(` ${getLayerHeader(failure.layer)}`);
}

type DockerInfoProbe = () => boolean;

export function isDockerDaemonUnreachableForStatus(
sb: registry.SandboxEntry | null,
probe: DockerInfoProbe = isDockerDaemonReachable,
): boolean {
if (!sb || sb.openshellDriver !== "docker") return false;
return !probe();
}

// eslint-disable-next-line complexity
export async function showSandboxStatus(sandboxName: string): Promise<void> {
const sb = registry.getSandbox(sandboxName);
maybeEnsureHermesToolGatewayBroker(sb);
// When the host Docker daemon is stopped on a docker-driver sandbox, the
// cached sandbox metadata renders as a "healthy" report even though the
// local container stack is down, and the host-side Inference probe hits the
// remote provider directly so it falsely shows healthy too. Probe the
// daemon once upfront so the failure-layer header is the first thing the
// user sees and the misleading probe is suppressed.
const dockerUnreachable = isDockerDaemonUnreachableForStatus(sb);
if (dockerUnreachable) {
console.log(getLayerHeader("docker_unreachable"));
process.exitCode = 1;
}
// #2666: never let an unexpected throw from the gateway probe (e.g. openshell
// hanging when its container is stopped and the published port is held by a
// foreign listener) suppress the sandbox header. The downstream switch
Expand Down Expand Up @@ -154,11 +185,17 @@ export async function showSandboxStatus(sandboxName: string): Promise<void> {
liveResult && !isCommandTimeout(liveResult) ? parseGatewayInference(liveResult.output) : null;
const currentModel = (live && live.model) || (sb && sb.model) || "unknown";
const currentProvider = (live && live.provider) || (sb && sb.provider) || "unknown";
const inferenceHealth = getSandboxStatusInferenceHealth(
lookup.state === "present",
currentProvider,
currentModel,
);
// When docker is unreachable on a docker-driver sandbox, host-side probes
// misrepresent the sandbox state — the remote-provider reachability check
// doesn't go through the local stack. Suppress the probe so the output
// doesn't conflict with the failure-layer header.
const inferenceHealth = dockerUnreachable
? null
: getSandboxStatusInferenceHealth(
lookup.state === "present",
currentProvider,
currentModel,
);
// #3265 optional 3rd line: probe the full inference chain (openclaw gateway
// → auth proxy → backend) from inside the sandbox so a broken hop the
// host-side probes can't see still surfaces in `status`.
Expand Down Expand Up @@ -294,7 +331,7 @@ export async function showSandboxStatus(sandboxName: string): Promise<void> {
if (guard.state === "connected_other") {
printWrongGatewayActiveGuidance(sandboxName, guard.activeGateway, console.log);
} else {
await printGatewayFailureLayerHeader(sandboxName);
await printGatewayFailureLayerHeader(sandboxName, dockerUnreachable);
printGatewayLifecycleHint(guard.status || "", sandboxName, console.log);
}
} else {
Expand Down Expand Up @@ -328,7 +365,7 @@ export async function showSandboxStatus(sandboxName: string): Promise<void> {
process.exit(1);
} else if (lookup.state === "gateway_unreachable_after_restart") {
console.log("");
await printGatewayFailureLayerHeader(sandboxName);
await printGatewayFailureLayerHeader(sandboxName, dockerUnreachable);
console.log(
` Sandbox '${sandboxName}' may still exist, but the selected ${CLI_DISPLAY_NAME} gateway is still refusing connections after restart.`,
);
Expand All @@ -344,7 +381,7 @@ export async function showSandboxStatus(sandboxName: string): Promise<void> {
process.exit(1);
} else if (lookup.state === "gateway_missing_after_restart") {
console.log("");
await printGatewayFailureLayerHeader(sandboxName);
await printGatewayFailureLayerHeader(sandboxName, dockerUnreachable);
console.log(
` Sandbox '${sandboxName}' may still exist locally, but the ${CLI_DISPLAY_NAME} gateway is no longer configured after restart/rebuild.`,
);
Expand All @@ -364,7 +401,7 @@ export async function showSandboxStatus(sandboxName: string): Promise<void> {
if (lookup.output) {
console.log(lookup.output);
}
await printGatewayFailureLayerHeader(sandboxName);
await printGatewayFailureLayerHeader(sandboxName, dockerUnreachable);
printGatewayLifecycleHint(lookup.output, sandboxName, console.log);
process.exit(1);
}
Expand Down
114 changes: 114 additions & 0 deletions test/cli.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -943,6 +943,120 @@ describe("CLI dispatch", () => {
});
});

it("sandbox <name> status surfaces docker_unreachable header and suppresses stale Inference probe", () => {
const home = fs.mkdtempSync(
path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-docker-unreachable-"),
);
const localBin = path.join(home, "bin");
fs.mkdirSync(localBin, { recursive: true });
writeSandboxRegistry(home, "alpha", {
provider: "openai-api",
model: "gpt-4o-mini",
openshellDriver: "docker",
} as unknown as Partial<SandboxEntry>);

fs.writeFileSync(
path.join(localBin, "docker"),
["#!/usr/bin/env bash", "exit 1"].join("\n"),
{ mode: 0o755 },
);
fs.writeFileSync(
path.join(localBin, "openshell"),
[
"#!/usr/bin/env bash",
'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then',
" echo 'Gateway inference:'",
" echo ' Provider: openai-api'",
" echo ' Model: gpt-4o-mini'",
" exit 0",
"fi",
'if [ "$1" = "status" ]; then',
" echo 'Gateway: nemoclaw'",
" echo 'Status: Connected'",
" exit 0",
"fi",
'if [ "$1" = "gateway" ] && [ "$2" = "info" ]; then',
" echo 'Gateway: nemoclaw'",
" exit 0",
"fi",
"exit 0",
].join("\n"),
{ mode: 0o755 },
);

const r = runWithEnv("alpha status", {
HOME: home,
PATH: `${localBin}:${process.env.PATH || ""}`,
});

expect(r.code).toBe(1);
expect(r.out.startsWith(
"Failure layer: docker_unreachable — Docker daemon is not reachable.",
)).toBe(true);
expect(r.out).not.toContain("Inference: healthy");
const headerIdx = r.out.indexOf("Failure layer: docker_unreachable");
const sandboxIdx = r.out.indexOf("Sandbox: alpha");
expect(headerIdx).toBeGreaterThanOrEqual(0);
expect(sandboxIdx).toBeGreaterThan(headerIdx);
expect(
(r.out.match(/Failure layer: docker_unreachable/g) || []).length,
).toBe(1);
});

it("sandbox <name> status preserves Inference probe and exits 0 when openshellDriver is not docker", () => {
const home = fs.mkdtempSync(
path.join(os.tmpdir(), "nemoclaw-cli-sandbox-status-non-docker-driver-"),
);
const localBin = path.join(home, "bin");
fs.mkdirSync(localBin, { recursive: true });
writeSandboxRegistry(home, "alpha", {
provider: "openai-api",
model: "gpt-4o-mini",
openshellDriver: "vm",
} as unknown as Partial<SandboxEntry>);

fs.writeFileSync(
path.join(localBin, "docker"),
["#!/usr/bin/env bash", "exit 1"].join("\n"),
{ mode: 0o755 },
);
fs.writeFileSync(
path.join(localBin, "openshell"),
[
"#!/usr/bin/env bash",
'if [ "$1" = "inference" ] && [ "$2" = "get" ]; then',
" echo 'Gateway inference:'",
" echo ' Provider: openai-api'",
" echo ' Model: gpt-4o-mini'",
" exit 0",
"fi",
'if [ "$1" = "status" ]; then',
" echo 'Gateway: nemoclaw'",
" echo 'Status: Connected'",
" exit 0",
"fi",
'if [ "$1" = "gateway" ] && [ "$2" = "info" ]; then',
" echo 'Gateway: nemoclaw'",
" exit 0",
"fi",
"exit 0",
].join("\n"),
{ mode: 0o755 },
);

const r = runWithEnv("alpha status", {
HOME: home,
PATH: `${localBin}:${process.env.PATH || ""}`,
});

expect(r.code).toBe(0);
expect(r.out).not.toContain("Failure layer: docker_unreachable");
expect(r.out).toContain("Sandbox: alpha");
expect(r.out).toContain("Provider: openai-api");
expect(r.out).toContain("Model: gpt-4o-mini");
expect(r.out).toMatch(/Inference:/);
});

it("status rejects unknown flags through current dispatch path", () => {
const r = run("status --bogus");
expect(r.code).toBe(2);
Expand Down
Loading