Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 14 additions & 27 deletions src/lib/actions/sandbox/snapshot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@

import fs from "node:fs";
import path from "node:path";

import { CLI_NAME } from "../../cli/branding";
import { dockerCapture, dockerInspect } from "../../adapters/docker";
import { stripAnsi } from "../../adapters/openshell/client";
import { parseLiveSandboxNames } from "../../runtime-recovery";
import { ROOT, run, shellQuote, validateName } from "../../runner";
import { captureOpenshell, getOpenshellBinary } from "../../adapters/openshell/runtime";
import { CLI_NAME } from "../../cli/branding";
import * as policies from "../../policy";
import { ROOT, run, shellQuote, validateName } from "../../runner";
import { parseLiveSandboxNames } from "../../runtime-recovery";
import { isGatewayHealthy } from "../../state/gateway";
import type { SandboxEntry } from "../../state/registry";
import * as registry from "../../state/registry";
import type { SandboxEntry } from "../../state/registry";
import * as sandboxState from "../../state/sandbox";

const useColor = !process.env.NO_COLOR && !!process.stdout.isTTY;
Expand Down Expand Up @@ -204,33 +205,19 @@ async function autoCreateSandboxFromSource(
console.log(` ${G}\u2713${R} Sandbox '${dstName}' created`);
}

// Docker/VM-driver sandboxes do not expose the legacy cluster container, so
// verify gateway health through OpenShell metadata instead.
function probeGatewayMetadataHealth(): boolean {
// Returns true only when the gateway Docker container is confirmed running.
// `openshell sandbox list` reads a local registry and exits 0 even when the
// gateway is stopped (#2673), so we probe the container directly instead.
function probeDockerDriverGatewayRunning(): boolean {
const status = captureOpenshell(["status"], { ignoreError: true, timeout: 10000 });
const namedGatewayInfo = captureOpenshell(["gateway", "info", "-g", NEMOCLAW_GATEWAY_NAME], {
ignoreError: true,
timeout: 10000,
});
const activeGatewayInfo = captureOpenshell(["gateway", "info"], {
ignoreError: true,
timeout: 10000,
});
return isGatewayHealthy(
status.output || "",
namedGatewayInfo.output || "",
activeGatewayInfo.output || "",
);
}

function usesGatewayMetadataProbe(driver: string | null | undefined): boolean {
return driver === "docker" || driver === "vm";
const clean = stripAnsi(status.output || "");
return status.status === 0 && /^\s*Status:\s*Connected\b/im.test(clean);
}

function probeGatewayRunning(sandboxName?: string): boolean {
const entry = sandboxName ? registry.getSandbox(sandboxName) : null;
if (usesGatewayMetadataProbe(entry?.openshellDriver)) {
return probeGatewayMetadataHealth();
if (entry?.openshellDriver === "docker") {
return probeDockerDriverGatewayRunning();
}
const container = `openshell-cluster-${NEMOCLAW_GATEWAY_NAME}`;
const result = dockerInspect(
Expand Down
104 changes: 31 additions & 73 deletions test/snapshot-gateway-guard.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,90 +51,58 @@ function runCli(args: string, env: Record<string, string | undefined> = {}): Cli
* This setup reproduces the exact failure mode from #2673: openshell returns
* exit 0 with stale data, so the old isLive.status guard never fires.
*/
function writeExecutable(filePath: string, lines: string[]): void {
fs.writeFileSync(filePath, ["#!/bin/sh", ...lines].join("\n"), { mode: 0o755 });
}
function makeStoppedGatewayEnv(prefix: string): Record<string, string> {
const home = fs.mkdtempSync(path.join(os.tmpdir(), prefix));
const localBin = path.join(home, "bin");
fs.mkdirSync(localBin, { recursive: true });

function writeSandboxRegistry(
home: string,
sandboxName: string,
entry: Record<string, unknown> = {},
): void {
const registryDir = path.join(home, ".nemoclaw");
fs.mkdirSync(registryDir, { recursive: true });
fs.writeFileSync(
path.join(registryDir, "sandboxes.json"),
JSON.stringify({
sandboxes: {
[sandboxName]: {
name: sandboxName,
alpha: {
name: "alpha",
model: "test-model",
provider: "nvidia-prod",
gpuEnabled: false,
policies: [],
...entry,
},
},
defaultSandbox: sandboxName,
defaultSandbox: "alpha",
}),
{ mode: 0o600 },
);
}

function makeStoppedGatewayEnv(prefix: string): Record<string, string> {
const home = fs.mkdtempSync(path.join(os.tmpdir(), prefix));
const localBin = path.join(home, "bin");
fs.mkdirSync(localBin, { recursive: true });
writeSandboxRegistry(home, "alpha");

// openshell lies: sandbox list exits 0 and lists alpha as Ready even though
// the gateway container is down (reads stale local registry/cache).
writeExecutable(path.join(localBin, "openshell"), [
'if [ "$1" = "sandbox" ] && [ "$2" = "list" ]; then',
' printf "NAME STATUS\\nalpha Ready\\n"',
" exit 0",
"fi",
"exit 0",
]);
fs.writeFileSync(
path.join(localBin, "openshell"),
[
"#!/bin/sh",
'if [ "$1" = "sandbox" ] && [ "$2" = "list" ]; then',
' printf "NAME STATUS\\nalpha Ready\\n"',
" exit 0",
"fi",
"exit 0",
].join("\n"),
{ mode: 0o755 },
);

// docker inspect: returns "false" for State.Running (gateway stopped).
writeExecutable(path.join(localBin, "docker"), [
'if [ "$1" = "inspect" ]; then',
' echo "false"',
" exit 0",
"fi",
"exit 0",
]);

return {
HOME: home,
PATH: `${localBin}:${process.env.PATH ?? ""}`,
};
}

function makeHealthyVmGatewayEnv(prefix: string): Record<string, string> {
const home = fs.mkdtempSync(path.join(os.tmpdir(), prefix));
const localBin = path.join(home, "bin");
fs.mkdirSync(localBin, { recursive: true });
writeSandboxRegistry(home, "alpha", { openshellDriver: "vm" });

// VM-driver snapshots should trust gateway metadata, not the legacy cluster
// container probe.
writeExecutable(path.join(localBin, "openshell"), [
'case "$1 $2" in',
' "gateway info") printf "Gateway Info\\n\\nGateway: nemoclaw\\nGateway endpoint: https://127.0.0.1:8080/\\n"; exit 0 ;;',
' "sandbox list") printf "NAME STATUS\\nalpha Ready\\n"; exit 0 ;;',
' "sandbox ssh-config") printf "Host openshell-alpha\\n HostName 127.0.0.1\\n User sandbox\\n"; exit 0 ;;',
"esac",
'if [ "$1" = "status" ]; then exit 0; fi',
"exit 0",
]);

writeExecutable(path.join(localBin, "ssh"), ["exit 0"]);
writeExecutable(path.join(localBin, "docker"), [
'if [ "$1" = "inspect" ]; then echo "false"; exit 0; fi',
"exit 0",
]);
fs.writeFileSync(
path.join(localBin, "docker"),
[
"#!/bin/sh",
'if [ "$1" = "inspect" ]; then',
' echo "false"',
" exit 0",
"fi",
"exit 0",
].join("\n"),
{ mode: 0o755 },
);

return {
HOME: home,
Expand All @@ -157,13 +125,3 @@ describe("snapshot gateway guard (#2673)", () => {
expect(r.out).toContain("Failed to query live sandbox state");
});
});

describe("snapshot VM-driver gateway guard", () => {
it("snapshot create accepts healthy macOS VM-driver gateways without legacy cluster container", () => {
const env = makeHealthyVmGatewayEnv("nemoclaw-snap-vm-gw-create-");
const r = runCli("alpha snapshot create --name baseline", env);
expect(r.code).toBe(0);
expect(r.out).toContain("Snapshot v1 name=baseline created");
expect(r.out).not.toContain("Failed to query live sandbox state");
});
});
Loading