From 74db9b7532bb9de910eb08884789f2f0c1fd7f6c Mon Sep 17 00:00:00 2001 From: Julie Yaunches Date: Wed, 20 May 2026 16:58:56 -0400 Subject: [PATCH] Revert "fix(snapshot): use gateway metadata for VM-driver health checks (#3784)" This reverts commit 36491d2fdab5f575c0e4b1a799346247a6742242. Signed-off-by: Julie Yaunches --- src/lib/actions/sandbox/snapshot.ts | 41 ++++------- test/snapshot-gateway-guard.test.ts | 104 +++++++++------------------- 2 files changed, 45 insertions(+), 100 deletions(-) diff --git a/src/lib/actions/sandbox/snapshot.ts b/src/lib/actions/sandbox/snapshot.ts index a6a3693744..86f4314b6d 100644 --- a/src/lib/actions/sandbox/snapshot.ts +++ b/src/lib/actions/sandbox/snapshot.ts @@ -4,15 +4,16 @@ import fs from "node:fs"; import path from "node:path"; + +import { CLI_NAME } from "../../cli/branding"; import { dockerCapture, dockerInspect } from "../../adapters/docker"; +import { stripAnsi } from "../../adapters/openshell/client"; +import { parseLiveSandboxNames } from "../../runtime-recovery"; +import { ROOT, run, shellQuote, validateName } from "../../runner"; import { captureOpenshell, getOpenshellBinary } from "../../adapters/openshell/runtime"; -import { CLI_NAME } from "../../cli/branding"; import * as policies from "../../policy"; -import { ROOT, run, shellQuote, validateName } from "../../runner"; -import { parseLiveSandboxNames } from "../../runtime-recovery"; -import { isGatewayHealthy } from "../../state/gateway"; -import type { SandboxEntry } from "../../state/registry"; import * as registry from "../../state/registry"; +import type { SandboxEntry } from "../../state/registry"; import * as sandboxState from "../../state/sandbox"; const useColor = !process.env.NO_COLOR && !!process.stdout.isTTY; @@ -204,33 +205,19 @@ async function autoCreateSandboxFromSource( console.log(` ${G}\u2713${R} Sandbox '${dstName}' created`); } -// Docker/VM-driver sandboxes do not expose the legacy cluster container, so -// verify gateway health through OpenShell metadata instead. -function probeGatewayMetadataHealth(): boolean { +// Returns true only when the gateway Docker container is confirmed running. +// `openshell sandbox list` reads a local registry and exits 0 even when the +// gateway is stopped (#2673), so we probe the container directly instead. +function probeDockerDriverGatewayRunning(): boolean { const status = captureOpenshell(["status"], { ignoreError: true, timeout: 10000 }); - const namedGatewayInfo = captureOpenshell(["gateway", "info", "-g", NEMOCLAW_GATEWAY_NAME], { - ignoreError: true, - timeout: 10000, - }); - const activeGatewayInfo = captureOpenshell(["gateway", "info"], { - ignoreError: true, - timeout: 10000, - }); - return isGatewayHealthy( - status.output || "", - namedGatewayInfo.output || "", - activeGatewayInfo.output || "", - ); -} - -function usesGatewayMetadataProbe(driver: string | null | undefined): boolean { - return driver === "docker" || driver === "vm"; + const clean = stripAnsi(status.output || ""); + return status.status === 0 && /^\s*Status:\s*Connected\b/im.test(clean); } function probeGatewayRunning(sandboxName?: string): boolean { const entry = sandboxName ? registry.getSandbox(sandboxName) : null; - if (usesGatewayMetadataProbe(entry?.openshellDriver)) { - return probeGatewayMetadataHealth(); + if (entry?.openshellDriver === "docker") { + return probeDockerDriverGatewayRunning(); } const container = `openshell-cluster-${NEMOCLAW_GATEWAY_NAME}`; const result = dockerInspect( diff --git a/test/snapshot-gateway-guard.test.ts b/test/snapshot-gateway-guard.test.ts index 0342734360..928ff5d79a 100644 --- a/test/snapshot-gateway-guard.test.ts +++ b/test/snapshot-gateway-guard.test.ts @@ -51,90 +51,58 @@ function runCli(args: string, env: Record = {}): Cli * This setup reproduces the exact failure mode from #2673: openshell returns * exit 0 with stale data, so the old isLive.status guard never fires. */ -function writeExecutable(filePath: string, lines: string[]): void { - fs.writeFileSync(filePath, ["#!/bin/sh", ...lines].join("\n"), { mode: 0o755 }); -} +function makeStoppedGatewayEnv(prefix: string): Record { + const home = fs.mkdtempSync(path.join(os.tmpdir(), prefix)); + const localBin = path.join(home, "bin"); + fs.mkdirSync(localBin, { recursive: true }); -function writeSandboxRegistry( - home: string, - sandboxName: string, - entry: Record = {}, -): void { const registryDir = path.join(home, ".nemoclaw"); fs.mkdirSync(registryDir, { recursive: true }); fs.writeFileSync( path.join(registryDir, "sandboxes.json"), JSON.stringify({ sandboxes: { - [sandboxName]: { - name: sandboxName, + alpha: { + name: "alpha", model: "test-model", provider: "nvidia-prod", gpuEnabled: false, policies: [], - ...entry, }, }, - defaultSandbox: sandboxName, + defaultSandbox: "alpha", }), { mode: 0o600 }, ); -} - -function makeStoppedGatewayEnv(prefix: string): Record { - const home = fs.mkdtempSync(path.join(os.tmpdir(), prefix)); - const localBin = path.join(home, "bin"); - fs.mkdirSync(localBin, { recursive: true }); - writeSandboxRegistry(home, "alpha"); // openshell lies: sandbox list exits 0 and lists alpha as Ready even though // the gateway container is down (reads stale local registry/cache). - writeExecutable(path.join(localBin, "openshell"), [ - 'if [ "$1" = "sandbox" ] && [ "$2" = "list" ]; then', - ' printf "NAME STATUS\\nalpha Ready\\n"', - " exit 0", - "fi", - "exit 0", - ]); + fs.writeFileSync( + path.join(localBin, "openshell"), + [ + "#!/bin/sh", + 'if [ "$1" = "sandbox" ] && [ "$2" = "list" ]; then', + ' printf "NAME STATUS\\nalpha Ready\\n"', + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); // docker inspect: returns "false" for State.Running (gateway stopped). - writeExecutable(path.join(localBin, "docker"), [ - 'if [ "$1" = "inspect" ]; then', - ' echo "false"', - " exit 0", - "fi", - "exit 0", - ]); - - return { - HOME: home, - PATH: `${localBin}:${process.env.PATH ?? ""}`, - }; -} - -function makeHealthyVmGatewayEnv(prefix: string): Record { - const home = fs.mkdtempSync(path.join(os.tmpdir(), prefix)); - const localBin = path.join(home, "bin"); - fs.mkdirSync(localBin, { recursive: true }); - writeSandboxRegistry(home, "alpha", { openshellDriver: "vm" }); - - // VM-driver snapshots should trust gateway metadata, not the legacy cluster - // container probe. - writeExecutable(path.join(localBin, "openshell"), [ - 'case "$1 $2" in', - ' "gateway info") printf "Gateway Info\\n\\nGateway: nemoclaw\\nGateway endpoint: https://127.0.0.1:8080/\\n"; exit 0 ;;', - ' "sandbox list") printf "NAME STATUS\\nalpha Ready\\n"; exit 0 ;;', - ' "sandbox ssh-config") printf "Host openshell-alpha\\n HostName 127.0.0.1\\n User sandbox\\n"; exit 0 ;;', - "esac", - 'if [ "$1" = "status" ]; then exit 0; fi', - "exit 0", - ]); - - writeExecutable(path.join(localBin, "ssh"), ["exit 0"]); - writeExecutable(path.join(localBin, "docker"), [ - 'if [ "$1" = "inspect" ]; then echo "false"; exit 0; fi', - "exit 0", - ]); + fs.writeFileSync( + path.join(localBin, "docker"), + [ + "#!/bin/sh", + 'if [ "$1" = "inspect" ]; then', + ' echo "false"', + " exit 0", + "fi", + "exit 0", + ].join("\n"), + { mode: 0o755 }, + ); return { HOME: home, @@ -157,13 +125,3 @@ describe("snapshot gateway guard (#2673)", () => { expect(r.out).toContain("Failed to query live sandbox state"); }); }); - -describe("snapshot VM-driver gateway guard", () => { - it("snapshot create accepts healthy macOS VM-driver gateways without legacy cluster container", () => { - const env = makeHealthyVmGatewayEnv("nemoclaw-snap-vm-gw-create-"); - const r = runCli("alpha snapshot create --name baseline", env); - expect(r.code).toBe(0); - expect(r.out).toContain("Snapshot v1 name=baseline created"); - expect(r.out).not.toContain("Failed to query live sandbox state"); - }); -});