diff --git a/docs/reference/architecture.mdx b/docs/reference/architecture.mdx index 91067b2c3a..d1ec4189c8 100644 --- a/docs/reference/architecture.mdx +++ b/docs/reference/architecture.mdx @@ -77,8 +77,11 @@ graph LR The logical diagram above shows how components relate. This section shows what actually runs where on the host. NemoClaw's default Docker-driver topology does not place the sandbox in an embedded k3s cluster. -On Linux and Apple Silicon macOS, NemoClaw starts the OpenShell Docker-driver gateway and creates the sandbox as a Docker container. -The gateway normally runs as a host process; Linux hosts that need the gateway compatibility patch may run the same gateway binary inside a small container. +On Linux, NemoClaw configures and restarts the package-managed OpenShell gateway user service when it is installed, then creates the sandbox as a Docker container. +NemoClaw treats that service as authoritative only when `systemctl --user show openshell-gateway` reports a package/vendor unit path and an `openshell-gateway` `ExecStart`. +Per-user units, partial units, and user-manager or bus outages do not take over gateway ownership; NemoClaw falls back to the standalone gateway process used by earlier installs. +That compatibility fallback remains until supported upgrade paths no longer include pre-service OpenShell installs and the package-managed handoff has direct nightly coverage. +On Apple Silicon macOS, NemoClaw starts the OpenShell Docker-driver gateway and creates the sandbox as a Docker container. In both Docker-driver modes, the sandbox is a Docker container, not a Kubernetes pod. Legacy non-Docker-driver installs still use the k3s-based gateway path; the diagram below shows the standard Docker-driver topology. diff --git a/docs/reference/commands.mdx b/docs/reference/commands.mdx index d1a45c5a4e..40aabc4f66 100644 --- a/docs/reference/commands.mdx +++ b/docs/reference/commands.mdx @@ -1253,7 +1253,7 @@ Earlier releases only stopped `openshell forward` processes, so those orphans ac For Local Ollama setups, uninstall also stops matching Ollama auth proxy processes before deleting `~/.nemoclaw` state so stale proxy listeners do not block a later reinstall. -On Linux, uninstall removes `~/.local/state/nemoclaw`, which contains Docker-driver gateway PID files, SQLite data, audit logs, and VM-driver state. +On Linux, uninstall removes `~/.local/state/nemoclaw`, which contains Docker-driver gateway SQLite data, audit logs, VM-driver state, and standalone-fallback gateway PID files. | Flag | Effect | |---|---| @@ -1418,9 +1418,9 @@ These flags toggle optional behaviors during onboarding; set them before running | `NEMOCLAW_SANDBOX_GPU` | `auto`, `1`, or `0` | Controls sandbox GPU passthrough during onboarding. `auto` enables GPU passthrough when an NVIDIA GPU is detected, `1` requires GPU passthrough, and `0` forces CPU-only sandbox creation. | | `NEMOCLAW_SANDBOX_GPU_DEVICE` | OpenShell GPU device selector | Selects the GPU device passed with `openshell sandbox create --gpu-device`. Requires explicit sandbox GPU enablement with `NEMOCLAW_SANDBOX_GPU=1` (or `--sandbox-gpu` for CLI-driven onboarding); otherwise onboarding rejects the selector instead of treating it as an implicit opt-in. | | `NEMOCLAW_DOCKER_GPU_PATCH` | `0` to disable, anything else to keep the default | Controls the Linux Docker-driver GPU sandbox compatibility patch. Set to `0` only as an escape hatch when the patch fails and you need onboarding to continue without patching the GPU sandbox container. | -| `NEMOCLAW_OPENSHELL_GATEWAY_BIN` | path | Advanced override for the `openshell-gateway` binary used by the Linux Docker-driver gateway. Defaults to the binary next to `openshell`, then common install paths. | -| `NEMOCLAW_OPENSHELL_SANDBOX_BIN` | path | Advanced override for the `openshell-sandbox` binary passed to the Linux Docker-driver gateway supervisor. Defaults to the binary next to `openshell`, then common install paths. | -| `NEMOCLAW_OPENSHELL_GATEWAY_STATE_DIR` | path | Advanced override for the Linux Docker-driver gateway pid file and SQLite state directory. Defaults to `~/.local/state/nemoclaw/openshell-docker-gateway`. | +| `NEMOCLAW_OPENSHELL_GATEWAY_BIN` | path | Advanced override for the `openshell-gateway` binary used by the Linux Docker-driver standalone fallback. Defaults to the binary next to `openshell`, then common install paths. | +| `NEMOCLAW_OPENSHELL_SANDBOX_BIN` | path | Advanced override for the `openshell-sandbox` binary used by the Linux Docker-driver standalone fallback. Defaults to the binary next to `openshell`, then common install paths. | +| `NEMOCLAW_OPENSHELL_GATEWAY_STATE_DIR` | path | Advanced override for the Linux Docker-driver gateway SQLite state directory and standalone-fallback PID file. Defaults to `~/.local/state/nemoclaw/openshell-docker-gateway`. | | `NEMOCLAW_WECHAT_QUIET` | `1` to enable | Silences the `[wechat]` diagnostic lines printed during the host-side WeChat QR login (poll status, IDC redirects, swallowed gateway errors), which are visible by default while the experimental WeChat path stabilizes; set `1` once the flow is reliable in your environment. | ### Onboard Profiling Traces diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 504004f75f..cb0da8915e 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -460,8 +460,7 @@ const { isGatewayTcpReady } = require("./onboard/gateway-tcp-readiness") as typeof import("./onboard/gateway-tcp-readiness"); const { trackChildExit } = require("./onboard/child-exit-tracker") as typeof import("./onboard/child-exit-tracker"); -const { reportDockerDriverGatewayStartFailure } = - require("./onboard/docker-driver-gateway-failure") as typeof import("./onboard/docker-driver-gateway-failure"); +const { reportDockerDriverGatewayStartFailure } = require("./onboard/docker-driver-gateway-failure") as typeof import("./onboard/docker-driver-gateway-failure"); const dockerDriverGatewayEnv: typeof import("./onboard/docker-driver-gateway-env") = require("./onboard/docker-driver-gateway-env"); const { getDockerDriverGatewayEndpoint } = dockerDriverGatewayEnv; @@ -2410,20 +2409,21 @@ async function startGatewayWithOptions( } async function startDockerDriverGateway({ exitOnFailure = true, skipSandboxBridgeReachability = false }: { exitOnFailure?: boolean; skipSandboxBridgeReachability?: boolean } = {}): Promise { - dockerDriverGatewayEnv.writeDockerGatewayDebEnvOverride(() => getDockerDriverGatewayEnv()); const gatewayBin = resolveOpenShellGatewayBinary(); const openshellVersionOutput = runCaptureOpenshell(["--version"], { ignoreError: true, }); const gatewayEnv = getDockerDriverGatewayEnv(openshellVersionOutput); + dockerDriverGatewayEnv.writeDockerGatewayDebEnvOverride(() => gatewayEnv); const stateDir = getDockerDriverGatewayStateDir(); const runtimeIdentity = gatewayBin ? dockerDriverGatewayLaunch.buildDockerDriverGatewayRuntimeIdentity({ gatewayBin, gatewayEnv, stateDir, sandboxBin: resolveOpenShellSandboxBinary() }) : null; const gatewayLaunch = runtimeIdentity?.launch ?? null; const driftGatewayBin = runtimeIdentity?.driftGatewayBin ?? gatewayBin; const driftGatewayEnv = runtimeIdentity?.desiredEnv ?? gatewayEnv; const identityGatewayBin = runtimeIdentity?.identityGatewayBin ?? gatewayBin; - const { verifySandboxBridgeGatewayReachableOrExit } = - require("./onboard/gateway-sandbox-reachability") as typeof import("./onboard/gateway-sandbox-reachability"); + const { verifySandboxBridgeGatewayReachableOrExit } = require("./onboard/gateway-sandbox-reachability") as typeof import("./onboard/gateway-sandbox-reachability"); + + if (await dockerDriverGatewayEnv.startPackageManagedDockerDriverGateway({ clearDockerDriverGatewayRuntimeFiles, exitOnFailure, gatewayName: GATEWAY_NAME, registerDockerDriverGatewayEndpoint, runCaptureOpenshell, skipSandboxBridgeReachability, verifySandboxBridgeGatewayReachableOrExit })) return; const gatewayStatus = runCaptureOpenshell(["status"], { ignoreError: true }); const gwInfo = runCaptureOpenshell(["gateway", "info", "-g", GATEWAY_NAME], { diff --git a/src/lib/onboard/docker-driver-gateway-env.test.ts b/src/lib/onboard/docker-driver-gateway-env.test.ts index 21d82312ca..476281c3da 100644 --- a/src/lib/onboard/docker-driver-gateway-env.test.ts +++ b/src/lib/onboard/docker-driver-gateway-env.test.ts @@ -127,15 +127,16 @@ describe("writeDockerGatewayDebEnvOverride", () => { const existsSpy = vi .spyOn(fs, "existsSync") - .mockImplementation((candidate) => candidate === "/usr/bin/openshell-gateway"); + .mockImplementation((candidate) => candidate === "/usr/lib/systemd/user/openshell-gateway.service"); const homedirSpy = vi.spyOn(os, "homedir").mockReturnValue(tempHome); try { - writeDockerGatewayDebEnvOverride(() => ({ + const wrote = writeDockerGatewayDebEnvOverride(() => ({ OPENSHELL_BIND_ADDRESS: "127.0.0.1", - })); + }), { platform: "linux" }); const envFileContent = fs.readFileSync(envFile, "utf-8"); + expect(wrote).toBe(true); expect(fs.statSync(envDir).mode & 0o777).toBe(0o700); expect(fs.statSync(envFile).mode & 0o777).toBe(0o600); expect(envFileContent).toContain("KEEP_ME=1\n"); @@ -146,4 +147,28 @@ describe("writeDockerGatewayDebEnvOverride", () => { fs.rmSync(tempHome, { recursive: true, force: true }); } }); + + it("does not write service env for standalone gateway binaries", () => { + const tempHome = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-gateway-env-")); + const existsSpy = vi + .spyOn(fs, "existsSync") + .mockImplementation((candidate) => candidate === "/usr/bin/openshell-gateway"); + const homedirSpy = vi.spyOn(os, "homedir").mockReturnValue(tempHome); + + try { + const wrote = writeDockerGatewayDebEnvOverride( + () => ({ + OPENSHELL_BIND_ADDRESS: "127.0.0.1", + }), + { platform: "linux" }, + ); + + expect(wrote).toBe(false); + expect(fs.existsSync(path.join(tempHome, ".config", "openshell", "gateway.env"))).toBe(false); + } finally { + existsSpy.mockRestore(); + homedirSpy.mockRestore(); + fs.rmSync(tempHome, { recursive: true, force: true }); + } + }); }); diff --git a/src/lib/onboard/docker-driver-gateway-env.ts b/src/lib/onboard/docker-driver-gateway-env.ts index 5962d76106..c6f2442496 100644 --- a/src/lib/onboard/docker-driver-gateway-env.ts +++ b/src/lib/onboard/docker-driver-gateway-env.ts @@ -13,8 +13,10 @@ import { getGatewayHttpsEndpoint, } from "../core/gateway-address"; import { GATEWAY_PORT } from "../core/ports"; +import { hasOpenShellGatewayUserService } from "./docker-driver-gateway-service"; export { getGatewayHttpsEndpoint }; +export { startPackageManagedDockerDriverGateway } from "./docker-driver-gateway-service"; export const DOCKER_DRIVER_GATEWAY_RUNTIME_ENV_KEYS = [ "OPENSHELL_DRIVERS", @@ -133,13 +135,9 @@ function readTextFileIfPresent(filePath: string): string { export function writeDockerGatewayDebEnvOverride( getOverride: () => Record, -): void { - const servicePaths = [ - "/usr/bin/openshell-gateway", - "/usr/lib/systemd/user/openshell-gateway.service", - "/lib/systemd/user/openshell-gateway.service", - ]; - if (!servicePaths.some((candidate) => fs.existsSync(candidate))) return; + opts: Parameters[0] = {}, +): boolean { + if (!hasOpenShellGatewayUserService(opts)) return false; const override = getOverride(); const envDir = path.join(os.homedir(), ".config", "openshell"); const envFile = path.join(envDir, "gateway.env"); @@ -151,4 +149,5 @@ export function writeDockerGatewayDebEnvOverride( mode: 0o600, }); fs.chmodSync(envFile, 0o600); + return true; } diff --git a/src/lib/onboard/docker-driver-gateway-service.test.ts b/src/lib/onboard/docker-driver-gateway-service.test.ts new file mode 100644 index 0000000000..6b1b6e9e81 --- /dev/null +++ b/src/lib/onboard/docker-driver-gateway-service.test.ts @@ -0,0 +1,288 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, expect, it, vi } from "vitest"; + +import { + getOpenShellGatewayUserServicePaths, + hasOpenShellGatewayUserService, + startPackageManagedDockerDriverGateway, + startOpenShellGatewayUserService, + type SpawnSyncLikeResult, +} from "./docker-driver-gateway-service"; + +const STATUS_CONNECTED = ` +Server Status + +Gateway: nemoclaw +Server: https://127.0.0.1:8080/ +Connected +`; + +const GATEWAY_INFO = ` +Gateway Info + +Gateway: nemoclaw +Gateway endpoint: https://127.0.0.1:8080/ +`; + +function trustedShowOutput(fragmentPath = "/lib/systemd/user/openshell-gateway.service"): string { + return [ + `FragmentPath=${fragmentPath}`, + "ExecStart={ path=/usr/bin/openshell-gateway ; argv[]=/usr/bin/openshell-gateway ; }", + ].join("\n"); +} + +function spawnResult(status = 0, stderr = "", stdout = ""): SpawnSyncLikeResult { + return { + error: undefined, + status, + stderr, + stdout, + }; +} + +describe("docker-driver-gateway-service", () => { + it("detects the upstream OpenShell user service only on Linux", () => { + const existsSync = (candidate: string) => + candidate === "/usr/lib/systemd/user/openshell-gateway.service"; + + expect(hasOpenShellGatewayUserService({ existsSync, platform: "linux" })).toBe(true); + expect(hasOpenShellGatewayUserService({ existsSync, platform: "darwin" })).toBe(false); + expect(getOpenShellGatewayUserServicePaths()).toEqual([ + "/usr/local/lib/systemd/user/openshell-gateway.service", + "/usr/lib/systemd/user/openshell-gateway.service", + "/lib/systemd/user/openshell-gateway.service", + ]); + }); + + it("ignores stale per-user service units so standalone fallback remains available", () => { + const existsSync = vi.fn( + (candidate: string) => + candidate === "/home/nvidia/.config/systemd/user/openshell-gateway.service", + ); + + expect(hasOpenShellGatewayUserService({ existsSync, platform: "linux" })).toBe(false); + expect(existsSync.mock.calls.flat()).not.toContain( + "/home/nvidia/.config/systemd/user/openshell-gateway.service", + ); + }); + + it("restarts the upstream user service with systemctl --user after validating identity", () => { + const spawnSyncImpl = vi.fn((_command: string, args: string[]) => + args.includes("show") ? spawnResult(0, "", trustedShowOutput()) : spawnResult(), + ); + + const result = startOpenShellGatewayUserService({ + commandExists: (command) => command === "systemctl", + env: {}, + existsSync: (candidate) => candidate === "/lib/systemd/user/openshell-gateway.service", + platform: "linux", + spawnSyncImpl, + }); + + expect(result).toEqual({ attempted: true, fallbackAllowed: false, started: true }); + expect(spawnSyncImpl.mock.calls.map(([command, args]) => [command, args])).toEqual([ + ["systemctl", ["--user", "daemon-reload"]], + [ + "systemctl", + [ + "--user", + "show", + "openshell-gateway", + "--property=FragmentPath", + "--property=ExecStart", + ], + ], + ["systemctl", ["--user", "enable", "openshell-gateway"]], + ["systemctl", ["--user", "restart", "openshell-gateway"]], + ]); + }); + + it("allows standalone fallback when the user systemd manager is unavailable", () => { + const result = startOpenShellGatewayUserService({ + commandExists: () => true, + env: {}, + existsSync: () => true, + platform: "linux", + spawnSyncImpl: vi.fn((_command: string, args: string[]) => + args.includes("daemon-reload") + ? spawnResult(1, "Failed to connect to bus") + : spawnResult(), + ), + }); + + expect(result).toMatchObject({ + attempted: true, + fallbackAllowed: true, + started: false, + }); + expect(result.reason).toContain("Failed to connect to bus"); + }); + + it("allows standalone fallback when restart loses the user systemd manager", () => { + const result = startOpenShellGatewayUserService({ + commandExists: () => true, + env: {}, + existsSync: () => true, + platform: "linux", + spawnSyncImpl: vi.fn((_command: string, args: string[]) => { + if (args.includes("show")) return spawnResult(0, "", trustedShowOutput()); + if (args.includes("restart")) return spawnResult(1, "Failed to connect to bus"); + return spawnResult(); + }), + }); + + expect(result).toMatchObject({ + attempted: true, + fallbackAllowed: true, + started: false, + }); + expect(result.reason).toContain("Failed to connect to bus"); + }); + + it("does not silently fall back when the installed service fails to restart", () => { + const result = startOpenShellGatewayUserService({ + commandExists: () => true, + env: {}, + existsSync: () => true, + platform: "linux", + spawnSyncImpl: vi.fn((_command: string, args: string[]) => { + if (args.includes("show")) return spawnResult(0, "", trustedShowOutput()); + if (args.includes("restart")) return spawnResult(1, "Job failed"); + return spawnResult(); + }), + }); + + expect(result).toMatchObject({ + attempted: true, + fallbackAllowed: false, + started: false, + }); + expect(result.reason).toContain("Job failed"); + }); + + it("falls back instead of trusting an unverified service identity", () => { + const spawnSyncImpl = vi.fn((_command: string, args: string[]) => { + if (args.includes("show")) { + return spawnResult( + 0, + "", + [ + "FragmentPath=/home/nvidia/.config/systemd/user/openshell-gateway.service", + "ExecStart={ path=/usr/bin/openshell-gateway ; argv[]=/usr/bin/openshell-gateway ; }", + ].join("\n"), + ); + } + return spawnResult(); + }); + + const result = startOpenShellGatewayUserService({ + commandExists: () => true, + env: {}, + existsSync: () => true, + platform: "linux", + spawnSyncImpl, + }); + + expect(result).toMatchObject({ + attempted: true, + fallbackAllowed: true, + started: false, + }); + expect(result.reason).toContain("not the package-managed OpenShell gateway"); + expect(spawnSyncImpl.mock.calls.map(([, args]) => args.join(" "))).not.toContain( + "--user restart openshell-gateway", + ); + }); + + it("uses the package-managed service only after endpoint, metadata, and gRPC health are ready", async () => { + const events: string[] = []; + let registerCount = 0; + const registerDockerDriverGatewayEndpoint = vi.fn(() => { + events.push("register"); + registerCount += 1; + return registerCount >= 2; + }); + + await expect( + startPackageManagedDockerDriverGateway({ + clearDockerDriverGatewayRuntimeFiles: () => events.push("clear"), + exitOnFailure: false, + gatewayName: "nemoclaw", + hasOpenShellGatewayUserService: () => true, + healthPollCount: 3, + healthPollInterval: 0, + isDockerDriverGatewayReady: async () => { + events.push("ready"); + return true; + }, + registerDockerDriverGatewayEndpoint, + runCaptureOpenshell: (args) => (args[0] === "status" ? STATUS_CONNECTED : GATEWAY_INFO), + sleepSeconds: () => events.push("sleep"), + skipSandboxBridgeReachability: false, + startOpenShellGatewayUserService: () => ({ + attempted: true, + fallbackAllowed: false, + started: true, + }), + verifySandboxBridgeGatewayReachableOrExit: async () => { + events.push("verify"); + }, + }), + ).resolves.toBe(true); + + expect(events).toEqual(["register", "sleep", "register", "ready", "clear", "verify"]); + }); + + it("falls back to standalone when package-managed service startup is unavailable", async () => { + const registerDockerDriverGatewayEndpoint = vi.fn(() => true); + + await expect( + startPackageManagedDockerDriverGateway({ + clearDockerDriverGatewayRuntimeFiles: vi.fn(), + exitOnFailure: false, + gatewayName: "nemoclaw", + hasOpenShellGatewayUserService: () => true, + registerDockerDriverGatewayEndpoint, + runCaptureOpenshell: vi.fn(), + skipSandboxBridgeReachability: false, + startOpenShellGatewayUserService: () => ({ + attempted: true, + fallbackAllowed: true, + reason: "user manager unavailable", + started: false, + }), + verifySandboxBridgeGatewayReachableOrExit: vi.fn(), + }), + ).resolves.toBe(false); + + expect(registerDockerDriverGatewayEndpoint).not.toHaveBeenCalled(); + }); + + it("keeps standalone runtime breadcrumbs when service health never becomes ready", async () => { + const clearDockerDriverGatewayRuntimeFiles = vi.fn(); + + await expect( + startPackageManagedDockerDriverGateway({ + clearDockerDriverGatewayRuntimeFiles, + exitOnFailure: false, + gatewayName: "nemoclaw", + hasOpenShellGatewayUserService: () => true, + healthPollCount: 1, + isDockerDriverGatewayReady: async () => false, + registerDockerDriverGatewayEndpoint: () => true, + runCaptureOpenshell: (args) => (args[0] === "status" ? STATUS_CONNECTED : GATEWAY_INFO), + skipSandboxBridgeReachability: false, + startOpenShellGatewayUserService: () => ({ + attempted: true, + fallbackAllowed: false, + started: true, + }), + verifySandboxBridgeGatewayReachableOrExit: vi.fn(), + }), + ).rejects.toThrow("did not become healthy"); + + expect(clearDockerDriverGatewayRuntimeFiles).not.toHaveBeenCalled(); + }); +}); diff --git a/src/lib/onboard/docker-driver-gateway-service.ts b/src/lib/onboard/docker-driver-gateway-service.ts new file mode 100644 index 0000000000..2abe157bd3 --- /dev/null +++ b/src/lib/onboard/docker-driver-gateway-service.ts @@ -0,0 +1,312 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { spawnSync, type SpawnSyncOptions } from "node:child_process"; +import fs from "node:fs"; +import path from "node:path"; + +import { sleepSeconds } from "../core/wait"; +import { isGatewayHealthy } from "../state/gateway"; +import { envInt } from "./env"; +import { isDockerDriverGatewayHttpReady } from "./gateway-http-readiness"; + +export const OPENSHELL_GATEWAY_USER_SERVICE = "openshell-gateway"; + +export interface OpenShellGatewayUserServiceOptions { + commandExists?: (command: string) => boolean; + env?: NodeJS.ProcessEnv; + existsSync?: (filePath: string) => boolean; + platform?: NodeJS.Platform; + spawnSyncImpl?: SpawnSyncLike; +} + +export interface OpenShellGatewayUserServiceStartResult { + attempted: boolean; + fallbackAllowed: boolean; + reason?: string; + started: boolean; +} + +export interface SpawnSyncLikeResult { + error?: Error; + status: number | null; + stderr?: Buffer | string | null; + stdout?: Buffer | string | null; +} + +export type SpawnSyncLike = ( + command: string, + args: string[], + options?: SpawnSyncOptions, +) => SpawnSyncLikeResult; + +export interface PackageManagedDockerDriverGatewayOptions { + clearDockerDriverGatewayRuntimeFiles: () => void; + exitOnFailure: boolean; + gatewayName: string; + hasOpenShellGatewayUserService?: () => boolean; + healthPollCount?: number; + healthPollInterval?: number; + isDockerDriverGatewayReady?: () => Promise; + registerDockerDriverGatewayEndpoint: () => boolean; + runCaptureOpenshell: (args: string[], opts?: { ignoreError?: boolean }) => string; + sleepSeconds?: (seconds: number) => void; + skipSandboxBridgeReachability: boolean; + startOpenShellGatewayUserService?: () => OpenShellGatewayUserServiceStartResult; + verifySandboxBridgeGatewayReachableOrExit: ( + exitOnFailure: boolean, + options?: { skip?: boolean }, + ) => Promise; +} + +interface OpenShellGatewayUserServiceIdentity { + execStart: string; + fragmentPath: string; +} + +export function getOpenShellGatewayUserServicePaths(): string[] { + return [ + "/usr/local/lib/systemd/user/openshell-gateway.service", + "/usr/lib/systemd/user/openshell-gateway.service", + "/lib/systemd/user/openshell-gateway.service", + ]; +} + +export function hasOpenShellGatewayUserService( + opts: Pick = {}, +): boolean { + if ((opts.platform ?? process.platform) !== "linux") return false; + const existsSync = opts.existsSync ?? fs.existsSync; + return getOpenShellGatewayUserServicePaths().some((candidate) => existsSync(candidate)); +} + +function defaultCommandExists(command: string, env: NodeJS.ProcessEnv): boolean { + return ( + spawnSync("sh", ["-c", 'command -v "$1" >/dev/null 2>&1', "sh", command], { + encoding: "utf-8", + env, + }).status === 0 + ); +} + +function text(value: Buffer | string | null | undefined): string { + if (typeof value === "string") return value; + if (Buffer.isBuffer(value)) return value.toString("utf-8"); + return ""; +} + +function userManagerLooksUnavailable(reason: string): boolean { + return /Failed to connect to bus|No medium found|XDG_RUNTIME_DIR|System has not been booted|Host is down|No such file or directory/i.test( + reason, + ); +} + +function runSystemctlUser( + args: string[], + opts: Required>, +): { ok: boolean; reason?: string; stdout?: string } { + const result = opts.spawnSyncImpl("systemctl", ["--user", ...args], { + encoding: "utf-8", + env: opts.env, + stdio: ["ignore", "pipe", "pipe"], + } satisfies SpawnSyncOptions); + if (result.error) { + return { ok: false, reason: result.error.message }; + } + if (result.status !== 0) { + const detail = text(result.stderr).trim() || text(result.stdout).trim() || `exit ${String(result.status)}`; + return { ok: false, reason: detail }; + } + return { ok: true, stdout: text(result.stdout) }; +} + +function parseSystemctlShowProperties(output: string): Record { + const properties: Record = {}; + for (const line of output.split(/\r?\n/)) { + const separator = line.indexOf("="); + if (separator <= 0) continue; + properties[line.slice(0, separator)] = line.slice(separator + 1).trim(); + } + return properties; +} + +function isTrustedOpenShellGatewayUserServiceIdentity( + identity: OpenShellGatewayUserServiceIdentity, +): boolean { + const fragmentPath = path.normalize(identity.fragmentPath.trim()); + const trustedUnit = getOpenShellGatewayUserServicePaths().some( + (candidate) => path.normalize(candidate) === fragmentPath, + ); + if (!trustedUnit) return false; + return /\bopenshell-gateway\b/.test(identity.execStart); +} + +function readTrustedOpenShellGatewayUserServiceIdentity( + opts: Required>, +): { fallbackAllowed: boolean; ok: boolean; reason?: string } { + const result = runSystemctlUser( + ["show", OPENSHELL_GATEWAY_USER_SERVICE, "--property=FragmentPath", "--property=ExecStart"], + opts, + ); + if (!result.ok) { + return { + fallbackAllowed: userManagerLooksUnavailable(result.reason ?? ""), + ok: false, + reason: `systemctl --user show ${OPENSHELL_GATEWAY_USER_SERVICE} failed: ${result.reason}`, + }; + } + + const properties = parseSystemctlShowProperties(result.stdout ?? ""); + const identity = { + execStart: properties.ExecStart ?? "", + fragmentPath: properties.FragmentPath ?? "", + }; + if (!identity.fragmentPath || !identity.execStart) { + return { + fallbackAllowed: true, + ok: false, + reason: "service identity is incomplete", + }; + } + if (!isTrustedOpenShellGatewayUserServiceIdentity(identity)) { + return { + fallbackAllowed: true, + ok: false, + reason: `service identity is not the package-managed OpenShell gateway (${identity.fragmentPath})`, + }; + } + return { fallbackAllowed: false, ok: true }; +} + +export function startOpenShellGatewayUserService( + opts: OpenShellGatewayUserServiceOptions = {}, +): OpenShellGatewayUserServiceStartResult { + const platform = opts.platform ?? process.platform; + if (platform !== "linux") { + return { attempted: false, fallbackAllowed: true, started: false, reason: "not a Linux host" }; + } + const existsSync = opts.existsSync ?? fs.existsSync; + if (!hasOpenShellGatewayUserService({ existsSync, platform })) { + return { + attempted: false, + fallbackAllowed: true, + started: false, + reason: "service unit not installed", + }; + } + + const env = opts.env ?? process.env; + const commandExists = opts.commandExists ?? ((command) => defaultCommandExists(command, env)); + if (!commandExists("systemctl")) { + return { + attempted: true, + fallbackAllowed: true, + started: false, + reason: "systemctl is not available", + }; + } + + const spawnSyncImpl = opts.spawnSyncImpl ?? spawnSync; + for (const args of [["daemon-reload"]]) { + const result = runSystemctlUser(args, { env, spawnSyncImpl }); + if (!result.ok) { + const reason = `systemctl --user ${args.join(" ")} failed: ${result.reason}`; + return { + attempted: true, + fallbackAllowed: userManagerLooksUnavailable(result.reason ?? ""), + reason, + started: false, + }; + } + } + + const identity = readTrustedOpenShellGatewayUserServiceIdentity({ env, spawnSyncImpl }); + if (!identity.ok) { + return { + attempted: true, + fallbackAllowed: identity.fallbackAllowed, + reason: identity.reason, + started: false, + }; + } + + for (const args of [ + ["enable", OPENSHELL_GATEWAY_USER_SERVICE], + ["restart", OPENSHELL_GATEWAY_USER_SERVICE], + ]) { + const result = runSystemctlUser(args, { env, spawnSyncImpl }); + if (!result.ok) { + const reason = `systemctl --user ${args.join(" ")} failed: ${result.reason}`; + return { + attempted: true, + fallbackAllowed: userManagerLooksUnavailable(result.reason ?? ""), + reason, + started: false, + }; + } + } + + return { attempted: true, fallbackAllowed: false, started: true }; +} + +export async function startPackageManagedDockerDriverGateway({ + clearDockerDriverGatewayRuntimeFiles, + exitOnFailure, + gatewayName, + hasOpenShellGatewayUserService: hasOpenShellGatewayUserServiceImpl = hasOpenShellGatewayUserService, + healthPollCount, + healthPollInterval, + isDockerDriverGatewayReady = isDockerDriverGatewayHttpReady, + registerDockerDriverGatewayEndpoint, + runCaptureOpenshell, + sleepSeconds: sleepSecondsImpl = sleepSeconds, + skipSandboxBridgeReachability, + startOpenShellGatewayUserService: startOpenShellGatewayUserServiceImpl = startOpenShellGatewayUserService, + verifySandboxBridgeGatewayReachableOrExit, +}: PackageManagedDockerDriverGatewayOptions): Promise { + if (!hasOpenShellGatewayUserServiceImpl()) return false; + + console.log(" Starting OpenShell Docker-driver gateway via upstream user service..."); + const serviceStart = startOpenShellGatewayUserServiceImpl(); + if (!serviceStart.started) { + const detail = serviceStart.reason ? ` (${serviceStart.reason})` : ""; + if (serviceStart.fallbackAllowed) { + console.warn(` OpenShell gateway user service is unavailable${detail}; using standalone fallback.`); + return false; + } + const message = `OpenShell gateway user service failed to start${detail}.`; + console.error(` ${message}`); + console.error(" Check: systemctl --user status openshell-gateway"); + if (exitOnFailure) process.exit(1); + throw new Error(message); + } + + const pollCount = healthPollCount ?? envInt("NEMOCLAW_HEALTH_POLL_COUNT", 30); + const pollInterval = healthPollInterval ?? envInt("NEMOCLAW_HEALTH_POLL_INTERVAL", 2); + for (let i = 0; i < pollCount; i += 1) { + if (!registerDockerDriverGatewayEndpoint()) { + if (i < pollCount - 1) sleepSecondsImpl(pollInterval); + continue; + } + const status = runCaptureOpenshell(["status"], { ignoreError: true }); + const namedInfo = runCaptureOpenshell(["gateway", "info", "-g", gatewayName], { + ignoreError: true, + }); + const currentInfo = runCaptureOpenshell(["gateway", "info"], { ignoreError: true }); + if (isGatewayHealthy(status, namedInfo, currentInfo) && (await isDockerDriverGatewayReady())) { + clearDockerDriverGatewayRuntimeFiles(); + await verifySandboxBridgeGatewayReachableOrExit(exitOnFailure, { + skip: skipSandboxBridgeReachability, + }); + console.log(" ✓ OpenShell gateway user service is healthy"); + return true; + } + if (i < pollCount - 1) sleepSecondsImpl(pollInterval); + } + + const message = "OpenShell gateway user service started but did not become healthy."; + console.error(` ${message}`); + console.error(" Check: systemctl --user status openshell-gateway"); + if (exitOnFailure) process.exit(1); + throw new Error(message); +} diff --git a/src/lib/onboard/gateway-tcp-readiness.ts b/src/lib/onboard/gateway-tcp-readiness.ts index 0a8a0e3bce..7071ebc033 100644 --- a/src/lib/onboard/gateway-tcp-readiness.ts +++ b/src/lib/onboard/gateway-tcp-readiness.ts @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 /** - * Host-level TCP readiness probe for the OpenShell Docker-driver gateway. + * Host-level TCP readiness probe for the standalone OpenShell Docker-driver gateway. * * Plain TCP connect to the local gateway endpoint — semantic-free, just asks * "is anyone listening?". Used by `startDockerDriverGateway` in `onboard.ts` @@ -21,8 +21,10 @@ * * There is a peer module `gateway-http-readiness.ts` (introduced by #3312) * that exposes `isGatewayHttpReady` — a stronger HTTP-level probe used on - * the K3s path. It cannot be reused on the Docker-driver path because the - * two gateway types expose different HTTP routes for the root path: + * the K3s path — and `isDockerDriverGatewayHttpReady`, the package-managed + * Docker-driver handoff probe for the gRPC health endpoint. The root-path + * probe cannot be reused on the Docker-driver path because the two gateway + * types expose different HTTP routes for the root path: * * - K3s gateway answers `GET /` with 200/401 via a dispatcher catch-all. * - Docker-driver gateway returns 404 for `GET /`; only routes under diff --git a/test/e2e/test-openshell-version-pin.sh b/test/e2e/test-openshell-version-pin.sh index 5fe7496fdb..dd4132ab4e 100755 --- a/test/e2e/test-openshell-version-pin.sh +++ b/test/e2e/test-openshell-version-pin.sh @@ -93,6 +93,29 @@ SH write_executable "$FAKE_BIN/gh" <<'SH' #!/usr/bin/env bash set -euo pipefail +write_asset() { + local asset_name="$1" + local asset_path="$2" + printf 'fake OpenShell release asset: %s\n' "$asset_name" >"$asset_path" +} +sha256_digest() { + if [ -x /usr/bin/sha256sum ]; then + /usr/bin/sha256sum "$1" | awk '{print $1}' + elif [ -x /bin/sha256sum ]; then + /bin/sha256sum "$1" | awk '{print $1}' + elif [ -x /usr/bin/shasum ]; then + /usr/bin/shasum -a 256 "$1" | awk '{print $1}' + else + exit 3 + fi +} +write_checksum() { + local checksum_file="$1" + local asset_name="$2" + local asset_path="$3" + [ -f "$asset_path" ] || write_asset "$asset_name" "$asset_path" + printf '%s %s\n' "$(sha256_digest "$asset_path")" "$asset_name" >"$checksum_file" +} if [ "${1:-}" = "release" ] && [ "${2:-}" = "download" ]; then tag="${3:-}" pattern="" @@ -109,16 +132,19 @@ if [ "${1:-}" = "release" ] && [ "${2:-}" = "download" ]; then mkdir -p "$dir" case "$pattern" in openshell-checksums-sha256.txt) - printf 'ignored openshell-x86_64-unknown-linux-musl.tar.gz\n' > "$dir/$pattern" + asset_name="openshell-x86_64-unknown-linux-musl.tar.gz" + write_checksum "$dir/$pattern" "$asset_name" "$dir/$asset_name" ;; openshell-gateway-checksums-sha256.txt) - printf 'ignored openshell-gateway-x86_64-unknown-linux-gnu.tar.gz\n' > "$dir/$pattern" + asset_name="openshell-gateway-x86_64-unknown-linux-gnu.tar.gz" + write_checksum "$dir/$pattern" "$asset_name" "$dir/$asset_name" ;; openshell-sandbox-checksums-sha256.txt) - printf 'ignored openshell-sandbox-x86_64-unknown-linux-gnu.tar.gz\n' > "$dir/$pattern" + asset_name="openshell-sandbox-x86_64-unknown-linux-gnu.tar.gz" + write_checksum "$dir/$pattern" "$asset_name" "$dir/$asset_name" ;; *) - : > "$dir/$pattern" + write_asset "$pattern" "$dir/$pattern" ;; esac exit 0 @@ -129,6 +155,29 @@ SH write_executable "$FAKE_BIN/curl" <<'SH' #!/usr/bin/env bash set -euo pipefail +write_asset() { + local asset_name="$1" + local asset_path="$2" + printf 'fake OpenShell release asset: %s\n' "$asset_name" >"$asset_path" +} +sha256_digest() { + if [ -x /usr/bin/sha256sum ]; then + /usr/bin/sha256sum "$1" | awk '{print $1}' + elif [ -x /bin/sha256sum ]; then + /bin/sha256sum "$1" | awk '{print $1}' + elif [ -x /usr/bin/shasum ]; then + /usr/bin/shasum -a 256 "$1" | awk '{print $1}' + else + exit 3 + fi +} +write_checksum() { + local checksum_file="$1" + local asset_name="$2" + local asset_path="$3" + [ -f "$asset_path" ] || write_asset "$asset_name" "$asset_path" + printf '%s %s\n' "$(sha256_digest "$asset_path")" "$asset_name" >"$checksum_file" +} printf 'curl %s\n' "$*" >> "${DOWNLOAD_LOG:?}" out="" while [ "$#" -gt 0 ]; do @@ -141,16 +190,19 @@ done [ -n "$out" ] || exit 0 case "$(basename "$out")" in openshell-checksums-sha256.txt) - printf 'ignored openshell-x86_64-unknown-linux-musl.tar.gz\n' > "$out" + asset_name="openshell-x86_64-unknown-linux-musl.tar.gz" + write_checksum "$out" "$asset_name" "$(dirname "$out")/$asset_name" ;; openshell-gateway-checksums-sha256.txt) - printf 'ignored openshell-gateway-x86_64-unknown-linux-gnu.tar.gz\n' > "$out" + asset_name="openshell-gateway-x86_64-unknown-linux-gnu.tar.gz" + write_checksum "$out" "$asset_name" "$(dirname "$out")/$asset_name" ;; openshell-sandbox-checksums-sha256.txt) - printf 'ignored openshell-sandbox-x86_64-unknown-linux-gnu.tar.gz\n' > "$out" + asset_name="openshell-sandbox-x86_64-unknown-linux-gnu.tar.gz" + write_checksum "$out" "$asset_name" "$(dirname "$out")/$asset_name" ;; *) - : > "$out" + write_asset "$(basename "$out")" "$out" ;; esac SH