diff --git a/.github/workflows/e2e-scenarios.yaml b/.github/workflows/e2e-scenarios.yaml index 5f1dfc8b37..13da3bccfd 100644 --- a/.github/workflows/e2e-scenarios.yaml +++ b/.github/workflows/e2e-scenarios.yaml @@ -81,7 +81,6 @@ jobs: for raw in "${IDS[@]}"; do id="${raw//[[:space:]]/}" [ -n "${id}" ] || continue - npx tsx test/e2e-scenario/scenarios/run.ts --scenarios "${id}" --plan-only >/dev/null runner="${ROUTES[$id]:-}" if [ -z "${runner}" ]; then echo "::error::No runner route for scenario: ${id}" >&2 @@ -135,7 +134,7 @@ jobs: echo "::error::Invalid scenario input: ${SCENARIOS}" >&2 exit 1 fi - npx tsx test/e2e-scenario/scenarios/run.ts --scenarios "${SCENARIOS}" --dry-run + npx tsx test/e2e-scenario/scenarios/run.ts --scenarios "${SCENARIOS}" - name: Resolve workspace paths for WSL if: contains(inputs.scenarios || github.event.inputs.scenarios, 'wsl-repo-cloud-openclaw') @@ -299,7 +298,7 @@ jobs: export E2E_CONTEXT_DIR="`$workdir" npm ci --ignore-scripts set +e - npx tsx test/e2e-scenario/scenarios/run.ts --scenarios "`$scenarios" --dry-run + npx tsx test/e2e-scenario/scenarios/run.ts --scenarios "`$scenarios" status=`$? if [ -d "`$workdir/.e2e" ]; then rm -rf "`$checkout_dir/.e2e" @@ -335,14 +334,25 @@ jobs: uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: e2e-scenario-${{ inputs.scenarios || github.event.inputs.scenarios }} + # Explicit subpath list, NOT a blanket .e2e/ + hidden files. + # The framework redacts every byte that flows from spawned + # children into actions/*.log, logs/*.log, and onboard.log via + # orchestrators/redaction.ts::pipeRedacted. Anything outside + # the listed paths (notably the raw context.env file) is + # excluded so secret-bearing key=value lines cannot leak via + # the artifact even if a future helper writes there. + # Diagnostic dumps of context use e2e_context_dump, which + # redacts on emit (runtime/lib/context.sh). path: | .e2e/run-plan.json .e2e/plan.txt .e2e/environment.result.json .e2e/onboarding.result.json .e2e/runtime.result.json - .e2e/ + .e2e/actions/ + .e2e/logs/ + .e2e/onboard.log test/e2e/logs/ if-no-files-found: warn retention-days: 14 - include-hidden-files: true + include-hidden-files: false diff --git a/test/e2e-scenario/docs/README.md b/test/e2e-scenario/docs/README.md index 15ad01d88d..f4acc8eebe 100644 --- a/test/e2e-scenario/docs/README.md +++ b/test/e2e-scenario/docs/README.md @@ -32,24 +32,25 @@ test plan, expected state, and post-onboard suites. Test plans can also declare onboarding assertions that run after install/onboard and before expected-state validation. -Plan-only resolution accepts either an alias or a test plan ID: - -```bash -bash test/e2e-scenario/runtime/run-scenario.sh ubuntu-repo-cloud-openclaw --plan-only -bash test/e2e-scenario/runtime/run-scenario.sh ubuntu-repo-docker__cloud-nvidia-openclaw --plan-only -``` - ## How to run +The TypeScript runner is the only supported entrypoint. There is one +execution mode: live. There is no `--dry-run`, no `--validate-only`, no +fake-pass code path. Plan output is emitted as a side effect of the +live run. + ```bash -bash test/e2e-scenario/runtime/run-scenario.sh --plan-only # resolve + print plan, no side effects -bash test/e2e-scenario/runtime/run-scenario.sh --dry-run # helpers short-circuit with trace -bash test/e2e-scenario/runtime/run-scenario.sh --validate-only # assume setup done; validate expected state -bash test/e2e-scenario/runtime/run-scenario.sh # full live run -bash test/e2e-scenario/runtime/run-suites.sh […] -bash test/e2e-scenario/runtime/coverage-report.sh # Markdown matrix of scenario × suite +npx tsx test/e2e-scenario/scenarios/run.ts --scenarios # live execution (the only mode) +npx tsx test/e2e-scenario/scenarios/run.ts --list # list canonical scenario ids +npx tsx test/e2e-scenario/scenarios/run.ts --emit-matrix # JSON registry payload for CI matrix fan-out +npx tsx test/e2e-scenario/scenarios/run.ts --scenarios --plan-only # local debug only; MUST NOT appear in any workflow +bash test/e2e-scenario/runtime/coverage-report.sh # Markdown matrix of scenario × suite ``` +The deprecated bash entrypoints `runtime/run-scenario.sh` and +`runtime/run-suites.sh` exist only as fail-fast stubs; they print a +pointer at `run.ts` and exit non-zero. + Override the runtime context dir with `E2E_CONTEXT_DIR=` (default `.e2e/`, gitignored). The scenario runner and suites communicate only through `$E2E_CONTEXT_DIR/context.env` — suites do not rediscover @@ -72,7 +73,8 @@ test/e2e/ assert/ # outcome assertions (inference, credentials, policy, messaging) smoke/ inference/ hermes/ platform/ security/ # suite scripts grouped by concern runtime/ # entry points + cross-cutting shared libs - run-scenario.sh / run-suites.sh / coverage-report.sh + run-scenario.sh / run-suites.sh # DEPRECATED fail-fast stubs (see above) + coverage-report.sh resolver/ # TypeScript: load, plan, validate, coverage (invoked via tsx) lib/ # shared shell helpers: context, env, cleanup, logging, artifacts, sandbox-teardown ``` @@ -89,7 +91,7 @@ three YAML files above, plus shell scripts under `validation_suites/assert/`, or `validation_suites//`. The schemas in [`../runtime/resolver/schema.ts`](../runtime/resolver/schema.ts) -describe the required shape; `run-scenario.sh --plan-only` +describe the required shape; `npx tsx test/e2e-scenario/scenarios/run.ts --scenarios --plan-only` validates your change without running anything destructive. When adding a suite assertion, emit or preserve a stable `PASS: ` / diff --git a/test/e2e-scenario/framework-tests/e2e-context-helper.test.ts b/test/e2e-scenario/framework-tests/e2e-context-helper.test.ts index 6a7c97959f..0134d6adc9 100644 --- a/test/e2e-scenario/framework-tests/e2e-context-helper.test.ts +++ b/test/e2e-scenario/framework-tests/e2e-context-helper.test.ts @@ -9,7 +9,6 @@ import path from "node:path"; const REPO_ROOT = path.resolve(import.meta.dirname, "../../.."); const CONTEXT_LIB = path.join(REPO_ROOT, "test/e2e-scenario/runtime/lib/context.sh"); -const RUN_SCENARIO = path.join(REPO_ROOT, "test/e2e-scenario/runtime/run-scenario.sh"); function runBash(script: string, env: Record = {}): SpawnSyncReturns { return spawnSync("bash", ["-c", script], { @@ -86,38 +85,4 @@ describe("E2E context helper (runtime/lib/context.sh)", () => { } }); - it("scenario_plan_execution_should_emit_context_under_dry_run", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-ctx-")); - try { - const r = spawnSync( - "bash", - [RUN_SCENARIO, "ubuntu-repo-cloud-openclaw", "--dry-run"], - { - env: { ...process.env, E2E_CONTEXT_DIR: tmp }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }, - ); - expect(r.status, r.stderr).toBe(0); - const ctxPath = path.join(tmp, "context.env"); - expect(fs.existsSync(ctxPath), `context.env missing in ${tmp}`).toBe(true); - const ctx = fs.readFileSync(ctxPath, "utf8"); - for (const key of [ - "E2E_SCENARIO", - "E2E_PLATFORM_OS", - "E2E_INSTALL_METHOD", - "E2E_ONBOARDING_PATH", - "E2E_AGENT", - "E2E_PROVIDER", - "E2E_SANDBOX_NAME", - "E2E_GATEWAY_URL", - "E2E_INFERENCE_ROUTE", - ]) { - expect(ctx, `${key} missing from context.env`).toMatch(new RegExp(`^${key}=`, "m")); - } - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); }); diff --git a/test/e2e-scenario/framework-tests/e2e-expected-state-validator.test.ts b/test/e2e-scenario/framework-tests/e2e-expected-state-validator.test.ts deleted file mode 100644 index ba1f2b5f31..0000000000 --- a/test/e2e-scenario/framework-tests/e2e-expected-state-validator.test.ts +++ /dev/null @@ -1,235 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -import { describe, it, expect } from "vitest"; -import { spawnSync } from "node:child_process"; -import fs from "node:fs"; -import os from "node:os"; -import path from "node:path"; - -import { - validateExpectedState, - type ProbeResults, -} from "../runtime/resolver/validator.ts"; -import type { ExpectedStateConfig, ResolvedSuite } from "../runtime/resolver/schema.ts"; - -const REPO_ROOT = path.resolve(import.meta.dirname, "../../.."); -const RUN_SCENARIO = path.join(REPO_ROOT, "test/e2e-scenario/runtime/run-scenario.sh"); - -function cloudOpenclawReady(): ExpectedStateConfig { - return { - cli: { installed: true }, - gateway: { expected: "present", health: "healthy" }, - sandbox: { expected: "present", status: "running", agent: "openclaw" }, - inference: { - expected: "available", - provider: "nvidia", - route: "inference-local", - mode: "gateway-routed", - }, - credentials: { expected: "present", storage: "gateway-managed" }, - }; -} - -function passingProbes(): ProbeResults { - return { - "cli.installed": true, - "gateway.health": "healthy", - "gateway.expected": "present", - "sandbox.status": "running", - "sandbox.expected": "present", - "sandbox.agent": "openclaw", - "inference.expected": "available", - "inference.provider": "nvidia", - "inference.route": "inference-local", - "inference.mode": "gateway-routed", - "credentials.expected": "present", - "credentials.storage": "gateway-managed", - }; -} - -describe("expected state validator", () => { - it("should_validate_matching_state", () => { - const report = validateExpectedState({ - stateId: "cloud-openclaw-ready", - state: cloudOpenclawReady(), - probes: passingProbes(), - suites: [], - }); - expect(report.ok).toBe(true); - expect(report.checks.every((c) => c.ok)).toBe(true); - }); - - it("should_fail_when_gateway_expected_but_unhealthy", () => { - const probes = passingProbes(); - probes["gateway.health"] = "unhealthy"; - const report = validateExpectedState({ - stateId: "cloud-openclaw-ready", - state: cloudOpenclawReady(), - probes, - suites: [], - }); - expect(report.ok).toBe(false); - const failing = report.checks.find((c) => c.key === "gateway.health"); - expect(failing?.ok).toBe(false); - expect(failing?.expected).toBe("healthy"); - expect(failing?.actual).toBe("unhealthy"); - }); - - it("should_fail_when_sandbox_expected_but_absent", () => { - const probes = passingProbes(); - probes["sandbox.status"] = "absent"; - probes["sandbox.expected"] = "absent"; - const report = validateExpectedState({ - stateId: "cloud-openclaw-ready", - state: cloudOpenclawReady(), - probes, - suites: [], - }); - expect(report.ok).toBe(false); - expect(report.checks.some((c) => c.key === "sandbox.status" && !c.ok)).toBe(true); - }); - - it("should_fail_when_suite_requires_state_unmet_at_runtime", () => { - // Expected state claims inference.expected=available, but the probe - // reports unavailable; the smoke suite happens to pass but an inference - // suite's requires_state should trigger a runtime failure before - // execution. - const state = cloudOpenclawReady(); - const probes = passingProbes(); - probes["inference.expected"] = "unavailable"; - const inferenceSuite: ResolvedSuite = { - id: "inference", - requires_state: { "inference.expected": "available" }, - steps: [{ id: "models-health", script: "suites/inference/cloud/00-models-health.sh" }], - }; - const report = validateExpectedState({ - stateId: "cloud-openclaw-ready", - state, - probes, - suites: [inferenceSuite], - }); - expect(report.ok).toBe(false); - const msg = report.checks - .filter((c) => !c.ok) - .map((c) => `${c.key}=${c.actual ?? ""} (wanted ${c.expected})`) - .join("; "); - expect(msg).toMatch(/inference\.expected/); - expect(msg).toMatch(/available/); - expect(msg).toMatch(/unavailable/); - // Should also reference the suite that made the requirement. - expect(report.checks.some((c) => c.suite === "inference" && !c.ok)).toBe(true); - }); -}); - -describe("runner_should_not_run_suites_when_expected_state_fails", () => { - it("runs expected-state validation and skips suites on failure", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-es-")); - try { - const trace = path.join(tmp, "trace.log"); - // Simulate gateway-unhealthy probe by setting an override env var. - const r = spawnSync( - "bash", - [RUN_SCENARIO, "ubuntu-repo-cloud-openclaw", "--dry-run"], - { - env: { - ...process.env, - E2E_CONTEXT_DIR: tmp, - E2E_TRACE_FILE: trace, - // validator reads these overrides in dry-run mode to fake probes - E2E_PROBE_OVERRIDE_GATEWAY_HEALTH: "unhealthy", - E2E_VALIDATE_EXPECTED_STATE: "1", - }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }, - ); - // Dry-run execution should now fail because the expected state - // validation runs and sees gateway.health=unhealthy. - expect(r.status).not.toBe(0); - // Validator must run (its report file should exist) but suites must not. - const reportPath = path.join(tmp, "expected-state-report.json"); - expect(fs.existsSync(reportPath), `missing ${reportPath}`).toBe(true); - const report = JSON.parse(fs.readFileSync(reportPath, "utf8")); - expect(report.ok).toBe(false); - expect(report.checks.some((c: { key: string; ok: boolean }) => c.key === "gateway.health" && !c.ok)).toBe(true); - // And the run's failure output should reference expected-state, not suites. - expect(`${r.stdout}${r.stderr}`).toMatch(/expected.state/i); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); -}); - -// ───────────────────────────────────────────────────────────────────────────── -// Phase 1.F — --validate-only flag on run-scenario.sh -// ───────────────────────────────────────────────────────────────────────────── - -describe("run-scenario --validate-only flag", () => { - it("runs only validator and emits probe results json on stdout without running install/onboard/suites", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-validate-only-")); - try { - const trace = path.join(tmp, "trace.log"); - // Pre-populate a context.env: --validate-only assumes setup has already run. - fs.writeFileSync( - path.join(tmp, "context.env"), - "E2E_SCENARIO=ubuntu-repo-cloud-openclaw\n", - ); - const r = spawnSync( - "bash", - [RUN_SCENARIO, "ubuntu-repo-cloud-openclaw", "--validate-only"], - { - env: { - ...process.env, - E2E_CONTEXT_DIR: tmp, - E2E_TRACE_FILE: trace, - // Supply probe overrides for every key the expected state needs. - E2E_PROBE_OVERRIDE_CLI_INSTALLED: "true", - E2E_PROBE_OVERRIDE_GATEWAY_EXPECTED: "present", - E2E_PROBE_OVERRIDE_GATEWAY_HEALTH: "healthy", - E2E_PROBE_OVERRIDE_SANDBOX_EXPECTED: "present", - E2E_PROBE_OVERRIDE_SANDBOX_STATUS: "running", - E2E_PROBE_OVERRIDE_SANDBOX_AGENT: "openclaw", - E2E_PROBE_OVERRIDE_INFERENCE_EXPECTED: "available", - E2E_PROBE_OVERRIDE_INFERENCE_PROVIDER: "nvidia", - E2E_PROBE_OVERRIDE_INFERENCE_ROUTE: "inference-local", - E2E_PROBE_OVERRIDE_INFERENCE_MODE: "gateway-routed", - E2E_PROBE_OVERRIDE_CREDENTIALS_EXPECTED: "present", - E2E_PROBE_OVERRIDE_CREDENTIALS_STORAGE: "gateway-managed", - E2E_PROBE_OVERRIDE_SECURITY_SHIELDS: "supported", - // `security.policy_engine` has an embedded underscore, which the - // E2E_PROBE_OVERRIDE_* convention cannot express. Use the - // JSON escape hatch for this one. - E2E_PROBE_OVERRIDES_JSON: JSON.stringify({ "security.policy_engine": "supported" }), - }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }, - ); - expect(r.status, r.stderr).toBe(0); - // Must NOT have traced install or onboard. - const contents = fs.existsSync(trace) ? fs.readFileSync(trace, "utf8") : ""; - expect(contents).not.toMatch(/install:/); - expect(contents).not.toMatch(/onboard:/); - // Must have emitted an expected-state-report.json (probe results). - const reportPath = path.join(tmp, "expected-state-report.json"); - expect(fs.existsSync(reportPath), `missing ${reportPath}`).toBe(true); - const report = JSON.parse(fs.readFileSync(reportPath, "utf8")); - expect(report.ok).toBe(true); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("is_mutually_exclusive_with_plan_only", () => { - const r = spawnSync( - "bash", - [RUN_SCENARIO, "ubuntu-repo-cloud-openclaw", "--validate-only", "--plan-only"], - { encoding: "utf8", timeout: 15_000, cwd: REPO_ROOT }, - ); - expect(r.status).not.toBe(0); - expect(r.stdout + r.stderr).toMatch(/mutually.exclusive|cannot.*both|--plan-only.*--validate-only|--validate-only.*--plan-only/i); - }); -}); diff --git a/test/e2e-scenario/framework-tests/e2e-lib-helpers.test.ts b/test/e2e-scenario/framework-tests/e2e-lib-helpers.test.ts index 1a5c1a8403..82862f5622 100644 --- a/test/e2e-scenario/framework-tests/e2e-lib-helpers.test.ts +++ b/test/e2e-scenario/framework-tests/e2e-lib-helpers.test.ts @@ -15,7 +15,6 @@ const ASSERT = path.join(VALIDATION_SUITES, "assert"); const REBUILD_UPGRADE_LIB = path.join(VALIDATION_SUITES, "lib/rebuild_upgrade.sh"); const FIXTURES = path.join(REPO_ROOT, "test/e2e-scenario/nemoclaw_scenarios/fixtures"); const INSTALL_DIR = path.join(REPO_ROOT, "test/e2e-scenario/nemoclaw_scenarios/install"); -const RUN_SCENARIO = path.join(REPO_ROOT, "test/e2e-scenario/runtime/run-scenario.sh"); function runBash(script: string, env: Record = {}): SpawnSyncReturns { return spawnSync("bash", ["-c", script], { @@ -61,51 +60,6 @@ describe("E2E shell helpers", () => { } }); - it("test_should_emit_plan_only_checks_without_live_infrastructure", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-inf-plan-")); - try { - const r = runBash( - ` - set -euo pipefail - . "${RUNTIME_LIB}/context.sh" - . "${VALIDATION_SUITES}/lib/inference_routing.sh" - e2e_context_init - e2e_context_set E2E_SANDBOX_NAME sandbox-1 - e2e_inference_routing_assert_chat_completion "post-onboard.inference-routing.inference-local-chat-completion" - `, - { E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1" }, - ); - expect(r.status, r.stderr).toBe(0); - expect(r.stdout).toContain("post-onboard.inference-routing.inference-local-chat-completion"); - expect(r.stdout).toMatch(/dry-run|plan/i); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("test_should_not_print_secret_values_in_helper_output", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-inf-secret-")); - try { - const r = runBash( - ` - set -euo pipefail - . "${RUNTIME_LIB}/context.sh" - . "${VALIDATION_SUITES}/lib/inference_routing.sh" - e2e_context_init - e2e_context_set E2E_SANDBOX_NAME sandbox-1 - e2e_context_set E2E_PROVIDER_API_KEY super-secret-test-token - e2e_inference_routing_assert_auth_proxy "post-onboard.ollama-auth-proxy.authenticated-request-accepted" "valid" - `, - { E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1" }, - ); - expect(r.status, r.stderr).toBe(0); - expect(r.stdout + r.stderr).not.toContain("super-secret-test-token"); - expect(r.stdout + r.stderr).toMatch(/REDACTED|dry-run|plan/i); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - it("security_policy_credentials_helper_should_load_with_context_library", () => { const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "spc-context-")); try { @@ -117,7 +71,7 @@ describe("E2E shell helpers", () => { spc_require_context E2E_SCENARIO E2E_PROVIDER echo "provider=$(spc_context_get E2E_PROVIDER)" `, - { E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1" }, + { E2E_CONTEXT_DIR: tmp }, ); expect(r.status, r.stderr).toBe(0); expect(r.stdout).toContain("provider=nvidia"); @@ -136,7 +90,7 @@ describe("E2E shell helpers", () => { . "${VALIDATION_SUITES}/lib/security_policy_credentials.sh" spc_require_context E2E_PROVIDER `, - { E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1" }, + { E2E_CONTEXT_DIR: tmp }, ); expect(r.status).not.toBe(0); expect(r.stderr).toContain("E2E_PROVIDER"); @@ -474,38 +428,6 @@ exit 0 } }); - it("scenario_dry_run_should_trace_helper_sequence_in_order", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-trace-")); - try { - const trace = path.join(tmp, "trace.log"); - const r = spawnSync( - "bash", - [RUN_SCENARIO, "ubuntu-repo-cloud-openclaw", "--dry-run"], - { - env: { - ...process.env, - E2E_CONTEXT_DIR: tmp, - E2E_TRACE_FILE: trace, - }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }, - ); - expect(r.status, r.stderr).toBe(0); - expect(fs.existsSync(trace), "trace log missing").toBe(true); - const contents = fs.readFileSync(trace, "utf8"); - const order = ["env:noninteractive", "install:", "onboard:", "gateway:check", "sandbox:check"]; - let pos = 0; - for (const marker of order) { - const idx = contents.indexOf(marker, pos); - expect(idx, `trace missing marker in order: ${marker}\nfull:\n${contents}`).toBeGreaterThanOrEqual(0); - pos = idx + marker.length; - } - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); }); // ───────────────────────────────────────────────────────────────────────────── @@ -675,7 +597,9 @@ exec "$@" e2e_sandbox_exec sb1 -- false echo "rc=$?" `, - { PATH: `${bin}:${process.env.PATH}` }, + // Force the openshell-direct transport so the stubbed openshell + // (which has no `sandbox ssh-config` subcommand) is exercised. + { PATH: `${bin}:${process.env.PATH}`, E2E_SANDBOX_EXEC_VIA_OPENSHELL: "1" }, ); expect(r.stdout).toMatch(/rc=1/); } finally { @@ -683,21 +607,6 @@ exec "$@" } }); - it("sandbox_exec_should_dry_run_short_circuit_when_e2e_dry_run_set", () => { - // Use a PATH that has bash itself but no nemoclaw — dry-run must - // short-circuit before the CLI lookup. - const r = runBash( - ` - set -euo pipefail - . "${VALIDATION_SUITES}/sandbox-exec.sh" - e2e_sandbox_exec sb1 -- rm -rf / - `, - { E2E_DRY_RUN: "1", PATH: "/usr/bin:/bin" }, - ); - expect(r.status, r.stderr).toBe(0); - expect(r.stdout + r.stderr).toMatch(/dry[- ]run/i); - }); - it("sandbox_exec_stdin_should_quote_args_safely_when_piped", () => { // Verify that $TOKEN is NOT expanded on the host side before being // delivered to the sandbox. We stub openshell to echo back stdin. @@ -717,7 +626,12 @@ exec "$@" . "${VALIDATION_SUITES}/sandbox-exec.sh" printf 'hello $TOKEN' | e2e_sandbox_exec_stdin sb1 -- cat `, - { PATH: `${bin}:${process.env.PATH}`, TOKEN: "SHOULD_NOT_EXPAND" }, + { + PATH: `${bin}:${process.env.PATH}`, + TOKEN: "SHOULD_NOT_EXPAND", + // Stub only handles the openshell-direct transport. + E2E_SANDBOX_EXEC_VIA_OPENSHELL: "1", + }, ); expect(r.status, r.stderr).toBe(0); expect(r.stdout).toContain("hello $TOKEN"); @@ -726,6 +640,111 @@ exec "$@" fs.rmSync(tmp, { recursive: true, force: true }); } }); + + it("sandbox_exec_should_prefer_ssh_config_transport_when_openshell_offers_one", () => { + // Verify the new default: when `openshell sandbox ssh-config ` + // succeeds, the wrapper routes through `ssh -F ` instead of + // `openshell sandbox exec`. + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-sbex-ssh-")); + try { + const bin = path.join(tmp, "bin"); + fs.mkdirSync(bin); + const trace = path.join(tmp, "ssh.trace"); + fs.writeFileSync( + path.join(bin, "openshell"), + `#!/usr/bin/env bash +set -euo pipefail +if [[ "$1" == "sandbox" && "$2" == "ssh-config" ]]; then + printf 'Host openshell-%s\\n HostName 127.0.0.1\\n Port 2222\\n User sandbox\\n' "$3" + exit 0 +fi +echo "unexpected openshell call: $*" >&2 +exit 99 +`, + { mode: 0o755 }, + ); + fs.writeFileSync( + path.join(bin, "ssh"), + `#!/usr/bin/env bash +set -euo pipefail +printf '%s\\n' "ssh-args:$*" >> "${trace}" +remote="\${@: -1}" +printf '%s\\n' "remote-cmd:\${remote}" >> "${trace}" +echo ok-from-ssh +exit 0 +`, + { mode: 0o755 }, + ); + const ctxDir = path.join(tmp, "ctx"); + fs.mkdirSync(ctxDir); + const r = runBash( + ` + set -euo pipefail + . "${VALIDATION_SUITES}/sandbox-exec.sh" + e2e_sandbox_exec sb1 -- echo hello + `, + { + PATH: `${bin}:${process.env.PATH}`, + E2E_CONTEXT_DIR: ctxDir, + }, + ); + expect(r.status, r.stderr).toBe(0); + expect(r.stdout).toContain("ok-from-ssh"); + const traceContents = fs.readFileSync(trace, "utf8"); + expect(traceContents).toMatch(/ssh-args:.*-F /); + expect(traceContents).toContain("openshell-sb1"); + expect(traceContents).toMatch(/remote-cmd:echo hello$/m); + const cfg = path.join(ctxDir, ".ssh-config-cache", "sb1.cfg"); + expect(fs.existsSync(cfg)).toBe(true); + } finally { + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); + + it("sandbox_exec_should_fall_back_to_openshell_when_ssh_config_unavailable", () => { + // If `openshell sandbox ssh-config` fails, the wrapper must fall + // back to `openshell sandbox exec`. + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-sbex-fb-")); + try { + const bin = path.join(tmp, "bin"); + fs.mkdirSync(bin); + fs.writeFileSync( + path.join(bin, "openshell"), + `#!/usr/bin/env bash +set -uo pipefail +if [[ "$1" == "sandbox" && "$2" == "ssh-config" ]]; then + exit 1 +fi +if [[ "$1" == "sandbox" && "$2" == "exec" ]]; then + shift 2 + while [[ "$#" -gt 0 && "$1" != "--" ]]; do shift; done + shift || true + exec "$@" +fi +exit 99 +`, + { mode: 0o755 }, + ); + const ctxDir = path.join(tmp, "ctx"); + fs.mkdirSync(ctxDir); + const r = runBash( + ` + set -euo pipefail + . "${VALIDATION_SUITES}/sandbox-exec.sh" + e2e_sandbox_exec sb1 -- echo fallback-ok + `, + { + PATH: `${bin}:${process.env.PATH}`, + E2E_CONTEXT_DIR: ctxDir, + }, + ); + expect(r.status, r.stderr).toBe(0); + expect(r.stdout).toContain("fallback-ok"); + expect(r.stderr).toMatch(/ssh-config unavailable for sb1/); + } finally { + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); }); // ───────────────────────────────────────────────────────────────────────────── @@ -968,53 +987,6 @@ describe("Issue #3810 messaging provider helper library", () => { }); }); -// ───────────────────────────────────────────────────────────────────────────── -// Phase 1.E — Install-method dispatcher splits -// ───────────────────────────────────────────────────────────────────────────── - -describe("Phase 1.E install dispatcher splits", () => { - function dispatchDryRun(profile: string): SpawnSyncReturns { - return runBash( - ` - set -euo pipefail - . "${INSTALL_DIR}/dispatch.sh" - e2e_install "${profile}" - `, - { E2E_DRY_RUN: "1" }, - ); - } - - it("install_should_dispatch_to_install_repo_helper_for_repo_current_profile", () => { - const r = dispatchDryRun("repo-current"); - expect(r.status, r.stderr).toBe(0); - expect(r.stdout + r.stderr).toMatch(/install-repo/); - expect(r.stdout + r.stderr).not.toMatch(/install-curl|install-ollama|install-launchable/); - }); - - it("install_should_dispatch_to_install_curl_helper_for_public_installer_profile", () => { - const r = dispatchDryRun("public-installer"); - expect(r.status, r.stderr).toBe(0); - expect(r.stdout + r.stderr).toMatch(/install-curl/); - expect(r.stdout + r.stderr).not.toMatch(/install-repo|install-ollama|install-launchable/); - }); - - it("install_should_dispatch_to_install_ollama_helper_for_ollama_profile", () => { - const r = dispatchDryRun("ollama"); - expect(r.status, r.stderr).toBe(0); - expect(r.stdout + r.stderr).toMatch(/install-ollama/); - expect(r.stdout + r.stderr).not.toMatch(/install-repo|install-curl|install-launchable/); - }); - - it("install_should_dispatch_to_install_launchable_helper_for_launchable_profile", () => { - const r = dispatchDryRun("launchable"); - expect(r.status, r.stderr).toBe(0); - expect(r.stdout + r.stderr).toMatch(/install-launchable/); - expect(r.stdout + r.stderr).not.toMatch(/install-repo|install-curl|install-ollama/); - }); -}); - - - describe("baseline onboarding validation helper", () => { it("baseline_helper_should_source_under_strict_shell_options", () => { const r = runBash(`set -euo pipefail; source "${VALIDATION_SUITES}/lib/baseline_onboarding.sh"`); @@ -1080,7 +1052,7 @@ describe("sandbox lifecycle validation helper", () => { try { const bin = path.join(tmp, "bin"); fs.mkdirSync(bin); fs.writeFileSync(path.join(bin, "timeout"), "#!/usr/bin/env bash\necho timed out >&2\nexit 124\n", { mode: 0o755 }); - const r = runBash(`set -e; unset E2E_DRY_RUN; . "${VALIDATION_SUITES}/lib/sandbox_lifecycle.sh"; sandbox_lifecycle_run_with_timeout 1 bash -c 'sleep 5'`, { PATH: `${bin}:${process.env.PATH}` }); + const r = runBash(`set -e; . "${VALIDATION_SUITES}/lib/sandbox_lifecycle.sh"; sandbox_lifecycle_run_with_timeout 1 bash -c 'sleep 5'`, { PATH: `${bin}:${process.env.PATH}` }); expect(r.status).toBe(124); expect(r.stderr).toMatch(/timed out/); } finally { fs.rmSync(tmp, { recursive: true, force: true }); } @@ -1093,7 +1065,7 @@ describe("sandbox lifecycle validation helper", () => { fs.writeFileSync(path.join(bin, "nemoclaw"), `#!/usr/bin/env bash case "$*" in list) echo sb1;; - "sb1 status") echo 'status running gateway healthy sandbox running';; + "sb1 status") printf ' Sandbox: sb1\\n Model: nvidia/x\\n OpenShell: 0.0.44\\n Policies: npm\\n';; "sb1 logs") echo logline;; *) echo "unexpected nemoclaw args: $*" >&2; exit 64;; esac diff --git a/test/e2e-scenario/framework-tests/e2e-phase-orchestrators.test.ts b/test/e2e-scenario/framework-tests/e2e-phase-orchestrators.test.ts index 497dac3387..c0f08fd23a 100644 --- a/test/e2e-scenario/framework-tests/e2e-phase-orchestrators.test.ts +++ b/test/e2e-scenario/framework-tests/e2e-phase-orchestrators.test.ts @@ -3,19 +3,39 @@ import { describe, expect, it } from "vitest"; import fs from "node:fs"; +import os from "node:os"; import path from "node:path"; import { HostCliClient } from "../scenarios/clients/host-cli.ts"; import { compileRunPlans } from "../scenarios/compiler.ts"; import { PhaseOrchestrator } from "../scenarios/orchestrators/phase.ts"; import { ScenarioRunner } from "../scenarios/orchestrators/runner.ts"; -import type { AssertionStep, PhaseName, PhaseResult, RunContext, RunPlanPhase } from "../scenarios/types.ts"; +import type { + AssertionStep, + PhaseAction, + PhaseName, + PhaseResult, + RunContext, + RunPlanPhase, +} from "../scenarios/types.ts"; -function fakeCtx(): RunContext { - return { contextDir: fs.mkdtempSync(path.join(process.cwd(), ".tmp-e2e-phase-")), dryRun: true }; +const REPO_ROOT = path.resolve(import.meta.dirname, "../../.."); + +function freshCtx(): RunContext { + return { contextDir: fs.mkdtempSync(path.join(os.tmpdir(), "e2e-phase-")) }; +} + +function shellStep(id: string, phase: PhaseName, ref: string, reliability?: AssertionStep["reliability"]): AssertionStep { + return { + id, + phase, + implementation: { kind: "shell", ref }, + evidencePath: `.e2e/assertions/${id}.log`, + reliability, + }; } -function fakeStep(id: string, phase: PhaseName, ref = "fake-pass"): AssertionStep { +function probeStep(id: string, phase: PhaseName, ref = "no-such-probe"): AssertionStep { return { id, phase, @@ -24,24 +44,69 @@ function fakeStep(id: string, phase: PhaseName, ref = "fake-pass"): AssertionSte }; } -function fakePhase(step: AssertionStep): RunPlanPhase { +function pendingStep(id: string, phase: PhaseName): AssertionStep { + return { + id, + phase, + implementation: { kind: "pending", ref: "not-yet" }, + }; +} + +function makePhase(steps: AssertionStep[]): RunPlanPhase { return { - name: step.phase, + name: steps[0].phase, actions: [], - assertionGroups: [{ id: `group.${step.id}`, phase: step.phase, migrationStatus: "complete", steps: [step] }], + assertionGroups: [{ id: `group.${steps[0].id}`, phase: steps[0].phase, migrationStatus: "complete", steps }], }; } -describe("phase orchestrators", () => { +function writeTempScript(dir: string, name: string, body: string): string { + const p = path.join(dir, name); + fs.writeFileSync(p, `#!/usr/bin/env bash\nset -euo pipefail\n${body}\n`, { mode: 0o755 }); + return p; +} + +function shellAction( + id: string, + phase: PhaseName, + scriptRef: string, + opts: { timeoutSeconds?: number; arg?: string } = {}, +): PhaseAction { + return { + id, + phase, + kind: "shell", + scriptRef, + arg: opts.arg, + timeoutSeconds: opts.timeoutSeconds, + }; +} + +function makePhaseWithActions( + phase: PhaseName, + actions: PhaseAction[], + steps: AssertionStep[], +): RunPlanPhase { + return { + name: phase, + actions, + assertionGroups: + steps.length > 0 + ? [{ id: `group.${steps[0].id}`, phase, migrationStatus: "complete", steps }] + : [], + }; +} + +describe("phase orchestrators - top-level delegation", () => { it("test_should_execute_phase_assertions_from_phase_orchestrators_not_top_level_runner", async () => { - const ctx = fakeCtx(); + const ctx = freshCtx(); try { const [plan] = compileRunPlans(["ubuntu-repo-cloud-openclaw"]); const calls: string[] = []; const fakeOrchestrator = (phase: PhaseName) => ({ run: async (_ctx: RunContext, runPhase: RunPlanPhase, _prior?: PhaseResult[]): Promise => { calls.push(runPhase.name); - return { phase, status: "passed", assertions: [] }; + return { phase, status: "passed", actions: [], assertions: [] }; }, }); const runner = new ScenarioRunner({ @@ -58,63 +123,729 @@ describe("phase orchestrators", () => { fs.rmSync(ctx.contextDir, { recursive: true, force: true }); } }); +}); - it("test_should_record_step_status_attempts_duration_classifier_and_evidence", async () => { - const ctx = fakeCtx(); +describe("phase orchestrators - real shell execution", () => { + it("shell_step_passes_when_script_exits_zero", async () => { + const ctx = freshCtx(); try { - const step = fakeStep("runtime.retry-pass", "runtime", "fake-retry-once-pass"); - step.reliability = { retry: { attempts: 2, on: ["gateway-transient"] } }; + const script = writeTempScript(ctx.contextDir, "ok.sh", "echo hello-from-real-shell"); + const ref = path.relative(REPO_ROOT, script); + const step = shellStep("runtime.real-pass", "runtime", ref); const orchestrator = new PhaseOrchestrator("runtime"); - const result = await orchestrator.run(ctx, fakePhase(step)); + const result = await orchestrator.run(ctx, makePhase([step])); expect(result.status).toBe("passed"); expect(result.assertions[0]).toEqual( - expect.objectContaining({ - id: "runtime.retry-pass", - status: "passed", - attempts: 2, - classifier: "gateway-transient", - evidence: ".e2e/assertions/runtime.retry-pass.json", - }), + expect.objectContaining({ id: "runtime.real-pass", status: "passed", attempts: 1 }), ); - expect(result.assertions[0].durationMs).toBeGreaterThanOrEqual(0); + const log = fs.readFileSync(result.assertions[0].evidence!, "utf8"); + expect(log).toContain("hello-from-real-shell"); } finally { fs.rmSync(ctx.contextDir, { recursive: true, force: true }); } }); - it("test_should_enforce_timeout_and_retry_policy_in_orchestrator", async () => { - const ctx = fakeCtx(); + it("shell_step_fails_when_script_exits_nonzero_and_records_stderr_tail", async () => { + const ctx = freshCtx(); try { - const step = fakeStep("runtime.retry-fail", "runtime", "fake-always-transient"); - step.reliability = { timeoutSeconds: 1, retry: { attempts: 2, on: ["provider-transient"] } }; + const script = writeTempScript(ctx.contextDir, "fail.sh", 'echo "boom: real failure" >&2; exit 7'); + const ref = path.relative(REPO_ROOT, script); + const step = shellStep("runtime.real-fail", "runtime", ref); const orchestrator = new PhaseOrchestrator("runtime"); - const result = await orchestrator.run(ctx, fakePhase(step)); + const result = await orchestrator.run(ctx, makePhase([step])); expect(result.status).toBe("failed"); - expect(result.assertions[0]).toEqual( - expect.objectContaining({ - id: "runtime.retry-fail", - status: "failed", - attempts: 2, - classifier: "provider-transient", + expect(result.assertions[0].status).toBe("failed"); + expect(result.assertions[0].message).toMatch(/exit 7/); + expect(result.assertions[0].message).toMatch(/boom: real failure/); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("shell_step_times_out_via_orchestrator_policy_not_script", async () => { + const ctx = freshCtx(); + try { + const script = writeTempScript(ctx.contextDir, "slow.sh", "sleep 30"); + const ref = path.relative(REPO_ROOT, script); + const step = shellStep("runtime.real-timeout", "runtime", ref, { timeoutSeconds: 1 }); + const orchestrator = new PhaseOrchestrator("runtime"); + + const started = Date.now(); + const result = await orchestrator.run(ctx, makePhase([step])); + const elapsed = Date.now() - started; + + expect(result.status).toBe("failed"); + expect(result.assertions[0].message).toMatch(/exceeded 1s/); + expect(elapsed).toBeLessThan(15_000); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }, 20_000); + + it("shell_step_retries_on_classified_transient_then_passes", async () => { + const ctx = freshCtx(); + try { + const counterFile = path.join(ctx.contextDir, "counter"); + fs.writeFileSync(counterFile, "0"); + const script = writeTempScript( + ctx.contextDir, + "gateway-flaky.sh", + `n=$(cat "${counterFile}"); n=$((n+1)); echo "$n" > "${counterFile}"; if [ "$n" -lt 2 ]; then echo "gateway-transient: try again" >&2; exit 1; fi; echo ok`, + ); + const ref = path.relative(REPO_ROOT, script); + const step = shellStep("runtime.gateway-retry", "runtime", ref, { + retry: { attempts: 2, on: ["gateway-transient"] }, + }); + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + + expect(result.status).toBe("passed"); + expect(result.assertions[0].attempts).toBe(2); + expect(result.assertions[0].classifier).toBe("gateway-transient"); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("shell_step_fails_with_clear_message_when_script_missing", async () => { + const ctx = freshCtx(); + try { + const step = shellStep("runtime.missing", "runtime", "test/e2e-scenario/does-not-exist.sh"); + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + + expect(result.status).toBe("failed"); + expect(result.assertions[0].message).toMatch(/script not found/); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("probe_step_without_registered_probe_skips_visibly_never_passes_falsely", async () => { + const ctx = freshCtx(); + try { + const step = probeStep("runtime.probe-pending", "runtime"); + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + + expect(result.assertions[0].status).toBe("skipped"); + expect(result.assertions[0].message).toMatch(/probe not registered/); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("pending_step_skips_visibly_with_pending_marker", async () => { + const ctx = freshCtx(); + try { + const step = pendingStep("runtime.pending", "runtime"); + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + + expect(result.assertions[0].status).toBe("skipped"); + expect(result.assertions[0].message).toMatch(/^pending:/); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); +}); + +describe("phase orchestrators - actions execute before assertions", () => { + it("phase_action_runs_before_assertions_and_records_evidence", async () => { + const ctx = freshCtx(); + try { + const actionScript = writeTempScript(ctx.contextDir, "setup.sh", "echo phase-action-evidence"); + const action = shellAction("environment.setup-ok", "environment", path.relative(REPO_ROOT, actionScript)); + const stepScript = writeTempScript(ctx.contextDir, "after.sh", "echo after-action"); + const step = shellStep("environment.assert-ok", "environment", path.relative(REPO_ROOT, stepScript)); + const orchestrator = new PhaseOrchestrator("environment"); + + const result = await orchestrator.run(ctx, makePhaseWithActions("environment", [action], [step])); + + expect(result.status).toBe("passed"); + expect(result.actions).toHaveLength(1); + expect(result.actions[0]).toEqual( + expect.objectContaining({ id: "environment.setup-ok", status: "passed" }), + ); + expect(result.actions[0].evidence).toBeTruthy(); + const actionLog = fs.readFileSync(result.actions[0].evidence!, "utf8"); + expect(actionLog).toContain("phase-action-evidence"); + expect(result.assertions).toHaveLength(1); + expect(result.assertions[0].status).toBe("passed"); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("phase_action_failure_short_circuits_assertions", async () => { + const ctx = freshCtx(); + try { + const failScript = writeTempScript(ctx.contextDir, "fail.sh", 'echo "setup boom" >&2; exit 5'); + const action = shellAction("environment.setup-fail", "environment", path.relative(REPO_ROOT, failScript)); + const stepScript = writeTempScript(ctx.contextDir, "after.sh", "echo should-not-run"); + const step = shellStep("environment.never-runs", "environment", path.relative(REPO_ROOT, stepScript)); + const orchestrator = new PhaseOrchestrator("environment"); + + const result = await orchestrator.run(ctx, makePhaseWithActions("environment", [action], [step])); + + expect(result.status).toBe("failed"); + expect(result.actions).toHaveLength(1); + expect(result.actions[0].status).toBe("failed"); + expect(result.actions[0].message).toMatch(/exit 5/); + // Assertions must NOT have run, so they must NOT show a misleading + // pass for an environment that was never set up. + expect(result.assertions).toEqual([]); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("phase_action_times_out_via_orchestrator_policy", async () => { + const ctx = freshCtx(); + try { + const slow = writeTempScript(ctx.contextDir, "slow.sh", "sleep 30"); + const action = shellAction("environment.setup-slow", "environment", path.relative(REPO_ROOT, slow), { + timeoutSeconds: 1, + }); + const orchestrator = new PhaseOrchestrator("environment"); + + const started = Date.now(); + const result = await orchestrator.run(ctx, makePhaseWithActions("environment", [action], [])); + + expect(result.status).toBe("failed"); + expect(result.actions[0].status).toBe("failed"); + expect(result.actions[0].message).toMatch(/exceeded 1s/); + // The orchestrator must enforce the timeout, not depend on the + // script self-killing. Allow some headroom but fail if we waited + // anywhere near the script's 30s sleep. + expect(Date.now() - started).toBeLessThan(15_000); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("phase_action_publishes_alias_path_on_success", async () => { + const ctx = freshCtx(); + try { + const actionScript = writeTempScript(ctx.contextDir, "alias.sh", "echo aliased-output"); + const action: PhaseAction = { + id: "onboarding.profile.alias-demo", + phase: "onboarding", + kind: "shell", + scriptRef: path.relative(REPO_ROOT, actionScript), + aliasPath: "onboard.log", + }; + const orchestrator = new PhaseOrchestrator("onboarding"); + + const result = await orchestrator.run(ctx, makePhaseWithActions("onboarding", [action], [])); + + expect(result.actions[0].status).toBe("passed"); + const aliasContents = fs.readFileSync(path.join(ctx.contextDir, "onboard.log"), "utf8"); + expect(aliasContents).toContain("aliased-output"); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("phase_action_evidence_log_is_flushed_before_resolve", async () => { + const ctx = freshCtx(); + try { + const actionScript = writeTempScript(ctx.contextDir, "flush.sh", "echo flushed-phase-action-output"); + const action = shellAction("environment.flush", "environment", path.relative(REPO_ROOT, actionScript)); + const orchestrator = new PhaseOrchestrator("environment"); + + const result = await orchestrator.run(ctx, makePhaseWithActions("environment", [action], [])); + + // Synchronous read must already see the output - the orchestrator + // must wait for the WriteStream's 'finish' before resolving. + const log = fs.readFileSync(result.actions[0].evidence!, "utf8"); + expect(log).toContain("flushed-phase-action-output"); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); +}); + +describe("plan compiler emits phase actions for canonical scenarios", () => { + it("compiler_emits_install_and_onboard_actions_for_canonical_scenarios", async () => { + const { compileRunPlans } = await import("../scenarios/compiler.ts"); + const ids = [ + "ubuntu-repo-cloud-openclaw", + "ubuntu-repo-cloud-hermes", + "gpu-repo-local-ollama-openclaw", + "macos-repo-cloud-openclaw", + "wsl-repo-cloud-openclaw", + "brev-launchable-cloud-openclaw", + "ubuntu-no-docker-preflight-negative", + ]; + const plans = compileRunPlans(ids); + expect(plans).toHaveLength(ids.length); + for (const plan of plans) { + const env = plan.phases.find((p) => p.name === "environment")!; + const onb = plan.phases.find((p) => p.name === "onboarding")!; + expect(env.actions.some((a) => a.id.startsWith("environment.install."))).toBe(true); + expect(onb.actions.some((a) => a.id.startsWith("onboarding.profile."))).toBe(true); + // context.env emission is framework infrastructure (ScenarioRunner), + // not a shell action. The compiler must NOT emit a shell context + // action - if it did we'd be coupling back to the old resolver's + // plan.json shape. + expect(env.actions.map((a) => a.id)).not.toContain("environment.context.emit"); + // Onboarding action must publish a stable alias path so legacy + // shell assertions referencing ${E2E_CONTEXT_DIR}/onboard.log + // keep working without coupling them to action ids. + const onboardingAction = onb.actions.find((a) => a.id.startsWith("onboarding.profile.")); + expect(onboardingAction?.aliasPath).toBe("onboard.log"); + // Every install/onboard action must be a typed shell-fn referencing + // the canonical dispatcher script - no free-form strings. + for (const action of [...env.actions, ...onb.actions]) { + if (action.id.startsWith("environment.install.") || action.id.startsWith("onboarding.profile.")) { + expect(action.kind).toBe("shell-fn"); + expect(action.scriptRef).toMatch(/dispatch\.sh$/); + expect(action.fn).toMatch(/^e2e_(install|onboard)$/); + expect(action.arg).toBeTruthy(); + } + } + } + }); + + it("compiler_routes_docker_missing_runtime_to_no_docker_onboarding_profile", async () => { + const { compileRunPlans } = await import("../scenarios/compiler.ts"); + // Negative scenario declares runtime=docker-missing in scenarios.yaml. + // The compiler must substitute the onboarding profile id from the + // base 'cloud-openclaw' to 'cloud-openclaw-no-docker' so the + // dispatcher routes to the worker that installs the docker shim and + // captures negative-preflight.log. Without this routing, the + // 'onboarding.preflight.expected-failed' assertion has nothing to grep. + const [plan] = compileRunPlans(["ubuntu-no-docker-preflight-negative"]); + const onb = plan.phases.find((p) => p.name === "onboarding")!; + const action = onb.actions.find((a) => a.id.startsWith("onboarding.profile.")); + expect(action?.id).toBe("onboarding.profile.cloud-openclaw-no-docker"); + expect(action?.arg).toBe("cloud-openclaw-no-docker"); + expect(action?.evidencePath).toBe( + ".e2e/actions/onboarding.profile.cloud-openclaw-no-docker.log", + ); + // Secret env must still include NVIDIA_API_KEY so behavior matches + // a real user invocation (CLI loads creds even if preflight aborts). + expect(action?.secretEnv).toContain("NVIDIA_API_KEY"); + // Positive scenarios must NOT pick up the -no-docker suffix. + const [posPlan] = compileRunPlans(["ubuntu-repo-cloud-openclaw"]); + const posAction = posPlan.phases + .find((p) => p.name === "onboarding")! + .actions.find((a) => a.id.startsWith("onboarding.profile.")); + expect(posAction?.arg).toBe("cloud-openclaw"); + }); +}); + +describe("ScenarioRunner seeds context.env and short-circuits across phases", () => { + it("seedContextEnv_writes_normalized_keys_at_top_level_context_env_path", async () => { + const { compileRunPlans } = await import("../scenarios/compiler.ts"); + const { seedContextEnv } = await import("../scenarios/orchestrators/context.ts"); + const ctx = freshCtx(); + try { + const [plan] = compileRunPlans(["ubuntu-repo-cloud-openclaw"]); + const result = seedContextEnv(ctx, plan); + + // Path matches the shell helper's e2e_context_init: top-level, + // not under .e2e/. Runtime steps source ${E2E_CONTEXT_DIR}/context.env. + expect(result.path).toBe(path.join(ctx.contextDir, "context.env")); + const body = fs.readFileSync(result.path, "utf8"); + // Required keys downstream shell assertions look up. + expect(body).toMatch(/^E2E_SCENARIO=ubuntu-repo-cloud-openclaw$/m); + expect(body).toMatch(/^E2E_PLATFORM_OS=ubuntu$/m); + expect(body).toMatch(/^E2E_AGENT=openclaw$/m); + expect(body).toMatch(/^E2E_PROVIDER=nvidia$/m); + expect(body).toMatch(/^E2E_GATEWAY_URL=http:\/\/127\.0\.0\.1:18789$/m); + expect(body).toMatch(/^E2E_SANDBOX_NAME=e2e-ubuntu-repo-cloud-openclaw$/m); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("hermes_scenario_seeds_hermes_gateway_url", async () => { + const { compileRunPlans } = await import("../scenarios/compiler.ts"); + const { seedContextEnv } = await import("../scenarios/orchestrators/context.ts"); + const ctx = freshCtx(); + try { + const [plan] = compileRunPlans(["ubuntu-repo-cloud-hermes"]); + const result = seedContextEnv(ctx, plan); + const body = fs.readFileSync(result.path, "utf8"); + expect(body).toMatch(/^E2E_AGENT=hermes$/m); + expect(body).toMatch(/^E2E_GATEWAY_URL=http:\/\/127\.0\.0\.1:8642$/m); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("runner_skips_downstream_phases_when_prior_phase_action_fails", async () => { + const { ScenarioRunner } = await import("../scenarios/orchestrators/runner.ts"); + const { compileRunPlans } = await import("../scenarios/compiler.ts"); + const ctx = freshCtx(); + try { + const [plan] = compileRunPlans(["ubuntu-repo-cloud-openclaw"]); + // Inject a failing environment phase to simulate an install action + // failure. Onboarding and runtime must report skipped, not run + // their own actions or assertions. + const failingEnv = { + run: async () => ({ + phase: "environment" as const, + status: "failed" as const, + actions: [ + { + id: "environment.install.repo-current", + status: "failed" as const, + durationMs: 5, + message: "simulated install failure", + }, + ], + assertions: [], }), + }; + let onboardingCalled = false; + let runtimeCalled = false; + const onboarding = { + run: async () => { + onboardingCalled = true; + return { phase: "onboarding" as const, status: "passed" as const, actions: [], assertions: [] }; + }, + }; + const runtime = { + run: async () => { + runtimeCalled = true; + return { phase: "runtime" as const, status: "passed" as const, actions: [], assertions: [] }; + }, + }; + const runner = new ScenarioRunner({ environment: failingEnv, onboarding, runtime }); + + const results = await runner.run(ctx, plan); + + // Downstream orchestrators must NOT have been invoked. + expect(onboardingCalled).toBe(false); + expect(runtimeCalled).toBe(false); + // Each phase still has a result, and the downstream ones are + // skipped with a message that names the blocking action. + expect(results.map((r) => r.phase)).toEqual(["environment", "onboarding", "runtime"]); + expect(results[1].status).toBe("skipped"); + expect(results[2].status).toBe("skipped"); + expect(results[1].assertions[0].message).toMatch(/blocked by prior failure/); + expect(results[1].assertions[0].message).toMatch(/environment.install.repo-current/); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("runner_does_not_short_circuit_on_assertion_failure_only", async () => { + // Assertion failures (as opposed to action failures) must not block + // downstream phases - reviewers need to see all failure layers. + const { ScenarioRunner } = await import("../scenarios/orchestrators/runner.ts"); + const { compileRunPlans } = await import("../scenarios/compiler.ts"); + const ctx = freshCtx(); + try { + const [plan] = compileRunPlans(["ubuntu-repo-cloud-openclaw"]); + const env = { + run: async () => ({ + phase: "environment" as const, + status: "failed" as const, + actions: [], + assertions: [ + { id: "environment.something", status: "failed" as const, attempts: 1, durationMs: 1 }, + ], + }), + }; + let onboardingCalled = false; + const onboarding = { + run: async () => { + onboardingCalled = true; + return { phase: "onboarding" as const, status: "passed" as const, actions: [], assertions: [] }; + }, + }; + const runner = new ScenarioRunner({ + environment: env, + onboarding, + runtime: { + run: async () => ({ phase: "runtime" as const, status: "passed" as const, actions: [], assertions: [] }), + }, + }); + + await runner.run(ctx, plan); + expect(onboardingCalled).toBe(true); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); +}); + +describe("required probe and pending steps fail closed", () => { + it("test_required_probe_step_that_is_unregistered_fails_the_phase", async () => { + const ctx = freshCtx(); + try { + const step: AssertionStep = { + id: "runtime.security.required-probe", + phase: "runtime", + implementation: { kind: "probe", ref: "unregisteredSecurityProbe" }, + evidencePath: ".e2e/assertions/runtime.security.required-probe.json", + required: true, + }; + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + + expect(result.status).toBe("failed"); + expect(result.assertions[0].status).toBe("failed"); + expect(result.assertions[0].message).toMatch(/required probe not registered/); + expect(result.assertions[0].message).toContain("unregisteredSecurityProbe"); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("test_non_required_probe_step_continues_to_skip_visibly", async () => { + const ctx = freshCtx(); + try { + const step: AssertionStep = { + id: "runtime.diagnostics.non-required-probe", + phase: "runtime", + implementation: { kind: "probe", ref: "diagnosticsProbe" }, + evidencePath: ".e2e/assertions/runtime.diagnostics.non-required-probe.json", + // required intentionally omitted (defaults to false) + }; + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + + expect(result.assertions[0].status).toBe("skipped"); + expect(result.assertions[0].message).toMatch(/probe not registered/); + // Non-required skipped step does not fail the phase. + expect(result.status).not.toBe("failed"); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("test_required_pending_step_fails_closed", async () => { + const ctx = freshCtx(); + try { + const step: AssertionStep = { + id: "runtime.expected-failure.no-side-effects", + phase: "runtime", + implementation: { kind: "pending", ref: "expectedFailureNoSideEffectsProbe" }, + evidencePath: ".e2e/assertions/runtime.expected-failure.no-side-effects.json", + required: true, + }; + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + + expect(result.status).toBe("failed"); + expect(result.assertions[0].status).toBe("failed"); + expect(result.assertions[0].message).toMatch(/required pending step not implemented/); + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("test_security_suite_groups_in_registry_mark_their_steps_as_required", async () => { + const { assertionGroupForSuite } = await import("../scenarios/assertions/registry.ts"); + for (const suiteId of ["security-shields", "security-policy", "security-injection"]) { + const group = assertionGroupForSuite(suiteId); + expect(group, `missing assertion group for suite ${suiteId}`).toBeDefined(); + for (const step of group?.steps ?? []) { + expect( + step.required, + `${suiteId} step ${step.id} must be required so it fails closed`, + ).toBe(true); + } + } + }); + + it("test_expected_failure_no_side_effects_step_in_registry_is_required", async () => { + const { assertionRegistry } = await import("../scenarios/assertions/registry.ts"); + const group = assertionRegistry.groups.find( + (g) => g.id === "runtime.expected-failure.no-side-effects", + ); + expect(group).toBeDefined(); + for (const step of group?.steps ?? []) { + expect(step.required).toBe(true); + } + }); +}); + +describe("framework-owned secret hygiene at the spawn boundary", () => { + it("test_should_not_persist_secret_shaped_child_output_into_evidence", async () => { + const ctx = freshCtx(); + try { + // Child writes secret-shaped tokens (NVIDIA, GitHub, OpenAI, + // Slack, Bearer-prefixed) on both stdout and stderr, then exits + // non-zero so stderrTail also flows into result.message. None of + // those literal tokens may persist anywhere in the evidence. + const body = [ + 'echo "step prints nvapi-1234567890abcdef0123456789"', + 'echo "and ghp_abcdefghijklmnopqrstuvwxyz0123456789"', + 'echo "and sk-abcdefghijklmnopqrstuvwxyz0123456789"', + 'echo "and xoxb-9876543210-fake-bot-token-abc"', + 'echo "Authorization: Bearer eyJhbGciOiJIUzI1NiJ9.payload.signature" 1>&2', + 'exit 7', + ].join("\n"); + const script = writeTempScript(ctx.contextDir, "leak.sh", body); + const ref = path.relative(REPO_ROOT, script); + const step = shellStep("runtime.leak", "runtime", ref); + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + const assertion = result.assertions[0]; + const logBody = fs.readFileSync(path.join(ctx.contextDir, ".e2e", "logs", `${step.id}.log`), "utf8"); + const phaseResultJson = fs.readFileSync( + path.join(ctx.contextDir, ".e2e", "runtime.result.json"), + "utf8", + ); + const surfaces = [logBody, assertion.message ?? "", phaseResultJson]; + + // Every secret-shaped token canonicalized in + // src/lib/security/secret-patterns.ts must be redacted on the + // way to disk, regardless of which surface is read. + const forbiddenPatterns = [ + /nvapi-[A-Za-z0-9_-]{10,}/, + /ghp_[A-Za-z0-9_-]{10,}/, + /sk-[A-Za-z0-9_-]{20,}/, + /(?:xox[bpas]|xapp)-[A-Za-z0-9-]{10,}/, + /Bearer\s+[A-Za-z0-9_.+\/=-]{10,}/i, + ]; + for (const surface of surfaces) { + for (const pat of forbiddenPatterns) { + expect(surface, `evidence surface must not contain ${pat}`).not.toMatch(pat); + } + expect(surface).toMatch(//); + } + } finally { + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("test_should_drop_non_allowlisted_parent_env_unless_declared_in_secretEnv", async () => { + const ctx = freshCtx(); + const sentinelKey = "SECRET_LEAK_PROBE_TOKEN"; + const previous = process.env[sentinelKey]; + process.env[sentinelKey] = "sentinel-value-that-must-not-leak"; + try { + const script = writeTempScript( + ctx.contextDir, + "env-leak.sh", + `printenv | sort\n`, + ); + const ref = path.relative(REPO_ROOT, script); + // Step does NOT declare SECRET_LEAK_PROBE_TOKEN in secretEnv, + // so the framework must drop it before spawn. + const step = shellStep("runtime.env-drop", "runtime", ref); + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + const logBody = fs.readFileSync(path.join(ctx.contextDir, ".e2e", "logs", `${step.id}.log`), "utf8"); + + expect(result.assertions[0].status).toBe("passed"); + expect(logBody, "non-allowlisted parent env must not reach the child").not.toContain(sentinelKey); + expect(logBody).not.toContain("sentinel-value-that-must-not-leak"); + // Framework allowlist + overlay still arrive: PATH and E2E_PHASE. + expect(logBody).toMatch(/^PATH=/m); + expect(logBody).toMatch(/^E2E_PHASE=runtime$/m); + } finally { + if (previous === undefined) delete process.env[sentinelKey]; + else process.env[sentinelKey] = previous; + fs.rmSync(ctx.contextDir, { recursive: true, force: true }); + } + }); + + it("test_should_pass_declared_secretEnv_through_to_child", async () => { + const ctx = freshCtx(); + const declaredKey = "NEMOCLAW_TEST_API_KEY"; // matches SECRET_ENV_KEY_SHAPE + const previous = process.env[declaredKey]; + process.env[declaredKey] = "declared-secret-value-passes-through"; + try { + const script = writeTempScript( + ctx.contextDir, + "declared.sh", + `printenv ${declaredKey} || echo MISSING\n`, ); + const ref = path.relative(REPO_ROOT, script); + const step: AssertionStep = { + ...shellStep("runtime.env-declared", "runtime", ref), + secretEnv: [declaredKey], + }; + const orchestrator = new PhaseOrchestrator("runtime"); + + const result = await orchestrator.run(ctx, makePhase([step])); + const logBody = fs.readFileSync(path.join(ctx.contextDir, ".e2e", "logs", `${step.id}.log`), "utf8"); + + expect(result.assertions[0].status).toBe("passed"); + // Declared secret reaches the child verbatim. + expect(logBody).toContain("declared-secret-value-passes-through"); + // It is NOT redacted in printenv output because nothing about + // the literal value matches a token-shape pattern. (Real + // secrets that match secret-patterns.ts WILL be redacted as a + // second line of defense; this synthetic value is intentionally + // shape-free to isolate the env-passthrough behavior.) } finally { + if (previous === undefined) delete process.env[declaredKey]; + else process.env[declaredKey] = previous; fs.rmSync(ctx.contextDir, { recursive: true, force: true }); } }); + it("test_should_reject_non_secret_shaped_keys_in_secretEnv_at_runtime", async () => { + const { buildChildEnv } = await import("../scenarios/orchestrators/redaction.ts"); + expect(() => + buildChildEnv(process.env, { secretEnv: ["FOO_VAR"], frameworkOverlay: {} }), + ).toThrow(/secret-key shape/); + }); + + it("test_should_declare_NVIDIA_API_KEY_only_for_cloud_onboarding_actions", async () => { + const { compileRunPlans } = await import("../scenarios/compiler.ts"); + const plans = compileRunPlans([ + "ubuntu-repo-cloud-openclaw", + "gpu-repo-local-ollama-openclaw", + ]); + const cloudOnboard = plans[0].phases + .find((p) => p.name === "onboarding") + ?.actions.find((a) => a.id.startsWith("onboarding.profile.")); + const localOnboard = plans[1].phases + .find((p) => p.name === "onboarding") + ?.actions.find((a) => a.id.startsWith("onboarding.profile.")); + expect(cloudOnboard?.secretEnv).toEqual(["NVIDIA_API_KEY"]); + expect(localOnboard?.secretEnv).toEqual([]); + }); +}); + +describe("clients are pass/fail/policy free", () => { it("test_should_keep_clients_free_of_pass_fail_and_retry_semantics", () => { - const source = fs.readFileSync( - path.join(process.cwd(), "test/e2e-scenario/scenarios/clients/host-cli.ts"), - "utf8", - ); const observation = new HostCliClient().observeVersion(); + // The client returns a raw act/observe shape only: the command it would + // run. It must NOT decide pass/fail, attach retry policy, surface a + // classifier, or expose AssertionResult/PhaseResult-shaped fields. expect(observation).toEqual(expect.objectContaining({ command: ["nemoclaw", "--version"] })); - expect(source).not.toMatch(/AssertionResult|PhaseResult|retry|timeout|passed|failed/); + // Raw act/observe fields are allowed (exitCode/stdout/stderr/timing). + // Pass/fail and reliability-policy fields are not. + const forbiddenKeys = [ + "status", + "attempts", + "classifier", + "evidence", + "retry", + "timeout", + "timeoutSeconds", + "phase", + "assertions", + "passed", + "failed", + ]; + for (const key of forbiddenKeys) { + expect(observation).not.toHaveProperty(key); + } }); }); diff --git a/test/e2e-scenario/framework-tests/e2e-redaction-parity.test.ts b/test/e2e-scenario/framework-tests/e2e-redaction-parity.test.ts new file mode 100644 index 0000000000..eb6c785a91 --- /dev/null +++ b/test/e2e-scenario/framework-tests/e2e-redaction-parity.test.ts @@ -0,0 +1,73 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * Parity test: the framework's local secret-pattern set + * (test/e2e-scenario/scenarios/orchestrators/redaction.ts) must stay in + * lockstep with the canonical product source + * (src/lib/security/secret-patterns.ts). + * + * The framework deliberately mirrors rather than imports — see the + * "Framework-local mirror" comment in redaction.ts for why — but the + * mirror is only safe if it is actually a mirror. This test parses + * both source files at the textual level and compares the regex + * literals. + */ + +import { describe, expect, it } from "vitest"; +import fs from "node:fs"; +import path from "node:path"; + +const REPO_ROOT = path.resolve(import.meta.dirname, "../../.."); + +// Pull only regex literals (lines starting with `/` and ending with +// a flag set like /g or /gi). Filters out comment lines like `// NVIDIA` +// that begin with `/` but are not regex. +const REGEX_LITERAL_LINE = /^\/.+\/[a-z]*,?$/; + +function extractFromBlock(block: string): string[] { + return block + .split("\n") + .map((line) => line.trim()) + .filter((line) => REGEX_LITERAL_LINE.test(line)) + .map((line) => line.replace(/,\s*$/, "")); +} + +function extractRegexLiterals(source: string, exportName: string): string[] { + const re = new RegExp(`export const ${exportName}[^=]*=\\s*\\[([\\s\\S]*?)\\];`, "m"); + const m = source.match(re); + return m ? extractFromBlock(m[1]) : []; +} + +function extractFrameworkArray(source: string, constName: string): string[] { + const re = new RegExp(`const ${constName}: RegExp\\[\\] = \\[([\\s\\S]*?)\\];`, "m"); + const m = source.match(re); + return m ? extractFromBlock(m[1]) : []; +} + +describe("framework redaction parity with product source-of-truth", () => { + const productSource = fs.readFileSync( + path.join(REPO_ROOT, "src/lib/security/secret-patterns.ts"), + "utf8", + ); + const frameworkSource = fs.readFileSync( + path.join(REPO_ROOT, "test/e2e-scenario/scenarios/orchestrators/redaction.ts"), + "utf8", + ); + + it("test_framework_TOKEN_PREFIX_PATTERNS_matches_product_source", () => { + const product = extractRegexLiterals(productSource, "TOKEN_PREFIX_PATTERNS"); + const framework = extractFrameworkArray(frameworkSource, "TOKEN_PREFIX_PATTERNS"); + expect(framework.length).toBeGreaterThan(0); + expect(product.length).toBeGreaterThan(0); + expect(framework).toEqual(product); + }); + + it("test_framework_CONTEXT_PATTERNS_matches_product_source", () => { + const product = extractRegexLiterals(productSource, "CONTEXT_PATTERNS"); + const framework = extractFrameworkArray(frameworkSource, "CONTEXT_PATTERNS"); + expect(framework.length).toBeGreaterThan(0); + expect(product.length).toBeGreaterThan(0); + expect(framework).toEqual(product); + }); +}); diff --git a/test/e2e-scenario/framework-tests/e2e-scenario-additional-families.test.ts b/test/e2e-scenario/framework-tests/e2e-scenario-additional-families.test.ts index 8c2e70caae..2d3c42fba0 100644 --- a/test/e2e-scenario/framework-tests/e2e-scenario-additional-families.test.ts +++ b/test/e2e-scenario/framework-tests/e2e-scenario-additional-families.test.ts @@ -2,17 +2,15 @@ // SPDX-License-Identifier: Apache-2.0 /** - * Phase 9: Migrate Additional Scenario Families. - * Verifies metadata for new scenarios (macOS, WSL, GPU local Ollama, Brev - * launchable, Ubuntu cloud Hermes, and the no-docker negative preflight) - * plus the deferred schema concepts (scenario-level overrides, negative - * expected state). + * Phase 9: Additional Scenario Families - resolver-level metadata only. + * + * Plan-printout tests that exercised the deprecated bash entrypoint + * (run-scenario.sh --plan-only) were deleted alongside the bash runner. + * The TS runner is exercised by e2e-plan-compiler / e2e-scenario-registry + * / e2e-phase-orchestrators tests instead. */ import { describe, it, expect } from "vitest"; -import { spawnSync } from "node:child_process"; -import fs from "node:fs"; -import os from "node:os"; import path from "node:path"; import { loadMetadataFromDir } from "../runtime/resolver/load.ts"; @@ -20,27 +18,6 @@ import { resolveScenario } from "../runtime/resolver/plan.ts"; const REPO_ROOT = path.resolve(import.meta.dirname, "../../.."); const E2E_DIR = path.join(REPO_ROOT, "test/e2e-scenario"); -const RUN_SCENARIO = path.join(E2E_DIR, "runtime", "run-scenario.sh"); - -function planOnly(scenarioId: string): { stdout: string; stderr: string; status: number | null; plan: Record } { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-p9-")); - try { - const r = spawnSync("bash", [RUN_SCENARIO, scenarioId, "--plan-only"], { - env: { ...process.env, E2E_CONTEXT_DIR: tmp }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }); - let plan = {}; - const pj = path.join(tmp, "plan.json"); - if (fs.existsSync(pj)) { - plan = JSON.parse(fs.readFileSync(pj, "utf8")); - } - return { stdout: r.stdout, stderr: r.stderr, status: r.status, plan }; - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } -} describe("Issue 3812: inference/provider suite families", () => { it("test_should_route_inference_suite_families_to_domain_specific_steps", () => { @@ -74,37 +51,6 @@ describe("Phase 9: additional scenario families - metadata", () => { }); }); -describe("Phase 9: macOS / WSL plan-only", () => { - it("macos scenario plan identifies macOS platform", () => { - const { status, plan } = planOnly("macos-repo-cloud-openclaw"); - expect(status).toBe(0); - const dims = (plan as { dimensions: { platform: { profile: { os?: string } } } }).dimensions; - expect(dims.platform.profile.os).toBe("macos"); - }); - - it("wsl scenario plan identifies WSL platform", () => { - const { status, plan } = planOnly("wsl-repo-cloud-openclaw"); - expect(status).toBe(0); - const dims = (plan as { dimensions: { platform: { profile: { os?: string } } } }).dimensions; - expect(dims.platform.profile.os).toBe("wsl"); - }); -}); - -describe("Phase 9: GPU local Ollama plan-only", () => { - it("runtime indicates GPU/CDI and provider is ollama", () => { - const { status, plan } = planOnly("gpu-repo-local-ollama-openclaw"); - expect(status).toBe(0); - const dims = (plan as { - dimensions: { - runtime: { profile: { gpu_runtime?: string } }; - onboarding: { profile: { provider?: string } }; - }; - }).dimensions; - expect(dims.runtime.profile.gpu_runtime).toBe("cdi"); - expect(dims.onboarding.profile.provider).toBe("ollama"); - }); -}); - describe("Phase 9: Brev launchable scenario (overrides schema)", () => { it("should_support_scenario_overrides_on_brev_launchable", () => { const meta = loadMetadataFromDir(E2E_DIR); @@ -116,21 +62,6 @@ describe("Phase 9: Brev launchable scenario (overrides schema)", () => { expect(overrides?.onboarding?.gateway?.bind_address).toBeTypeOf("string"); expect(overrides?.onboarding?.gateway?.bind_address?.length).toBeGreaterThan(0); }); - - it("plan shows remote target, launchable install, and gateway bind override", () => { - const { status, stdout, plan } = planOnly("brev-launchable-cloud-openclaw"); - expect(status).toBe(0); - const dims = (plan as { - dimensions: { - platform: { profile: { execution_target?: string } }; - install: { id: string }; - }; - }).dimensions; - expect(dims.platform.profile.execution_target).toBe("remote"); - expect(dims.install.id).toBe("launchable"); - expect(stdout).toMatch(/Overrides:/); - expect(stdout).toMatch(/bind_address/); - }); }); describe("Phase 9: negative preflight", () => { @@ -148,27 +79,4 @@ describe("Phase 9: negative preflight", () => { expect(es?.sandbox?.expected).toBe("absent"); expect(es?.failure?.expected).toBe(true); }); - - it("negative scenario plan identifies docker missing and negative state", () => { - const { status, plan } = planOnly("ubuntu-no-docker-preflight-negative"); - expect(status).toBe(0); - const p = plan as { - dimensions: { runtime: { profile: { container_daemon?: string } } }; - expected_state: { id: string }; - expected_failure?: { - phase?: string; - error_class?: string; - message_pattern?: string; - forbidden_side_effects?: string[]; - }; - }; - expect(p.dimensions.runtime.profile.container_daemon).toBe("missing"); - expect(p.expected_state.id).toBe("preflight-failure-no-sandbox"); - expect(p.expected_failure?.phase).toBe("preflight"); - expect(p.expected_failure?.error_class).toBe("docker-missing"); - expect(p.expected_failure?.message_pattern).toBeTypeOf("string"); - expect(p.expected_failure?.forbidden_side_effects).toEqual( - expect.arrayContaining(["sandbox-created", "gateway-started", "credentials-written"]), - ); - }); }); diff --git a/test/e2e-scenario/framework-tests/e2e-scenario-first-migration.test.ts b/test/e2e-scenario/framework-tests/e2e-scenario-first-migration.test.ts deleted file mode 100644 index 0307ca9103..0000000000 --- a/test/e2e-scenario/framework-tests/e2e-scenario-first-migration.test.ts +++ /dev/null @@ -1,102 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -/** - * Phase 6: Migrate First Scenario - ubuntu-repo-cloud-openclaw. - * Verifies resolver output, plan printout, and dry-run phase ordering. - */ - -import { describe, it, expect } from "vitest"; -import { spawnSync } from "node:child_process"; -import fs from "node:fs"; -import os from "node:os"; -import path from "node:path"; - -import { loadMetadataFromDir } from "../runtime/resolver/load.ts"; -import { resolveScenario } from "../runtime/resolver/plan.ts"; - -const REPO_ROOT = path.resolve(import.meta.dirname, "../../.."); -const E2E_DIR = path.join(REPO_ROOT, "test/e2e-scenario"); -const RUN_SCENARIO = path.join(E2E_DIR, "runtime", "run-scenario.sh"); - -describe("Phase 6: ubuntu-repo-cloud-openclaw migration", () => { - it("ubuntu_repo_cloud_openclaw_should_resolve_to_cloud_openclaw_ready", () => { - const meta = loadMetadataFromDir(E2E_DIR); - const plan = resolveScenario("ubuntu-repo-cloud-openclaw", meta); - expect(plan.expected_state.id).toBe("cloud-openclaw-ready"); - const suiteIds = plan.suites.map((s) => s.id); - expect(suiteIds).toContain("smoke"); - expect(suiteIds).toContain("inference"); - }); - - it("ubuntu_repo_cloud_openclaw_plan_should_include_setup_install_onboard", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-first-")); - try { - const r = spawnSync( - "bash", - [RUN_SCENARIO, "ubuntu-repo-cloud-openclaw", "--plan-only"], - { env: { ...process.env, E2E_CONTEXT_DIR: tmp }, encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), cwd: REPO_ROOT }, - ); - expect(r.status, r.stderr).toBe(0); - expect(r.stdout).toMatch(/install=repo-current/); - expect(r.stdout).toMatch(/runtime=docker-running/); - expect(r.stdout).toMatch(/onboarding=cloud-openclaw/); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("ubuntu_repo_cloud_openclaw_dry_run_should_execute_phases_in_order", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-first-")); - try { - const trace = path.join(tmp, "trace.log"); - const r = spawnSync( - "bash", - [RUN_SCENARIO, "ubuntu-repo-cloud-openclaw", "--dry-run"], - { - env: { ...process.env, E2E_CONTEXT_DIR: tmp, E2E_TRACE_FILE: trace }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }, - ); - expect(r.status, r.stderr).toBe(0); - expect(fs.existsSync(trace)).toBe(true); - const contents = fs.readFileSync(trace, "utf8"); - const order = [ - "env:noninteractive", - "install:repo-current", - "onboard:cloud-openclaw", - "gateway:check", - "sandbox:check", - ]; - let pos = 0; - for (const marker of order) { - const idx = contents.indexOf(marker, pos); - expect(idx, `missing marker ${marker}. trace:\n${contents}`).toBeGreaterThanOrEqual(0); - pos = idx + marker.length; - } - // The run should also seed the context and produce plan.json. - expect(fs.existsSync(path.join(tmp, "context.env"))).toBe(true); - expect(fs.existsSync(path.join(tmp, "plan.json"))).toBe(true); - // After dry-run, suite runner should be able to execute the full - // suite sequence against the emitted context. - const suites = spawnSync( - "bash", - [path.join(E2E_DIR, "runtime", "run-suites.sh"), "smoke", "inference"], - { - env: { ...process.env, E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1" }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }, - ); - expect(suites.status, `suite stderr:${suites.stderr}\nstdout:${suites.stdout}`).toBe(0); - expect(suites.stdout).toMatch(/PASS smoke\/cli-available/); - expect(suites.stdout).toMatch(/PASS inference\/models-health/); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); -}); diff --git a/test/e2e-scenario/framework-tests/e2e-scenario-resolver.test.ts b/test/e2e-scenario/framework-tests/e2e-scenario-resolver.test.ts index dc4f105884..0111aa0e42 100644 --- a/test/e2e-scenario/framework-tests/e2e-scenario-resolver.test.ts +++ b/test/e2e-scenario/framework-tests/e2e-scenario-resolver.test.ts @@ -199,62 +199,6 @@ suites: }); }); -describe("run-scenario.sh --plan-only", () => { - it("run_scenario_plan_only_should_print_plan", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-plan-")); - try { - const result = spawnSync( - "bash", - [ - path.join(E2E_DIR, "runtime", "run-scenario.sh"), - "ubuntu-repo-cloud-openclaw", - "--plan-only", - ], - { - env: { ...process.env, E2E_CONTEXT_DIR: tmp }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }, - ); - expect(result.status, result.stderr).toBe(0); - expect(result.stdout).toContain("ubuntu-repo-cloud-openclaw"); - expect(result.stdout).toContain("cloud-openclaw-ready"); - expect(result.stdout).toContain("smoke"); - expect(result.stdout).toContain("inference"); - const planJsonPath = path.join(tmp, "plan.json"); - expect(fs.existsSync(planJsonPath)).toBe(true); - const doc = JSON.parse(fs.readFileSync(planJsonPath, "utf8")); - expect(doc.scenario_id).toBe("ubuntu-repo-cloud-openclaw"); - expect(doc.expected_state.id).toBe("cloud-openclaw-ready"); - expect(Array.isArray(doc.suites)).toBe(true); - expect(doc.suites.map((s: { id: string }) => s.id)).toContain("smoke"); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("run_scenario_plan_only_should_fail_for_unknown_scenario", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-plan-")); - try { - const result = spawnSync( - "bash", - [ - path.join(E2E_DIR, "runtime", "run-scenario.sh"), - "does-not-exist", - "--plan-only", - ], - { - env: { ...process.env, E2E_CONTEXT_DIR: tmp }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }, - ); - expect(result.status).not.toBe(0); - expect(`${result.stderr}${result.stdout}`).toMatch(/does-not-exist/); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); -}); +// run-scenario.sh-based plan-only tests removed: the bash runner is +// now a fail-fast stub. Equivalent coverage of the typed runner lives in +// e2e-plan-compiler.test.ts and e2e-scenario-registry.test.ts. diff --git a/test/e2e-scenario/framework-tests/e2e-scenarios-workflow.test.ts b/test/e2e-scenario/framework-tests/e2e-scenarios-workflow.test.ts index eb1be9ae19..5a1e3d8906 100644 --- a/test/e2e-scenario/framework-tests/e2e-scenarios-workflow.test.ts +++ b/test/e2e-scenario/framework-tests/e2e-scenarios-workflow.test.ts @@ -50,8 +50,9 @@ jobs: "run-scenario job must use the resolved runner output", "run-scenario job missing step: Run typed scenarios in WSL", "artifact upload name must include the scenarios input", - "artifact upload must include hidden .e2e files", - "artifact upload path must include .e2e/", + "artifact upload must set include-hidden-files: false (raw context.env must not leak)", + "artifact upload path must include .e2e/actions/ (redacted action evidence)", + "artifact upload path must include .e2e/logs/ (redacted shell-step evidence)", ]), ); } finally { diff --git a/test/e2e-scenario/framework-tests/e2e-suite-runner.test.ts b/test/e2e-scenario/framework-tests/e2e-suite-runner.test.ts deleted file mode 100644 index ded16c1917..0000000000 --- a/test/e2e-scenario/framework-tests/e2e-suite-runner.test.ts +++ /dev/null @@ -1,250 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -import { describe, it, expect } from "vitest"; -import { spawnSync, type SpawnSyncReturns } from "node:child_process"; -import fs from "node:fs"; -import os from "node:os"; -import path from "node:path"; -const REPO_ROOT = path.resolve(import.meta.dirname, "../../.."); -const RUN_SUITES = path.join(REPO_ROOT, "test/e2e-scenario/runtime/run-suites.sh"); - -function runSuites(args: string[], env: Record = {}): SpawnSyncReturns { - return spawnSync("bash", [RUN_SUITES, ...args], { - env: { ...process.env, ...env }, - encoding: "utf8", - timeout: Number(process.env.E2E_SPAWN_TIMEOUT_MS ?? 60_000), - cwd: REPO_ROOT, - }); -} - -function seedContext(tmp: string, values: Record): void { - fs.mkdirSync(tmp, { recursive: true }); - const ctx = Object.entries(values) - .map(([k, v]) => `${k}=${v}`) - .join("\n"); - fs.writeFileSync(path.join(tmp, "context.env"), `${ctx}\n`); -} - -function fullContext(): Record { - return { - E2E_SCENARIO: "ubuntu-repo-cloud-openclaw", - E2E_PLATFORM_OS: "ubuntu", - E2E_EXECUTION_TARGET: "local", - E2E_INSTALL_METHOD: "repo-checkout", - E2E_CONTAINER_ENGINE: "docker", - E2E_CONTAINER_DAEMON: "running", - E2E_ONBOARDING_PATH: "cloud", - E2E_AGENT: "openclaw", - E2E_PROVIDER: "nvidia", - E2E_SANDBOX_NAME: "e2e-ubuntu-repo-cloud-openclaw", - E2E_GATEWAY_URL: "http://127.0.0.1:18789", - E2E_INFERENCE_ROUTE: "inference-local", - }; -} - -describe("Issue #3810 messaging suite wiring", () => { - it("should_define_real_steps_for_messaging_provider_suites", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-messaging-suites-")); - try { - const baseContext = { - ...fullContext(), - E2E_PROVIDER: "telegram", - E2E_MESSAGING_PROVIDER: "telegram", - E2E_MESSAGING_BRIDGE_URL: "http://127.0.0.1:18789", - E2E_MESSAGING_CONFIG_CONTENT: "TELEGRAM_BOT_TOKEN=PLACEHOLDER", - }; - seedContext(tmp, baseContext); - const telegram = runSuites(["messaging-telegram"], { - E2E_CONTEXT_DIR: tmp, - E2E_DRY_RUN: "1", - }); - expect(telegram.status, `stderr:${telegram.stderr}\nstdout:${telegram.stdout}`).toBe(0); - seedContext(tmp, { - ...baseContext, - E2E_MESSAGING_PROVIDER: "discord", - E2E_MESSAGING_CONFIG_CONTENT: "DISCORD_BOT_TOKEN=PLACEHOLDER", - }); - const discord = runSuites(["messaging-discord"], { - E2E_CONTEXT_DIR: tmp, - E2E_DRY_RUN: "1", - }); - expect(discord.status, `stderr:${discord.stderr}\nstdout:${discord.stdout}`).toBe(0); - seedContext(tmp, { - ...baseContext, - E2E_MESSAGING_PROVIDER: "slack", - E2E_MESSAGING_CHANNEL: "bot", - E2E_MESSAGING_CONFIG_CONTENT: "SLACK_BOT_TOKEN=PLACEHOLDER", - }); - const slack = runSuites(["messaging-slack"], { - E2E_CONTEXT_DIR: tmp, - E2E_DRY_RUN: "1", - }); - expect(slack.status, `stderr:${slack.stderr}\nstdout:${slack.stdout}`).toBe(0); - const output = `${telegram.stdout}\n${discord.stdout}\n${slack.stdout}`; - for (const id of [ - "messaging-provider-attached", - "messaging-placeholder-configured", - "messaging-no-secret-leak", - "messaging-bridge-reachable", - "telegram-injection-safety", - "discord-gateway-path", - "slack-provider-state", - "slack.runtime-discovery", - ]) { - expect(output).toContain(id); - } - expect(output).not.toContain("cli-available"); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); -}); - -describe("run-suites.sh", () => { - it("security_credentials_suite_should_emit_stable_assertion_ids", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-security-credentials-")); - try { - seedContext(tmp, { ...fullContext(), E2E_CREDENTIALS_EXPECTED: "present" }); - const r = runSuites(["security-credentials"], { E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1", HOME: tmp }); - expect(r.status, `stderr:${r.stderr}\nstdout:${r.stdout}`).toBe(0); - expect(r.stdout).toContain("post-onboard.credentials.gateway-list-redacts-values"); - expect(r.stdout).toContain("post-onboard.credentials.no-plaintext-host-store"); - expect(r.stdout).not.toMatch(/no-credentials-leaked|assert\//); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("run_suites_should_run_steps_in_declared_order", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-suite-")); - try { - seedContext(tmp, fullContext()); - const r = runSuites(["smoke"], { - E2E_CONTEXT_DIR: tmp, - E2E_DRY_RUN: "1", - }); - expect(r.status, `stderr:${r.stderr}\nstdout:${r.stdout}`).toBe(0); - // Smoke order is: cli-available, gateway-health, sandbox-listed, sandbox-shell - const order = ["cli-available", "gateway-health", "sandbox-listed", "sandbox-shell"]; - let pos = 0; - for (const marker of order) { - const idx = r.stdout.indexOf(marker, pos); - expect(idx, `missing marker ${marker} after ${pos} in:\n${r.stdout}`).toBeGreaterThanOrEqual(0); - pos = idx + marker.length; - } - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("run_suites_should_fail_on_unknown_suite", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-suite-")); - try { - seedContext(tmp, fullContext()); - const r = runSuites(["does-not-exist"], { E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1" }); - expect(r.status).not.toBe(0); - expect(`${r.stdout}${r.stderr}`).toMatch(/does-not-exist/); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("run_suites_should_stop_on_first_failed_step", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-suite-")); - try { - seedContext(tmp, fullContext()); - // Use a fixture suites file with a failing middle step. - const fixtureSuites = path.join(tmp, "suites.yaml"); - const fixtureDir = path.join(tmp, "suites", "fixture"); - fs.mkdirSync(fixtureDir, { recursive: true }); - fs.writeFileSync(path.join(fixtureDir, "00-a.sh"), "#!/usr/bin/env bash\necho A-RAN\nexit 0\n"); - fs.writeFileSync(path.join(fixtureDir, "01-b.sh"), "#!/usr/bin/env bash\necho B-RAN\nexit 1\n"); - fs.writeFileSync(path.join(fixtureDir, "02-c.sh"), "#!/usr/bin/env bash\necho C-RAN\nexit 0\n"); - fs.chmodSync(path.join(fixtureDir, "00-a.sh"), 0o755); - fs.chmodSync(path.join(fixtureDir, "01-b.sh"), 0o755); - fs.chmodSync(path.join(fixtureDir, "02-c.sh"), 0o755); - fs.writeFileSync( - fixtureSuites, - `suites: - fixture: - steps: - - { id: a, script: suites/fixture/00-a.sh } - - { id: b, script: suites/fixture/01-b.sh } - - { id: c, script: suites/fixture/02-c.sh } -`, - ); - const r = runSuites(["fixture"], { - E2E_CONTEXT_DIR: tmp, - E2E_SUITES_FILE: fixtureSuites, - E2E_SUITES_DIR: tmp, - }); - expect(r.status).not.toBe(0); - expect(r.stdout).toContain("A-RAN"); - expect(r.stdout).toContain("B-RAN"); - expect(r.stdout).not.toContain("C-RAN"); - expect(`${r.stdout}${r.stderr}`).toMatch(/FAIL.*(fixture\/b|step=b)/i); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("smoke_suite_should_require_context", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-suite-")); - try { - // No context.env written to tmp. - const r = runSuites(["smoke"], { E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1" }); - expect(r.status).not.toBe(0); - expect(`${r.stderr}${r.stdout}`).toMatch(/context\.env|E2E_SCENARIO|missing/i); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("rebuild_and_upgrade_suites_should_emit_stable_assertion_ids_in_dry_run", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-suite-")); - try { - seedContext(tmp, fullContext()); - const r = runSuites(["rebuild", "upgrade"], { E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1" }); - expect(r.status, `stderr:${r.stderr}\nstdout:${r.stdout}`).toBe(0); - for (const id of [ - "suite.rebuild.workspace_state_preserved", - "suite.rebuild.agent_version_upgraded", - "suite.rebuild.inference_still_works", - "suite.rebuild.policy_presets_preserved", - "suite.rebuild.hermes_config_preserved", - "suite.upgrade.sandbox_registry_preserved", - "suite.upgrade.gateway_version_upgraded", - "suite.upgrade.survivor_agent_reachable", - ]) { - expect(r.stdout).toContain(id); - } - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); - - it("smoke_and_inference_run_with_stub_context", () => { - const tmp = fs.mkdtempSync(path.join(os.tmpdir(), "e2e-suite-")); - try { - seedContext(tmp, fullContext()); - const r = runSuites(["smoke", "inference"], { E2E_CONTEXT_DIR: tmp, E2E_DRY_RUN: "1" }); - expect(r.status, `stderr:${r.stderr}\nstdout:${r.stdout}`).toBe(0); - for (const id of [ - "cli-available", - "gateway-health", - "sandbox-listed", - "sandbox-shell", - "models-health", - "chat-completion", - "sandbox-inference-local", - ]) { - expect(r.stdout).toContain(id); - } - // Summary should call out PASS for each step. - expect(r.stdout).toMatch(/PASS/); - } finally { - fs.rmSync(tmp, { recursive: true, force: true }); - } - }); -}); diff --git a/test/e2e-scenario/nemoclaw_scenarios/dispatch-action.sh b/test/e2e-scenario/nemoclaw_scenarios/dispatch-action.sh new file mode 100755 index 0000000000..5aaca1b2c1 --- /dev/null +++ b/test/e2e-scenario/nemoclaw_scenarios/dispatch-action.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Phase-action launcher for the hybrid scenario E2E framework. +# +# The phase orchestrators (EnvironmentOrchestrator, OnboardingOrchestrator) +# call this launcher to invoke a function defined in a sourced shell +# dispatcher (install/dispatch.sh or onboard/dispatch.sh). Those +# dispatchers are intentionally library-style (function definitions +# only); this script gives them a deterministic executable entrypoint +# the typed runner can spawn. +# +# Usage: +# dispatch-action.sh +# +# Examples: +# dispatch-action.sh e2e_install repo-current \ +# test/e2e-scenario/nemoclaw_scenarios/install/dispatch.sh +# +# dispatch-action.sh e2e_onboard cloud-openclaw \ +# test/e2e-scenario/nemoclaw_scenarios/onboard/dispatch.sh +# +# Environment (set by the orchestrator): +# E2E_CONTEXT_DIR artifact directory +# E2E_PHASE environment | onboarding +# E2E_ACTION_ID stable action id, used for trace/log correlation + +set -euo pipefail + +if [[ $# -lt 3 ]]; then + echo "dispatch-action.sh: usage: " >&2 + exit 2 +fi + +ACTION_FN="$1" +ACTION_ARG="$2" +DISPATCHER="$3" + +if [[ ! -f "${DISPATCHER}" ]]; then + echo "dispatch-action.sh: dispatcher script not found: ${DISPATCHER}" >&2 + exit 2 +fi + +# Source the runtime/lib helpers the dispatchers (and their workers) rely on. +RUNTIME_LIB="$(cd "$(dirname "${BASH_SOURCE[0]}")/../runtime/lib" && pwd)" +# shellcheck source=runtime/lib/env.sh +. "${RUNTIME_LIB}/env.sh" +# shellcheck source=runtime/lib/context.sh +. "${RUNTIME_LIB}/context.sh" + +# Apply the standard non-interactive env once, on the very first action of +# the run. Subsequent actions in the same run see the env via process +# inheritance. e2e_env_apply_noninteractive is idempotent. +e2e_env_apply_noninteractive +e2e_env_trace "phase:${E2E_PHASE:-unknown}/action:${E2E_ACTION_ID:-unknown}" + +# IMPORTANT: do NOT call e2e_context_init here. The TS framework +# (ScenarioRunner.seedContextEnv) is the single owner of context.env +# initialization for the run; e2e_context_init opens with `: > ctx` +# which would truncate the file and wipe seeded keys (E2E_SCENARIO, +# E2E_GATEWAY_URL, ...) that runtime assertions require. +# Workers may still call e2e_context_set to extend context.env in place. + +# Source the dispatcher last so its function definitions are in scope +# when we invoke the requested function. +# shellcheck source=/dev/null +. "${DISPATCHER}" + +if ! declare -F "${ACTION_FN}" >/dev/null 2>&1; then + echo "dispatch-action.sh: function not found in dispatcher: ${ACTION_FN}" >&2 + exit 2 +fi + +"${ACTION_FN}" "${ACTION_ARG}" diff --git a/test/e2e-scenario/nemoclaw_scenarios/fixtures/older-base-image.sh b/test/e2e-scenario/nemoclaw_scenarios/fixtures/older-base-image.sh index 3d49c03116..d10fbd2c9d 100755 --- a/test/e2e-scenario/nemoclaw_scenarios/fixtures/older-base-image.sh +++ b/test/e2e-scenario/nemoclaw_scenarios/fixtures/older-base-image.sh @@ -12,8 +12,6 @@ # older_base_image_prepare [--registry ghcr.io/nvidia/nemoclaw] # Writes a minimal Dockerfile to a temp location whose first line is # `FROM :`, and prints the Dockerfile path on stdout. -# Honors E2E_DRY_RUN: skips the `docker pull` step (but still writes -# the Dockerfile, which is what callers inspect). # older_base_image_cleanup # Removes the generated Dockerfile and (if present) its build context. @@ -50,11 +48,9 @@ LABEL nemoclaw.e2e.fixture=older-base-image EOF e2e_env_trace "fixture:older-base-image" "${registry}:${tag}" - if ! e2e_env_is_dry_run; then - if command -v docker >/dev/null 2>&1; then - docker pull "${registry}:${tag}" >&2 \ - || echo "older_base_image_prepare: docker pull failed (continuing; build may still succeed on cached layers)" >&2 - fi + if command -v docker >/dev/null 2>&1; then + docker pull "${registry}:${tag}" >&2 \ + || echo "older_base_image_prepare: docker pull failed (continuing; build may still succeed on cached layers)" >&2 fi printf '%s\n' "${dockerfile}" } diff --git a/test/e2e-scenario/nemoclaw_scenarios/install/dispatch.sh b/test/e2e-scenario/nemoclaw_scenarios/install/dispatch.sh index 7ea798cfdf..1a2ec2b0aa 100755 --- a/test/e2e-scenario/nemoclaw_scenarios/install/dispatch.sh +++ b/test/e2e-scenario/nemoclaw_scenarios/install/dispatch.sh @@ -4,7 +4,7 @@ # # Install dispatcher. Routes by install-method / profile id to one of four # split helpers (repo-current.sh, public-curl.sh, ollama.sh, -# launchable.sh). Honors E2E_DRY_RUN. +# launchable.sh). # # Accepts both legacy install-method names (repo-checkout, # curl-install-script) and the new profile-centric names used by diff --git a/test/e2e-scenario/nemoclaw_scenarios/install/launchable.sh b/test/e2e-scenario/nemoclaw_scenarios/install/launchable.sh index 5ec638e90a..09d8aa3bbb 100755 --- a/test/e2e-scenario/nemoclaw_scenarios/install/launchable.sh +++ b/test/e2e-scenario/nemoclaw_scenarios/install/launchable.sh @@ -18,11 +18,6 @@ _E2E_INST_LNCH_RUNTIME_LIB="$(cd "${_E2E_INST_LNCH_DIR}/../../runtime/lib" && pw e2e_install_launchable() { e2e_env_trace "install-launchable" - if e2e_env_is_dry_run; then - echo "[dry-run] install-launchable (skipped)" - return 0 - fi - # Match nightly launchable-smoke-e2e: exercise the launchable bootstrap # script on the current runner instead of assuming a pre-provisioned Brev VM. # The script has no Brev API dependency; it installs Docker/OpenShell/NemoClaw diff --git a/test/e2e-scenario/nemoclaw_scenarios/install/ollama.sh b/test/e2e-scenario/nemoclaw_scenarios/install/ollama.sh index a9d5f81c14..449eae519a 100755 --- a/test/e2e-scenario/nemoclaw_scenarios/install/ollama.sh +++ b/test/e2e-scenario/nemoclaw_scenarios/install/ollama.sh @@ -17,10 +17,6 @@ _E2E_INST_OL_RUNTIME_LIB="$(cd "${_E2E_INST_OL_DIR}/../../runtime/lib" && pwd)" e2e_install_ollama() { e2e_env_trace "install-ollama" - if e2e_env_is_dry_run; then - echo "[dry-run] install-ollama (skipped)" - return 0 - fi local ollama_url="${E2E_OLLAMA_INSTALL_URL:-https://ollama.ai/install.sh}" if ! command -v ollama >/dev/null 2>&1; then if ! curl -fsSL --retry 3 --retry-delay 2 "${ollama_url}" | bash; then diff --git a/test/e2e-scenario/nemoclaw_scenarios/install/public-curl.sh b/test/e2e-scenario/nemoclaw_scenarios/install/public-curl.sh index 143d097f0d..6628e332a2 100755 --- a/test/e2e-scenario/nemoclaw_scenarios/install/public-curl.sh +++ b/test/e2e-scenario/nemoclaw_scenarios/install/public-curl.sh @@ -16,10 +16,6 @@ _E2E_INST_CURL_RUNTIME_LIB="$(cd "${_E2E_INST_CURL_DIR}/../../runtime/lib" && pw e2e_install_curl() { e2e_env_trace "install-curl" - if e2e_env_is_dry_run; then - echo "[dry-run] install-curl (skipped)" - return 0 - fi local url="${E2E_INSTALLER_URL:-https://raw.githubusercontent.com/NVIDIA/NemoClaw/main/scripts/install.sh}" local sha256="${E2E_INSTALLER_SHA256:-}" local tmp diff --git a/test/e2e-scenario/nemoclaw_scenarios/install/repo-current.sh b/test/e2e-scenario/nemoclaw_scenarios/install/repo-current.sh index 8c985dc3f7..000431a4b8 100755 --- a/test/e2e-scenario/nemoclaw_scenarios/install/repo-current.sh +++ b/test/e2e-scenario/nemoclaw_scenarios/install/repo-current.sh @@ -5,7 +5,6 @@ # Install from a checked-out repo (repo-current / repo-checkout profile). # # Split from the install dispatcher to keep scenario setup logic flat and to -# make the per-profile code discoverable by grep. Honors E2E_DRY_RUN. _E2E_INST_REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" _E2E_INST_REPO_RUNTIME_LIB="$(cd "${_E2E_INST_REPO_DIR}/../../runtime/lib" && pwd)" @@ -16,10 +15,6 @@ _E2E_INST_REPO_RUNTIME_LIB="$(cd "${_E2E_INST_REPO_DIR}/../../runtime/lib" && pw e2e_install_repo() { e2e_env_trace "install-repo" - if e2e_env_is_dry_run; then - echo "[dry-run] install-repo (skipped)" - return 0 - fi local repo_root repo_root="$(cd "${_E2E_INST_REPO_DIR}/../../../.." && pwd)" cd "${repo_root}" || return diff --git a/test/e2e-scenario/nemoclaw_scenarios/onboard/cloud-openclaw-no-docker.sh b/test/e2e-scenario/nemoclaw_scenarios/onboard/cloud-openclaw-no-docker.sh new file mode 100644 index 0000000000..9c7b9803f1 --- /dev/null +++ b/test/e2e-scenario/nemoclaw_scenarios/onboard/cloud-openclaw-no-docker.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Onboard worker: cloud-openclaw-no-docker profile. +# +# Drives the negative `ubuntu-no-docker-preflight-negative` scenario by: +# +# 1. Installing a `docker` shim earlier on PATH that exits non-zero +# with a "Cannot connect to the Docker daemon" message. This makes +# `commandExists("docker")` succeed (the binary is present) while +# `docker info` fails — matching the production failure mode users +# see when Docker is installed but the daemon is not running. +# +# 2. Running `nemoclaw onboard --non-interactive` with stdout+stderr +# captured to `${E2E_CONTEXT_DIR}/negative-preflight.log`. The +# `onboarding.preflight.expected-failed` assertion greps that file. +# +# 3. Asserting that nemoclaw exits non-zero (preflight DID fail). If +# onboard unexpectedly succeeds, the action fails so the operator +# sees a clear "expected failure did not happen" signal instead of a +# green light masking a regression. +# +# 4. Returning 0 on the *expected* failure path so the orchestrator +# reports the action as passed and the assertion phase runs against +# the captured log. Without this, the action would be marked failed +# and the dependent assertions would be skipped. +# +# Pattern mirrors test/e2e/e2e-cloud-experimental/test-port8080-conflict.sh, +# which sets up a different failure condition (port 8080 occupied) but +# follows the same capture-output / check-exit / grep-log shape. + +e2e_onboard_cloud_openclaw_no_docker() { + e2e_env_apply_noninteractive + e2e_context_init + + local log shim_dir rc=0 + log="${E2E_CONTEXT_DIR}/negative-preflight.log" + shim_dir="$(mktemp -d -t e2e-no-docker-XXXXXX)" + + cat >"${shim_dir}/docker" <<'SHIM' +#!/usr/bin/env bash +# Negative-preflight docker shim — preserves "docker is installed" while +# breaking "docker info" / "docker version" so preflight fails with the +# real "Cannot connect to the Docker daemon" message. +printf 'Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?\n' >&2 +exit 1 +SHIM + chmod +x "${shim_dir}/docker" + + echo "negative-preflight: shim docker installed at ${shim_dir}/docker" + echo "negative-preflight: log_file=${log}" + echo "negative-preflight: invoking nemoclaw onboard --non-interactive (expected to fail at preflight)" + + PATH="${shim_dir}:${PATH}" \ + nemoclaw onboard --non-interactive --yes-i-accept-third-party-software \ + >"${log}" 2>&1 || rc=$? + + rm -rf "${shim_dir}" + + echo "negative-preflight: nemoclaw onboard exited ${rc}" + if [[ -f "${log}" ]]; then + echo "--- captured log tail (${log}) ---" + tail -50 "${log}" 2>/dev/null || true + echo "--- end captured log ---" + fi + + if [[ "${rc}" -eq 0 ]]; then + echo "negative-preflight: ERROR: nemoclaw onboard unexpectedly exited 0; preflight should have failed when docker is unreachable" >&2 + return 1 + fi + + return 0 +} diff --git a/test/e2e-scenario/nemoclaw_scenarios/onboard/dispatch.sh b/test/e2e-scenario/nemoclaw_scenarios/onboard/dispatch.sh index 2baf698986..fba1004559 100755 --- a/test/e2e-scenario/nemoclaw_scenarios/onboard/dispatch.sh +++ b/test/e2e-scenario/nemoclaw_scenarios/onboard/dispatch.sh @@ -14,6 +14,8 @@ _E2E_ONBOARD_RUNTIME_LIB="$(cd "${_E2E_ONBOARD_DIR}/../../runtime/lib" && pwd)" . "${_E2E_ONBOARD_RUNTIME_LIB}/context.sh" # shellcheck source=cloud-openclaw.sh . "${_E2E_ONBOARD_DIR}/cloud-openclaw.sh" +# shellcheck source=cloud-openclaw-no-docker.sh +. "${_E2E_ONBOARD_DIR}/cloud-openclaw-no-docker.sh" # shellcheck source=cloud-hermes.sh . "${_E2E_ONBOARD_DIR}/cloud-hermes.sh" # shellcheck source=local-ollama-openclaw.sh @@ -26,14 +28,13 @@ e2e_onboard() { return 2 fi e2e_env_trace "onboard:${profile}" - if e2e_env_is_dry_run; then - echo "[dry-run] onboard profile=${profile} (skipped)" - return 0 - fi case "${profile}" in cloud-openclaw) e2e_onboard_cloud_openclaw ;; + cloud-openclaw-no-docker) + e2e_onboard_cloud_openclaw_no_docker + ;; cloud-openclaw-custom-policies) E2E_ONBOARDING_MODEL="${E2E_ONBOARDING_MODEL:-nvidia/nemotron-3-super-120b-a12b}" E2E_ONBOARDING_POLICY_PRESETS="${E2E_ONBOARDING_POLICY_PRESETS:-npm,pypi}" diff --git a/test/e2e-scenario/onboarding_assertions/preflight/00-preflight-passed.sh b/test/e2e-scenario/onboarding_assertions/preflight/00-preflight-passed.sh index 69bda6c47c..fb05606494 100755 --- a/test/e2e-scenario/onboarding_assertions/preflight/00-preflight-passed.sh +++ b/test/e2e-scenario/onboarding_assertions/preflight/00-preflight-passed.sh @@ -9,7 +9,14 @@ if [[ ! -f "${E2E_CONTEXT_DIR:-}/onboard.log" ]]; then exit 1 fi -if grep -Eiq "preflight.*(fail|error)|docker|container|daemon|socket" "${E2E_CONTEXT_DIR}/onboard.log"; then +# The onboarding action already completed (exit 0) for this assertion to +# run; we only need to confirm the captured onboard.log does not contain +# explicit preflight FAILURE markers. The previous regex matched any +# mention of 'docker' / 'container' / 'daemon' / 'socket', which a normal +# successful onboarding always logs. Tighten to actual failure phrases. +if grep -Eiq \ + "preflight[[:space:]]+(failed|error)|cannot connect to[[:space:]]+(the[[:space:]]+)?docker daemon|permission denied[[:space:]]+while trying to connect to.*docker.*sock|onboarding aborted|FATAL: docker|ERROR: docker daemon" \ + "${E2E_CONTEXT_DIR}/onboard.log"; then echo "FAIL: onboarding.preflight.passed - onboard log contains preflight failure evidence" exit 1 fi diff --git a/test/e2e-scenario/runtime/lib/env.sh b/test/e2e-scenario/runtime/lib/env.sh index ed33fb8a6a..9c33af97cc 100755 --- a/test/e2e-scenario/runtime/lib/env.sh +++ b/test/e2e-scenario/runtime/lib/env.sh @@ -40,8 +40,3 @@ e2e_env_trace() { printf '%s %s\n' "${event}" "$*" >>"${E2E_TRACE_FILE}" fi } - -# e2e_env_is_dry_run: true if E2E_DRY_RUN=1 -e2e_env_is_dry_run() { - [[ "${E2E_DRY_RUN:-0}" == "1" ]] -} diff --git a/test/e2e-scenario/runtime/run-scenario.sh b/test/e2e-scenario/runtime/run-scenario.sh index 58042c8523..2477ce79ec 100755 --- a/test/e2e-scenario/runtime/run-scenario.sh +++ b/test/e2e-scenario/runtime/run-scenario.sh @@ -2,482 +2,24 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # -# E2E scenario runner entrypoint. -# -# Usage: -# bash test/e2e-scenario/runtime/run-scenario.sh [--plan-only|--validate-only|--dry-run] -# -# Flags: -# --plan-only Resolve metadata and print the plan only. Writes -# ${E2E_CONTEXT_DIR:-.e2e}/plan.json for artifact upload. -# --validate-only Run the expected-state validator against the current -# context.env without running install/onboard/suites. -# Emits probe results JSON to stdout and writes -# ${E2E_CONTEXT_DIR}/expected-state-report.json. Used by -# the parity-compare workflow to collect per-assertion -# probe results. Mutually exclusive with --plan-only. -# --dry-run (reserved) Run orchestration with real side effects -# replaced by trace-logged stubs. Sets E2E_DRY_RUN=1 for -# helpers. Full dry-run orchestration lands in later phases. -# -# Environment: -# E2E_CONTEXT_DIR Override the scenario artifact directory -# (default: /.e2e/). +# DEPRECATED. The hybrid scenario architecture has a single supported runtime +# entrypoint: test/e2e-scenario/scenarios/run.ts. This bash runner duplicated +# install/onboard/gateway-check/suite-execution that now belongs in TS phase +# orchestrators (EnvironmentOrchestrator, OnboardingOrchestrator, +# RuntimeOrchestrator) and shared clients (HostCliClient, GatewayClient, +# SandboxClient). It is fail-fast so the deprecation is loud, not silent. set -euo pipefail -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -E2E_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" -REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" - -SCENARIO_ID="" -PLAN_ONLY=0 -VALIDATE_ONLY=0 -DRY_RUN=0 - -usage() { - cat >&2 <<'USAGE' -Usage: bash test/e2e-scenario/runtime/run-scenario.sh [--plan-only|--validate-only|--dry-run] -USAGE -} - -while [[ $# -gt 0 ]]; do - case "$1" in - --plan-only) - PLAN_ONLY=1 - shift - ;; - --validate-only) - VALIDATE_ONLY=1 - shift - ;; - --dry-run) - DRY_RUN=1 - shift - ;; - -h | --help) - usage - exit 0 - ;; - --*) - echo "run-scenario: unknown flag: $1" >&2 - usage - exit 2 - ;; - *) - if [[ -z "${SCENARIO_ID}" ]]; then - SCENARIO_ID="$1" - else - echo "run-scenario: unexpected positional argument: $1" >&2 - usage - exit 2 - fi - shift - ;; - esac -done - -if [[ -z "${SCENARIO_ID}" ]]; then - echo "run-scenario: missing scenario id" >&2 - usage - exit 2 -fi - -if [[ "${PLAN_ONLY}" -eq 1 && "${VALIDATE_ONLY}" -eq 1 ]]; then - echo "run-scenario: --plan-only and --validate-only are mutually exclusive" >&2 - usage - exit 2 -fi - -export E2E_CONTEXT_DIR="${E2E_CONTEXT_DIR:-${REPO_ROOT}/.e2e}" -mkdir -p "${E2E_CONTEXT_DIR}" - -if [[ "${DRY_RUN}" -eq 1 ]]; then - export E2E_DRY_RUN=1 -fi - -# Prefer the locally-installed tsx if present, otherwise fall back to npx. -TSX_BIN="${REPO_ROOT}/node_modules/.bin/tsx" -if [[ ! -x "${TSX_BIN}" ]]; then - TSX_BIN="" -fi - -run_resolver() { - if [[ -n "${TSX_BIN}" ]]; then - "${TSX_BIN}" "${SCRIPT_DIR}/resolver/index.ts" "$@" - return - fi - # CodeRabbit review item #10: fail closed with a clear hint instead of - # silently pulling tsx from the network via `npx --yes`. - if ! (cd "${REPO_ROOT}" && npx --no-install tsx "${SCRIPT_DIR}/resolver/index.ts" "$@"); then - echo "run-scenario: tsx is required but not installed. Run 'npm ci' at the repo root and retry." >&2 - return 1 - fi -} - -run_resolver plan "${SCENARIO_ID}" --context-dir "${E2E_CONTEXT_DIR}" - -if [[ "${PLAN_ONLY}" -eq 1 ]]; then - exit 0 -fi - -# --validate-only: assume setup has already completed. Skip install / -# onboard / suite execution and dispatch the expected-state validator -# using probes resolved from E2E_PROBE_OVERRIDE_* env vars. Emits the -# probe results JSON report to stdout and writes it to -# ${E2E_CONTEXT_DIR}/expected-state-report.json. -if [[ "${VALIDATE_ONLY}" -eq 1 ]]; then - validate_args=("${SCENARIO_ID}" --context-dir "${E2E_CONTEXT_DIR}") - if ! run_resolver validate-state "${validate_args[@]}"; then - echo "run-scenario: --validate-only: expected-state validation failed" >&2 - exit 3 - fi - exit 0 -fi - -# Source the shared helper library so we can exercise the full -# setup → install → onboard → gateway/sandbox check sequence. In dry-run -# mode each helper short-circuits (and writes to E2E_TRACE_FILE if set). -# shellcheck source=lib/env.sh -. "${SCRIPT_DIR}/lib/env.sh" -# shellcheck source=lib/context.sh -. "${SCRIPT_DIR}/lib/context.sh" -# shellcheck source=lib/negative.sh -. "${SCRIPT_DIR}/lib/negative.sh" -# shellcheck source=lib/port-holder.sh -. "${SCRIPT_DIR}/lib/port-holder.sh" -# shellcheck source=../nemoclaw_scenarios/install/dispatch.sh -. "${E2E_ROOT}/nemoclaw_scenarios/install/dispatch.sh" -# shellcheck source=../nemoclaw_scenarios/onboard/dispatch.sh -. "${E2E_ROOT}/nemoclaw_scenarios/onboard/dispatch.sh" -# shellcheck source=../validation_suites/assert/gateway-alive.sh -. "${E2E_ROOT}/validation_suites/assert/gateway-alive.sh" -# shellcheck source=../validation_suites/assert/sandbox-alive.sh -. "${E2E_ROOT}/validation_suites/assert/sandbox-alive.sh" - -# Apply standard non-interactive env (and trace it). -e2e_env_apply_noninteractive -e2e_env_trace "env:noninteractive" - -# Emit normalized context from the resolved plan. -e2e_context_init -"${E2E_ROOT}/nemoclaw_scenarios/helpers/emit-context-from-plan.sh" "${E2E_CONTEXT_DIR}/plan.json" - -# Extract the install method and onboarding profile from the plan so we can -# dispatch to the right helpers. -read_plan_string() { - local key="$1" - node -e " - const p = JSON.parse(require('fs').readFileSync(process.argv[1], 'utf8')); - const parts = process.argv[2].split('.'); - let cur = p; - for (const part of parts) { if (cur == null) { cur = ''; break; } cur = cur[part]; } - process.stdout.write(cur == null ? '' : String(cur)); - " "${E2E_CONTEXT_DIR}/plan.json" "${key}" -} - -INSTALL_ID="$(read_plan_string dimensions.install.id)" -INSTALL_METHOD="$(read_plan_string dimensions.install.profile.method)" -ONBOARDING_ID="$(read_plan_string dimensions.onboarding.id)" -RUNTIME_ID="$(read_plan_string dimensions.runtime.id)" -RUNTIME_CONTAINER_DAEMON="$(read_plan_string dimensions.runtime.profile.container_daemon)" -EXPECTED_STATE_ID="$(read_plan_string expected_state.id)" -FAILURE_STAGE="$(read_plan_string expected_state.config.failure.stage)" -FAILURE_EXIT_CODE="$(read_plan_string expected_state.config.failure.exit_code)" -FAILURE_MESSAGE_CONTAINS="$(read_plan_string expected_state.config.failure.message_contains)" -FAILURE_NO_STACK_TRACE="$(read_plan_string expected_state.config.failure.no_stack_trace)" - -# Trace the dimension id so scenario-level assertions can identify the -# configured install (e.g. repo-current); e2e_install internally traces -# the resolved method. -e2e_env_trace "install:${INSTALL_ID}" - -install_log="${E2E_CONTEXT_DIR}/install.log" -set +e -e2e_install "${INSTALL_METHOD}" >"${install_log}" 2>&1 -install_status=$? -set -e -if [[ "${install_status}" -ne 0 ]]; then - cat "${install_log}" >&2 - echo "run-scenario: install ${INSTALL_METHOD} failed with status ${install_status}" >&2 - exit "${install_status}" -fi -export PATH="${HOME}/.local/bin:${PATH}" -{ - printf 'PATH=%s\n' "${PATH}" - command -v nemoclaw || true -} >"${E2E_CONTEXT_DIR}/post-install-path.log" 2>&1 -if [[ "${DRY_RUN}" -eq 1 ]]; then - printf 'run-scenario: dry-run skipping post-install nemoclaw PATH verification\n' >&2 -else - nemoclaw_bin="$(command -v nemoclaw || true)" - if [[ -z "${nemoclaw_bin}" ]]; then - cat "${E2E_CONTEXT_DIR}/post-install-path.log" >&2 - echo "run-scenario: nemoclaw not found on PATH after install" >&2 - exit 127 - fi - printf 'run-scenario: using nemoclaw at %s\n' "${nemoclaw_bin}" >&2 -fi - -# Negative scenarios declare an `expected_failure` block on their expected -# state (see NemoClaw issue #3608). The runner forces the failure mode for -# the scenario, captures the setup log, gathers a side-effect inventory, and -# delegates structured matching to `resolver/index.ts match-failure`. The -# matcher writes `expected-vs-actual.json` for CI artifact upload. - -read_plan_failure_field() { - local key="$1" - node -e " - (() => { - const p = JSON.parse(require('fs').readFileSync(process.argv[1], 'utf8')); - const ef = p.expected_failure; - if (!ef) { process.stdout.write(''); return; } - const v = ef[process.argv[2]]; - process.stdout.write(v == null ? '' : Array.isArray(v) ? v.join(',') : String(v)); - })(); - " "${E2E_CONTEXT_DIR}/plan.json" "${key}" -} - -EXPECTED_FAILURE_PHASE="$(read_plan_failure_field phase)" - -if [[ -n "${EXPECTED_FAILURE_PHASE}" ]]; then - expected_error_class="$(read_plan_failure_field error_class)" - negative_log="${E2E_CONTEXT_DIR}/negative-${EXPECTED_FAILURE_PHASE}.log" - sandbox_name="$(e2e_context_get E2E_SANDBOX_NAME)" - - # Snapshot the side-effect baseline BEFORE forcing the failure so we only - # report effects newly introduced by this scenario. A pre-existing gateway - # or credentials file from an earlier run would otherwise look like a fresh - # side effect and falsely fail negative scenarios in dirty environments. - baseline_sandbox=0 - if [[ -n "${sandbox_name}" ]] && openshell sandbox list 2>/dev/null | grep -Fq "${sandbox_name}"; then - baseline_sandbox=1 - fi - baseline_gateway=0 - if nemoclaw gateway status >/dev/null 2>&1; then - baseline_gateway=1 - fi - baseline_credentials=0 - if [[ -s "${HOME}/.nemoclaw/credentials.json" ]]; then - baseline_credentials=1 - fi - - # Force the failure mode declared by the scenario. Only `preflight` / - # `docker-missing` is implemented here; other phases are accepted by the - # schema but their forcing logic lands alongside the first consumer. - case "${EXPECTED_FAILURE_PHASE}:${expected_error_class}" in - preflight:docker-missing) - if [[ "${DRY_RUN}" -eq 1 ]]; then - printf 'Cannot connect to the Docker daemon during preflight\n' >"${negative_log}" - else - if DOCKER_HOST="unix:///tmp/nemoclaw-e2e-missing-docker.sock" \ - e2e_onboard "${ONBOARDING_ID}" >"${negative_log}" 2>&1; then - echo "run-scenario: expected preflight failure, but onboarding succeeded" >&2 - cat "${negative_log}" >&2 - exit 4 - fi - fi - ;; - *) - echo "run-scenario: expected_failure phase=${EXPECTED_FAILURE_PHASE} class=${expected_error_class} has no forcing implementation yet" >&2 - exit 2 - ;; - esac - - # Compute the side-effect delta: only count effects that were absent in the - # baseline and present after the forced failure. - observed_side_effects="" - if [[ "${baseline_sandbox}" -eq 0 ]] && [[ -n "${sandbox_name}" ]] \ - && openshell sandbox list 2>/dev/null | grep -Fq "${sandbox_name}"; then - observed_side_effects="${observed_side_effects:+${observed_side_effects},}sandbox-created" - fi - if [[ "${baseline_gateway}" -eq 0 ]] && nemoclaw gateway status >/dev/null 2>&1; then - observed_side_effects="${observed_side_effects:+${observed_side_effects},}gateway-started" - fi - if [[ "${baseline_credentials}" -eq 0 ]] && [[ -s "${HOME}/.nemoclaw/credentials.json" ]]; then - observed_side_effects="${observed_side_effects:+${observed_side_effects},}credentials-written" - fi - - # `--observed-error-class` is intentionally omitted: the runner does not yet - # derive a structured error class from the actual failure output, and - # reporting the planned class back to the matcher would make the check - # tautological. The matcher logs this as a skipped check. - match_args=( - match-failure "${SCENARIO_ID}" - --context-dir "${E2E_CONTEXT_DIR}" - --log "${negative_log}" - --observed-phase "${EXPECTED_FAILURE_PHASE}" - ) - if [[ -n "${observed_side_effects}" ]]; then - match_args+=(--observed-side-effects "${observed_side_effects}") - fi - if ! run_resolver "${match_args[@]}"; then - echo "run-scenario: expected-failure match failed; see ${E2E_CONTEXT_DIR}/expected-vs-actual.json" >&2 - exit 4 - fi - echo "run-scenario: negative scenario passed (phase=${EXPECTED_FAILURE_PHASE} class=${expected_error_class})" - exit 0 -fi - -if [[ "${EXPECTED_STATE_ID}" == "preflight-failure-no-sandbox" ]]; then - negative_log="${E2E_CONTEXT_DIR}/negative-preflight.log" - sandbox_name="$(e2e_context_get E2E_SANDBOX_NAME)" - if [[ "${DRY_RUN}" -eq 1 ]]; then - printf 'Cannot connect to the Docker daemon during preflight\n' >"${negative_log}" - elif DOCKER_HOST="unix:///tmp/nemoclaw-e2e-missing-docker.sock" e2e_onboard "${ONBOARDING_ID}" >"${negative_log}" 2>&1; then - echo "run-scenario: expected preflight failure, but onboarding succeeded" >&2 - exit 4 - fi - if ! grep -Eiq "docker|container|daemon|socket|preflight" "${negative_log}"; then - echo "run-scenario: negative preflight failed without a clear Docker/preflight reason" >&2 - cat "${negative_log}" >&2 - exit 4 - fi - if openshell sandbox list 2>/dev/null | grep -Fq "${sandbox_name}"; then - echo "run-scenario: negative preflight left behind sandbox ${sandbox_name}" >&2 - exit 4 - fi - echo "run-scenario: negative preflight passed; Docker daemon unavailable and no sandbox was created" - exit 0 -fi - -if [[ "${FAILURE_STAGE}" == "onboarding" ]]; then - negative_log="${E2E_CONTEXT_DIR}/negative-onboarding.log" - sandbox_name="$(e2e_context_get E2E_SANDBOX_NAME)" - port_holder_started=0 - onboard_env=(NEMOCLAW_SANDBOX_NAME="${sandbox_name}" NEMOCLAW_RECREATE_SANDBOX=1 NEMOCLAW_POLICY_MODE=skip) - case "${ONBOARDING_ID}" in - cloud-openclaw-invalid-nvidia-key) - onboard_env+=(NVIDIA_API_KEY=not-a-nvidia-key) - ;; - cloud-openclaw-gateway-port-conflict) - conflict_port="$(read_plan_string dimensions.onboarding.profile.gateway_port)" - : "${conflict_port:=18080}" - if e2e_port_holder_start "${conflict_port}"; then - port_holder_started=1 - else - echo "run-scenario: could not start port holder on ${conflict_port}; continuing against any existing listener" >&2 - fi - onboard_env+=(NEMOCLAW_GATEWAY_PORT="${conflict_port}") - ;; - esac - if [[ "${DRY_RUN}" -eq 1 ]]; then - printf '%s -' "${FAILURE_MESSAGE_CONTAINS}" >"${negative_log}" - negative_status="${FAILURE_EXIT_CODE:-1}" - else - set +e - ( - export "${onboard_env[@]}" - e2e_onboard "${ONBOARDING_ID}" - ) >"${negative_log}" 2>&1 - negative_status=$? - set -e - fi - if [[ "${port_holder_started}" -eq 1 ]]; then - e2e_port_holder_stop - fi - if ! e2e_negative_assert_failure "${negative_log}" "${negative_status}" "${FAILURE_EXIT_CODE:-1}" "${FAILURE_MESSAGE_CONTAINS}" "$([[ "${FAILURE_NO_STACK_TRACE}" == "true" ]] && echo 1 || echo 0)"; then - exit 4 - fi - if openshell sandbox list 2>/dev/null | grep -Fq "${sandbox_name}"; then - echo "run-scenario: negative onboarding left behind sandbox ${sandbox_name}" >&2 - exit 4 - fi - echo "run-scenario: negative onboarding ${ONBOARDING_ID} passed" - exit 0 -fi - -DOCKER_OPTIONAL_UNAVAILABLE=0 -if [[ "${RUNTIME_CONTAINER_DAEMON}" == "optional" ]] && ! docker info >/dev/null 2>&1; then - DOCKER_OPTIONAL_UNAVAILABLE=1 - echo "SKIP: scenario.${SCENARIO_ID}.docker-dependent-suites Docker unavailable for optional runtime ${RUNTIME_ID}; gateway/sandbox/inference coverage skipped" - echo "run-scenario: Docker unavailable for optional runtime ${RUNTIME_ID}; scaling back to platform-only suites" -else - onboard_log="${E2E_CONTEXT_DIR}/onboard.log" - set +e - e2e_onboard "${ONBOARDING_ID}" >"${onboard_log}" 2>&1 - onboard_status=$? - set -e - if [[ "${onboard_status}" -ne 0 ]]; then - cat "${onboard_log}" >&2 - echo "run-scenario: onboarding ${ONBOARDING_ID} failed with status ${onboard_status}" >&2 - exit "${onboard_status}" - fi - if [[ "${RUNTIME_ID}" == "gpu-docker-cdi" ]] && ! e2e_env_is_dry_run; then - echo "run-scenario: GPU Docker CDI uses host-network gateway; validating gateway from suites" - else - e2e_gateway_assert_healthy - fi - e2e_sandbox_assert_running -fi - -# Expected state validation. The validator reads E2E_PROBE_OVERRIDE_* env -# variables to simulate real probe outputs in dry-run/test contexts. -# Live probe wiring lands scenario-by-scenario; by default, live runs move -# straight from setup checks to suites so migrated suite assertions can be -# debugged against the real environment. -if [[ "${E2E_VALIDATE_EXPECTED_STATE:-0}" == "1" || "${DRY_RUN}" -eq 1 ]]; then - validate_args=("${SCENARIO_ID}" --context-dir "${E2E_CONTEXT_DIR}") - if [[ "${DRY_RUN}" -eq 1 ]]; then - # CodeRabbit review item #9: explicitly opt in to seeding probes from - # the expected state in dry-run/test mode. Live runs go through real - # probes and must fail closed if any are missing. - validate_args+=(--probes-from-state) - fi - if ! run_resolver validate-state "${validate_args[@]}"; then - echo "run-scenario: expected-state validation failed; suites will NOT run" >&2 - exit 3 - fi -fi - -if [[ "${DRY_RUN}" -eq 1 ]]; then - echo "run-scenario: dry-run complete; context.env emitted under ${E2E_CONTEXT_DIR}" - exit 0 -fi - -SUITE_IDS=() -while IFS= read -r suite_id; do - SUITE_IDS+=("${suite_id}") -done < <(node -e " - try { - const planPath = process.argv[1]; - const p = JSON.parse(require('fs').readFileSync(planPath, 'utf8')); - if (!Array.isArray(p.suites)) { - throw new Error('missing or invalid suites array'); - } - const filter = process.env.E2E_SUITE_FILTER || ''; - const selected = filter ? filter.split(',').map((s) => s.trim()).filter(Boolean) : p.suites.map((s) => s.id); - for (const id of selected) console.log(id); - } catch (err) { - console.error('run-scenario: failed to parse plan.json ' + process.argv[1] + ': ' + err.message); - process.exit(1); - } -" "${E2E_CONTEXT_DIR}/plan.json") - -if [[ "${#SUITE_IDS[@]}" -eq 0 ]]; then - echo "run-scenario: no suites selected for ${SCENARIO_ID}" >&2 - exit 4 -fi - -if [[ "${DOCKER_OPTIONAL_UNAVAILABLE}" -eq 1 ]]; then - FILTERED_SUITE_IDS=() - for suite_id in "${SUITE_IDS[@]}"; do - case "${suite_id}" in - smoke | inference | credentials | hermes-specific | local-ollama-inference | ollama-proxy | gateway-health | sandbox-shell | cloud-inference | ollama-auth-proxy | security-credentials | messaging-telegram | messaging-discord | messaging-slack | security-shields | inference-routing | sandbox-lifecycle | sandbox-operations | snapshot | rebuild | upgrade | diagnostics | docs-validation | openai-compatible-inference | inference-switch | kimi-compatibility | messaging-token-rotation | security-policy | security-injection | model-router) - echo "SKIP: suite.${suite_id} skipped because optional Docker runtime ${RUNTIME_ID} is unavailable" - ;; - *) - FILTERED_SUITE_IDS+=("${suite_id}") - ;; - esac - done - SUITE_IDS=("${FILTERED_SUITE_IDS[@]}") -fi +cat >&2 <<'MSG' +run-scenario.sh is deprecated. Use the TS runner instead: -if [[ "${#SUITE_IDS[@]}" -eq 0 ]]; then - echo "run-scenario: all suites skipped for ${SCENARIO_ID}" >&2 - exit 0 -fi + npx tsx test/e2e-scenario/scenarios/run.ts --scenarios -bash "${SCRIPT_DIR}/run-suites.sh" "${SUITE_IDS[@]}" +Other run.ts modes (read-only): + --list List canonical scenario ids + --emit-matrix Emit GitHub Actions matrix payload from the registry + --plan-only Local debug: print the compiled plan, do not execute + (must NOT appear in any CI workflow) +MSG +exit 2 diff --git a/test/e2e-scenario/runtime/run-suites.sh b/test/e2e-scenario/runtime/run-suites.sh index e99c069408..dac69cd422 100755 --- a/test/e2e-scenario/runtime/run-suites.sh +++ b/test/e2e-scenario/runtime/run-suites.sh @@ -2,136 +2,20 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # -# Run one or more functional suites against a completed E2E environment. -# -# Usage: -# bash test/e2e-scenario/runtime/run-suites.sh [ ...] -# -# Reads suite metadata from test/e2e-scenario/validation_suites/suites.yaml -# (or $E2E_SUITES_FILE). Each suite script receives .e2e/context.env -# via E2E_CONTEXT_DIR and is expected to source runtime/lib/context.sh if -# it needs specific keys. -# -# Environment: -# E2E_CONTEXT_DIR Directory containing context.env (default: /.e2e) -# E2E_SUITES_FILE Override suites metadata file (for tests) -# E2E_SUITES_DIR Override the directory that suite scripts are resolved -# against (default: test/e2e-scenario/validation_suites/) -# E2E_DRY_RUN When 1, suite scripts run in dry-run mode themselves. -# -# Exit code: 0 if all steps pass; non-zero at the first failing step. +# DEPRECATED. Suite execution is now driven directly by the TS phase +# orchestrator (RuntimeOrchestrator -> PhaseOrchestrator.runShellStep) which +# spawns each migrated assertion step's implementation.ref shell script. +# There is no longer a YAML-walking bash suite runner. set -euo pipefail -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -E2E_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" -REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" -VALIDATION_SUITES_DIR="${E2E_ROOT}/validation_suites" - -if (($# == 0)); then - echo "run-suites: at least one suite id required" >&2 - echo "Usage: bash test/e2e-scenario/runtime/run-suites.sh [ ...]" >&2 - exit 2 -fi - -export E2E_CONTEXT_DIR="${E2E_CONTEXT_DIR:-${REPO_ROOT}/.e2e}" -SUITES_FILE="${E2E_SUITES_FILE:-${VALIDATION_SUITES_DIR}/suites.yaml}" -SUITES_DIR="${E2E_SUITES_DIR:-${VALIDATION_SUITES_DIR}}" - -CTX_FILE="${E2E_CONTEXT_DIR}/context.env" -if [[ ! -f "${CTX_FILE}" ]]; then - echo "run-suites: missing ${CTX_FILE}; run-scenario.sh must emit context before running suites" >&2 - exit 1 -fi - -# Sanity-check that the baseline scenario key is present. -if ! grep -q '^E2E_SCENARIO=' "${CTX_FILE}"; then - echo "run-suites: ${CTX_FILE} is missing required key E2E_SCENARIO" >&2 - exit 1 -fi - -# Resolve the suite step list by reading the YAML via node. -resolve_suite() { - local suite_id="$1" - node -e " - const fs = require('fs'); - const path = process.argv[1]; - const wanted = process.argv[2]; - const raw = fs.readFileSync(path, 'utf8'); - // Minimal YAML reader: prefer js-yaml if available; else fall back. - let yaml; - try { yaml = require('js-yaml'); } catch (_) { - process.stderr.write('run-suites: js-yaml required to parse suite metadata\n'); - process.exit(2); - } - const doc = yaml.load(raw); - if (!doc || !doc.suites || !doc.suites[wanted]) { - process.stderr.write('run-suites: unknown suite: ' + wanted + '\n'); - process.exit(3); - } - const steps = doc.suites[wanted].steps || []; - for (const s of steps) { - if (!s || typeof s.id !== 'string' || typeof s.script !== 'string') { - process.stderr.write('run-suites: malformed step in ' + wanted + '\n'); - process.exit(4); - } - process.stdout.write(s.id + '\t' + s.script + '\n'); - } - " "${SUITES_FILE}" "${suite_id}" -} - -declare -a FAILED_STEPS=() -declare -a PASSED_STEPS=() -OVERALL_STATUS=0 - -run_one_suite() { - local suite_id="$1" - echo "== suite: ${suite_id} ==" - local steps - if ! steps="$(resolve_suite "${suite_id}")"; then - OVERALL_STATUS=1 - return 1 - fi - if [[ -z "${steps}" ]]; then - echo " (no steps)" - return 0 - fi - while IFS=$'\t' read -r step_id script; do - [[ -z "${step_id}" ]] && continue - local full="${SUITES_DIR}/${script}" - echo " -> step: ${step_id} (${script})" - if [[ ! -f "${full}" ]]; then - echo " FAIL: script not found at ${full}" >&2 - FAILED_STEPS+=("${suite_id}/${step_id}") - OVERALL_STATUS=1 - return 1 - fi - if ! bash "${full}"; then - echo " FAIL: suite=${suite_id} step=${step_id}" >&2 - FAILED_STEPS+=("${suite_id}/${step_id}") - OVERALL_STATUS=1 - return 1 - fi - echo " PASS: ${step_id}" - PASSED_STEPS+=("${suite_id}/${step_id}") - done <<<"${steps}" -} - -for suite_id in "$@"; do - if ! run_one_suite "${suite_id}"; then - break - fi -done +cat >&2 <<'MSG' +run-suites.sh is deprecated. Suite assertions are now executed by +test/e2e-scenario/scenarios/orchestrators/phase.ts via child_process.spawn, +walking the typed assertionGroups defined in the scenario registry. -echo -echo "== suite summary ==" -# bash 3.2 (macOS) fails on "${arr[@]}" when the array is empty under `set -u`; -# use the `${arr[@]+...}` guard to expand to nothing when empty. -for p in ${PASSED_STEPS[@]+"${PASSED_STEPS[@]}"}; do - echo " PASS ${p}" -done -for f in ${FAILED_STEPS[@]+"${FAILED_STEPS[@]}"}; do - echo " FAIL ${f}" -done +Run scenarios via: -exit "${OVERALL_STATUS}" + npx tsx test/e2e-scenario/scenarios/run.ts --scenarios +MSG +exit 2 diff --git a/test/e2e-scenario/scenarios/assertions/environment.ts b/test/e2e-scenario/scenarios/assertions/environment.ts deleted file mode 100644 index be7a62e6fb..0000000000 --- a/test/e2e-scenario/scenarios/assertions/environment.ts +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -import type { AssertionGroup } from "../types.ts"; - -export function environmentBaseline(): AssertionGroup { - return { - id: "environment.baseline", - phase: "environment", - description: "Skeleton environment baseline assertion group.", - migrationStatus: "complete", - steps: [ - { - id: "environment.plan.skeleton", - phase: "environment", - description: "Placeholder step until live environment orchestration is migrated.", - implementation: { kind: "pending", ref: "phase-1-skeleton" }, - evidencePath: ".e2e/environment.result.json", - }, - ], - }; -} diff --git a/test/e2e-scenario/scenarios/assertions/onboarding.ts b/test/e2e-scenario/scenarios/assertions/onboarding.ts deleted file mode 100644 index 9886a701fb..0000000000 --- a/test/e2e-scenario/scenarios/assertions/onboarding.ts +++ /dev/null @@ -1,21 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -import type { AssertionGroup } from "../types.ts"; - -export function onboardingBaseline(): AssertionGroup { - return { - id: "onboarding.baseline", - phase: "onboarding", - description: "Skeleton onboarding assertion group.", - steps: [ - { - id: "onboarding.plan.skeleton", - phase: "onboarding", - description: "Placeholder step until onboarding assertions are migrated.", - implementation: { kind: "pending", ref: "phase-1-skeleton" }, - evidencePath: ".e2e/onboarding.result.json", - }, - ], - }; -} diff --git a/test/e2e-scenario/scenarios/assertions/registry.ts b/test/e2e-scenario/scenarios/assertions/registry.ts index d6ef59fe1c..2a7d6603f4 100644 --- a/test/e2e-scenario/scenarios/assertions/registry.ts +++ b/test/e2e-scenario/scenarios/assertions/registry.ts @@ -3,7 +3,6 @@ import fs from "node:fs"; import path from "node:path"; -import { environmentBaseline } from "./environment.ts"; import type { AssertionGroup, AssertionStep, PhaseName, ScenarioDefinition } from "../types.ts"; type Reliability = AssertionStep["reliability"]; @@ -25,22 +24,42 @@ function shellStep(input: ShellStepInput): AssertionStep { }; } -function probeStep(id: string, phase: PhaseName, ref: string, reliability?: Reliability): AssertionStep { +interface ProbeStepOptions { + reliability?: Reliability; + // When true, an unregistered probe fails the phase (and the run) + // instead of skipping. Use for security-sensitive probes the run + // is not safe without. + required?: boolean; +} + +function probeStep( + id: string, + phase: PhaseName, + ref: string, + options: ProbeStepOptions = {}, +): AssertionStep { return { id, phase, implementation: { kind: "probe", ref }, evidencePath: `.e2e/assertions/${id}.json`, - reliability, + reliability: options.reliability, + required: options.required, }; } -function pendingStep(id: string, phase: PhaseName, ref: string): AssertionStep { +function pendingStep( + id: string, + phase: PhaseName, + ref: string, + options: { required?: boolean } = {}, +): AssertionStep { return { id, phase, implementation: { kind: "pending", ref }, evidencePath: `.e2e/assertions/${id}.json`, + required: options.required, }; } @@ -186,7 +205,21 @@ export const runtimeControlGroups: AssertionGroup[] = [ phase: "runtime", description: "Negative scenario runtime check ensuring forbidden side effects did not occur.", migrationStatus: "complete", - steps: [pendingStep("runtime.expected-failure.no-side-effects", "runtime", "expectedFailureNoSideEffectsProbe")], + steps: [ + pendingStep( + "runtime.expected-failure.no-side-effects", + "runtime", + "expectedFailureNoSideEffectsProbe", + // Negative scenarios assert that a declared failure mode + // produced no forbidden side effects. Until the side-effect + // validator is implemented, this step must fail closed for + // any scenario that opts into runtimeControlGroups[0] + // (i.e. scenario.expectedFailure is set). Skipping it would + // let negative scenarios silently "pass" without verifying + // their core contract. + { required: true }, + ), + ], }, ]; @@ -219,9 +252,19 @@ export const validationSuiteGroups: AssertionGroup[] = [ ]), suiteGroup("credentials", credentialsSteps), suiteGroup("security-credentials", credentialsSteps), - suiteGroup("security-shields", [probeStep("security.shields.config", "runtime", "shieldsConfigProbe")]), - suiteGroup("security-policy", [probeStep("security.policy.enforced", "runtime", "networkPolicyProbe")]), - suiteGroup("security-injection", [probeStep("security.injection.blocked", "runtime", "injectionBlockedProbe")]), + // Security-sensitive probes MUST fail closed until the probe + // registry lands. A skipped shields/policy/injection check would + // produce fake-green for the exact suites these scenarios exist to + // protect. + suiteGroup("security-shields", [ + probeStep("security.shields.config", "runtime", "shieldsConfigProbe", { required: true }), + ]), + suiteGroup("security-policy", [ + probeStep("security.policy.enforced", "runtime", "networkPolicyProbe", { required: true }), + ]), + suiteGroup("security-injection", [ + probeStep("security.injection.blocked", "runtime", "injectionBlockedProbe", { required: true }), + ]), suiteGroup("messaging-telegram", [ shellStep({ id: "messaging.telegram.injection-safety", phase: "runtime", ref: "test/e2e-scenario/validation_suites/messaging/telegram/00-telegram-injection-safety.sh", reliability: { timeoutSeconds: 30, retry: { attempts: 2, on: ["external-tunnel"] } } }), shellStep({ id: "messaging.telegram.injection-payload-classes", phase: "runtime", ref: "test/e2e-scenario/validation_suites/messaging/telegram/01-telegram-injection-payload-classes.sh", reliability: { timeoutSeconds: 30, retry: { attempts: 2, on: ["external-tunnel"] } } }), @@ -254,7 +297,7 @@ export const validationSuiteGroups: AssertionGroup[] = [ ]; export const assertionRegistry = { - groups: [environmentBaseline(), ...onboardingAssertionGroups, ...runtimeControlGroups, ...validationSuiteGroups], + groups: [...onboardingAssertionGroups, ...runtimeControlGroups, ...validationSuiteGroups], }; export function assertionGroupForSuite(suiteId: string): AssertionGroup | undefined { @@ -349,8 +392,11 @@ export function assertionGroupsForScenario(scenario: ScenarioDefinition): Assert return group; }); + // Environment phase work is performed by typed PhaseAction entries + // (context.emit + install.) emitted from compiler.phaseActions(), + // not by assertion groups. No environment-phase assertion group is + // included in scenario plans. const groups: (AssertionGroup | undefined)[] = [ - environmentBaseline(), ...onboardingGroups, ...suiteGroups, ...supplementalGroups, diff --git a/test/e2e-scenario/scenarios/assertions/runtime.ts b/test/e2e-scenario/scenarios/assertions/runtime.ts deleted file mode 100644 index 5ed7031279..0000000000 --- a/test/e2e-scenario/scenarios/assertions/runtime.ts +++ /dev/null @@ -1,21 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -import type { AssertionGroup } from "../types.ts"; - -export function runtimeSmokeSkeleton(): AssertionGroup { - return { - id: "runtime.smoke.skeleton", - phase: "runtime", - description: "Skeleton runtime smoke assertion group.", - steps: [ - { - id: "runtime.plan.skeleton", - phase: "runtime", - description: "Placeholder step until validation suites are migrated.", - implementation: { kind: "pending", ref: "phase-1-skeleton" }, - evidencePath: ".e2e/runtime.result.json", - }, - ], - }; -} diff --git a/test/e2e-scenario/scenarios/compiler.ts b/test/e2e-scenario/scenarios/compiler.ts index 5046c77dd2..796e8a05fc 100644 --- a/test/e2e-scenario/scenarios/compiler.ts +++ b/test/e2e-scenario/scenarios/compiler.ts @@ -6,7 +6,17 @@ import path from "node:path"; import { fileURLToPath } from "node:url"; import { loadManifest } from "./manifests.ts"; import { requireScenarios } from "./registry.ts"; -import type { AssertionGroup, NemoClawInstanceManifest, PhaseName, RunPlan, ScenarioDefinition, SutBoundary } from "./types.ts"; +import type { + AssertionGroup, + ExpectedFailureContract, + ExpectedFailurePhase, + NemoClawInstanceManifest, + PhaseAction, + PhaseName, + RunPlan, + ScenarioDefinition, + SutBoundary, +} from "./types.ts"; const PHASES: PhaseName[] = ["environment", "onboarding", "runtime"]; const REPO_ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../.."); @@ -67,17 +77,114 @@ function validateManifestCompatibility(scenario: ScenarioDefinition, manifest?: } } -function phaseActions(phase: PhaseName, scenario: ScenarioDefinition): string[] { +// Centralized paths to the existing shell helpers. Spec rule: shell +// scripts can remain as implementations, but invocation goes through +// typed assertion/action definitions, not bare workflow YAML or a +// resurrected bash runner. +const INSTALL_DISPATCH = "test/e2e-scenario/nemoclaw_scenarios/install/dispatch.sh"; +const ONBOARD_DISPATCH = "test/e2e-scenario/nemoclaw_scenarios/onboard/dispatch.sh"; + +// Default action timeouts. Install and onboarding can take a while on +// cold runners (Docker pulls, image builds, sandbox bootstrap). +const INSTALL_TIMEOUT_SECONDS = 900; +const ONBOARD_TIMEOUT_SECONDS = 900; + +// Declared parent-env secrets each onboarding profile actually needs. +// Anything not listed here (and not in the framework allowlist) is +// dropped before spawn by buildChildEnv. Keep this list minimal — +// every entry widens the secret blast radius if the child or one of +// its descendants logs unredacted output. +const ONBOARD_PROFILE_SECRET_ENV: Readonly> = { + // Cloud profiles invoke `nemoclaw onboard` which authenticates to the + // NVIDIA cloud provider via NVIDIA_API_KEY. + "cloud-openclaw": ["NVIDIA_API_KEY"], + "cloud-openclaw-custom-policies": ["NVIDIA_API_KEY"], + "cloud-openclaw-invalid-nvidia-key": ["NVIDIA_API_KEY"], + "cloud-openclaw-gateway-port-conflict": ["NVIDIA_API_KEY"], + // Negative scenario: nemoclaw onboard runs against a docker shim that + // exits non-zero. Onboard never reaches the cloud auth step, but the + // CLI still loads NVIDIA_API_KEY when present — keep it in the secret + // env so behavior matches a real user invocation. + "cloud-openclaw-no-docker": ["NVIDIA_API_KEY"], + "cloud-hermes": ["NVIDIA_API_KEY"], + "cloud-hermes-discord": ["NVIDIA_API_KEY"], + "cloud-hermes-slack": ["NVIDIA_API_KEY"], + // Local profiles do not need any cloud secret. + "local-ollama-openclaw": [], +}; + +function phaseActions(phase: PhaseName, scenario: ScenarioDefinition): PhaseAction[] { if (phase === "environment") { + if (!scenario.environment) { + // Scenarios without any environment dimension (skeleton scenarios) + // legitimately have no actions yet. Don't fail-fast here. + return []; + } + const installId = scenario.environment.install; + if (!installId) { + // Environment is declared but install is missing - that IS a + // malformed scenario; fail fast so the caller sees a clear error + // rather than a phase that silently no-ops setup work. + throw new Error(`Scenario ${scenario.id} is missing environment.install`); + } return [ - `install:${scenario.environment?.install ?? "unknown"}`, - `runtime:${scenario.environment?.runtime ?? "unknown"}`, + { + id: `environment.install.${installId}`, + phase: "environment", + description: `Run e2e_install ${installId} to set up the host control plane.`, + kind: "shell-fn", + scriptRef: INSTALL_DISPATCH, + fn: "e2e_install", + arg: installId, + timeoutSeconds: INSTALL_TIMEOUT_SECONDS, + evidencePath: `.e2e/actions/environment.install.${installId}.log`, + }, ]; } if (phase === "onboarding") { - return [`onboard:${scenario.environment?.onboarding ?? "unknown"}`]; + if (!scenario.environment) { + return []; + } + const baseOnboardingId = scenario.environment.onboarding; + if (!baseOnboardingId) { + throw new Error(`Scenario ${scenario.id} is missing environment.onboarding`); + } + // Negative-runtime scenarios route to a dedicated onboarding profile + // that sets up the failure condition (e.g. docker-missing) BEFORE + // invoking `nemoclaw onboard` and captures the resulting output to + // the log file the assertion phase reads. The profile id convention + // is `-no-docker`. New negative profiles register a worker in + // nemoclaw_scenarios/onboard/dispatch.sh and a secret-env mapping + // above. + const onboardingId = + scenario.environment.runtime === "docker-missing" + ? `${baseOnboardingId}-no-docker` + : baseOnboardingId; + // secretEnv defaults to [] (no parent-env secrets pass through) + // unless the profile is explicitly listed above. Unknown profiles + // get the safest setting and surface the gap loudly the first + // time they actually need a secret to authenticate. + const secretEnv = ONBOARD_PROFILE_SECRET_ENV[onboardingId] ?? []; + return [ + { + id: `onboarding.profile.${onboardingId}`, + phase: "onboarding", + description: `Run e2e_onboard ${onboardingId} to bring the gateway and sandbox online.`, + kind: "shell-fn", + scriptRef: ONBOARD_DISPATCH, + fn: "e2e_onboard", + arg: onboardingId, + timeoutSeconds: ONBOARD_TIMEOUT_SECONDS, + evidencePath: `.e2e/actions/onboarding.profile.${onboardingId}.log`, + // Legacy preflight assertions look for ${E2E_CONTEXT_DIR}/onboard.log; + // publish a stable alias so they keep working without rewiring. + aliasPath: "onboard.log", + secretEnv, + }, + ]; } - return (scenario.suiteIds ?? []).map((suiteId) => `suite:${suiteId}`); + // Runtime phase has no actions; suites are assertion groups. + return []; } const SUT_BOUNDARIES: SutBoundary[] = [ @@ -89,6 +196,41 @@ const SUT_BOUNDARIES: SutBoundary[] = [ { id: "state", client: "StateClient" }, ]; +// Negative scenarios advertise their failure mode against one of these +// user-facing phases. "preflight" is intentionally distinct from the +// internal PhaseName union: scenario manifests speak the user's vocab +// ("preflight failed") and the matcher resolves preflight to the +// onboarding phase orchestrator. See orchestrators/negative-matcher.ts. +const EXPECTED_FAILURE_PHASES: readonly ExpectedFailurePhase[] = [ + "environment", + "onboarding", + "runtime", + "preflight", +]; + +function validateExpectedFailure(scenarioId: string, contract: ExpectedFailureContract): void { + if (!EXPECTED_FAILURE_PHASES.includes(contract.phase)) { + throw new Error( + `Scenario ${scenarioId} expectedFailure.phase invalid: ${String(contract.phase)} (allowed: ${EXPECTED_FAILURE_PHASES.join(", ")})`, + ); + } + if (typeof contract.errorClass !== "string" || contract.errorClass.trim().length === 0) { + throw new Error(`Scenario ${scenarioId} expectedFailure.errorClass must be a non-empty string`); + } + if (contract.forbiddenSideEffects !== undefined) { + if (!Array.isArray(contract.forbiddenSideEffects)) { + throw new Error(`Scenario ${scenarioId} expectedFailure.forbiddenSideEffects must be an array`); + } + for (const entry of contract.forbiddenSideEffects) { + if (typeof entry !== "string" || entry.trim().length === 0) { + throw new Error( + `Scenario ${scenarioId} expectedFailure.forbiddenSideEffects entries must be non-empty strings`, + ); + } + } + } +} + export function validateRunPlan(plan: RunPlan): void { if (!plan.scenarioId) { throw new Error("RunPlan missing scenarioId"); @@ -101,6 +243,9 @@ export function validateRunPlan(plan: RunPlan): void { if (plan.sutBoundaries.length === 0) { throw new Error(`RunPlan ${plan.scenarioId} missing SUT boundaries`); } + if (plan.expectedFailure) { + validateExpectedFailure(plan.scenarioId, plan.expectedFailure); + } } export function compileRunPlans(inputs: Array): RunPlan[] { @@ -112,7 +257,7 @@ export function compileRunPlans(inputs: Array): Run const plan: RunPlan = { scenarioId: scenario.id, status: "compiled", - note: "compiled plan-only preview; live execution lands in later phases", + note: "compiled plan; phase orchestrators execute actions then assertions", manifestPath: scenario.manifestPath, manifest, environment: scenario.environment, @@ -182,6 +327,18 @@ export function renderPlanText(plans: RunPlan[]): string { } for (const phase of plan.phases) { lines.push(`Phase: ${phase.name}`); + for (const action of phase.actions) { + const policy: string[] = []; + if (action.timeoutSeconds) { + policy.push(`timeout=${action.timeoutSeconds}s`); + } + const target = action.kind === "shell-fn" + ? `${action.fn ?? ""}${action.arg ? ` ${action.arg}` : ""}`.trim() + : action.scriptRef; + const policySuffix = policy.length > 0 ? ` (${policy.join(", ")})` : ""; + const targetSuffix = target ? ` -> ${target}` : ""; + lines.push(` Action: ${action.id}${policySuffix}${targetSuffix}`); + } for (const group of phase.assertionGroups) { lines.push(` Group: ${group.id}`); for (const step of group.steps) { diff --git a/test/e2e-scenario/scenarios/orchestrators/context.ts b/test/e2e-scenario/scenarios/orchestrators/context.ts new file mode 100644 index 0000000000..35394121fc --- /dev/null +++ b/test/e2e-scenario/scenarios/orchestrators/context.ts @@ -0,0 +1,108 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import fs from "node:fs"; +import path from "node:path"; +import type { RunContext, RunPlan } from "../types.ts"; + +// Spec ownership: emitting the normalized context.env that downstream +// shell helpers consume is FRAMEWORK INFRASTRUCTURE, not a phase action. +// Doing it as a shell action coupled the typed runner back to the old +// resolver's plan.json shape; doing it here keeps the typed RunPlan as +// the single source of truth. +// +// We seed context.env with values derivable from the typed RunPlan +// (scenario id, install method, agent/provider/route, default sandbox +// name and gateway URL). Onboarding helpers may overwrite these via +// e2e_context_set (e.g. assigning a real sandbox name, real gateway +// URL after the gateway boots). + +function platformOsFromManifest(plan: RunPlan): string { + const explicit = plan.manifest?.spec.setup.platform.os; + if (typeof explicit === "string" && explicit.length > 0) { + return explicit; + } + // Fall back to the scenario environment platform id ("ubuntu-local", + // "macos-local", "wsl-local", "gpu-runner", "brev-launchable"). + const platform = plan.environment?.platform ?? ""; + if (platform.startsWith("macos")) return "macos"; + if (platform.startsWith("wsl")) return "wsl"; + if (platform.startsWith("brev")) return "ubuntu"; + if (platform.startsWith("gpu")) return "ubuntu"; + return "ubuntu"; +} + +function executionTargetFromManifest(plan: RunPlan): string { + const explicit = plan.manifest?.spec.setup.platform.executionTarget; + if (typeof explicit === "string" && explicit.length > 0) { + return explicit; + } + return plan.environment?.platform === "brev-launchable" ? "remote" : "local"; +} + +function containerEngine(plan: RunPlan): string { + const explicit = plan.manifest?.spec.setup.runtime.containerEngine; + return typeof explicit === "string" && explicit.length > 0 ? explicit : "docker"; +} + +function containerDaemon(plan: RunPlan): string { + const explicit = plan.manifest?.spec.setup.runtime.containerDaemon; + if (typeof explicit === "string" && explicit.length > 0) { + return explicit; + } + return plan.environment?.runtime === "docker-missing" ? "missing" : "running"; +} + +function defaultGatewayUrl(agent: string): string { + // Mirrors the historical defaults from emit-context-from-plan.sh so + // existing shell helpers see the same seed values they used to. + return agent === "hermes" ? "http://127.0.0.1:8642" : "http://127.0.0.1:18789"; +} + +function escapeContextValue(value: string): string { + // The context library accepts plain `KEY=value` lines without quoting. + // Reject newlines (would corrupt the file) and otherwise pass through. + if (value.includes("\n")) { + throw new Error(`context.env value for must not contain newline: ${JSON.stringify(value)}`); + } + return value; +} + +export interface ContextSeedResult { + path: string; + keys: string[]; +} + +export function seedContextEnv(ctx: RunContext, plan: RunPlan): ContextSeedResult { + const onboarding = plan.manifest?.spec.onboarding; + const agent = onboarding?.agent ?? "openclaw"; + const provider = onboarding?.provider ?? "nvidia"; + const inferenceRoute = onboarding?.modelRoute ?? "inference-local"; + const onboardingPath = plan.environment?.onboarding ?? "unknown"; + const installMethod = plan.environment?.install ?? "unknown"; + + const entries: Record = { + E2E_SCENARIO: plan.scenarioId, + E2E_PLATFORM_OS: platformOsFromManifest(plan), + E2E_EXECUTION_TARGET: executionTargetFromManifest(plan), + E2E_INSTALL_METHOD: installMethod, + E2E_CONTAINER_ENGINE: containerEngine(plan), + E2E_CONTAINER_DAEMON: containerDaemon(plan), + E2E_ONBOARDING_PATH: onboardingPath, + E2E_AGENT: agent, + E2E_PROVIDER: provider, + E2E_INFERENCE_ROUTE: inferenceRoute, + E2E_SANDBOX_NAME: `e2e-${plan.scenarioId}`, + E2E_GATEWAY_URL: defaultGatewayUrl(agent), + }; + + // Path matches the shell helper's e2e_context_init: ${E2E_CONTEXT_DIR}/context.env + const contextPath = path.join(ctx.contextDir, "context.env"); + fs.mkdirSync(ctx.contextDir, { recursive: true }); + const lines = Object.entries(entries) + .map(([key, value]) => `${key}=${escapeContextValue(value)}`) + .join("\n"); + fs.writeFileSync(contextPath, `${lines}\n`); + + return { path: contextPath, keys: Object.keys(entries) }; +} diff --git a/test/e2e-scenario/scenarios/orchestrators/phase.ts b/test/e2e-scenario/scenarios/orchestrators/phase.ts index ae59a58e62..de952b23fc 100644 --- a/test/e2e-scenario/scenarios/orchestrators/phase.ts +++ b/test/e2e-scenario/scenarios/orchestrators/phase.ts @@ -1,31 +1,49 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +import { spawn } from "node:child_process"; import fs from "node:fs"; import path from "node:path"; +import { fileURLToPath } from "node:url"; import type { AssertionResult, AssertionStep, + PhaseAction, + PhaseActionResult, PhaseName, PhaseResult, RunContext, RunPlanPhase, TransientClassifier, } from "../types.ts"; +import { buildChildEnv, pipeRedacted, redactString } from "./redaction.ts"; + +const REPO_ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../.."); +const DEFAULT_STEP_TIMEOUT_SECONDS = 300; interface StepAttemptOutcome { - status: "passed" | "failed"; + status: "passed" | "failed" | "skipped"; classifier?: TransientClassifier; message?: string; + evidence?: string; } -function transientForRef(ref: string): TransientClassifier { - if (ref.includes("provider") || ref.includes("transient")) { - return "provider-transient"; +// Heuristic transient classifier for shell step refs that don't print +// their own classifier hint. Phase orchestrators own classification; +// clients/scripts do not. +function classifierForRef(ref: string): TransientClassifier { + if (/provider|inference|chat-completion|cloudflared|tunnel/i.test(ref)) { + // Use case-insensitive matching here too; the outer guard is /i, so + // mixed-case refs (Tunnel, Cloudflared) must still classify as + // external-tunnel rather than fall through to provider-transient. + return /tunnel|cloudflared/i.test(ref) ? "external-tunnel" : "provider-transient"; } - if (ref.includes("gateway")) { + if (/gateway/i.test(ref)) { return "gateway-transient"; } + if (/event-capture|tui|chat-events/i.test(ref)) { + return "empty-event-capture"; + } return "runner-infra"; } @@ -33,35 +51,208 @@ export class PhaseOrchestrator { constructor(private readonly phaseName: PhaseName) {} async run(ctx: RunContext, phase: RunPlanPhase): Promise { + const actions: PhaseActionResult[] = []; + let actionFailed = false; + for (const action of phase.actions) { + const actionResult = await this.runAction(ctx, action); + actions.push(actionResult); + if (actionResult.status === "failed") { + actionFailed = true; + // Spec failure-layer rule: setup failure must not let assertions + // run and accidentally pass. Stop the phase here. + break; + } + } const assertions: AssertionResult[] = []; - for (const group of phase.assertionGroups) { - for (const step of group.steps) { - assertions.push(await this.runStep(ctx, step)); + if (!actionFailed) { + for (const group of phase.assertionGroups) { + for (const step of group.steps) { + assertions.push(await this.runStep(ctx, step)); + } } } - const status = assertions.some((assertion) => assertion.status === "failed") ? "failed" : "passed"; - const result: PhaseResult = { phase: this.phaseName, status, assertions }; + const assertionsFailed = assertions.some((assertion) => assertion.status === "failed"); + const allSkipped = + !actionFailed && + assertions.length > 0 && + assertions.every((assertion) => assertion.status === "skipped"); + let status: PhaseResult["status"]; + if (actionFailed || assertionsFailed) { + status = "failed"; + } else if (allSkipped || (actions.length === 0 && assertions.length === 0)) { + status = "skipped"; + } else { + status = "passed"; + } + const result: PhaseResult = { phase: this.phaseName, status, actions, assertions }; this.writePhaseResult(ctx, result); return result; } + private async runAction(ctx: RunContext, action: PhaseAction): Promise { + const startedAt = Date.now(); + const scriptPath = path.isAbsolute(action.scriptRef) + ? action.scriptRef + : path.resolve(REPO_ROOT, action.scriptRef); + if (!fs.existsSync(scriptPath)) { + return { + id: action.id, + status: "failed", + durationMs: Date.now() - startedAt, + message: `phase action ${action.id} script not found: ${scriptPath}`, + }; + } + const timeoutSeconds = action.timeoutSeconds ?? DEFAULT_STEP_TIMEOUT_SECONDS; + const logDir = path.join(ctx.contextDir, ".e2e", "actions"); + fs.mkdirSync(logDir, { recursive: true }); + const logPath = path.join(logDir, `${action.id}.log`); + + // Compose the bash invocation. shell-fn sources the dispatcher and + // calls the named function with its single positional arg; shell + // executes the script directly. We always go through bash -lc so + // sourced shell helpers see a normal interactive-style env. + const dispatchAction = path.join(REPO_ROOT, "test/e2e-scenario/nemoclaw_scenarios/dispatch-action.sh"); + const useDispatchLauncher = action.kind === "shell-fn" && fs.existsSync(dispatchAction); + const bashArgs: string[] = useDispatchLauncher + ? [dispatchAction, action.fn ?? "", action.arg ?? "", scriptPath] + : [scriptPath, ...(action.arg ? [action.arg] : [])]; + + // Framework-owned secret hygiene at the spawn boundary. The child + // gets a minimal allowlisted env plus only the secrets this action + // explicitly declared via PhaseAction.secretEnv. See + // orchestrators/redaction.ts for the full contract. + const env = buildChildEnv(process.env, { + secretEnv: action.secretEnv, + frameworkOverlay: { + E2E_CONTEXT_DIR: ctx.contextDir, + E2E_PHASE: action.phase, + E2E_ACTION_ID: action.id, + }, + }); + + return await new Promise((resolve) => { + const child = spawn("bash", bashArgs, { env, cwd: REPO_ROOT, detached: true }); + const pgid = child.pid; + const logStream = fs.createWriteStream(logPath); + let stderrTail = ""; + // Every byte from the child passes through redactString before + // hitting the evidence log or the stderr tail; raw output never + // touches disk or PhaseActionResult.message. + pipeRedacted(child.stdout, logStream); + pipeRedacted(child.stderr, logStream, (redactedChunk) => { + stderrTail = (stderrTail + redactedChunk).slice(-4096); + }); + + const killGroup = (signal: NodeJS.Signals) => { + if (typeof pgid !== "number") { + child.kill(signal); + return; + } + try { + process.kill(-pgid, signal); + } catch { + /* group already gone */ + } + }; + + let timedOut = false; + const timeout = setTimeout(() => { + timedOut = true; + killGroup("SIGTERM"); + setTimeout(() => { + if (!child.killed) { + killGroup("SIGKILL"); + } + }, 5_000).unref(); + }, timeoutSeconds * 1_000); + + const finishLog = (): Promise => + new Promise((res) => { + if ((logStream as unknown as { closed?: boolean }).closed) { + res(); + return; + } + logStream.once("finish", () => res()); + logStream.once("error", () => res()); + logStream.end(); + }); + + child.on("error", (err) => { + clearTimeout(timeout); + void finishLog().then(() => + resolve({ + id: action.id, + status: "failed", + durationMs: Date.now() - startedAt, + evidence: logPath, + message: redactString(`phase action ${action.id} spawn error: ${err.message}`), + }), + ); + }); + + child.on("close", (code, signal) => { + clearTimeout(timeout); + void finishLog().then(() => { + const durationMs = Date.now() - startedAt; + if (timedOut) { + resolve({ + id: action.id, + status: "failed", + durationMs, + evidence: logPath, + message: `phase action ${action.id} exceeded ${timeoutSeconds}s (signal=${signal ?? "SIGTERM"})`, + }); + return; + } + if (code === 0) { + // Publish the action's evidence log under a stable alias for + // legacy assertions that reference fixed filenames + // (onboard.log, install.log, ...). Best-effort; alias copy + // failures do not fail the action. + if (action.aliasPath) { + try { + const aliasFull = path.isAbsolute(action.aliasPath) + ? action.aliasPath + : path.join(ctx.contextDir, action.aliasPath); + fs.mkdirSync(path.dirname(aliasFull), { recursive: true }); + fs.copyFileSync(logPath, aliasFull); + } catch { + /* alias is a convenience; never fail action on copy */ + } + } + resolve({ id: action.id, status: "passed", durationMs, evidence: logPath }); + return; + } + resolve({ + id: action.id, + status: "failed", + durationMs, + evidence: logPath, + message: `phase action ${action.id} exit ${code ?? "null"}: ${stderrTail.split("\n").slice(-3).join(" | ").trim()}`, + }); + }); + }); + }); + } + private async runStep(ctx: RunContext, step: AssertionStep): Promise { const startedAt = Date.now(); const rawAttempts = step.reliability?.retry?.attempts; - const maxAttempts = typeof rawAttempts === "number" && Number.isFinite(rawAttempts) ? Math.max(1, Math.floor(rawAttempts)) : 1; + const maxAttempts = + typeof rawAttempts === "number" && Number.isFinite(rawAttempts) ? Math.max(1, Math.floor(rawAttempts)) : 1; let attempts = 0; let lastOutcome: StepAttemptOutcome = { status: "failed", message: "step did not run" }; for (let attempt = 1; attempt <= maxAttempts; attempt += 1) { attempts = attempt; lastOutcome = await this.executeStep(ctx, step, attempt); - if (lastOutcome.status === "passed") { + if (lastOutcome.status === "passed" || lastOutcome.status === "skipped") { return { id: step.id, - status: "passed", + status: lastOutcome.status, attempts, durationMs: Date.now() - startedAt, classifier: attempt > 1 ? step.reliability?.retry?.on[0] : lastOutcome.classifier, - evidence: step.evidencePath, + evidence: lastOutcome.evidence ?? step.evidencePath, message: lastOutcome.message, }; } @@ -75,7 +266,7 @@ export class PhaseOrchestrator { attempts, durationMs: Date.now() - startedAt, classifier: lastOutcome.classifier, - evidence: step.evidencePath, + evidence: lastOutcome.evidence ?? step.evidencePath, message: lastOutcome.message, }; } @@ -92,26 +283,192 @@ export class PhaseOrchestrator { return step.reliability?.retry?.on.includes(classifier) ?? false; } - private async executeStep(_ctx: RunContext, step: AssertionStep, attempt: number): Promise { - const ref = step.implementation?.ref ?? ""; - if (ref === "fake-pass" || ref === "phase-1-skeleton") { - return { status: "passed" }; + private async executeStep(ctx: RunContext, step: AssertionStep, _attempt: number): Promise { + const kind = step.implementation?.kind; + if (kind === "shell") { + return this.runShellStep(ctx, step); } - if (ref === "fake-retry-once-pass") { - return attempt === 1 - ? { status: "failed", classifier: step.reliability?.retry?.on[0] ?? "gateway-transient" } - : { status: "passed" }; + if (kind === "probe") { + // Probe registry lands in a follow-up PR. Until then, probes + // surface as visibly skipped — never as fake green. For + // security-sensitive or otherwise required probes, the run + // must NOT pass on this gap; the typed registry marks those + // with `required: true` and we reclassify the skip as a + // failure so the phase result fails closed. + const ref = step.implementation?.ref ?? ""; + if (step.required) { + return { + status: "failed", + classifier: "runner-infra", + message: `required probe not registered: ${ref} (step ${step.id})`, + }; + } + return { status: "skipped", message: `probe not registered: ${ref}` }; } - if (ref === "fake-always-transient") { - return { status: "failed", classifier: step.reliability?.retry?.on[0] ?? transientForRef(ref) }; + if (kind === "pending") { + // pending steps surface as skipped with the placeholder ref so + // gaps are visible in plan output and phase results. Required + // pending steps (e.g. expected-failure side-effect validators + // for negative scenarios) fail closed instead — the run cannot + // honestly pass while the contract is unimplemented. + const ref = step.implementation?.ref ?? ""; + if (step.required) { + return { + status: "failed", + classifier: "runner-infra", + message: `required pending step not implemented: ${ref} (step ${step.id})`, + }; + } + return { status: "skipped", message: `pending: ${ref}` }; + } + throw new Error(`Unknown assertion step kind for ${step.id}: ${String(kind)}`); + } + + private async runShellStep(ctx: RunContext, step: AssertionStep): Promise { + const ref = step.implementation?.ref; + if (!ref) { + return { status: "failed", message: `shell step ${step.id} missing implementation.ref` }; } - if (step.implementation?.kind === "shell" && _ctx.dryRun) { - return { status: "passed", message: `dry-run shell ${ref}` }; + const scriptPath = path.isAbsolute(ref) ? ref : path.resolve(REPO_ROOT, ref); + if (!fs.existsSync(scriptPath)) { + return { status: "failed", message: `shell step ${step.id} script not found: ${scriptPath}` }; } - if (step.implementation?.kind === "probe" && _ctx.dryRun) { - return { status: "passed", message: `dry-run probe ${ref}` }; + + const timeoutSeconds = step.reliability?.timeoutSeconds ?? DEFAULT_STEP_TIMEOUT_SECONDS; + const logDir = path.join(ctx.contextDir, ".e2e", "logs"); + fs.mkdirSync(logDir, { recursive: true }); + const logPath = path.join(logDir, `${step.id}.log`); + + // Framework-owned secret hygiene at the spawn boundary (mirrors + // runAction). The shell step's child gets only the framework + // allowlist + scenario context.env keys + step.secretEnv + // declarations. See orchestrators/redaction.ts. + const env = buildChildEnv(process.env, { + secretEnv: step.secretEnv, + frameworkOverlay: { + E2E_CONTEXT_DIR: ctx.contextDir, + E2E_STEP_ID: step.id, + E2E_PHASE: step.phase, + }, + }); + // Surface scenario-derived context (E2E_SCENARIO, E2E_SANDBOX_NAME, + // E2E_GATEWAY_URL, etc.) that the framework wrote at the start of the + // run and that environment+onboarding phases extended via + // e2e_context_set. The shell context library writes to + // ${E2E_CONTEXT_DIR}/context.env, NOT to ${E2E_CONTEXT_DIR}/.e2e/. + const contextEnvPath = path.join(ctx.contextDir, "context.env"); + if (fs.existsSync(contextEnvPath)) { + const contextEnv = fs.readFileSync(contextEnvPath, "utf8"); + for (const line of contextEnv.split("\n")) { + const trimmed = line.trim(); + if (!trimmed || trimmed.startsWith("#")) { + continue; + } + const eq = trimmed.indexOf("="); + if (eq <= 0) { + continue; + } + const key = trimmed.slice(0, eq); + let value = trimmed.slice(eq + 1); + if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) { + value = value.slice(1, -1); + } + env[key] = value; + } } - return { status: "failed", message: `unsupported live step ${step.id}` }; + + return await new Promise((resolve) => { + // detached: true puts the child (and any of its children, e.g. a `sleep` + // spawned by bash) into its own process group. We send signals to the + // negative pid so the whole group dies on timeout. Without this, bash + // ignores SIGTERM until its current foreground command (e.g. sleep) + // returns, and timeouts effectively don't work. + const child = spawn("bash", [scriptPath], { env, cwd: REPO_ROOT, detached: true }); + const pgid = child.pid; + const logStream = fs.createWriteStream(logPath); + let stderrTail = ""; + // Redact at the I/O boundary; raw bytes from the child must not + // reach the evidence log or the stderr tail that flows into + // step result.message. + pipeRedacted(child.stdout, logStream); + pipeRedacted(child.stderr, logStream, (redactedChunk) => { + stderrTail = (stderrTail + redactedChunk).slice(-4096); + }); + + const killGroup = (signal: NodeJS.Signals) => { + if (typeof pgid !== "number") { + child.kill(signal); + return; + } + try { + process.kill(-pgid, signal); + } catch { + /* group already gone */ + } + }; + + let timedOut = false; + const timeout = setTimeout(() => { + timedOut = true; + killGroup("SIGTERM"); + setTimeout(() => { + if (!child.killed) { + killGroup("SIGKILL"); + } + }, 5_000).unref(); + }, timeoutSeconds * 1_000); + + // Wait for the log writeStream to fully flush before resolving so + // callers can synchronously read the evidence file. Without this, the + // 'close' event on the child fires before the WriteStream finishes + // draining, and tests/orchestrators see an empty log file. + const finishLog = (): Promise => + new Promise((res) => { + if ((logStream as unknown as { closed?: boolean }).closed) { + res(); + return; + } + logStream.once("finish", () => res()); + logStream.once("error", () => res()); + logStream.end(); + }); + + child.on("error", (err) => { + clearTimeout(timeout); + void finishLog().then(() => + resolve({ + status: "failed", + message: redactString(`shell step ${step.id} spawn error: ${err.message}`), + evidence: logPath, + }), + ); + }); + + child.on("close", (code, signal) => { + clearTimeout(timeout); + void finishLog().then(() => { + if (timedOut) { + resolve({ + status: "failed", + classifier: "runner-infra", + message: `shell step ${step.id} exceeded ${timeoutSeconds}s (signal=${signal ?? "SIGTERM"})`, + evidence: logPath, + }); + return; + } + if (code === 0) { + resolve({ status: "passed", evidence: logPath }); + return; + } + resolve({ + status: "failed", + classifier: classifierForRef(ref), + message: `shell step ${step.id} exit ${code ?? "null"}: ${stderrTail.split("\n").slice(-3).join(" | ").trim()}`, + evidence: logPath, + }); + }); + }); + }); } private writePhaseResult(ctx: RunContext, result: PhaseResult) { diff --git a/test/e2e-scenario/scenarios/orchestrators/redaction.ts b/test/e2e-scenario/scenarios/orchestrators/redaction.ts new file mode 100644 index 0000000000..745ec61126 --- /dev/null +++ b/test/e2e-scenario/scenarios/orchestrators/redaction.ts @@ -0,0 +1,212 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * Framework-owned secret hygiene at the spawn boundary. + * + * Spec ownership: redaction and child-env minimization are FRAMEWORK + * INFRASTRUCTURE, not a per-action / per-script / per-workflow concern. + * Children spawned by PhaseOrchestrator must (a) receive a minimal, + * typed env (framework allowlist + per-action declared `secretEnv` + * passthrough only), and (b) have their stdout/stderr passed through + * redaction before any byte reaches an evidence log or + * PhaseResult.message. There is no opt-out flag, no env switch, no + * helper that bypasses this. One execution mode, secrets always + * redacted in evidence — same one-mode discipline that motivates the + * rest of this PR. + * + * Pattern source-of-truth: src/lib/security/secret-patterns.ts. We + * import the canonical regex sets and apply them here so framework + * redaction stays in lockstep with product-runtime redaction without + * coupling the framework to product runtime modules. + * + * Bash side: test/e2e-scenario/runtime/lib/context.sh::e2e_context_dump + * already redacts on dump via _e2e_context_is_sensitive_key. Bash + * helpers must continue to use that for diagnostic dumps; this module + * only covers the TS-spawned-child I/O path. + * + * Tests: + * test/e2e-scenario/framework-tests/e2e-phase-orchestrators.test.ts + * - test_should_not_persist_secret_shaped_child_output_into_evidence + * - test_should_drop_non_allowlisted_parent_env_unless_declared_in_secretEnv + * - test_should_pass_declared_secretEnv_through_to_child + */ + +import type { Readable, Writable } from "node:stream"; + +const REDACTED = ""; + +// Framework-local mirror of src/lib/security/secret-patterns.ts. The +// framework deliberately does not import from src/lib/security/ so it +// stays decoupled from product runtime modules and the cross-tsconfig +// boundary. A parity test +// (test/e2e-scenario/framework-tests/e2e-redaction-parity.test.ts) +// asserts these regex sources stay in lockstep with the canonical +// product source so adding a token shape there keeps both layers +// honest at once. +const TOKEN_PREFIX_PATTERNS: RegExp[] = [ + /nvapi-[A-Za-z0-9_-]{10,}/g, + /nvcf-[A-Za-z0-9_-]{10,}/g, + /ghp_[A-Za-z0-9_-]{10,}/g, + /(?:github_pat_)[A-Za-z0-9_]{30,}/g, + /sk-proj-[A-Za-z0-9_-]{10,}/g, + /sk-ant-[A-Za-z0-9_-]{10,}/g, + /sk-[A-Za-z0-9_-]{20,}/g, + /(?:xox[bpas]|xapp)-[A-Za-z0-9-]{10,}/g, + /A(?:K|S)IA[A-Z0-9]{16}/g, + /hf_[A-Za-z0-9]{10,}/g, + /glpat-[A-Za-z0-9_-]{10,}/g, + /gsk_[A-Za-z0-9]{10,}/g, + /pypi-[A-Za-z0-9_-]{10,}/g, + /\bbot\d{8,10}:[A-Za-z0-9_-]{35}\b/g, + /\b\d{8,10}:[A-Za-z0-9_-]{35}\b/g, + /\b[A-Za-z0-9]{24}\.[A-Za-z0-9_-]{6}\.[A-Za-z0-9_-]{27,}\b/g, +]; + +const CONTEXT_PATTERNS: RegExp[] = [ + /(?<=Bearer\s+)[A-Za-z0-9_.+/=-]{10,}/gi, + /(?<=(?:_KEY|API_KEY|SECRET|TOKEN|PASSWORD|CREDENTIAL)[=: ]['"]?)[A-Za-z0-9_.+/=-]{10,}/gi, +]; + +/** + * Replace every secret-shaped token in `text` with ``. Uses + * the canonical TOKEN_PREFIX_PATTERNS + CONTEXT_PATTERNS sets. + * + * Best-effort against unknown token shapes. The actual defense is the + * env allowlist (buildChildEnv); pattern redaction catches what slips + * through (e.g. error messages that echo a secret value). + */ +export function redactString(text: string): string { + if (!text) return text; + let out = text; + for (const p of TOKEN_PREFIX_PATTERNS) { + p.lastIndex = 0; + out = out.replace(p, REDACTED); + } + for (const p of CONTEXT_PATTERNS) { + p.lastIndex = 0; + out = out.replace(p, REDACTED); + } + return out; +} + +// Env keys the framework guarantees children may always see. Anything +// outside this set, outside FRAMEWORK_ENV_PREFIXES, and not declared +// in PhaseAction.secretEnv / AssertionStep.secretEnv is dropped before +// the child spawns. +const FRAMEWORK_ENV_ALLOWLIST: ReadonlySet = new Set([ + "PATH", + "HOME", + "SHELL", + "USER", + "LOGNAME", + "LANG", + "LC_ALL", + "LC_CTYPE", + "TZ", + "TERM", + "TMPDIR", + "RUNNER_TEMP", + "RUNNER_OS", + "GITHUB_ACTIONS", + "CI", + "NEMOCLAW_NON_INTERACTIVE", + "NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE", +]); + +const FRAMEWORK_ENV_PREFIXES: readonly string[] = ["E2E_", "NEMOCLAW_LOG_"]; + +// Shape required of any declared secretEnv key — must look like a +// secret-bearing variable. Prevents accidental allowlisting of +// non-secret values via the secretEnv channel and keeps the +// "framework-allowlist vs declared-secret" distinction honest. +const SECRET_ENV_KEY_SHAPE = + /^[A-Z][A-Z0-9_]*(?:API[_]?KEY|TOKEN|SECRET|PASSWORD|CREDENTIAL|PASSPHRASE|PRIVATE[_]?KEY|ACCESS[_]?KEY)$/; + +export function isValidSecretEnvKey(key: string): boolean { + return SECRET_ENV_KEY_SHAPE.test(key); +} + +export interface BuildChildEnvOptions { + /** Per-action / per-step declared secret-bearing env keys to pass through. */ + secretEnv?: readonly string[]; + /** Framework-controlled overlay (E2E_CONTEXT_DIR, E2E_PHASE, E2E_*_ID). */ + frameworkOverlay: NodeJS.ProcessEnv; +} + +/** + * Build the child's env from `base` (typically `process.env`) by + * keeping only: + * 1. keys in FRAMEWORK_ENV_ALLOWLIST + * 2. keys starting with one of FRAMEWORK_ENV_PREFIXES + * 3. keys explicitly declared in `opts.secretEnv` (validated shape) + * then layering `opts.frameworkOverlay` on top. + * + * Throws if a `secretEnv` entry doesn't match the secret-key shape; + * better to fail loudly at compile/runtime than silently leak a + * non-secret env var (which would defeat the allowlist purpose). + */ +export function buildChildEnv( + base: NodeJS.ProcessEnv, + opts: BuildChildEnvOptions, +): NodeJS.ProcessEnv { + const out: NodeJS.ProcessEnv = {}; + for (const [key, value] of Object.entries(base)) { + if (value === undefined) continue; + if (FRAMEWORK_ENV_ALLOWLIST.has(key)) { + out[key] = value; + continue; + } + if (FRAMEWORK_ENV_PREFIXES.some((prefix) => key.startsWith(prefix))) { + out[key] = value; + continue; + } + } + for (const key of opts.secretEnv ?? []) { + if (!isValidSecretEnvKey(key)) { + throw new Error( + `secretEnv entry '${key}' does not match the secret-key shape ` + + `(must end with API_KEY, TOKEN, SECRET, PASSWORD, CREDENTIAL, ` + + `PASSPHRASE, PRIVATE_KEY, or ACCESS_KEY). Refusing to allowlist.`, + ); + } + if (base[key] !== undefined) { + out[key] = base[key]; + } + } + Object.assign(out, opts.frameworkOverlay); + return out; +} + +/** + * Pipe `src` into `log`, redacting every chunk on the way through. + * Optional `onChunk` receives the already-redacted text (used by the + * orchestrator to keep a redacted stderr tail for failure messages). + * + * No raw bytes from the child ever reach `log` or the tail callback. + */ +export function pipeRedacted( + src: Readable, + log: Writable, + onChunk?: (redactedChunk: string) => void, +): void { + src.on("data", (chunk: Buffer) => { + const redacted = redactString(chunk.toString("utf8")); + log.write(redacted); + onChunk?.(redacted); + }); +} + +/** + * Compact array of all framework env keys the child sees by default. + * Exported for tests/diagnostics; do not use to bypass the boundary. + */ +export function frameworkEnvAllowlistSnapshot(): { + keys: string[]; + prefixes: string[]; +} { + return { + keys: [...FRAMEWORK_ENV_ALLOWLIST].sort(), + prefixes: [...FRAMEWORK_ENV_PREFIXES], + }; +} diff --git a/test/e2e-scenario/scenarios/orchestrators/runner.ts b/test/e2e-scenario/scenarios/orchestrators/runner.ts index 6ab3b76c62..228d32d452 100644 --- a/test/e2e-scenario/scenarios/orchestrators/runner.ts +++ b/test/e2e-scenario/scenarios/orchestrators/runner.ts @@ -1,7 +1,8 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -import type { PhaseResult, RunContext, RunPlan, RunPlanPhase } from "../types.ts"; +import type { PhaseActionResult, PhaseResult, RunContext, RunPlan, RunPlanPhase } from "../types.ts"; +import { seedContextEnv } from "./context.ts"; import { EnvironmentOrchestrator } from "./environment.ts"; import { OnboardingOrchestrator } from "./onboarding.ts"; import { RuntimeOrchestrator } from "./runtime.ts"; @@ -28,22 +29,65 @@ export class ScenarioRunner { } async run(ctx: RunContext, plan: RunPlan): Promise { + // Seed context.env from the typed RunPlan once, before any phase + // runs. Spec ownership: framework infrastructure (the runner), not + // a shell action. Onboarding may extend context.env via + // e2e_context_set; the runtime phase reads whatever is on disk. + seedContextEnv(ctx, plan); + const results: PhaseResult[] = []; for (const phase of plan.phases) { - if (phase.name === "environment") { - results.push(await this.environment.run(ctx, phase, results)); - continue; - } - if (phase.name === "onboarding") { - results.push(await this.onboarding.run(ctx, phase, results)); + const blocked = blockingPriorResult(results); + if (blocked) { + // Cross-phase short-circuit: the previous phase's setup work + // failed, so this phase cannot meaningfully run. Synthesize a + // skipped PhaseResult with a clear reason so artifacts stay + // honest (no false greens, no <1s assertion explosion). + results.push({ + phase: phase.name, + status: "skipped", + actions: [], + assertions: [ + { + id: `${phase.name}.blocked`, + status: "skipped", + attempts: 0, + durationMs: 0, + message: `phase blocked by prior failure: ${blocked.phase} action ${blocked.action.id} failed (${blocked.action.message ?? "no message"})`, + }, + ], + }); continue; } - if (phase.name === "runtime") { - results.push(await this.runtime.run(ctx, phase, results)); - continue; - } - throw new Error(`Unsupported phase: ${String(phase.name)}`); + const orchestrator = this.orchestratorFor(phase.name); + results.push(await orchestrator.run(ctx, phase, results)); } return results; } + + private orchestratorFor(name: RunPlanPhase["name"]): PhaseRunner { + if (name === "environment") return this.environment; + if (name === "onboarding") return this.onboarding; + if (name === "runtime") return this.runtime; + throw new Error(`Unsupported phase: ${String(name)}`); + } +} + +interface BlockingFailure { + phase: PhaseResult["phase"]; + action: PhaseActionResult; +} + +function blockingPriorResult(results: PhaseResult[]): BlockingFailure | undefined { + // A phase action failure (real setup work didn't succeed) blocks + // downstream phases. Assertion failures do NOT block downstream + // phases - they are expected to be reported alongside other phase + // results so reviewers can see all failure layers at once. + for (const result of results) { + const failedAction = result.actions.find((action) => action.status === "failed"); + if (failedAction) { + return { phase: result.phase, action: failedAction }; + } + } + return undefined; } diff --git a/test/e2e-scenario/scenarios/run.ts b/test/e2e-scenario/scenarios/run.ts index e666e07844..2a16c85996 100644 --- a/test/e2e-scenario/scenarios/run.ts +++ b/test/e2e-scenario/scenarios/run.ts @@ -4,33 +4,29 @@ import { compileRunPlans, renderPlanText, writePlanArtifacts } from "./compiler.ts"; import { ScenarioRunner } from "./orchestrators/runner.ts"; import { listScenarios } from "./registry.ts"; +import type { PhaseResult } from "./types.ts"; interface Args { list: boolean; + emitMatrix: boolean; planOnly: boolean; - dryRun: boolean; - validateOnly: boolean; scenarios: string[]; } function parseArgs(argv: string[]): Args { - const args: Args = { list: false, planOnly: false, dryRun: false, validateOnly: false, scenarios: [] }; + const args: Args = { list: false, emitMatrix: false, planOnly: false, scenarios: [] }; for (let i = 0; i < argv.length; i += 1) { const arg = argv[i]; if (arg === "--list") { args.list = true; continue; } - if (arg === "--plan-only") { - args.planOnly = true; + if (arg === "--emit-matrix") { + args.emitMatrix = true; continue; } - if (arg === "--dry-run") { - args.dryRun = true; - continue; - } - if (arg === "--validate-only") { - args.validateOnly = true; + if (arg === "--plan-only") { + args.planOnly = true; continue; } if (arg === "--scenarios") { @@ -54,17 +50,29 @@ function printList() { } } +function emitMatrix() { + // Read-only emission of the typed registry as a GitHub Actions matrix + // payload. Consumed by the dynamic matrix workflow (PR #4359). + const payload = { + include: listScenarios().map((scenario) => ({ + id: scenario.id, + description: scenario.description ?? "", + })), + }; + console.log(JSON.stringify(payload)); +} + async function main() { const args = parseArgs(process.argv.slice(2)); if (args.list) { printList(); return; } - - const modeCount = [args.planOnly, args.dryRun, args.validateOnly].filter(Boolean).length; - if (modeCount !== 1) { - throw new Error("Use exactly one of --plan-only, --dry-run, or --validate-only with --scenarios "); + if (args.emitMatrix) { + emitMatrix(); + return; } + if (args.scenarios.length === 0) { throw new Error("scenario execution requires --scenarios "); } @@ -78,12 +86,43 @@ async function main() { writePlanArtifacts(plans, contextDir); console.log(renderPlanText(plans)); - if (args.dryRun) { - const runner = new ScenarioRunner(); - for (const plan of plans) { - await runner.run({ contextDir, dryRun: true }, plan); + if (args.planOnly) { + // Local debug only. Workflows must not pass --plan-only. + return; + } + + const runner = new ScenarioRunner(); + const allResults: PhaseResult[] = []; + let anyFailed = false; + for (const plan of plans) { + const results = await runner.run({ contextDir }, plan); + allResults.push(...results); + if (results.some((result) => result.status === "failed")) { + anyFailed = true; } } + + // Surface a compact run summary so phase results don't have to be opened + // to see what passed. + console.log(""); + console.log("Phase results:"); + for (const result of allResults) { + const counts = result.assertions.reduce( + (acc, assertion) => { + acc[assertion.status] = (acc[assertion.status] ?? 0) + 1; + return acc; + }, + {} as Record, + ); + const detail = Object.entries(counts) + .map(([status, count]) => `${status}=${count}`) + .join(" "); + console.log(` ${result.phase}: ${result.status} (${detail || "no steps"})`); + } + + if (anyFailed) { + process.exitCode = 1; + } } try { diff --git a/test/e2e-scenario/scenarios/types.ts b/test/e2e-scenario/scenarios/types.ts index b29f8458d6..46201f55a2 100644 --- a/test/e2e-scenario/scenarios/types.ts +++ b/test/e2e-scenario/scenarios/types.ts @@ -66,6 +66,21 @@ export interface AssertionStep { }; evidencePath?: string; reliability?: AssertionStepReliability; + // Declared parent-env keys this step requires beyond the framework's + // allowlist. Anything not allowlisted and not declared here is + // dropped before spawn. See orchestrators/redaction.ts. Each entry + // must match the secret-key shape; the framework rejects non-secret + // names to keep the allowlist-vs-declared-secret boundary honest. + secretEnv?: readonly string[]; + // When true, a probe/pending step that resolves as "skipped" is + // reclassified as "failed" by the phase orchestrator. Required + // steps fail closed when their underlying implementation isn't + // available yet (probe registry not landed, expected-failure + // side-effect validator not implemented, ...) instead of silently + // producing fake green. Defaults to false; set true for security- + // sensitive suites and expected-failure validators that the run + // is not safe without. + required?: boolean; } export interface AssertionGroup { @@ -100,9 +115,53 @@ export interface ScenarioDefinition { expectedFailure?: Record; } +// A phase action is real, deterministic setup work the phase orchestrator +// performs BEFORE running its assertions: install nemoclaw, run +// onboarding, emit context.env, etc. Actions short-circuit assertions on +// failure (assertions don't run if the action they depend on failed). +// +// Spec ownership: phase orchestrators own actions. The top-level runner +// must not execute actions; clients must not embed action policy. +export interface PhaseAction { + id: string; + phase: PhaseName; + description?: string; + // "shell-fn" sources the bash dispatcher and invokes the named function. + // "shell" runs an executable script (used for context-emit helper). + kind: "shell-fn" | "shell"; + // Repo-relative path to the script. + scriptRef: string; + // For "shell-fn": the bash function to invoke after sourcing scriptRef. + fn?: string; + // Single positional arg passed to the function/script (install method or + // onboarding profile id today). Kept as a single string to keep stable + // ids predictable; multi-arg variants can extend this later. + arg?: string; + // Per-action timeout. No retry by default - install/onboard must fail + // loudly so the regression is visible. Retry stays a property of + // assertion steps, not actions. + timeoutSeconds?: number; + // Repo-relative evidence log path. + evidencePath?: string; + // Optional stable alias the orchestrator copies the evidence log to + // after a successful action. Lets legacy shell assertions that + // reference well-known filenames (e.g. ${E2E_CONTEXT_DIR}/onboard.log) + // keep working without coupling them to the action's stable id. + aliasPath?: string; + // Declared parent-env keys this action requires beyond the + // framework's allowlist (PATH, HOME, E2E_*, NEMOCLAW_*, ...). + // Anything not allowlisted and not declared here is dropped before + // spawn. See orchestrators/redaction.ts. Each entry must match the + // secret-key shape; the framework rejects non-secret names so the + // allowlist-vs-declared-secret boundary stays honest. Cloud install + // declares ["NVIDIA_API_KEY"]; slack onboarding declares the slack + // tokens it actually needs; etc. + secretEnv?: readonly string[]; +} + export interface RunPlanPhase { name: PhaseName; - actions: string[]; + actions: PhaseAction[]; assertionGroups: AssertionGroup[]; } @@ -126,7 +185,6 @@ export interface RunPlan { export interface RunContext { contextDir: string; - dryRun: boolean; } export interface AssertionResult { @@ -139,8 +197,20 @@ export interface AssertionResult { message?: string; } +export interface PhaseActionResult { + id: string; + status: "passed" | "failed" | "skipped"; + durationMs: number; + evidence?: string; + message?: string; +} + export interface PhaseResult { phase: PhaseName; status: "passed" | "failed" | "skipped"; + // Action results are recorded distinctly from assertion results so + // failure-layer attribution stays unambiguous: a failure in actions + // means setup never completed; assertions did not have a fair chance. + actions: PhaseActionResult[]; assertions: AssertionResult[]; } diff --git a/test/e2e-scenario/validation_suites/assert/gateway-alive.sh b/test/e2e-scenario/validation_suites/assert/gateway-alive.sh index a498602d35..42f33e1c50 100755 --- a/test/e2e-scenario/validation_suites/assert/gateway-alive.sh +++ b/test/e2e-scenario/validation_suites/assert/gateway-alive.sh @@ -9,6 +9,8 @@ _E2E_GW_LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../runtime/lib" && pwd) . "${_E2E_GW_LIB_DIR}/env.sh" # shellcheck source=../../runtime/lib/context.sh . "${_E2E_GW_LIB_DIR}/context.sh" +# shellcheck source=../sandbox-exec.sh +. "$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)/sandbox-exec.sh" # e2e_gateway_assert_healthy [url] # Defaults to E2E_GATEWAY_URL from context; returns non-zero with a clear @@ -23,10 +25,6 @@ e2e_gateway_assert_healthy() { return 2 fi e2e_env_trace "gateway:check" "${url}" - if e2e_env_is_dry_run; then - echo "[dry-run] gateway check ${url} (skipped)" - return 0 - fi # Prefer /health if available, otherwise just hit the base URL. local http_code http_code="$(curl -fsS -o /dev/null -w '%{http_code}' --max-time 5 "${url%/}/health" 2>/dev/null || echo 000)" @@ -41,7 +39,9 @@ e2e_gateway_assert_healthy() { local sandbox_name sandbox_name="$(e2e_context_get E2E_SANDBOX_NAME)" if [[ -n "${sandbox_name}" ]] && command -v openshell >/dev/null 2>&1; then - http_code="$(openshell sandbox exec -n "${sandbox_name}" -- curl -fsS -o /dev/null -w '%{http_code}' --max-time 5 http://localhost:18789/health 2>/dev/null || echo 000)" + # Wrapper applies a per-call timeout so a wedged ssh handshake here + # cannot consume the orchestrator's whole step budget. + http_code="$(E2E_SANDBOX_EXEC_TIMEOUT_SECONDS=15 e2e_sandbox_exec "${sandbox_name}" -- curl -fsS -o /dev/null -w '%{http_code}' --max-time 5 http://localhost:18789/health 2>/dev/null || echo 000)" if [[ "${http_code}" == "200" || "${http_code}" == "401" ]]; then return 0 fi diff --git a/test/e2e-scenario/validation_suites/assert/sandbox-alive.sh b/test/e2e-scenario/validation_suites/assert/sandbox-alive.sh index b85ef9cd60..473061e972 100755 --- a/test/e2e-scenario/validation_suites/assert/sandbox-alive.sh +++ b/test/e2e-scenario/validation_suites/assert/sandbox-alive.sh @@ -12,7 +12,6 @@ _E2E_SB_LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../runtime/lib" && pwd) # e2e_sandbox_assert_running # Requires E2E_SANDBOX_NAME in context. Real implementation queries -# `nemoclaw list`; honors E2E_DRY_RUN. e2e_sandbox_assert_running() { if ! e2e_context_require E2E_SANDBOX_NAME; then return 1 @@ -20,10 +19,6 @@ e2e_sandbox_assert_running() { local name name="$(e2e_context_get E2E_SANDBOX_NAME)" e2e_env_trace "sandbox:check" "${name}" - if e2e_env_is_dry_run; then - echo "[dry-run] sandbox check ${name} (skipped)" - return 0 - fi if ! command -v nemoclaw >/dev/null 2>&1; then echo "e2e_sandbox_assert_running: nemoclaw CLI not on PATH" >&2 return 1 diff --git a/test/e2e-scenario/validation_suites/hermes/00-hermes-health.sh b/test/e2e-scenario/validation_suites/hermes/00-hermes-health.sh index 0fff0fd9ab..4b8161aea4 100755 --- a/test/e2e-scenario/validation_suites/hermes/00-hermes-health.sh +++ b/test/e2e-scenario/validation_suites/hermes/00-hermes-health.sh @@ -16,10 +16,6 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../runtime/lib" && pwd)" echo "hermes-specific:hermes-health" e2e_context_require E2E_AGENT -if e2e_env_is_dry_run; then - echo "[dry-run] would run Hermes health checks" - exit 0 -fi agent="$(e2e_context_get E2E_AGENT)" if [[ "${agent}" != "hermes" ]]; then echo "hermes-specific: E2E_AGENT should be 'hermes', got '${agent}'" >&2 diff --git a/test/e2e-scenario/validation_suites/inference/cloud/00-models-health.sh b/test/e2e-scenario/validation_suites/inference/cloud/00-models-health.sh index 64e1b086fc..8277f05f38 100755 --- a/test/e2e-scenario/validation_suites/inference/cloud/00-models-health.sh +++ b/test/e2e-scenario/validation_suites/inference/cloud/00-models-health.sh @@ -13,17 +13,16 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../../runtime/lib" && pwd)" . "${LIB_DIR}/env.sh" # shellcheck source=../../../runtime/lib/context.sh . "${LIB_DIR}/context.sh" +# shellcheck source=../../sandbox-exec.sh +. "${SCRIPT_DIR}/../../sandbox-exec.sh" echo "inference:models-health" e2e_context_require E2E_SANDBOX_NAME -if e2e_env_is_dry_run; then - echo "[dry-run] would GET inference.local/v1/models from inside the sandbox" - exit 0 -fi - name="$(e2e_context_get E2E_SANDBOX_NAME)" -body="$(openshell sandbox exec --name "${name}" -- curl -fsS --max-time 30 "https://inference.local/v1/models")" +# Orchestrator step cap is 30s; wrapper default 25s applies. Inner curl +# --max-time keeps a hung HTTP read from consuming the whole budget. +body="$(e2e_sandbox_exec "${name}" -- curl -fsS --max-time 20 "https://inference.local/v1/models")" if [[ -z "${body}" ]]; then echo "inference:models-health: no response from models endpoint" >&2 exit 1 diff --git a/test/e2e-scenario/validation_suites/inference/cloud/01-chat-completion.sh b/test/e2e-scenario/validation_suites/inference/cloud/01-chat-completion.sh index f54ff8806b..20f481504e 100755 --- a/test/e2e-scenario/validation_suites/inference/cloud/01-chat-completion.sh +++ b/test/e2e-scenario/validation_suites/inference/cloud/01-chat-completion.sh @@ -12,18 +12,20 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../../runtime/lib" && pwd)" . "${LIB_DIR}/env.sh" # shellcheck source=../../../runtime/lib/context.sh . "${LIB_DIR}/context.sh" +# shellcheck source=../../sandbox-exec.sh +. "${SCRIPT_DIR}/../../sandbox-exec.sh" echo "inference:chat-completion" e2e_context_require E2E_SANDBOX_NAME -if e2e_env_is_dry_run; then - echo "[dry-run] would POST a chat completion to inference.local from inside the sandbox" - exit 0 -fi - name="$(e2e_context_get E2E_SANDBOX_NAME)" payload='{"model":"nvidia/nemotron-3-super-120b-a12b","messages":[{"role":"user","content":"Reply with exactly one word: PONG"}],"max_tokens":100}' -response="$(openshell sandbox exec --name "${name}" -- curl -fsS --max-time 60 -H 'Content-Type: application/json' \ +# Orchestrator step cap is 60s; widen the wrapper cap to 50s so a hung +# upstream surfaces with a clear diagnostic before SIGTERM. Inner curl +# --max-time stays ~10s under the wrapper cap. +# shellcheck disable=SC2034 # consumed by e2e_sandbox_exec via env +E2E_SANDBOX_EXEC_TIMEOUT_SECONDS=50 \ +response="$(e2e_sandbox_exec "${name}" -- curl -fsS --max-time 40 -H 'Content-Type: application/json' \ -d "${payload}" "https://inference.local/v1/chat/completions")" # CodeRabbit review item #12: substring expansion instead of `| head` # avoids SIGPIPE-driven false failures under `set -o pipefail`. diff --git a/test/e2e-scenario/validation_suites/inference/cloud/02-inference-local-from-sandbox.sh b/test/e2e-scenario/validation_suites/inference/cloud/02-inference-local-from-sandbox.sh index 6d1343a736..f5102efd74 100755 --- a/test/e2e-scenario/validation_suites/inference/cloud/02-inference-local-from-sandbox.sh +++ b/test/e2e-scenario/validation_suites/inference/cloud/02-inference-local-from-sandbox.sh @@ -13,18 +13,18 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../../runtime/lib" && pwd)" . "${LIB_DIR}/env.sh" # shellcheck source=../../../runtime/lib/context.sh . "${LIB_DIR}/context.sh" +# shellcheck source=../../sandbox-exec.sh +. "${SCRIPT_DIR}/../../sandbox-exec.sh" echo "inference:sandbox-inference-local" e2e_context_require E2E_SANDBOX_NAME E2E_INFERENCE_ROUTE -if e2e_env_is_dry_run; then - echo "[dry-run] would resolve inference-local from inside the sandbox" - exit 0 -fi - name="$(e2e_context_get E2E_SANDBOX_NAME)" route="$(e2e_context_get E2E_INFERENCE_ROUTE)" +# Orchestrator step cap is 45s; widen wrapper cap to 35s. # CodeRabbit review item #13: capture then truncate to avoid `| head` racing # curl under `pipefail` and flagging a successful request as failed. -body="$(openshell sandbox exec --name "${name}" -- curl -fsS --max-time 10 "https://${route}/v1/models")" +# shellcheck disable=SC2034 # consumed by e2e_sandbox_exec via env +E2E_SANDBOX_EXEC_TIMEOUT_SECONDS=35 \ +body="$(e2e_sandbox_exec "${name}" -- curl -fsS --max-time 25 "https://${route}/v1/models")" printf '%s\n' "${body:0:512}" diff --git a/test/e2e-scenario/validation_suites/inference/ollama-auth-proxy/00-proxy-reachable.sh b/test/e2e-scenario/validation_suites/inference/ollama-auth-proxy/00-proxy-reachable.sh index 77d4772c17..d172615795 100755 --- a/test/e2e-scenario/validation_suites/inference/ollama-auth-proxy/00-proxy-reachable.sh +++ b/test/e2e-scenario/validation_suites/inference/ollama-auth-proxy/00-proxy-reachable.sh @@ -12,18 +12,16 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../../runtime/lib" && pwd)" . "${LIB_DIR}/env.sh" # shellcheck source=../../../runtime/lib/context.sh . "${LIB_DIR}/context.sh" +# shellcheck source=../../sandbox-exec.sh +. "${SCRIPT_DIR}/../../sandbox-exec.sh" echo "ollama-proxy:proxy-reachable" e2e_context_require E2E_SANDBOX_NAME -if e2e_env_is_dry_run; then - echo "[dry-run] would verify the Ollama auth proxy is reachable from the sandbox" - exit 0 -fi name="$(e2e_context_get E2E_SANDBOX_NAME)" # The Ollama auth proxy intentionally rejects unauthenticated requests to # /api/tags (legacy test-gpu-e2e.sh accepts 401/403 as proof the proxy is # live and enforcing auth). Do not use curl -f here. -status="$(openshell sandbox exec --name "${name}" -- curl -sS -o /dev/null -w '%{http_code}' --max-time 10 "http://inference-local/api/tags" 2>/dev/null || echo 000)" +status="$(e2e_sandbox_exec "${name}" -- curl -sS -o /dev/null -w '%{http_code}' --max-time 10 "http://inference-local/api/tags" 2>/dev/null || echo 000)" case "${status}" in 200 | 401 | 403) echo "ollama-proxy:proxy-reachable status=${status}" diff --git a/test/e2e-scenario/validation_suites/inference/ollama-gpu/00-ollama-models-health.sh b/test/e2e-scenario/validation_suites/inference/ollama-gpu/00-ollama-models-health.sh index 47e9f1fd43..d61ead2e98 100755 --- a/test/e2e-scenario/validation_suites/inference/ollama-gpu/00-ollama-models-health.sh +++ b/test/e2e-scenario/validation_suites/inference/ollama-gpu/00-ollama-models-health.sh @@ -15,10 +15,6 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../../runtime/lib" && pwd)" echo "local-ollama-inference:ollama-models-health" e2e_context_require E2E_PROVIDER -if e2e_env_is_dry_run; then - echo "[dry-run] would GET ollama /api/tags via host Ollama" - exit 0 -fi # GPU Ollama scenarios mirror legacy test-gpu-e2e.sh: validate the host # Ollama daemon directly because Docker GPU host networking bypasses the # normal dashboard/gateway forward path. diff --git a/test/e2e-scenario/validation_suites/inference/ollama-gpu/01-ollama-chat-completion.sh b/test/e2e-scenario/validation_suites/inference/ollama-gpu/01-ollama-chat-completion.sh index ad8ff54faa..5d18b4209a 100755 --- a/test/e2e-scenario/validation_suites/inference/ollama-gpu/01-ollama-chat-completion.sh +++ b/test/e2e-scenario/validation_suites/inference/ollama-gpu/01-ollama-chat-completion.sh @@ -15,10 +15,6 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../../runtime/lib" && pwd)" echo "local-ollama-inference:ollama-chat-completion" e2e_context_require E2E_SANDBOX_NAME -if e2e_env_is_dry_run; then - echo "[dry-run] would POST chat completion from sandbox to host-network Ollama" - exit 0 -fi name="$(e2e_context_get E2E_SANDBOX_NAME)" model="$(curl -fsS --max-time 10 http://127.0.0.1:11434/api/tags \ | node -e "const fs=require('fs'); const data=JSON.parse(fs.readFileSync(0,'utf8')); process.stdout.write(data.models?.[0]?.name || data.models?.[0]?.model || 'default');")" diff --git a/test/e2e-scenario/validation_suites/lib/inference_routing.sh b/test/e2e-scenario/validation_suites/lib/inference_routing.sh index b4f4c1d63f..17db0bbedb 100755 --- a/test/e2e-scenario/validation_suites/lib/inference_routing.sh +++ b/test/e2e-scenario/validation_suites/lib/inference_routing.sh @@ -31,16 +31,6 @@ _e2e_inference_sandbox_name() { e2e_context_get E2E_SANDBOX_NAME } -_e2e_inference_plan() { - local assertion_id="${1:-}" - local detail="${2:-planned inference/provider check}" - e2e_env_trace "inference:plan" "${assertion_id} ${detail}" - echo "[dry-run] ${assertion_id}: ${detail}" - if [[ -f "$(e2e_context_path)" ]]; then - e2e_context_dump | sed -E 's/(TOKEN|SECRET|API_KEY|APIKEY|CREDENTIAL|PASSWORD)([^=]*)=.*/\1\2=REDACTED/' - fi -} - _e2e_inference_curl_json() { local sandbox="$1" local url="$2" @@ -64,10 +54,6 @@ e2e_inference_routing_assert_chat_completion() { local assertion_id="${1:-post-onboard.inference-routing.inference-local-chat-completion}" _e2e_inference_assertion "${assertion_id}" _e2e_inference_require_sandbox - if e2e_env_is_dry_run; then - _e2e_inference_plan "${assertion_id}" "POST https://inference.local/v1/chat/completions with bounded curl" - return 0 - fi local sandbox payload output sandbox="$(_e2e_inference_sandbox_name)" payload='{"model":"default","messages":[{"role":"user","content":"Say ok"}],"max_tokens":8}' @@ -84,10 +70,6 @@ e2e_inference_routing_assert_health() { local url="${2:-https://inference.local/v1/models}" _e2e_inference_assertion "${assertion_id}" _e2e_inference_require_sandbox - if e2e_env_is_dry_run; then - _e2e_inference_plan "${assertion_id}" "GET ${url} with bounded curl" - return 0 - fi local sandbox status sandbox="$(_e2e_inference_sandbox_name)" status="$(_e2e_inference_status "${sandbox}" "${url}")" @@ -103,10 +85,6 @@ e2e_inference_routing_assert_auth_proxy() { local mode="${2:-valid}" _e2e_inference_assertion "${assertion_id}" _e2e_inference_require_sandbox - if e2e_env_is_dry_run; then - _e2e_inference_plan "${assertion_id}" "auth-proxy ${mode} request; sensitive context redacted" - return 0 - fi local sandbox status token sandbox="$(_e2e_inference_sandbox_name)" case "${mode}" in diff --git a/test/e2e-scenario/validation_suites/lib/messaging_providers.sh b/test/e2e-scenario/validation_suites/lib/messaging_providers.sh index 77eb1f1176..01250b784f 100755 --- a/test/e2e-scenario/validation_suites/lib/messaging_providers.sh +++ b/test/e2e-scenario/validation_suites/lib/messaging_providers.sh @@ -104,10 +104,6 @@ e2e_messaging_read_config_surface() { return 0 fi path="$(e2e_messaging_agent_config_path)" - if [[ -n "${E2E_DRY_RUN:-}" ]]; then - printf '%s=PLACEHOLDER\n' "$(e2e_messaging_config_key)" - return 0 - fi if [[ -f "${path}" ]]; then cat "${path}" return 0 @@ -177,9 +173,6 @@ e2e_messaging_assert_literal_payload() { local assertion_id="${1:?assertion id required}" local payload="${2:?payload required}" local observed="${3:-}" - if [[ -z "${observed}" && -n "${E2E_DRY_RUN:-}" ]]; then - observed="${payload}" - fi if [[ -z "${observed}" ]]; then e2e_fail "${assertion_id} missing observed payload output" fi diff --git a/test/e2e-scenario/validation_suites/lib/rebuild_upgrade.sh b/test/e2e-scenario/validation_suites/lib/rebuild_upgrade.sh index c6483c99fb..4870a68c64 100755 --- a/test/e2e-scenario/validation_suites/lib/rebuild_upgrade.sh +++ b/test/e2e-scenario/validation_suites/lib/rebuild_upgrade.sh @@ -10,6 +10,15 @@ _REBUILD_UPGRADE_REPO_ROOT="$(cd "${_REBUILD_UPGRADE_DIR}/../../../.." && pwd)" . "${_REBUILD_UPGRADE_REPO_ROOT}/test/e2e-scenario/runtime/lib/context.sh" # shellcheck source=../../runtime/lib/logging.sh . "${_REBUILD_UPGRADE_REPO_ROOT}/test/e2e-scenario/runtime/lib/logging.sh" +# shellcheck source=../sandbox-exec.sh +. "${_REBUILD_UPGRADE_REPO_ROOT}/test/e2e-scenario/validation_suites/sandbox-exec.sh" + +# Sandbox-exec calls in this lib feed the lifecycle.rebuild/upgrade +# orchestrator steps, which carry 120s caps. Default the per-call wrapper +# cap to 100s so a hung 'openshell sandbox exec'/'ssh -F' surfaces as a +# classified exit 124 well before the orchestrator's SIGTERM. Callers +# may still override per-call. +: "${E2E_SANDBOX_EXEC_TIMEOUT_SECONDS:=100}" rebuild_upgrade_require_context() { e2e_context_require E2E_SCENARIO E2E_AGENT E2E_SANDBOX_NAME E2E_GATEWAY_URL @@ -30,15 +39,30 @@ _rebuild_upgrade_run() { "$@" } +# _rebuild_upgrade_sandbox_exec [args...] +# Routes through the canonical `e2e_sandbox_exec` wrapper (ssh-config +# preferred, openshell-exec fallback, per-call timeout, classified +# diagnostic on hang) for production; honors the legacy +# REBUILD_UPGRADE_SANDBOX_CMD override so tests can inject a fake. The +# override contract preserves the original argv shape +# (` -n -- ...`) so existing test fakes +# (e.g. `REBUILD_UPGRADE_SANDBOX_CMD=fake_sandbox`) keep working. +_rebuild_upgrade_sandbox_exec() { + local sandbox="$1" + shift + if [[ -n "${REBUILD_UPGRADE_SANDBOX_CMD:-}" ]]; then + # shellcheck disable=SC2086 + ${REBUILD_UPGRADE_SANDBOX_CMD} -n "${sandbox}" -- "$@" + return $? + fi + e2e_sandbox_exec "${sandbox}" -- "$@" +} + rebuild_upgrade_assert_sandbox_reachable() { rebuild_upgrade_require_context || return 1 - if [[ "${E2E_DRY_RUN:-0}" == "1" ]]; then - e2e_pass "suite.upgrade.survivor_agent_reachable dry-run" - return 0 - fi local sandbox sandbox="$(_rebuild_upgrade_ctx E2E_SANDBOX_NAME)" - if _rebuild_upgrade_run REBUILD_UPGRADE_SANDBOX_CMD openshell sandbox exec -n "${sandbox}" -- true; then + if _rebuild_upgrade_sandbox_exec "${sandbox}" true; then e2e_pass "suite.upgrade.survivor_agent_reachable" else e2e_fail "suite.upgrade.survivor_agent_reachable" @@ -47,15 +71,11 @@ rebuild_upgrade_assert_sandbox_reachable() { rebuild_upgrade_assert_marker_preserved() { rebuild_upgrade_require_context || return 1 - if [[ "${E2E_DRY_RUN:-0}" == "1" ]]; then - e2e_pass "suite.rebuild.workspace_state_preserved dry-run" - return 0 - fi local sandbox marker_path expected actual sandbox="$(_rebuild_upgrade_ctx E2E_SANDBOX_NAME)" marker_path="${E2E_REBUILD_MARKER_PATH:-/workspace/.nemoclaw-rebuild-marker}" expected="${E2E_REBUILD_MARKER_EXPECTED:-${E2E_STATE_MARKER_EXPECTED:-}}" - actual="$(_rebuild_upgrade_run REBUILD_UPGRADE_SANDBOX_CMD openshell sandbox exec -n "${sandbox}" -- cat "${marker_path}" 2>/dev/null || true)" + actual="$(_rebuild_upgrade_sandbox_exec "${sandbox}" cat "${marker_path}" 2>/dev/null || true)" if [[ -n "${actual}" && (-z "${expected}" || "${actual}" == "${expected}") ]]; then e2e_pass "suite.rebuild.workspace_state_preserved" else @@ -65,16 +85,12 @@ rebuild_upgrade_assert_marker_preserved() { rebuild_upgrade_assert_agent_version_upgraded() { rebuild_upgrade_require_context || return 1 - if [[ "${E2E_DRY_RUN:-0}" == "1" ]]; then - e2e_pass "suite.rebuild.agent_version_upgraded dry-run" - return 0 - fi local sandbox old expected actual cmd sandbox="$(_rebuild_upgrade_ctx E2E_SANDBOX_NAME)" old="${E2E_OLD_AGENT_VERSION:-}" expected="${E2E_EXPECTED_AGENT_VERSION:-}" cmd="${E2E_AGENT_VERSION_COMMAND:-openclaw --version}" - actual="$(_rebuild_upgrade_run REBUILD_UPGRADE_SANDBOX_CMD openshell sandbox exec -n "${sandbox}" -- bash -lc "${cmd}" 2>/dev/null || true)" + actual="$(_rebuild_upgrade_sandbox_exec "${sandbox}" bash -lc "${cmd}" 2>/dev/null || true)" if [[ -n "${actual}" && (-z "${old}" || "${actual}" != *"${old}"*) && (-z "${expected}" || "${actual}" == *"${expected}"*) ]]; then e2e_pass "suite.rebuild.agent_version_upgraded" else @@ -84,14 +100,10 @@ rebuild_upgrade_assert_agent_version_upgraded() { rebuild_upgrade_assert_inference_works() { rebuild_upgrade_require_context || return 1 - if [[ "${E2E_DRY_RUN:-0}" == "1" ]]; then - e2e_pass "suite.rebuild.inference_still_works dry-run" - return 0 - fi local sandbox cmd output sandbox="$(_rebuild_upgrade_ctx E2E_SANDBOX_NAME)" cmd="${E2E_INFERENCE_CHECK_COMMAND:-curl -fsS http://inference.local/v1/models}" - output="$(_rebuild_upgrade_run REBUILD_UPGRADE_SANDBOX_CMD openshell sandbox exec -n "${sandbox}" -- bash -lc "${cmd}" 2>/dev/null || true)" + output="$(_rebuild_upgrade_sandbox_exec "${sandbox}" bash -lc "${cmd}" 2>/dev/null || true)" if [[ -n "${output}" ]]; then e2e_pass "suite.rebuild.inference_still_works" else @@ -101,10 +113,6 @@ rebuild_upgrade_assert_inference_works() { rebuild_upgrade_assert_policy_presets_preserved() { rebuild_upgrade_require_context || return 1 - if [[ "${E2E_DRY_RUN:-0}" == "1" ]]; then - e2e_pass "suite.rebuild.policy_presets_preserved dry-run" - return 0 - fi local presets output preset presets="${E2E_EXPECTED_POLICY_PRESETS:-npm pypi}" output="$(_rebuild_upgrade_run REBUILD_UPGRADE_NEMOCLAW_CMD nemoclaw policy status 2>/dev/null || true)" @@ -123,13 +131,9 @@ rebuild_upgrade_assert_hermes_config_preserved() { e2e_pass "suite.rebuild.hermes_config_preserved skipped non-hermes" return 0 fi - if [[ "${E2E_DRY_RUN:-0}" == "1" ]]; then - e2e_pass "suite.rebuild.hermes_config_preserved dry-run" - return 0 - fi local sandbox output sandbox="$(_rebuild_upgrade_ctx E2E_SANDBOX_NAME)" - output="$(_rebuild_upgrade_run REBUILD_UPGRADE_SANDBOX_CMD openshell sandbox exec -n "${sandbox}" -- bash -lc "grep -R 'platforms.discord\|DISCORD' ~/.hermes . 2>/dev/null" || true)" + output="$(_rebuild_upgrade_sandbox_exec "${sandbox}" bash -lc "grep -R 'platforms.discord\|DISCORD' ~/.hermes . 2>/dev/null" || true)" if [[ "${output}" == *"discord"* || "${output}" == *"DISCORD"* ]]; then e2e_pass "suite.rebuild.hermes_config_preserved" else @@ -139,10 +143,6 @@ rebuild_upgrade_assert_hermes_config_preserved() { rebuild_upgrade_assert_sandbox_registry_preserved() { rebuild_upgrade_require_context || return 1 - if [[ "${E2E_DRY_RUN:-0}" == "1" ]]; then - e2e_pass "suite.upgrade.sandbox_registry_preserved dry-run" - return 0 - fi local sandbox output sandbox="$(_rebuild_upgrade_ctx E2E_SANDBOX_NAME)" output="$(_rebuild_upgrade_run REBUILD_UPGRADE_NEMOCLAW_CMD nemoclaw list 2>/dev/null || true)" @@ -155,10 +155,6 @@ rebuild_upgrade_assert_sandbox_registry_preserved() { rebuild_upgrade_assert_gateway_version_upgraded() { rebuild_upgrade_require_context || return 1 - if [[ "${E2E_DRY_RUN:-0}" == "1" ]]; then - e2e_pass "suite.upgrade.gateway_version_upgraded dry-run" - return 0 - fi local expected output expected="${E2E_EXPECTED_OPENSHELL_VERSION:-}" output="$(_rebuild_upgrade_run REBUILD_UPGRADE_GATEWAY_CMD curl -fsS "$(_rebuild_upgrade_ctx E2E_GATEWAY_URL)/version" 2>/dev/null || true)" diff --git a/test/e2e-scenario/validation_suites/lib/sandbox_lifecycle.sh b/test/e2e-scenario/validation_suites/lib/sandbox_lifecycle.sh index df942487e7..3cca8966b4 100755 --- a/test/e2e-scenario/validation_suites/lib/sandbox_lifecycle.sh +++ b/test/e2e-scenario/validation_suites/lib/sandbox_lifecycle.sh @@ -37,11 +37,6 @@ sandbox_lifecycle_run_with_timeout() { local seconds="$1" shift SANDBOX_LIFECYCLE_LAST_OUTPUT="" - if [[ "${E2E_DRY_RUN:-0}" == "1" ]]; then - SANDBOX_LIFECYCLE_LAST_OUTPUT="dry-run: $*" - printf '%s\n' "${SANDBOX_LIFECYCLE_LAST_OUTPUT}" - return 0 - fi if command -v timeout >/dev/null 2>&1; then SANDBOX_LIFECYCLE_LAST_OUTPUT="$(timeout "${seconds}" "$@" 2>&1)" || { local rc=$? @@ -64,7 +59,10 @@ sandbox_lifecycle_assert_nemoclaw_list_contains_sandbox() { sandbox_lifecycle_fail "${id}" "nemoclaw list failed" return 1 } - [[ "${E2E_DRY_RUN:-0}" == "1" || "${SANDBOX_LIFECYCLE_LAST_OUTPUT}" == *"${E2E_SANDBOX_NAME}"* ]] || { + # Match the sandbox name exactly as a whole token; substring match + # would let `sb1` falsely match `sb10`. + awk -v n="${E2E_SANDBOX_NAME}" '$1 == n { found = 1 } END { exit !found }' \ + <<<"${SANDBOX_LIFECYCLE_LAST_OUTPUT}" || { sandbox_lifecycle_fail "${id}" "sandbox not listed: ${E2E_SANDBOX_NAME}" return 1 } @@ -77,16 +75,25 @@ sandbox_lifecycle_assert_status_fields_present() { sandbox_lifecycle_fail "${id}" "nemoclaw status failed" return 1 } - if [[ "${E2E_DRY_RUN:-0}" != "1" ]]; then - local status_output_lower - status_output_lower="$(printf '%s' "${SANDBOX_LIFECYCLE_LAST_OUTPUT}" | tr '[:upper:]' '[:lower:]')" - for field in status gateway sandbox; do - [[ "${status_output_lower}" == *"${field}"* ]] || { - sandbox_lifecycle_fail "${id}" "missing status field: ${field}" - return 1 - } - done + # The real `nemoclaw status` output (src/lib/actions/sandbox/status.ts) + # always emits a 'Sandbox: ' header plus structured fields like + # 'Model:', 'OpenShell:', 'Policies:'. The original assertion required + # literal 'status' and 'gateway' tokens that never appear in normal + # output — it only passed against the test-suite mock. Align with the + # production CLI: require the sandbox name and a couple of substantive + # field labels that are unconditionally printed. + local output="${SANDBOX_LIFECYCLE_LAST_OUTPUT}" + if [[ "${output}" != *"${E2E_SANDBOX_NAME}"* ]]; then + sandbox_lifecycle_fail "${id}" "status output did not mention sandbox '${E2E_SANDBOX_NAME}'" + return 1 fi + local field + for field in Sandbox Model OpenShell; do + [[ "${output}" == *"${field}"* ]] || { + sandbox_lifecycle_fail "${id}" "missing status field: ${field}" + return 1 + } + done sandbox_lifecycle_pass "${id}" "status fields present" } @@ -96,7 +103,7 @@ sandbox_lifecycle_assert_logs_available() { sandbox_lifecycle_fail "${id}" "nemoclaw logs failed" return 1 } - [[ "${E2E_DRY_RUN:-0}" == "1" || -n "${SANDBOX_LIFECYCLE_LAST_OUTPUT}" ]] || { + [[ -n "${SANDBOX_LIFECYCLE_LAST_OUTPUT}" ]] || { sandbox_lifecycle_fail "${id}" "logs empty" return 1 } @@ -109,7 +116,7 @@ sandbox_lifecycle_assert_openshell_exec_ok() { sandbox_lifecycle_fail "${id}" "openshell exec failed" return 1 } - [[ "${E2E_DRY_RUN:-0}" == "1" || "${SANDBOX_LIFECYCLE_LAST_OUTPUT}" == *"lifecycle-ok"* ]] || { + [[ "${SANDBOX_LIFECYCLE_LAST_OUTPUT}" == *"lifecycle-ok"* ]] || { sandbox_lifecycle_fail "${id}" "unexpected exec output" return 1 } diff --git a/test/e2e-scenario/validation_suites/lib/security_policy_credentials.sh b/test/e2e-scenario/validation_suites/lib/security_policy_credentials.sh index 3e1872d62a..8d34a5444f 100755 --- a/test/e2e-scenario/validation_suites/lib/security_policy_credentials.sh +++ b/test/e2e-scenario/validation_suites/lib/security_policy_credentials.sh @@ -55,10 +55,6 @@ spc_assert_credentials_expected() { return 1 fi spc_log_provider_metadata "$(spc_context_get E2E_PROVIDER)" "gateway" - if e2e_env_is_dry_run; then - echo "[dry-run] would list gateway credentials without raw values" - return 0 - fi local raw_file listed_raw listed list_rc raw_file="$(mktemp "${TMPDIR:-/tmp}/nemoclaw-credentials-list.XXXXXX")" chmod 600 "${raw_file}" @@ -105,10 +101,6 @@ spc_assert_policy_preset_present() { spc_assertion_id "post-onboard.security-policy.${preset}-preset-applied" spc_require_context E2E_SCENARIO E2E_SANDBOX_NAME echo "policy preset expected: ${preset}" - if e2e_env_is_dry_run; then - echo "[dry-run] would verify policy preset ${preset}" - return 0 - fi local sandbox_name active sandbox_name="$(spc_context_get E2E_SANDBOX_NAME)" if ! active="$(nemoclaw "${sandbox_name}" policy-list 2>&1)"; then @@ -143,10 +135,6 @@ spc_semver_ge() { spc_assert_openshell_credential_rewrite_supported() { spc_assertion_id "post-onboard.gateway.openshell-version-supports-credential-rewrite" spc_require_context E2E_SCENARIO - if e2e_env_is_dry_run; then - echo "[dry-run] would verify OpenShell gateway capability metadata" - return 0 - fi local openshell_bin version_output version minimum_version binary_strings feature minimum_version="0.0.39" openshell_bin="$(command -v openshell 2>/dev/null || true)" @@ -221,10 +209,6 @@ spc_assert_shields_permissions_match_state() { spc_assert_shields_config_consistent() { spc_assertion_id "post-onboard.security-shields.config-consistent" spc_require_context E2E_SCENARIO E2E_SANDBOX_NAME E2E_AGENT - if e2e_env_is_dry_run; then - echo "[dry-run] would verify shields config consistency" - return 0 - fi local sandbox_name status observed expected sandbox_name="$(spc_context_get E2E_SANDBOX_NAME)" if ! status="$(nemoclaw "${sandbox_name}" shields status 2>&1)"; then @@ -262,10 +246,6 @@ spc_assert_telegram_payload_not_shell_executed() { if [[ -n "${fixture_payload}" ]]; then printf 'telegram payload fixture loaded (%s bytes)\n' "${#fixture_payload}" fi - if e2e_env_is_dry_run; then - echo "[dry-run] would submit payload without shell evaluation" - return 0 - fi local sandbox_name marker payload send_output marker_state sandbox_name="$(spc_context_get E2E_SANDBOX_NAME)" marker="/tmp/nemoclaw-telegram-injection-proof-$RANDOM-$$" diff --git a/test/e2e-scenario/validation_suites/messaging/common/03-bridge-reachable.sh b/test/e2e-scenario/validation_suites/messaging/common/03-bridge-reachable.sh index 9fc2156ad0..8ec82f8aeb 100755 --- a/test/e2e-scenario/validation_suites/messaging/common/03-bridge-reachable.sh +++ b/test/e2e-scenario/validation_suites/messaging/common/03-bridge-reachable.sh @@ -5,9 +5,4 @@ set -euo pipefail . "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib/messaging_providers.sh" e2e_messaging_load_context -if [[ -n "${E2E_DRY_RUN:-}" ]]; then - provider="$(e2e_messaging_provider_name)" - e2e_pass "expected-state.messaging.${provider}.bridge-reachable dry-run" - exit 0 -fi e2e_messaging_assert_bridge_reachable diff --git a/test/e2e-scenario/validation_suites/messaging/slack/00-slack-provider-state.sh b/test/e2e-scenario/validation_suites/messaging/slack/00-slack-provider-state.sh index 0f1afa2e14..bac54bb501 100755 --- a/test/e2e-scenario/validation_suites/messaging/slack/00-slack-provider-state.sh +++ b/test/e2e-scenario/validation_suites/messaging/slack/00-slack-provider-state.sh @@ -3,7 +3,10 @@ # SPDX-License-Identifier: Apache-2.0 set -euo pipefail -. "$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)/lib/messaging_providers.sh" +_SLACK_SUITES_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +. "${_SLACK_SUITES_DIR}/lib/messaging_providers.sh" +# shellcheck source=../../sandbox-exec.sh +. "${_SLACK_SUITES_DIR}/sandbox-exec.sh" e2e_messaging_load_context provider="$(e2e_messaging_provider_name)" case "${provider}" in @@ -12,25 +15,25 @@ case "${provider}" in esac e2e_messaging_assert_provider_attached if [[ "$(e2e_context_get E2E_AGENT)" == "openclaw" ]]; then - if [[ -n "${E2E_DRY_RUN:-}" ]]; then - e2e_pass "expected-state.messaging.slack.openclaw-enabled dry-run" - e2e_pass "expected-state.messaging.slack.runtime-discovery dry-run" - else - content="$(e2e_messaging_read_config_surface)" - if ! printf '%s\n' "${content}" | python3 -c ' + content="$(e2e_messaging_read_config_surface)" + if ! printf '%s\n' "${content}" | python3 -c ' import json import sys cfg = json.load(sys.stdin) assert cfg["channels"]["slack"]["enabled"] is True assert cfg["plugins"]["entries"]["slack"]["enabled"] is True '; then - e2e_fail "expected-state.messaging.slack.openclaw-enabled missing channels.slack.enabled or plugins.entries.slack.enabled" - fi - e2e_pass "expected-state.messaging.slack.openclaw-enabled channel and plugin enabled" + e2e_fail "expected-state.messaging.slack.openclaw-enabled missing channels.slack.enabled or plugins.entries.slack.enabled" + fi + e2e_pass "expected-state.messaging.slack.openclaw-enabled channel and plugin enabled" - sandbox_name="$(e2e_context_get E2E_SANDBOX_NAME)" - runtime_json="$(openshell sandbox exec --name "${sandbox_name}" -- timeout 45 openclaw channels list --all --json --no-color 2>/dev/null || true)" - runtime_state="$(printf '%s\n' "${runtime_json}" | python3 -c ' + sandbox_name="$(e2e_context_get E2E_SANDBOX_NAME)" + # Wrapper cap (50s) sits just above the inner `timeout 45` so the inner + # cap is what fires under normal upstream slowness; the wrapper only + # catches the case where openshell itself wedges before delivering the + # `timeout` invocation to the sandbox. + runtime_json="$(E2E_SANDBOX_EXEC_TIMEOUT_SECONDS=50 e2e_sandbox_exec "${sandbox_name}" -- timeout 45 openclaw channels list --all --json --no-color 2>/dev/null || true)" + runtime_state="$(printf '%s\n' "${runtime_json}" | python3 -c ' import json import sys try: @@ -44,10 +47,9 @@ try: except Exception as exc: print("error %s" % exc) ' 2>/dev/null || true)" - if [[ "${runtime_state}" != "yes" ]]; then - e2e_fail "expected-state.messaging.slack.runtime-discovery OpenClaw did not report Slack installed/configured (${runtime_state}; output=${runtime_json:0:300})" - fi - e2e_pass "expected-state.messaging.slack.runtime-discovery OpenClaw reports Slack installed and configured" + if [[ "${runtime_state}" != "yes" ]]; then + e2e_fail "expected-state.messaging.slack.runtime-discovery OpenClaw did not report Slack installed/configured (${runtime_state}; output=${runtime_json:0:300})" fi + e2e_pass "expected-state.messaging.slack.runtime-discovery OpenClaw reports Slack installed and configured" fi e2e_pass "expected-state.messaging.slack.provider-state ${provider} provider state configured" diff --git a/test/e2e-scenario/validation_suites/platform/macos/00-macos-smoke.sh b/test/e2e-scenario/validation_suites/platform/macos/00-macos-smoke.sh index 2f42115f5e..4f2f094c67 100755 --- a/test/e2e-scenario/validation_suites/platform/macos/00-macos-smoke.sh +++ b/test/e2e-scenario/validation_suites/platform/macos/00-macos-smoke.sh @@ -19,11 +19,6 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../../runtime/lib" && pwd)" echo "platform-macos:macos-smoke" e2e_context_require E2E_PLATFORM_OS -if e2e_env_is_dry_run; then - echo "[dry-run] would run macOS-specific smoke checks" - exit 0 -fi - os="$(e2e_context_get E2E_PLATFORM_OS)" if [[ "${os}" != "macos" ]]; then echo "platform-macos: E2E_PLATFORM_OS should be 'macos', got '${os}'" >&2 diff --git a/test/e2e-scenario/validation_suites/platform/wsl/00-wsl-smoke.sh b/test/e2e-scenario/validation_suites/platform/wsl/00-wsl-smoke.sh index 1aeb39fe7c..ef96795a0c 100755 --- a/test/e2e-scenario/validation_suites/platform/wsl/00-wsl-smoke.sh +++ b/test/e2e-scenario/validation_suites/platform/wsl/00-wsl-smoke.sh @@ -17,11 +17,6 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../../runtime/lib" && pwd)" echo "platform-wsl:wsl-smoke" e2e_context_require E2E_PLATFORM_OS E2E_SANDBOX_NAME -if e2e_env_is_dry_run; then - echo "[dry-run] would run WSL-specific smoke checks" - exit 0 -fi - os="$(e2e_context_get E2E_PLATFORM_OS)" if [[ "${os}" != "wsl" ]]; then echo "platform-wsl: E2E_PLATFORM_OS should be 'wsl', got '${os}'" >&2 diff --git a/test/e2e-scenario/validation_suites/sandbox-exec.sh b/test/e2e-scenario/validation_suites/sandbox-exec.sh index 0682c4cf2f..44e4288111 100755 --- a/test/e2e-scenario/validation_suites/sandbox-exec.sh +++ b/test/e2e-scenario/validation_suites/sandbox-exec.sh @@ -12,7 +12,6 @@ # Functions: # e2e_sandbox_exec -- [args...] # Run inside via `openshell sandbox exec`. No stdin passed. -# Exit code propagates from . Honors E2E_DRY_RUN. # # e2e_sandbox_exec_stdin -- [args...] # Like e2e_sandbox_exec but pipes the caller's stdin into the @@ -23,6 +22,174 @@ _E2E_SBEX_LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../runtime/lib" && pwd)" # shellcheck source=../runtime/lib/env.sh . "${_E2E_SBEX_LIB_DIR}/env.sh" +# Per-call timeout (seconds) applied to every `openshell sandbox exec` +# invocation routed through this wrapper. Callers MAY override per call: +# E2E_SANDBOX_EXEC_TIMEOUT_SECONDS=50 e2e_sandbox_exec ... +# +# Why a wrapper-level cap exists: +# The orchestrator (phase.ts) enforces step-level timeouts via SIGTERM on +# the script's process group. When openshell ssh-into-sandbox hangs, +# SIGTERM eventually kills the script — but the script has no chance to +# emit a structured diagnostic, so logs end mid-line. An inner per-call +# `timeout` lets the wrapper observe the hang, emit a classified +# diagnostic, and exit cleanly *before* the orchestrator's SIGTERM. +# +# The default (25s) sits below the most common orchestrator step caps +# (30s smoke / kimi, 45s sandbox-local). Steps with longer caps (60s +# chat-completion, 120s rebuild) export a larger value before calling. +: "${E2E_SANDBOX_EXEC_TIMEOUT_SECONDS:=25}" + +# Resolve the timeout binary once. Empty string == not available. +_e2e_sbex_resolve_timeout_cmd() { + if command -v timeout >/dev/null 2>&1; then + printf '%s' timeout + elif command -v gtimeout >/dev/null 2>&1; then + printf '%s' gtimeout + else + printf '%s' '' + fi +} + +# ---------------------------------------------------------------------- +# ssh-config transport (preferred) +# +# `openshell sandbox exec` has been observed to wedge in CI (PR #4380 +# scenario run — host can curl the gateway but `openshell sandbox exec` +# never returns). The legacy test/e2e/ scripts have always entered the +# sandbox via `openshell sandbox ssh-config` + `ssh -F`, which works in +# the same environments. We mirror that pattern here: +# +# 1. On first call per sandbox, materialize an ssh-config under +# ${E2E_CONTEXT_DIR}/.ssh-config-cache/.cfg. +# 2. Subsequent calls reuse the cached config. +# 3. Each ssh invocation gets `-o ConnectTimeout=10`, +# `-o StrictHostKeyChecking=no`, `-o UserKnownHostsFile=/dev/null`, +# `-o LogLevel=ERROR` to mirror the legacy pattern. +# +# Opt-out: set E2E_SANDBOX_EXEC_VIA_OPENSHELL=1 to force the original +# `openshell sandbox exec` transport (e.g. for debugging or for runners +# where ssh-config is unavailable). +# ---------------------------------------------------------------------- + +_e2e_sbex_ssh_cfg_dir() { + local base="${E2E_CONTEXT_DIR:-/tmp}" + printf '%s/.ssh-config-cache' "${base}" +} + +# _e2e_sbex_ssh_config_for +# Prints the path to a populated ssh-config for on stdout. +# Returns non-zero (and prints nothing) if `openshell sandbox ssh-config` +# fails — callers fall back to `openshell sandbox exec`. +_e2e_sbex_ssh_config_for() { + local sandbox="$1" + local dir cfg + dir="$(_e2e_sbex_ssh_cfg_dir)" + mkdir -p "${dir}" || return 1 + cfg="${dir}/${sandbox}.cfg" + if [[ ! -s "${cfg}" ]]; then + if ! openshell sandbox ssh-config "${sandbox}" >"${cfg}" 2>/dev/null; then + rm -f "${cfg}" + return 1 + fi + fi + printf '%s' "${cfg}" +} + +# _e2e_sbex_quote_args +# Outputs the args quoted into a single shell string suitable for +# embedding as the remote command in `ssh host 'cmd args ...'`. +_e2e_sbex_quote_args() { + local arg out="" + for arg in "$@"; do + out+="$(printf '%q' "${arg}") " + done + printf '%s' "${out% }" +} + +# _e2e_sbex_invoke_via_ssh +# stdin_mode is 'pipe' (forward caller stdin) or 'none' (close stdin). +# Returns ssh's exit code (124 if timed out, 137 if SIGKILLed). +_e2e_sbex_invoke_via_ssh() { + local cfg="$1" stdin_mode="$2" seconds="$3" timeout_cmd="$4" + local remote_cmd ssh_args + remote_cmd="$(_e2e_sbex_quote_args "${_E2E_SBEX_CMD[@]}")" + ssh_args=( + -F "${cfg}" + -o ConnectTimeout=10 + -o StrictHostKeyChecking=no + -o UserKnownHostsFile=/dev/null + -o LogLevel=ERROR + "openshell-${_E2E_SBEX_SB_NAME}" + "${remote_cmd}" + ) + if [[ "${stdin_mode}" == "none" ]]; then + if [[ -z "${timeout_cmd}" ]]; then + ssh "${ssh_args[@]}" +# Fallback path that uses `openshell sandbox exec`. +_e2e_sbex_invoke_via_openshell() { + local stdin_mode="$1" seconds="$2" timeout_cmd="$3" + if [[ -z "${timeout_cmd}" ]]; then + openshell sandbox exec --name "${_E2E_SBEX_SB_NAME}" -- "${_E2E_SBEX_CMD[@]}" + else + "${timeout_cmd}" --kill-after=5s "${seconds}" \ + openshell sandbox exec --name "${_E2E_SBEX_SB_NAME}" -- "${_E2E_SBEX_CMD[@]}" + fi +} + +# _e2e_sbex_dispatch +# Shared body for e2e_sandbox_exec / e2e_sandbox_exec_stdin. Picks the +# transport (ssh-config preferred; openshell sandbox exec on opt-out or +# ssh-config failure), applies the per-call timeout, and emits a +# classified diagnostic on hang. +_e2e_sbex_dispatch() { + local stdin_mode="$1" + if ! command -v openshell >/dev/null 2>&1; then + echo "e2e_sandbox_exec: openshell CLI not on PATH" >&2 + return 127 + fi + local timeout_cmd seconds="${E2E_SANDBOX_EXEC_TIMEOUT_SECONDS}" + timeout_cmd="$(_e2e_sbex_resolve_timeout_cmd)" + if [[ -z "${timeout_cmd}" ]]; then + # Make the missing safety net visible so CI can flag it; do not + # abort — the orchestrator's step-level timeout still applies. + echo "e2e_sandbox_exec: 'timeout' not available; running without per-call cap (sandbox=${_E2E_SBEX_SB_NAME})" >&2 + fi + + local cfg="" via="ssh" rc=0 + if [[ "${E2E_SANDBOX_EXEC_VIA_OPENSHELL:-0}" == "1" ]]; then + via="openshell" + elif ! cfg="$(_e2e_sbex_ssh_config_for "${_E2E_SBEX_SB_NAME}")"; then + echo "e2e_sandbox_exec: ssh-config unavailable for ${_E2E_SBEX_SB_NAME}; falling back to 'openshell sandbox exec'" >&2 + via="openshell" + fi + + if [[ "${via}" == "ssh" ]]; then + _e2e_sbex_invoke_via_ssh "${cfg}" "${stdin_mode}" "${seconds}" "${timeout_cmd}" + rc=$? + else + _e2e_sbex_invoke_via_openshell "${stdin_mode}" "${seconds}" "${timeout_cmd}" + rc=$? + fi + + if [[ "${rc}" -eq 124 || "${rc}" -eq 137 ]]; then + echo "e2e_sandbox_exec: ${via} transport hung after ${seconds}s (sandbox=${_E2E_SBEX_SB_NAME}, cmd=${_E2E_SBEX_CMD[0]:-?}; classifier=gateway-transient)" >&2 + fi + return "${rc}" +} + # _e2e_sbex_split_args -- [args...] # Parses the shared calling convention. Prints on stderr on misuse and # returns 2. On success, sets the two global arrays _E2E_SBEX_SB_NAME and @@ -52,15 +219,7 @@ _e2e_sbex_parse() { e2e_sandbox_exec() { _e2e_sbex_parse "$@" || return $? e2e_env_trace "sandbox:exec" "${_E2E_SBEX_SB_NAME}" "${_E2E_SBEX_CMD[*]}" - if e2e_env_is_dry_run; then - echo "[dry-run] sandbox_exec ${_E2E_SBEX_SB_NAME} -- ${_E2E_SBEX_CMD[*]} (skipped)" - return 0 - fi - if ! command -v openshell >/dev/null 2>&1; then - echo "e2e_sandbox_exec: openshell CLI not on PATH" >&2 - return 127 - fi - openshell sandbox exec --name "${_E2E_SBEX_SB_NAME}" -- "${_E2E_SBEX_CMD[@]}" + _e2e_sbex_dispatch none } # e2e_sandbox_exec_stdin -- [args...] @@ -70,15 +229,5 @@ e2e_sandbox_exec() { e2e_sandbox_exec_stdin() { _e2e_sbex_parse "$@" || return $? e2e_env_trace "sandbox:exec_stdin" "${_E2E_SBEX_SB_NAME}" "${_E2E_SBEX_CMD[*]}" - if e2e_env_is_dry_run; then - # Consume stdin so the caller's pipeline doesn't SIGPIPE. - cat >/dev/null 2>&1 || true - echo "[dry-run] sandbox_exec_stdin ${_E2E_SBEX_SB_NAME} -- ${_E2E_SBEX_CMD[*]} (skipped)" - return 0 - fi - if ! command -v openshell >/dev/null 2>&1; then - echo "e2e_sandbox_exec_stdin: openshell CLI not on PATH" >&2 - return 127 - fi - openshell sandbox exec --name "${_E2E_SBEX_SB_NAME}" -- "${_E2E_SBEX_CMD[@]}" + _e2e_sbex_dispatch pipe } diff --git a/test/e2e-scenario/validation_suites/smoke/00-cli-available.sh b/test/e2e-scenario/validation_suites/smoke/00-cli-available.sh index e56925b1f9..ab733f039d 100755 --- a/test/e2e-scenario/validation_suites/smoke/00-cli-available.sh +++ b/test/e2e-scenario/validation_suites/smoke/00-cli-available.sh @@ -18,11 +18,6 @@ echo "smoke:cli-available" e2e_context_require E2E_SCENARIO -if e2e_env_is_dry_run; then - echo "[dry-run] would check that nemoclaw CLI is on PATH" - exit 0 -fi - if ! command -v nemoclaw >/dev/null 2>&1; then echo "smoke:cli-available: nemoclaw CLI not on PATH" >&2 exit 1 diff --git a/test/e2e-scenario/validation_suites/smoke/03-sandbox-shell.sh b/test/e2e-scenario/validation_suites/smoke/03-sandbox-shell.sh index b92dc33e8a..966efeb2d8 100755 --- a/test/e2e-scenario/validation_suites/smoke/03-sandbox-shell.sh +++ b/test/e2e-scenario/validation_suites/smoke/03-sandbox-shell.sh @@ -4,7 +4,6 @@ # # smoke step: sandbox-shell # Verifies that OpenShell can execute a trivial command inside the sandbox. -# Honors E2E_DRY_RUN. set -euo pipefail @@ -14,17 +13,15 @@ LIB_DIR="$(cd "${SCRIPT_DIR}/../../runtime/lib" && pwd)" . "${LIB_DIR}/env.sh" # shellcheck source=../../runtime/lib/context.sh . "${LIB_DIR}/context.sh" +# shellcheck source=../sandbox-exec.sh +. "${SCRIPT_DIR}/../sandbox-exec.sh" echo "smoke:sandbox-shell" e2e_context_require E2E_SANDBOX_NAME -if e2e_env_is_dry_run; then - echo "[dry-run] would run: openshell sandbox exec --name -- echo ok" - exit 0 -fi - name="$(e2e_context_get E2E_SANDBOX_NAME)" -output="$(openshell sandbox exec --name "${name}" -- echo ok 2>&1)" +# Orchestrator step cap is 30s; wrapper default 25s applies. +output="$(e2e_sandbox_exec "${name}" -- echo ok 2>&1)" echo "${output}" if ! echo "${output}" | grep -q '^ok$'; then echo "smoke:sandbox-shell: did not receive expected 'ok' from sandbox" >&2 diff --git a/tools/e2e-scenarios/workflow-boundary.mts b/tools/e2e-scenarios/workflow-boundary.mts index 26394d1b4c..a06b21f3ea 100644 --- a/tools/e2e-scenarios/workflow-boundary.mts +++ b/tools/e2e-scenarios/workflow-boundary.mts @@ -49,6 +49,13 @@ function requireRunContains(errors: string[], step: WorkflowStep | undefined, ex } } +function requireRunDoesNotContain(errors: string[], step: WorkflowStep | undefined, forbidden: string): void { + if (!step) return; + if (stringValue(step.run).includes(forbidden)) { + errors.push(`step '${step.name ?? ""}' run script must not include ${forbidden}`); + } +} + export function validateE2eScenariosWorkflowBoundary( workflowPath = DEFAULT_WORKFLOW_PATH, ): string[] { @@ -92,7 +99,11 @@ export function validateE2eScenariosWorkflowBoundary( const normalRun = requireStep(errors, steps, "Run typed scenarios"); requireRunContains(errors, normalRun, "npx tsx test/e2e-scenario/scenarios/run.ts"); requireRunContains(errors, normalRun, "--scenarios"); - requireRunContains(errors, normalRun, "--dry-run"); + // The TS runner has one execution mode: live. Workflows must not pass + // --dry-run, --plan-only, or --validate-only — they hide real test runs. + requireRunDoesNotContain(errors, normalRun, "--dry-run"); + requireRunDoesNotContain(errors, normalRun, "--plan-only"); + requireRunDoesNotContain(errors, normalRun, "--validate-only"); const wslInstall = requireStep(errors, steps, "Ensure Ubuntu WSL exists"); requireRunContains(errors, wslInstall, "wsl --install"); @@ -113,7 +124,16 @@ export function validateE2eScenariosWorkflowBoundary( const wslRun = requireStep(errors, steps, "Run typed scenarios in WSL"); requireRunContains(errors, wslRun, "npx tsx test/e2e-scenario/scenarios/run.ts"); requireRunContains(errors, wslRun, "--scenarios"); - requireRunContains(errors, wslRun, "--dry-run"); + // From this PR: the typed runner is the only execution path; the + // bash runner / dry-run / validate-only / plan-only modes are + // removed from CI. + requireRunDoesNotContain(errors, wslRun, "--dry-run"); + requireRunDoesNotContain(errors, wslRun, "--plan-only"); + requireRunDoesNotContain(errors, wslRun, "--validate-only"); + // From main (#4346): the WSL step must use the robust PowerShell + // wrapper that materializes a bash script, copies it into WSL via + // wslpath, and invokes it with `bash -l` so Docker WSL integration + // and Ubuntu first-run races are handled. requireRunContains(errors, wslRun, "$env:WSL_WORKDIR"); requireRunContains(errors, wslRun, "WriteAllText"); requireRunContains(errors, wslRun, "bash -l $wslTmp"); @@ -123,11 +143,28 @@ export function validateE2eScenariosWorkflowBoundary( if (uploadWith.name !== "e2e-scenario-${{ inputs.scenarios || github.event.inputs.scenarios }}") { errors.push("artifact upload name must include the scenarios input"); } - if (uploadWith["include-hidden-files"] !== true) { - errors.push("artifact upload must include hidden .e2e files"); + // Framework-owned secret hygiene: include-hidden-files MUST be false. + // Hidden dotfiles under the workspace can carry raw secrets (notably + // .e2e/context.env, written by e2e_context_set without redaction). + // The redacted surfaces are explicit subpaths under .e2e/ that the + // framework writes via orchestrators/redaction.ts::pipeRedacted. + if (uploadWith["include-hidden-files"] !== false) { + errors.push("artifact upload must set include-hidden-files: false (raw context.env must not leak)"); + } + const uploadPath = stringValue(uploadWith.path); + if (!uploadPath.includes(".e2e/actions/")) { + errors.push("artifact upload path must include .e2e/actions/ (redacted action evidence)"); + } + if (!uploadPath.includes(".e2e/logs/")) { + errors.push("artifact upload path must include .e2e/logs/ (redacted shell-step evidence)"); } - if (!stringValue(uploadWith.path).includes(".e2e/")) { - errors.push("artifact upload path must include .e2e/"); + // Bare blanket '.e2e/' (without a trailing subdir) would re-include + // the raw context.env file. Reject it so the explicit-subpath + // contract stays honest. Subpaths like '.e2e/actions/' are fine. + for (const line of uploadPath.split("\n")) { + if (line.trim() === ".e2e/") { + errors.push("artifact upload path must not list bare .e2e/ (use explicit subpaths to avoid context.env leakage)"); + } } return errors;