Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 111 additions & 9 deletions .github/workflows/runtime-live-e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@
#
# The offline job carries NO environment and NO secret: it builds + runs the
# default Go suite (the live-tagged test compiles out) as the secret-free gate,
# and runs unconditionally on every PR. Only the live job declares an
# `environment:` (a required-reviewer approval gate, reviewer = clkao) and reads
# only the host-specific secret it needs: ANTHROPIC_API_KEY for Claude,
# OPENAI_API_KEY for Codex. The Claude live job is a matrix over two variants:
# sonnet (the Python-land floor) on CI-E2E and claude-opus-4-8 on CI-E2E-OPUS — each its own
# separately-approved deployment. So a live PR run needs same-repo-or-no-secrets
# + the per-variant environment approval; each variant pauses in `waiting` until
# a maintainer approves its environment, so the API-spending dispatch cannot
# start unapproved.
# and runs unconditionally on every PR. Only live jobs declare an `environment:`
# (a required-reviewer approval gate, reviewer = clkao) and read only the
# host-specific secret they need: ANTHROPIC_API_KEY for Claude, OPENAI_API_KEY
# for Codex and Pi. The Claude live job is a matrix over two variants: sonnet
# (the Python-land floor) on CI-E2E and claude-opus-4-8 on CI-E2E-OPUS — each its
# own separately-approved deployment. So a live PR run needs
# same-repo-or-no-secrets + the per-variant environment approval; each variant
# pauses in `waiting` until a maintainer approves its environment, so the
# API-spending dispatch cannot start unapproved.

name: Runtime Live E2E

Expand Down Expand Up @@ -314,3 +314,105 @@ jobs:
live-artifacts/codex/**
live-artifacts/journey-metrics/**
if-no-files-found: warn

pi-live:
needs: offline
runs-on: ubuntu-latest
environment:
name: CI-E2E-PI
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
SPACEDOCK_PI_LIVE_REQUIRED: "1"
SPACEDOCK_LIVE_ARTIFACT_DIR: ${{ github.workspace }}/live-artifacts/pi
SPACEDOCK_JOURNEY_METRICS_DIR: ${{ github.workspace }}/live-artifacts/journey-metrics/pi
PI_OFFLINE: "1"
steps:
- name: Check required secret
run: |
if [ -z "${OPENAI_API_KEY}" ]; then
echo "OPENAI_API_KEY is required for pi-live after CI-E2E-PI approval." >&2
exit 1
fi

- uses: actions/checkout@v4

- uses: actions/setup-go@v5
with:
go-version: "1.22"

- uses: actions/setup-node@v4
with:
node-version: "20"

- name: Install Pi CLI and substrates
run: |
npm --version
NPM_BEFORE="$(node -e 'console.log(new Date(Date.now() - 24*60*60*1000).toISOString())')"
echo "Using npm --before age gate for pi-live installs: $NPM_BEFORE"

npm install -g @earendil-works/pi-coding-agent --before="$NPM_BEFORE" --ignore-scripts --no-audit --no-fund --omit=dev
command -v pi
test -x "$(command -v pi)"
pi --version
global_npm_root="$(npm root -g)"
node -e "const p=require('$global_npm_root/@earendil-works/pi-coding-agent/package.json'); if (p.name !== '@earendil-works/pi-coding-agent') throw new Error('unexpected Pi package name '+p.name); if (!p.bin || p.bin.pi !== 'dist/cli.js') throw new Error('unexpected Pi bin '+JSON.stringify(p.bin)); console.log('verified '+p.name+'@'+p.version+' bin pi='+p.bin.pi)"

pi_npm_root="$HOME/.pi/agent/npm"
mkdir -p "$pi_npm_root"
npm install --prefix "$pi_npm_root" \
pi-subagents \
pi-intercom \
--before="$NPM_BEFORE" --ignore-scripts --no-audit --no-fund --omit=dev
node -e "const p=require('$pi_npm_root/node_modules/pi-subagents/package.json'); if (p.name !== 'pi-subagents') throw new Error('unexpected pi-subagents package name '+p.name); console.log('verified '+p.name+'@'+p.version)"
node -e "const p=require('$pi_npm_root/node_modules/pi-intercom/package.json'); if (p.name !== 'pi-intercom') throw new Error('unexpected pi-intercom package name '+p.name); console.log('verified '+p.name+'@'+p.version)"
test -f "$pi_npm_root/node_modules/pi-subagents/src/extension/index.ts"
test -f "$pi_npm_root/node_modules/pi-subagents/skills/pi-subagents/SKILL.md"
echo "PI_SUBAGENTS_PACKAGE_ROOT=$pi_npm_root/node_modules/pi-subagents" >> "$GITHUB_ENV"

- name: Build spacedock binary
run: |
go build -o ./spacedock ./cmd/spacedock
echo "SPACEDOCK_BIN=$(pwd)/spacedock" >> "$GITHUB_ENV"
echo "SPACEDOCK_REPO_ROOT=$GITHUB_WORKSPACE" >> "$GITHUB_ENV"
echo "$(pwd)" >> "$GITHUB_PATH"

- name: Configure git identity
run: |
git config --global user.name "github-actions[bot]"
git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com"
git config --global init.defaultBranch main

- name: Verify Pi current-checkout setup
run: |
mkdir -p "$SPACEDOCK_LIVE_ARTIFACT_DIR"
spacedock doctor --host pi --plugin-dir "$GITHUB_WORKSPACE" | tee "$SPACEDOCK_LIVE_ARTIFACT_DIR/pi-doctor.txt"
test -f "$GITHUB_WORKSPACE/skills/first-officer/references/pi-first-officer-runtime.md"
test -f "$GITHUB_WORKSPACE/skills/ensign/references/pi-ensign-runtime.md"
test -f "$PI_SUBAGENTS_PACKAGE_ROOT/src/extension/index.ts"
test -f "$PI_SUBAGENTS_PACKAGE_ROOT/skills/pi-subagents/SKILL.md"

- name: Show tool versions
run: |
pi --version
go version
echo "### Pi live tool versions" >> "$GITHUB_STEP_SUMMARY"
echo "- \`pi --version\`: \`$(pi --version)\`" >> "$GITHUB_STEP_SUMMARY"
echo "- \`go version\`: \`$(go version)\`" >> "$GITHUB_STEP_SUMMARY"
echo "- Effort: \`${{ inputs.effort }}\`" >> "$GITHUB_STEP_SUMMARY"

- name: Run Pi shared scenario coverage guard
run: go test -tags live -count=1 -run 'TestSharedScenarioRunnerCoverage|TestPiSharedScenarioCoverage' ./internal/ensigncycle -v

- name: Run live Pi front-door smoke
run: go test -tags live -count=1 -run TestLivePiFrontDoorSmoke ./internal/ensigncycle -v

- name: Upload live artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: runtime-live-e2e-pi-live
path: |
./spacedock
live-artifacts/pi/**
live-artifacts/journey-metrics/**
if-no-files-found: warn
19 changes: 13 additions & 6 deletions docs/dev/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ spacedock status --workflow-dir docs/dev --next

The live lanes prove runtime behavior, not text shape. Static grep checks over workflow YAML or skill prose are not a substitute for launching the real host front door, observing its output, and checking the resulting workflow state.

A runtime regression should be caught once per user journey and then exercised by EACH supported host. The shared runtime scenarios make that real: one host-neutral scenario table, two per-host runner adapters (Claude and Codex) implementing the same scenario IDs, and a parity guard that fails if a scenario exists for one host only.
A runtime regression should be caught once per user journey and then exercised by EACH supported host. The shared runtime scenarios make that real: one host-neutral scenario table, per-host runner adapters (Claude and Codex today, with Pi tracked through an explicit live/codified/gap coverage map until its shared runners are live-safe) implementing or accounting for the same scenario IDs, and a parity guard that fails if a scenario exists for one host only.

### Shared runtime scenarios

Expand All @@ -159,7 +159,7 @@ The scenario surface lives in `internal/ensigncycle` and splits into four host-n
| Scenario table | `shared_scenarios_test.go` (`sharedRuntimeScenarios()`) | Yes |
| Fixtures + prompts | `shared_fixtures_test.go` | Yes |
| Assertions | `gate_assert_impl_test.go`, `shared_assertions_impl_test.go` | Yes |
| Runner adapter | `codex_live_runner_test.go`, `claude_live_runner_test.go` | No — one per host |
| Runner adapter | `codex_live_runner_test.go`, `claude_live_runner_test.go`, `pi_shared_coverage_test.go` | No — one per host; Pi currently records explicit live/codified/gap status for each shared scenario |

The shared table (`sharedRuntimeScenario`) carries ONLY runtime-neutral facts: scenario `name` (ID), `oldPythonTest` provenance, behavior `intent`, and a live `timeout`. It encodes NO launch, auth, plugin, artifact, or transcript field — `TestSharedRuntimeScenarioDefinitions` reflects over the type and fails if any field names a single host.

Expand Down Expand Up @@ -188,7 +188,7 @@ Assertions prefer durable workflow state over transcript phrasing: entity frontm
3. Add a host-neutral assertion over `(before, after, observed)` strings (or reuse an existing one) and at least one offline negative case in `shared_scenarios_negative_test.go` that builds the broken end-state and proves the assertion goes red.
4. Add a runner entry for the new `name` to BOTH `codexScenarioRunners()` and `claudeScenarioRunners()`. `TestSharedScenarioRunnerCoverage` fails until both hosts cover it.

The shared coverage meta-test enforces parity in both directions: every shared scenario must have a runner for each host, and every runner must map to a defined scenario.
The shared coverage meta-test enforces parity in both directions: every shared scenario must have a Claude and Codex runner plus a Pi live/codified/gap coverage entry, and every runner or Pi coverage entry must map to a defined scenario.

### Local live execution

Expand All @@ -212,22 +212,29 @@ Run the Codex shared suite locally (`npm install -g @openai/codex` then `codex l
go test -tags live -count=1 -run TestLiveCodexSharedScenarios ./internal/ensigncycle -v
```

Run the Pi front-door smoke locally (`npm install -g pi-coding-agent`, `pi install npm:pi-subagents`, and either `pi login` or `OPENAI_API_KEY`). The smoke loads the current checkout's Spacedock first-officer and ensign skills plus the local pi-subagents extension/skill explicitly; it verifies durable state in the split-root state checkout rather than transcript wording alone.

```bash
go test -tags live -count=1 -run TestLivePiFrontDoorSmoke ./internal/ensigncycle -v
```

The parity and definition guards run with no model spend — useful before paying for a live run:

```bash
go test -tags live -run 'TestSharedScenarioRunnerCoverage|TestSharedRuntimeScenarioDefinitions' ./internal/ensigncycle -v
go test -tags live -run 'TestSharedScenarioRunnerCoverage|TestSharedRuntimeScenarioDefinitions|TestPiSharedScenarioCoverage' ./internal/ensigncycle -v
```

Without auth, the respective live suite skips locally (Claude/Codex), except in CI where the lane requires it.
Without auth, the respective live suite skips locally (Claude/Codex/Pi), except in CI where the lane requires it.

### GitHub setup

Workflow: `.github/workflows/runtime-live-e2e.yml`. The offline gate job (`go test ./...`, no secrets) must pass before either live lane burns its environment approval.

- `claude-live` (matrix: `sonnet` on `CI-E2E`, `claude-opus-4-8` on `CI-E2E-OPUS`): secret `ANTHROPIC_API_KEY`. Runs `TestLiveEnsignCycle` (the full-cycle smoke) AND `TestLiveClaudeSharedScenarios` (the shared suite). Artifacts under `live-artifacts/claude/<model>/` plus the session jsonl under `$CLAUDE_CONFIG_DIR`.
- `codex-live` (environment `CI-E2E-CODEX`): secret `OPENAI_API_KEY`, `SPACEDOCK_CODEX_LIVE_REQUIRED=1` so a missing key fails clearly after approval. Runs `TestLiveCodexSharedScenarios`. Artifacts under `live-artifacts/codex/`.
- `pi-live` (environment `CI-E2E-PI`): secret `OPENAI_API_KEY`, `SPACEDOCK_PI_LIVE_REQUIRED=1` so missing Pi/OpenAI prerequisites fail clearly after approval. Installs `pi-coding-agent`, `pi-subagents`, and `pi-intercom`, runs the Pi shared coverage guard plus `TestLivePiFrontDoorSmoke`, and uploads artifacts under `live-artifacts/pi/`.

Both live lanes must test the current checkout, not a remote `--ref next` install. The Codex lane generates a local marketplace under `$RUNNER_TEMP`:
All live lanes must test the current checkout, not a remote `--ref next` install. The Codex lane generates a local marketplace under `$RUNNER_TEMP`:

```text
.agents/plugins/marketplace.json
Expand Down
4 changes: 3 additions & 1 deletion internal/cli/pi.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ type piRuntimeConfig struct {
firstOfficer string
ensign string
authPath string
openAIAPIKey string
pluginDirSource string
}

Expand Down Expand Up @@ -232,14 +233,15 @@ func piRuntimeConfigFromEnv(env []string, dir, pluginDir string) piRuntimeConfig
firstOfficer: filepath.Join(repo, "skills", "first-officer", "SKILL.md"),
ensign: filepath.Join(repo, "skills", "ensign", "SKILL.md"),
authPath: authPath,
openAIAPIKey: envMap["OPENAI_API_KEY"],
pluginDirSource: pluginDirSource,
}
}

func checkPiRuntime(ops piRuntimeOps, cfg piRuntimeConfig) piCheckResult {
bin, err := ops.LookPath("pi")
res := piCheckResult{piBinOK: err == nil, piBin: bin, packageRoot: cfg.packageRoot, repoRoot: cfg.repoRoot, authPath: cfg.authPath}
res.authOK = ops.Stat(cfg.authPath) == nil
res.authOK = ops.Stat(cfg.authPath) == nil || strings.TrimSpace(cfg.openAIAPIKey) != ""
res.extensionOK = ops.Stat(cfg.extensionPath) == nil
res.subagentsSkillOK = ops.Stat(filepath.Join(cfg.subagentsSkill, "SKILL.md")) == nil
res.firstOfficerOK = ops.Stat(cfg.firstOfficer) == nil
Expand Down
14 changes: 14 additions & 0 deletions internal/cli/pi_frontdoor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,20 @@ func TestPiDoctorReportsMissingAndHealthyRuntime(t *testing.T) {
}
})

t.Run("openai-api-key-auth", func(t *testing.T) {
var stdout, stderr bytes.Buffer
code := runDoctorWithPi(context.Background(), []string{"--host", "pi", "--plugin-dir", repo}, &fakeHost{}, &fakePiRuntimeOps{
lookPath: map[string]string{"pi": "/bin/pi"},
statOK: statOKForPiResources(repo, pkg),
}, append(piTestEnv(pkg, home), "OPENAI_API_KEY=test-key"), &stdout, &stderr)
if code != 0 {
t.Fatalf("exit=%d stderr=%q stdout=%q", code, stderr.String(), stdout.String())
}
if !strings.Contains(stdout.String(), "OK Pi auth") {
t.Fatalf("OpenAI-key doctor output should accept env auth:\n%s", stdout.String())
}
})

t.Run("healthy", func(t *testing.T) {
var stdout, stderr bytes.Buffer
statOK := statOKForPiResources(repo, pkg)
Expand Down
39 changes: 24 additions & 15 deletions internal/ensigncycle/pi_live_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ func newPiLiveSmokeFixture(t *testing.T, name, repo, piSubagentsRoot, binary str
piHome := t.TempDir()
sessionDir := t.TempDir()
cleanHome := t.TempDir()
seedPiLocalAuth(t, piHome, os.Getenv("HOME"))
seedPiLiveAuth(t, piHome, os.Getenv("HOME"), os.Getenv("OPENAI_API_KEY"), os.Getenv("SPACEDOCK_PI_LIVE_REQUIRED"))
workflowRoot, stateRoot, entityPath = writePiSplitRootSmokeWorkflow(t)
artifactDir = filepath.Join(piLiveArtifactDir(t, name), "run")
if err := os.MkdirAll(filepath.Join(artifactDir, "sessions"), 0o755); err != nil {
Expand Down Expand Up @@ -214,23 +214,32 @@ func piLiveSmokeEntity() string {

func seedPiLocalAuth(t *testing.T, piHome, realHome string) {
t.Helper()
if realHome == "" {
t.Skip("no HOME set; cannot locate ~/.pi/agent/auth.json for Pi live smoke")
}
authPath := filepath.Join(realHome, ".pi", "agent", "auth.json")
b, err := os.ReadFile(authPath)
if err != nil {
t.Skipf("no live Pi auth available: expected %s; run pi login or provide the auth file", authPath)
}
if strings.TrimSpace(string(b)) == "" {
t.Skipf("live Pi auth file is empty: %s", authPath)
seedPiLiveAuth(t, piHome, realHome, os.Getenv("OPENAI_API_KEY"), os.Getenv("SPACEDOCK_PI_LIVE_REQUIRED"))
}

func seedPiLiveAuth(t *testing.T, piHome, realHome, openAIAPIKey, required string) {
t.Helper()
if realHome != "" {
authPath := filepath.Join(realHome, ".pi", "agent", "auth.json")
b, err := os.ReadFile(authPath)
if err == nil && strings.TrimSpace(string(b)) != "" {
if err := os.MkdirAll(piHome, 0o700); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(piHome, "auth.json"), b, 0o600); err != nil {
t.Fatal(err)
}
return
}
}
if err := os.MkdirAll(piHome, 0o700); err != nil {
t.Fatal(err)
if strings.TrimSpace(openAIAPIKey) != "" {
return
}
if err := os.WriteFile(filepath.Join(piHome, "auth.json"), b, 0o600); err != nil {
t.Fatal(err)
message := "no live Pi auth available: expected ~/.pi/agent/auth.json or OPENAI_API_KEY"
if required != "" {
t.Fatal(message + " for the approval-gated pi-live lane")
}
t.Skip(message + "; run pi login or set OPENAI_API_KEY to run the live Pi suite")
}

func piLiveEnv(piHome, sessionDir, cleanHome, binaryDir, piSubagentsRoot string) []string {
Expand Down
53 changes: 53 additions & 0 deletions internal/ensigncycle/pi_shared_coverage_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
//go:build live

package ensigncycle

import "testing"

type piSharedScenarioCoverage struct {
mode string
reason string
}

func piSharedScenarioCoverageMap() map[string]piSharedScenarioCoverage {
return map[string]piSharedScenarioCoverage{
"gate-guardrail": {
mode: "gap",
reason: "Pi currently has durable live coverage for subagent dispatch/front-door setup, but not a live-safe shared first-officer gate runner.",
},
"rejection-flow": {
mode: "gap",
reason: "Pi currently has durable live coverage for subagent dispatch/front-door setup, but not a live-safe shared first-officer rejection-flow runner.",
},
"merge-hook-guardrail": {
mode: "gap",
reason: "Pi currently has durable live coverage for subagent dispatch/front-door setup, but not a live-safe shared first-officer merge-hook runner.",
},
}
}

func TestPiSharedScenarioCoverage(t *testing.T) {
coverage := piSharedScenarioCoverageMap()
defined := map[string]bool{}
for _, scenario := range sharedRuntimeScenarios() {
defined[scenario.name] = true
entry, ok := coverage[scenario.name]
if !ok {
t.Errorf("shared scenario %q has no Pi coverage entry", scenario.name)
continue
}
switch entry.mode {
case "live", "codified", "gap":
default:
t.Errorf("shared scenario %q has invalid Pi coverage mode %q", scenario.name, entry.mode)
}
if entry.reason == "" {
t.Errorf("shared scenario %q Pi coverage entry needs an honest reason", scenario.name)
}
}
for name := range coverage {
if !defined[name] {
t.Errorf("Pi coverage entry %q has no shared scenario definition", name)
}
}
}
9 changes: 9 additions & 0 deletions internal/ensigncycle/shared_coverage_meta_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import "testing"
func TestSharedScenarioRunnerCoverage(t *testing.T) {
codexRunners := codexScenarioRunners()
claudeRunners := claudeScenarioRunners()
piCoverage := piSharedScenarioCoverageMap()

scenarios := sharedRuntimeScenarios()
if len(scenarios) == 0 {
Expand All @@ -27,6 +28,9 @@ func TestSharedScenarioRunnerCoverage(t *testing.T) {
if claudeRunners[scenario.name] == nil {
t.Errorf("shared scenario %q has no Claude runner", scenario.name)
}
if _, ok := piCoverage[scenario.name]; !ok {
t.Errorf("shared scenario %q has no Pi live/codified/gap coverage entry", scenario.name)
}
}

// A runner with no matching shared scenario is also drift: a host scenario the
Expand All @@ -46,4 +50,9 @@ func TestSharedScenarioRunnerCoverage(t *testing.T) {
t.Errorf("Claude runner %q has no shared scenario definition", name)
}
}
for name := range piCoverage {
if !defined[name] {
t.Errorf("Pi coverage entry %q has no shared scenario definition", name)
}
}
}
Loading
Loading