diff --git a/services/agent/README.md b/services/agent/README.md
new file mode 100644
index 0000000000..82b5272e17
--- /dev/null
+++ b/services/agent/README.md
@@ -0,0 +1,121 @@
+# Agent runner (TypeScript)
+
+The Node side of the agent workflow service. It runs the actual agent loop and serves one
+contract: a JSON request in, a structured result out. The Python service
+(`services/oss/src/agent/`) decides *what* to run (config, tools, secrets, trace) and calls
+in here; this package *runs* it. It lives in Node because the harnesses (Pi, Claude Code,
+rivet's `sandbox-agent`) are Node libraries with no Python SDK.
+
+## How it is invoked
+
+Two entrypoints, same `/run` contract (see `src/protocol.ts`):
+
+- **`src/cli.ts`** — one JSON request on stdin, one result on stdout. The Python
+  SDK adapters use this subprocess transport when `AGENTA_AGENT_PI_URL` is unset. stdout is
+  the result channel only; logs go to stderr.
+- **`src/server.ts`** — the same thing as a long-lived HTTP server on `:8765`
+  (`GET /health`, `POST /run`). This is the dockerized agent runner sidecar the Python SDK
+  adapters call over HTTP when `AGENTA_AGENT_PI_URL` points at it. The dev image
+  (`docker/Dockerfile.dev`) runs `tsx watch src/server.ts`.
+
+Both route to an engine by the request's `backend` field.
+
+## Layout (`src/`)
+
+```
+src/
+  cli.ts              entrypoint: stdin/stdout (subprocess transport)
+  server.ts           entrypoint: HTTP sidecar on :8765
+  protocol.ts         the /run wire contract (request, result, events, capabilities)
+  engines/
+    pi.ts             engine: drive the Pi SDK in-process
+    rivet.ts          engine: drive a harness over ACP via a rivet sandbox-agent daemon
+  tracing/
+    otel.ts           turn a run into OpenTelemetry spans nested under /invoke
+  tools/
+    callback.ts       the one /tools/call HTTP client
+    code.ts           execute resolved code tools in a scoped subprocess
+    dispatch.ts       dispatch resolved tools by executor kind
+    mcp-bridge.ts     build the MCP server config that exposes tools to a harness
+    mcp-server.ts     the stdio MCP server itself (launched per session by the daemon)
+  extensions/
+    agenta.ts         the Pi extension (tracing + tools), bundled into dist/ for Pi to load
+```
+
+## Engines
+
+- **`pi`** (`engines/pi.ts`) — drives the Pi SDK directly in-process.
+- **`rivet`** (`engines/rivet.ts`) — drives any harness (`pi`, `claude`) over the Agent
+  Client Protocol through a rivet `sandbox-agent` daemon, either local or in a Daytona
+  sandbox. This is the default on the platform.
+
+The engine is a deployment choice (`backend` on the wire / `AGENT_BACKEND` env), not a
+harness. Harness choice (`pi`, `claude`, or experimental `agenta`) and sandbox (`local` or
+`daytona`, where supported) are per-run config the Python service sends.
+
+## Result
+
+```json
+{
+  "ok": true,
+  "output": "Rome",
+  "messages": [{ "role": "assistant", "content": "Rome" }],
+  "events": [{ "type": "message", "text": "Rome" }, { "type": "done" }],
+  "usage": { "input": 1297, "output": 5, "total": 1302, "cost": 0.0066 },
+  "stopReason": "end_turn",
+  "capabilities": { "mcpTools": false, "images": true, "...": "..." },
+  "sessionId": "...",
+  "model": "openai-codex/gpt-5.5",
+  "traceId": "..."
+}
+```
+
+`runRivet` probes the harness's capabilities and branches on them (for example, tools go
+over MCP only when the harness advertises `mcpTools`); usage and the structured event log
+come back on every run.
+
+## Tracing
+
+When the request carries a `trace` block, the run is exported to Agenta as OpenTelemetry
+spans nested under the caller's `/invoke` span. The Pi path self-instruments via the
+bundled extension (`extensions/agenta.ts`); other harnesses are traced from the rivet ACP
+event stream (`tracing/otel.ts`). The Python `tracing` module fills `trace` in from the
+live workflow span.
+
+## Tools
+
+Tools are resolved in the Python backend and arrive on the request as `customTools` plus a
+`toolCallback`. Delivery is capability-routed: the Pi extension registers them natively;
+other harnesses get them over MCP through `tools/mcp-bridge.ts` + `tools/mcp-server.ts`.
+Either way each call POSTs back to Agenta's `/tools/call` (`tools/callback.ts`), so the
+provider key and connection auth stay server-side.
+
+## The extension bundle
+
+`scripts/build-extension.mjs` esbuild-bundles `src/extensions/agenta.ts` into one
+self-contained `dist/extensions/agenta.js` that Pi can load anywhere (host, the sidecar, a
+Daytona snapshot). The dev image bakes it; rebuild after editing the extension or the
+tracer:
+
+```bash
+pnpm run build:extension
+```
+
+## Auth
+
+Provider keys arrive as `request.secrets` (resolved from the project vault) or fall back to
+the harness's own login: Pi reads `~/.pi/agent/auth.json` (`pnpm exec pi` then `/login`),
+Claude Code reads `~/.claude`. Set `OPENAI_API_KEY` / `ANTHROPIC_API_KEY` to override.
+
+## config/
+
+`config/AGENTS.md` and `config/agent.json` are a fallback "hello-world" agent, used only
+when a request arrives with no config. In practice the playground always sends the agent
+revision's config, so these are rarely hit.
+
+## Local use
+
+```bash
+pnpm install
+echo '{"backend":"pi","messages":[{"role":"user","content":"Hi"}]}' | pnpm run run:cli
+```
diff --git a/services/agent/config/AGENTS.md b/services/agent/config/AGENTS.md
new file mode 100644
index 0000000000..767a2cdd49
--- /dev/null
+++ b/services/agent/config/AGENTS.md
@@ -0,0 +1,7 @@
+# Hello-world agent
+
+You are a friendly hello-world agent running on the Agenta agent service.
+
+- Greet the user warmly.
+- Answer the user's message in one or two short sentences.
+- Do not use tools. Keep replies plain text.
diff --git a/services/agent/config/agent.json b/services/agent/config/agent.json
new file mode 100644
index 0000000000..adc26f793c
--- /dev/null
+++ b/services/agent/config/agent.json
@@ -0,0 +1,4 @@
+{
+  "model": "gpt-5.5",
+  "tools": []
+}
diff --git a/services/agent/docker/Dockerfile b/services/agent/docker/Dockerfile
new file mode 100644
index 0000000000..687fea4347
--- /dev/null
+++ b/services/agent/docker/Dockerfile
@@ -0,0 +1,55 @@
+# Agent runner sidecar (sandbox-agent server), production image.
+#
+# Runs the TypeScript runner (src/server.ts) as a long-lived HTTP server on :8765.
+# The Python agent service calls it in-network. Unlike Dockerfile.dev there is no
+# `tsx watch` and no bind mount: the source is baked in.
+#
+# Licensing posture (see docker/README.md):
+#   - Pi (@earendil-works/pi-coding-agent, MIT) is baked via the npm dependencies.
+#   - Claude Code is proprietary (Anthropic Commercial Terms). It is NEVER baked into
+#     this image. The sandbox-agent daemon installs it at runtime from Anthropic over
+#     HTTPS (the reason ca-certificates is installed). That keeps Anthropic as the
+#     distributor, the only compliant path for an image we build and ship.
+#   - No credential is baked: no API key, no OAuth login. Auth is injected at runtime
+#     (ANTHROPIC_API_KEY / request secrets; OAuth self-host is a mounted opt-in only).
+
+FROM node:24-slim
+
+WORKDIR /app
+
+# CA certificates: the sandbox-agent daemon (Rust) downloads harness CLIs (e.g. Claude
+# Code) over HTTPS using the system trust store, which node:*-slim omits — without this
+# the daemon's `install-agent claude` fails TLS verification. git lets npm/installers
+# fetch git deps.
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends ca-certificates git \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN corepack enable
+
+# Install deps as a cached layer (manifest + lockfile only). The full dependency set is
+# installed (not --prod): the runtime uses `tsx` and the extension build uses `esbuild`,
+# both devDependencies.
+COPY package.json pnpm-lock.yaml ./
+RUN pnpm install --frozen-lockfile
+
+# Bake the source (no bind mount in production).
+COPY tsconfig.json ./
+COPY scripts ./scripts
+COPY src ./src
+COPY config ./config
+COPY skills ./skills
+
+# Bundle the Agenta Pi extension (tracing + tools) into dist/. runSandboxAgent installs
+# this baked copy into Pi's agent dir on every run. Rebuild the image after editing
+# src/extensions/agenta.ts or the tracer.
+RUN pnpm run build:extension
+
+ENV NODE_ENV=production \
+    PORT=8765
+
+EXPOSE 8765
+
+# Call the local tsx binary directly to avoid pnpm/corepack HOME writes when the
+# container runs as a non-root host uid.
+CMD ["node_modules/.bin/tsx", "src/server.ts"]
diff --git a/services/agent/docker/Dockerfile.dev b/services/agent/docker/Dockerfile.dev
new file mode 100644
index 0000000000..4f2f64f126
--- /dev/null
+++ b/services/agent/docker/Dockerfile.dev
@@ -0,0 +1,41 @@
+# Pi harness sidecar (WP-2), dev image.
+#
+# Runs the TypeScript Pi wrapper as an HTTP server. The Python agent service calls
+# it in-network. Source is bind-mounted in dev so `tsx watch` hot-reloads; node_modules
+# stays baked into the image. Build context is services/agent.
+
+FROM node:24-slim
+
+WORKDIR /app
+
+# CA certificates: the rivet daemon (Rust) downloads harness CLIs (e.g. Claude Code) over
+# HTTPS using the system trust store, which node:*-slim omits — without this the daemon's
+# `install-agent claude` fails TLS verification. git lets npm/installers fetch git deps.
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends ca-certificates git \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN corepack enable
+
+# Install deps as a cached layer (manifest + lockfile only).
+COPY package.json pnpm-lock.yaml ./
+RUN pnpm install --frozen-lockfile
+
+# Fallback copy for non-mounted runs; in dev these are bind-mounted over.
+COPY tsconfig.json ./
+COPY scripts ./scripts
+COPY src ./src
+
+# Bundle the Agenta Pi extension (tracing + tools) into dist/. dist/ is NOT bind-mounted
+# in dev, so this baked copy is what runRivet installs into Pi's agent dir. Rebuild the
+# image after editing src/piExtension.ts or src/agenta-otel.ts.
+RUN pnpm run build:extension
+
+ENV NODE_ENV=development \
+    PORT=8765
+
+EXPOSE 8765
+
+# Call the local tsx binary directly to avoid pnpm/corepack HOME writes when the
+# container runs as a non-root host uid.
+CMD ["node_modules/.bin/tsx", "watch", "src/server.ts"]
diff --git a/services/agent/docker/README.md b/services/agent/docker/README.md
new file mode 100644
index 0000000000..63895b109a
--- /dev/null
+++ b/services/agent/docker/README.md
@@ -0,0 +1,66 @@
+# Agent sidecar images
+
+Images for the agent runner sidecar (the `sandbox-agent server` runtime in
+`services/agent/src/server.ts`). The Python service calls it in-network at
+`:8765`.
+
+- `Dockerfile.dev` — dev image. `tsx watch`, source bind-mounted, hot reload.
+- `Dockerfile` — production image. Source baked in, no watcher.
+
+## Licensing posture (read before changing any image or build recipe)
+
+The rule that shapes every image here:
+
+> **We ship build recipes, not Claude-containing images, and we never bake a
+> credential into any image.**
+
+Why:
+
+- **Pi** (`@earendil-works/pi-coding-agent`) is MIT. We bake it freely via the npm
+  dependencies, in every image and snapshot.
+- **Claude Code** is proprietary (© Anthropic PBC, governed by Anthropic's
+  [Commercial Terms](https://www.anthropic.com/legal/commercial-terms);
+  [legal & compliance](https://code.claude.com/docs/en/legal-and-compliance)). The
+  Commercial Terms grant a usage license only. They do not grant any right to
+  redistribute, resell, sublicense, or repackage the Services. So an image **we
+  build and distribute must not contain Claude Code.**
+- Claude Code is installed **from Anthropic** (`npm install -g
+  @anthropic-ai/claude-code`, `https://claude.ai/install.sh`, or the daemon's
+  `install-agent claude`). That keeps Anthropic as the distributor, which is the
+  permitted path. The production sidecar does this at runtime; a snapshot we build
+  for our own use does it at build time.
+
+## Authentication
+
+Auth is injected at runtime, never baked into a layer.
+
+- **API key (default, and the only option for cloud / multi-tenant).** Set
+  `ANTHROPIC_API_KEY` (or pass provider keys as request secrets from the vault).
+  Anthropic directs products and services that interact with Claude to use API key
+  auth, so this is the path for any Agenta-orchestrated run that serves users.
+- **OAuth subscription (self-host opt-in only).** An individual operator may mount
+  their own Claude login (e.g. `~/.claude`) into the container and run with their
+  own subscription. This is for personal, individual use of Claude Code, never for
+  serving other users, and it is the operator's responsibility. Anthropic restricts
+  Free/Pro/Max OAuth to first-party use and forbids third parties routing requests
+  through it (enforced since 2026-03). Cloud and multi-tenant deployments must stay
+  API-key only.
+
+We never bake an OAuth login or an API key into an image.
+
+## Build recipes (two paths)
+
+- **Cloud / Daytona (API key).** The Daytona snapshot recipe bakes Pi. Agenta Cloud
+  builds and uses its own snapshot internally; self-hosters run the same recipe
+  against their own Daytona account. We ship the build script (the recipe), not the
+  built snapshot, so we never distribute a Claude-containing artifact. Snapshot
+  builder: `docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/poc/build_rivet_snapshot.py`.
+  Today it bases on rivet's `-full` image, which already bundles Claude. That is
+  compliant under the recipe-not-image model. **Cleaner-provenance follow-up
+  (needs a live Daytona build to verify):** base on a daemon-only rivet image and
+  install Claude from Anthropic at build, so the snapshot's Claude comes straight
+  from Anthropic rather than from a third party's bundled image. Relocation of the
+  builder into this folder is a follow-up.
+- **Self-host (API key, OAuth optional).** Build the production `Dockerfile` (it
+  bakes neither Claude nor a credential), then supply auth at runtime: an
+  `ANTHROPIC_API_KEY` env var, or, for individual use, a mounted OAuth login dir.
diff --git a/services/agent/scripts/build-extension.mjs b/services/agent/scripts/build-extension.mjs
new file mode 100644
index 0000000000..debdae88d7
--- /dev/null
+++ b/services/agent/scripts/build-extension.mjs
@@ -0,0 +1,30 @@
+/**
+ * Bundle the Agenta Pi extension into one self-contained file so its OpenTelemetry deps
+ * resolve wherever Pi loads it (host, docker sidecar, Daytona snapshot). Pi only accepts
+ * `.ts`/`.js` extension files, so we emit `.js` (ESM) with a default export.
+ *
+ * Run: pnpm run build:extension  ->  dist/extensions/agenta.js
+ */
+import { build } from "esbuild";
+import { dirname, join } from "node:path";
+import { fileURLToPath } from "node:url";
+
+const root = join(dirname(fileURLToPath(import.meta.url)), "..");
+
+await build({
+  entryPoints: [join(root, "src/extensions/agenta.ts")],
+  outfile: join(root, "dist/extensions/agenta.js"),
+  bundle: true,
+  platform: "node",
+  format: "esm",
+  target: "node20",
+  // Pi provides the ExtensionAPI at load time; never bundle the harness SDK.
+  external: ["@earendil-works/pi-coding-agent"],
+  banner: {
+    // protobufjs and some deps expect CommonJS globals under ESM; shim them.
+    js: "import{createRequire as __cr}from'node:module';const require=__cr(import.meta.url);",
+  },
+  logLevel: "info",
+});
+
+process.stderr.write("[build-extension] wrote dist/extensions/agenta.js\n");
diff --git a/services/agent/skills/agenta-getting-started/SKILL.md b/services/agent/skills/agenta-getting-started/SKILL.md
new file mode 100644
index 0000000000..44bc6a7a6b
--- /dev/null
+++ b/services/agent/skills/agenta-getting-started/SKILL.md
@@ -0,0 +1,21 @@
+---
+name: agenta-getting-started
+description: Baseline guidance for agents running on the Agenta platform. Use at the start of a task to recall how to work with the tools and skills Agenta provides and how to report results clearly.
+---
+
+# Agenta getting started
+
+This is a placeholder Agenta skill that ships with the `AgentaHarness`. It proves the
+forced-skill path end to end; replace its content with real Agenta guidance.
+
+## When to use
+
+Read this when you begin a task and want a reminder of the Agenta conventions below.
+
+## Conventions
+
+- Prefer the provided tools and skills over guessing; call a tool when one fits.
+- When another skill matches the task, read its `SKILL.md` fully before acting.
+- Keep answers grounded in what the tools and skills actually return. Do not fabricate
+  results or tool output.
+- Be concise. State what you did, what it returned, and what is left.
diff --git a/services/agent/src/cli.ts b/services/agent/src/cli.ts
new file mode 100644
index 0000000000..7f45ebb714
--- /dev/null
+++ b/services/agent/src/cli.ts
@@ -0,0 +1,88 @@
+/**
+ * WP-2 Pi wrapper CLI: the JSON transport for the Harness port.
+ *
+ * Reads one JSON `AgentRunRequest` from stdin, runs Pi once, and writes one JSON
+ * `AgentRunResult` to stdout. stdout carries the result and nothing else; logs go
+ * to stderr. This is the one-shot "json adapter" the design doc describes; a
+ * long-lived RPC adapter can replace it later behind the same Python-side port.
+ */
+import type {
+  AgentRunRequest,
+  AgentRunResult,
+  EmitEvent,
+  StreamRecord,
+} from "./protocol.ts";
+import { runPi } from "./engines/pi.ts";
+import { runRivet } from "./engines/rivet.ts";
+
+// Engine: `rivet` drives a harness over ACP via a rivet daemon; `pi` (default) is the
+// legacy in-process Pi path. The request's `backend` wins, then the AGENT_BACKEND env.
+function runAgent(
+  request: AgentRunRequest,
+  emit?: EmitEvent,
+): Promise<AgentRunResult> {
+  const backend = (request.backend ?? process.env.AGENT_BACKEND ?? "pi").toLowerCase();
+  return backend === "rivet" ? runRivet(request, emit) : runPi(request, emit);
+}
+
+async function readStdin(): Promise<string> {
+  const chunks: Buffer[] = [];
+  for await (const chunk of process.stdin) {
+    chunks.push(chunk as Buffer);
+  }
+  return Buffer.concat(chunks).toString("utf8");
+}
+
+// One-shot mode: the whole result as a single JSON document (the `/invoke` contract).
+function emitResult(result: AgentRunResult): void {
+  process.stdout.write(JSON.stringify(result));
+}
+
+// Streaming mode (`--stream`): one NDJSON record per line — an `{kind:"event"}` line the
+// moment each event is built, then exactly one terminal `{kind:"result"}` line.
+function writeRecord(record: StreamRecord): void {
+  process.stdout.write(JSON.stringify(record) + "\n");
+}
+
+async function main(): Promise<void> {
+  const stream = process.argv.includes("--stream");
+  const raw = await readStdin();
+
+  let request: AgentRunRequest;
+  try {
+    request = raw.trim() ? (JSON.parse(raw) as AgentRunRequest) : {};
+  } catch (err) {
+    const failure: AgentRunResult = { ok: false, error: `Invalid JSON on stdin: ${String(err)}` };
+    if (stream) writeRecord({ kind: "result", result: failure });
+    else emitResult(failure);
+    process.exit(1);
+  }
+
+  if (!stream) {
+    try {
+      const result = await runAgent(request);
+      emitResult(result);
+      process.exit(result.ok ? 0 : 1);
+    } catch (err) {
+      emitResult({
+        ok: false,
+        error: err instanceof Error ? err.stack ?? err.message : String(err),
+      });
+      process.exit(1);
+    }
+    return;
+  }
+
+  const emit: EmitEvent = (event) => writeRecord({ kind: "event", event });
+  let result: AgentRunResult;
+  try {
+    result = await runAgent(request, emit);
+  } catch (err) {
+    result = { ok: false, error: err instanceof Error ? err.stack ?? err.message : String(err) };
+  }
+  // Streaming delivered the events live, so don't echo them in the terminal record.
+  writeRecord({ kind: "result", result: { ...result, events: [] } });
+  process.exit(result.ok ? 0 : 1);
+}
+
+main();
diff --git a/services/agent/src/engines/pi.ts b/services/agent/src/engines/pi.ts
new file mode 100644
index 0000000000..2be7d1698f
--- /dev/null
+++ b/services/agent/src/engines/pi.ts
@@ -0,0 +1,432 @@
+/**
+ * Legacy backend: drive the Pi SDK in-process for one cold run.
+ *
+ * This is the non-rivet engine. It drives Pi's `createAgentSession` directly: injects
+ * AGENTS.md in memory, resolves the model, sends one user turn, and returns the structured
+ * result (final text, messages, events, usage, capabilities). It also turns the
+ * backend-resolved runnable tools (WP-7) into Pi customTools that route back through
+ * Agenta's /tools/call. The rivet engine (`engines/rivet.ts`) is the ACP path; both serve the
+ * same `/run` contract (see `protocol.ts`).
+ *
+ * Auth: provider keys arrive as `request.secrets` (applied to the env) or fall back to the
+ * local Pi login (`AuthStorage.create()` reads ~/.pi/agent/auth.json). Nothing
+ * invocation-specific is written to a persistent disk: the session is in-memory and the
+ * working dir is a throwaway temp dir.
+ *
+ * Important: stdout is reserved for the JSON result (see cli.ts). Everything here logs to
+ * stderr so it never pollutes the result channel.
+ */
+import { existsSync, mkdtempSync, rmSync, statSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { dirname, isAbsolute, join } from "node:path";
+import { fileURLToPath } from "node:url";
+
+import {
+  AuthStorage,
+  createAgentSession,
+  DefaultResourceLoader,
+  getAgentDir,
+  ModelRegistry,
+  SessionManager,
+  SettingsManager,
+} from "@earendil-works/pi-coding-agent";
+
+import { createAgentaOtel } from "../tracing/otel.ts";
+import {
+  type AgentEvent,
+  type AgentRunRequest,
+  type AgentRunResult,
+  type ChatMessage,
+  type EmitEvent,
+  type HarnessCapabilities,
+  type ResolvedToolSpec,
+  type ToolCallbackContext,
+  resolveRunSessionId,
+  resolvePromptText,
+} from "../protocol.ts";
+import { EMPTY_OBJECT_SCHEMA } from "../tools/callback.ts";
+import { runResolvedTool } from "../tools/dispatch.ts";
+
+/** What the in-process Pi engine supports. Static (no daemon to probe, unlike rivet). */
+const PI_CAPABILITIES: HarnessCapabilities = {
+  textMessages: true,
+  toolCalls: true,
+  reasoning: true,
+  usage: true,
+  streamingDeltas: true,
+  images: false,
+  fileAttachments: false,
+  mcpTools: false,
+  planMode: false,
+  permissions: false,
+  sessionLifecycle: false,
+};
+
+function log(message: string): void {
+  process.stderr.write(`[pi-wrapper] ${message}\n`);
+}
+
+// services/agent/src/engines/pi.ts -> services/agent. Bundled skills (the Agenta harness's
+// forced skills) live under services/agent/skills/<name>/. Overridable for non-default layouts.
+const PKG_ROOT = dirname(dirname(dirname(fileURLToPath(import.meta.url))));
+const SKILLS_ROOT = process.env.AGENTA_AGENT_SKILLS_DIR || join(PKG_ROOT, "skills");
+
+/**
+ * Resolve the requested skill names to bundled skill directories under SKILLS_ROOT. Each name
+ * must be a committed dir holding a SKILL.md (Pi loads them and surfaces them in the system
+ * prompt). Absolute paths are honored as-is; unknown or non-directory entries are skipped with
+ * a warning so a stale name never fails the run.
+ */
+function resolveSkillDirs(names: string[] | undefined): string[] {
+  const dirs: string[] = [];
+  for (const name of names ?? []) {
+    if (!name) continue;
+    const path = isAbsolute(name) ? name : join(SKILLS_ROOT, name);
+    try {
+      if (existsSync(path) && statSync(path).isDirectory()) {
+        dirs.push(path);
+      } else {
+        log(`skipping unknown skill "${name}" (no directory at ${path})`);
+      }
+    } catch {
+      log(`skipping skill "${name}": cannot stat ${path}`);
+    }
+  }
+  return dirs;
+}
+
+// In-process Pi reads provider keys from process.env. Since process.env is process-global,
+// serialize Pi runs while applying request-scoped provider env, then restore the prior env
+// exactly so one request's vault keys cannot leak into the next request.
+let providerEnvQueue: Promise<void> = Promise.resolve();
+
+async function withRequestProviderEnv<T>(
+  secrets: Record<string, string> | undefined,
+  fn: () => Promise<T>,
+): Promise<T> {
+  const run = providerEnvQueue.then(async () => {
+    const previous = new Map<string, string | undefined>();
+    for (const [key, value] of Object.entries(secrets ?? {})) {
+      previous.set(key, process.env[key]);
+      if (value) process.env[key] = value;
+      else delete process.env[key];
+    }
+    try {
+      return await fn();
+    } finally {
+      for (const [key, value] of previous) {
+        if (value === undefined) delete process.env[key];
+        else process.env[key] = value;
+      }
+    }
+  });
+  providerEnvQueue = run.then(
+    () => undefined,
+    () => undefined,
+  );
+  return run;
+}
+
+/** Pick the requested model, else gpt-5.5, else a sensible non-mini default. */
+function pickModel(available: any[], wanted?: string): any {
+  return (
+    (wanted &&
+      available.find((m) => m.id === wanted || `${m.provider}/${m.id}` === wanted)) ||
+    available.find((m) => m.id === "gpt-5.5") ||
+    available.find((m) => !/spark|mini/i.test(m.id)) ||
+    available[0]
+  );
+}
+
+/** Concatenate the text blocks of the last assistant message. */
+function extractAssistantText(messages: any[]): string {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const message = messages[i];
+    if (message?.role !== "assistant") continue;
+    const content = message.content;
+    if (typeof content === "string") return content;
+    if (Array.isArray(content)) {
+      const text = content
+        .filter((block: any) => block?.type === "text" && block.text)
+        .map((block: any) => block.text)
+        .join("");
+      if (text) return text;
+    }
+  }
+  return "";
+}
+
+/** The stop reason of the last assistant message, when Pi set one. */
+function lastStopReason(messages: any[]): string | undefined {
+  for (let i = messages.length - 1; i >= 0; i--) {
+    if (messages[i]?.role === "assistant" && messages[i].stopReason) {
+      return String(messages[i].stopReason);
+    }
+  }
+  return undefined;
+}
+
+/**
+ * Turn resolved tool specs into Pi customTools, branching on the executor `kind`:
+ *  - `callback` (default): `execute` POSTs back through Agenta's /tools/call, so the Composio
+ *    key and connection auth stay server-side.
+ *  - `code`: `execute` runs the snippet in a sandbox subprocess with its scoped secret env.
+ *  - `client`: browser-fulfilled, so skipped on the in-process path (no browser to answer).
+ *
+ * A failed `execute` throws, which Pi turns into a tool-error result (the loop continues)
+ * rather than a run failure. Pi accepts a plain JSON Schema for `parameters` (non-TypeBox path).
+ */
+export function buildCustomTools(
+  specs: ResolvedToolSpec[],
+  callback: ToolCallbackContext | undefined,
+): any[] {
+  const tools: any[] = [];
+  for (const spec of specs) {
+    const base = {
+      name: spec.name,
+      label: spec.name,
+      description: spec.description ?? spec.name,
+      parameters: (spec.inputSchema as any) ?? EMPTY_OBJECT_SCHEMA,
+    };
+    if (spec.kind === "client") {
+      log(`skipping client tool '${spec.name}' (browser-fulfilled; not available in-process)`);
+      continue;
+    }
+    if (spec.kind === "code") {
+      tools.push({
+        ...base,
+        async execute(toolCallId: string, params: unknown, signal?: AbortSignal) {
+          const text = await runResolvedTool(spec, params, { toolCallId, signal });
+          return { content: [{ type: "text", text }], details: { kind: "code" } };
+        },
+      });
+      continue;
+    }
+    // callback (default): route back to Agenta's /tools/call.
+    if (!callback?.endpoint) {
+      log(`skipping callback tool '${spec.name}': missing toolCallback endpoint`);
+      continue;
+    }
+    tools.push({
+      ...base,
+      async execute(toolCallId: string, params: unknown, signal?: AbortSignal) {
+        const text = await runResolvedTool(spec, params, {
+          toolCallId,
+          endpoint: callback.endpoint,
+          authorization: callback.authorization,
+          signal,
+        });
+        return {
+          content: [{ type: "text", text }],
+          details: { callRef: spec.callRef },
+        };
+      },
+    });
+  }
+  return tools;
+}
+
+export async function runPi(
+  request: AgentRunRequest,
+  emit?: EmitEvent,
+): Promise<AgentRunResult> {
+  return withRequestProviderEnv(request.secrets, () => runPiWithEnv(request, emit));
+}
+
+async function runPiWithEnv(
+  request: AgentRunRequest,
+  emit?: EmitEvent,
+): Promise<AgentRunResult> {
+  const prompt = resolvePromptText(request);
+  if (!prompt) {
+    return { ok: false, error: "No user message to send (prompt/messages empty)." };
+  }
+
+  const cwd = mkdtempSync(join(tmpdir(), "agenta-agent-"));
+
+  try {
+    const authStorage = AuthStorage.create();
+    const modelRegistry = ModelRegistry.create(authStorage);
+    const available = await modelRegistry.getAvailable();
+    if (!available || available.length === 0) {
+      return {
+        ok: false,
+        error:
+          "No model available. Log in with `pnpm exec pi` -> /login, or set OPENAI_API_KEY / ANTHROPIC_API_KEY.",
+      };
+    }
+
+    const model = pickModel(available, request.model);
+    log(`model: ${model.provider}/${model.id}`);
+
+    // Tracing: turn this run into OTel spans. When the caller passed a traceparent,
+    // invoke_agent nests under their /invoke span so the whole agent run is part of the
+    // same trace (just like completion/chat).
+    const otel = createAgentaOtel({
+      traceparent: request.trace?.traceparent,
+      baggage: request.trace?.baggage,
+      endpoint: request.trace?.endpoint,
+      authorization: request.trace?.authorization,
+      captureContent: request.trace?.captureContent,
+    });
+
+    // Inject AGENTS.md in memory and keep on-disk context files out of the run.
+    const agentsMd = request.agentsMd?.trim();
+    // Pi's two system-prompt layers, carried on the request (PiAgentConfig.system /
+    // append_system). `systemPrompt` replaces Pi's base prompt; `appendSystemPrompt` adds to
+    // it. We feed them through the loader overrides so the run stays hermetic: only what the
+    // request carries applies, never a SYSTEM.md / APPEND_SYSTEM.md left on disk.
+    const systemPrompt = request.systemPrompt?.trim();
+    const appendSystemPrompt = request.appendSystemPrompt?.trim();
+    // Forced skills (the Agenta harness): load exactly the bundled dirs the request names.
+    // `noSkills` suppresses host/global discovery so the run is deterministic; the loader still
+    // merges `additionalSkillPaths` on top, so the bundled skills load. They only surface in
+    // the prompt when `read` is enabled (the harness forces it).
+    const skillDirs = resolveSkillDirs(request.skills);
+    if (skillDirs.length > 0) {
+      log(`skills: ${skillDirs.join(", ")}`);
+    }
+    const loader = new DefaultResourceLoader({
+      cwd,
+      agentDir: getAgentDir(),
+      noContextFiles: true,
+      noSkills: true,
+      additionalSkillPaths: skillDirs,
+      systemPromptOverride: () => systemPrompt || undefined,
+      appendSystemPromptOverride: () => (appendSystemPrompt ? [appendSystemPrompt] : []),
+      agentsFilesOverride: () => ({
+        agentsFiles: agentsMd
+          ? [{ path: "/virtual/AGENTS.md", content: agentsMd }]
+          : [],
+      }),
+      extensionFactories: [otel.register],
+    });
+    await loader.reload();
+
+    // Build runnable tools from the resolved specs. Pi's allowlist gates custom tools too,
+    // so their names must be in `tools` for the model to see them.
+    const customTools = buildCustomTools(request.customTools ?? [], request.toolCallback);
+    const toolAllowlist = [
+      ...(request.tools ?? []),
+      ...customTools.map((tool) => tool.name),
+    ];
+    if (customTools.length > 0) {
+      log(`custom tools: ${customTools.map((t) => t.name).join(", ")}`);
+    }
+
+    // Created before the prompt so a throw mid-run still flushes the partial trace and
+    // disposes the session (the inner finally below). Mirrors the rivet engine's pattern.
+    let session: Awaited<ReturnType<typeof createAgentSession>>["session"] | undefined;
+    try {
+      ({ session } = await createAgentSession({
+        cwd,
+        model,
+        authStorage,
+        modelRegistry,
+        tools: toolAllowlist,
+        customTools,
+        sessionManager: SessionManager.inMemory(cwd),
+        settingsManager: SettingsManager.inMemory(),
+        resourceLoader: loader,
+      }));
+
+      // Hand the session id + model to the extension so spans carry them.
+      const sessionId = resolveRunSessionId(request, session.sessionId);
+      otel.config.sessionId = sessionId;
+      otel.config.provider = model.provider;
+      otel.config.requestModel = model.id;
+
+      // Accumulate streamed text as the primary output channel. On the streaming path, flush
+      // each Pi `text_delta` as a `message_delta` live (Pi deltas are already pure, so they
+      // emit verbatim); the block opens on the first delta and closes after the run.
+      let streamed = "";
+      let piTextId: string | undefined;
+      session.subscribe((event: any) => {
+        if (
+          event.type === "message_update" &&
+          event.assistantMessageEvent?.type === "text_delta"
+        ) {
+          const delta = event.assistantMessageEvent.delta ?? "";
+          if (!delta) return;
+          streamed += delta;
+          if (emit) {
+            if (piTextId === undefined) {
+              piTextId = "msg-0";
+              emit({ type: "message_start", id: piTextId });
+            }
+            emit({ type: "message_delta", id: piTextId, delta });
+          }
+        }
+      });
+
+      await session.prompt(prompt);
+
+      const output = streamed.trim() || extractAssistantText(session.messages);
+      const stopReason = lastStopReason(session.messages);
+      const usage = otel.usage();
+
+      // Ship this run's trace before the result is returned (and before the CLI process
+      // exits): invoke_agent has a remote parent, so the per-trace flush is what exports it.
+      await otel.flush();
+
+      // The structured stream is thinner here than on the rivet path: Pi's in-process tool
+      // events feed the trace spans, while the result-level event log carries the final
+      // message, usage, and stop reason (enough for the platform without double-plumbing).
+      //
+      // On the streaming path the events were flushed live via `emit`, so the result log stays
+      // empty; here we only close the open text block (or synthesize one when the text never
+      // streamed) and flush the tail usage/done events.
+      const events: AgentEvent[] = [];
+      const emitOrLog = (event: AgentEvent): void => {
+        if (emit) emit(event);
+        else events.push(event);
+      };
+      if (emit) {
+        if (piTextId !== undefined) {
+          emit({ type: "message_end", id: piTextId });
+        } else if (output) {
+          emit({ type: "message_start", id: "msg-0" });
+          emit({ type: "message_delta", id: "msg-0", delta: output });
+          emit({ type: "message_end", id: "msg-0" });
+        }
+      } else if (output) {
+        events.push({ type: "message", text: output });
+      }
+      if (usage.total > 0) emitOrLog({ type: "usage", ...usage });
+      emitOrLog({ type: "done", stopReason });
+
+      const messages: ChatMessage[] = output
+        ? [{ role: "assistant", content: output }]
+        : [];
+
+      return {
+        ok: true,
+        output,
+        messages,
+        events,
+        usage,
+        stopReason,
+        // `streamingDeltas` is only honest when a live sink carried the deltas end-to-end.
+        capabilities: { ...PI_CAPABILITIES, streamingDeltas: !!emit },
+        sessionId,
+        model: `${model.provider}/${model.id}`,
+        traceId: otel.config.traceId,
+      };
+    } catch (err) {
+      // Flush the partial trace before the error propagates so a failed run is still
+      // observable (the happy-path flush above never ran). Best-effort: never mask `err`.
+      await otel.flush().catch(() => {});
+      throw err;
+    } finally {
+      // Pi keeps the in-memory session alive until disposed; release it on every exit
+      // (success or throw). Guarded for the case where createAgentSession itself threw.
+      session?.dispose();
+    }
+  } finally {
+    try {
+      rmSync(cwd, { recursive: true, force: true });
+    } catch {
+      // best-effort cleanup of the throwaway working dir
+    }
+  }
+}
diff --git a/services/agent/src/engines/rivet.ts b/services/agent/src/engines/rivet.ts
new file mode 100644
index 0000000000..3a5d138106
--- /dev/null
+++ b/services/agent/src/engines/rivet.ts
@@ -0,0 +1,948 @@
+/**
+ * WP-8 rivet harness driver.
+ *
+ * Drives a coding harness (Pi, Claude Code, ...) over the Agent Client Protocol (ACP)
+ * through a rivet `sandbox-agent` daemon, instead of the bespoke Pi SDK calls in the pi
+ * engine. It serves the same /run contract (AgentRunRequest -> AgentRunResult), so the
+ * Python side stays thin and the choice of harness/sandbox is config, not new code.
+ *
+ * Per invoke (cold), mirroring the shipped code-evaluator DaytonaRunner pattern:
+ *
+ *   SandboxAgent.start({ sandbox: local({ env }) | daytona({ create }) })
+ *     -> createSession({ agent: <harness>, cwd, model })
+ *       -> write AGENTS.md into cwd
+ *       -> session.prompt([{ type: "text", text }])
+ *         -> accumulate ACP `agent_message_chunk` text + build the trace
+ *           -> destroySandbox()
+ *
+ * Two orthogonal axes swap independently: the sandbox (where the daemon runs) and the
+ * harness (which engine). The ACP boundary is daemon-to-harness; the service-to-rivet
+ * hop stays harness-agnostic behind the Harness port.
+ *
+ * Tracing is built here from the ACP event stream (see tracing/otel.ts createRivetOtel),
+ * so it is uniform across every harness and always nests under the caller's /invoke
+ * span. stdout is reserved for the JSON result (see cli.ts); logs go to stderr.
+ */
+import { randomBytes } from "node:crypto";
+import {
+  chmodSync,
+  copyFileSync,
+  existsSync,
+  mkdirSync,
+  mkdtempSync,
+  readdirSync,
+  readFileSync,
+  rmSync,
+  writeFileSync,
+} from "node:fs";
+import { createRequire } from "node:module";
+import { tmpdir } from "node:os";
+import { dirname, join } from "node:path";
+import { fileURLToPath } from "node:url";
+
+import { SandboxAgent, InMemorySessionPersistDriver } from "sandbox-agent";
+import { local } from "sandbox-agent/local";
+import { daytona } from "sandbox-agent/daytona";
+
+import { createRivetOtel } from "../tracing/otel.ts";
+import { buildToolMcpServers, type McpServerStdio } from "../tools/mcp-bridge.ts";
+import { executableToolSpecs, publicToolSpecs } from "../tools/public-spec.ts";
+import {
+  localRelayHost,
+  sandboxRelayHost,
+  startToolRelay,
+} from "../tools/relay.ts";
+import {
+  PolicyResponder,
+  decisionToReply,
+  policyFromRequest,
+  type Responder,
+} from "../responder.ts";
+import {
+  type AgentRunRequest,
+  type AgentRunResult,
+  type ChatMessage,
+  type ContentBlock,
+  type EmitEvent,
+  type HarnessCapabilities,
+  type McpServerConfig,
+  type ResolvedToolSpec,
+  type ToolCallbackContext,
+  messageText,
+  resolvePromptText,
+  resolveRunSessionId,
+} from "../protocol.ts";
+
+const require = createRequire(import.meta.url);
+// services/agent/src/engines/rivet.ts -> services/agent
+const PKG_ROOT = dirname(dirname(dirname(fileURLToPath(import.meta.url))));
+const ADAPTER_BIN_DIR = join(PKG_ROOT, "node_modules", ".bin");
+
+/** Map node platform/arch to the @sandbox-agent CLI binary package. */
+const CLI_PACKAGES: Record<string, string> = {
+  "darwin-arm64": "@sandbox-agent/cli-darwin-arm64",
+  "darwin-x64": "@sandbox-agent/cli-darwin-x64",
+  "linux-x64": "@sandbox-agent/cli-linux-x64",
+  "linux-arm64": "@sandbox-agent/cli-linux-arm64",
+  "win32-x64": "@sandbox-agent/cli-win32-x64",
+};
+
+function log(message: string): void {
+  process.stderr.write(`[rivet-wrapper] ${message}\n`);
+}
+
+/**
+ * Resolve the sandbox-agent daemon binary. Prefers SANDBOX_AGENT_BIN, then the
+ * platform CLI package shipped with `sandbox-agent` (resolved from the SDK's own
+ * location, since pnpm nests it under `sandbox-agent`). Ensures it is executable
+ * (pnpm may skip the package's chmod postinstall). Returns undefined when not found;
+ * the local provider then runs its own resolution and surfaces a clear error.
+ */
+function resolveDaemonBinary(): string | undefined {
+  const fromEnv = process.env.SANDBOX_AGENT_BIN;
+  if (fromEnv && existsSync(fromEnv)) return ensureExecutable(fromEnv);
+
+  const pkg = CLI_PACKAGES[`${process.platform}-${process.arch}`];
+  if (!pkg) return undefined;
+  const bin = process.platform === "win32" ? "sandbox-agent.exe" : "sandbox-agent";
+  try {
+    // Resolve from the sandbox-agent package context (its node_modules sees the
+    // sibling CLI package in the pnpm layout); package.json blocks the subpath, so
+    // resolve from the main entry instead.
+    const sdkRequire = createRequire(require.resolve("sandbox-agent"));
+    const pkgJson = sdkRequire.resolve(`${pkg}/package.json`);
+    const resolved = join(dirname(pkgJson), "bin", bin);
+    if (existsSync(resolved)) return ensureExecutable(resolved);
+  } catch {
+    // fall through to a store scan
+  }
+  // Fallback: scan the pnpm store for the platform binary.
+  try {
+    const store = join(PKG_ROOT, "node_modules", ".pnpm");
+    for (const entry of readdirSync(store)) {
+      if (!entry.startsWith(`@sandbox-agent+cli-${process.platform}`)) continue;
+      const candidate = join(store, entry, "node_modules", pkg, "bin", bin);
+      if (existsSync(candidate)) return ensureExecutable(candidate);
+    }
+  } catch {
+    // store not present
+  }
+  return undefined;
+}
+
+function ensureExecutable(path: string): string {
+  try {
+    chmodSync(path, 0o755);
+  } catch {
+    // read-only fs (e.g. baked snapshot already +x): ignore
+  }
+  return path;
+}
+
+// The bundled Agenta Pi extension (tracing + tools). Built by `pnpm run build:extension`
+// and into the image; installed into Pi's agent dir so Pi loads it on every run.
+const EXTENSION_BUNDLE =
+  process.env.AGENTA_RIVET_EXTENSION_BUNDLE ?? join(PKG_ROOT, "dist", "extensions", "agenta.js");
+
+/**
+ * Env the Agenta Pi extension reads. Propagating the trace context here is what makes Pi
+ * emit its real spans under the caller's `/invoke` span. Tool env contains only public
+ * metadata plus the relay directory; private specs/auth stay in the runner. Empty keys are
+ * omitted so the extension stays inert when nothing applies.
+ */
+function buildPiExtensionEnv(
+  request: AgentRunRequest,
+  tracing: boolean,
+  opts: { relayDir?: string; usageOutPath?: string } = {},
+): Record<string, string> {
+  const env: Record<string, string> = {};
+  // Tracing env is omitted when the harness process can't reach Agenta's OTLP (Daytona):
+  // there the runner traces from the event stream instead, and the extension only does
+  // tools + the usage writeback.
+  const trace = tracing ? request.trace : undefined;
+  if (trace?.traceparent) env.AGENTA_TRACEPARENT = trace.traceparent;
+  if (trace?.endpoint) env.AGENTA_OTLP_ENDPOINT = trace.endpoint;
+  if (trace?.authorization) env.AGENTA_OTLP_AUTHORIZATION = trace.authorization;
+  if (trace && trace.captureContent === false) env.AGENTA_CAPTURE_CONTENT = "false";
+
+  const specs = publicToolSpecs((request.customTools as ResolvedToolSpec[]) ?? []);
+  if (specs.length && opts.relayDir) {
+    env.AGENTA_TOOL_PUBLIC_SPECS = JSON.stringify(specs);
+    env.AGENTA_TOOL_RELAY_DIR = opts.relayDir;
+  }
+  if (opts.usageOutPath) env.AGENTA_USAGE_OUT = opts.usageOutPath;
+  return env;
+}
+
+/** Install the extension bundle into a local Pi agent dir's extensions/. Best-effort. */
+function installPiExtensionLocal(agentDir: string): void {
+  if (!existsSync(EXTENSION_BUNDLE)) {
+    log(`pi extension bundle missing at ${EXTENSION_BUNDLE} (run build:extension)`);
+    return;
+  }
+  try {
+    const dir = join(agentDir, "extensions");
+    mkdirSync(dir, { recursive: true });
+    copyFileSync(EXTENSION_BUNDLE, join(dir, "agenta.js"));
+  } catch (err) {
+    log(`pi extension install skipped: ${(err as Error).message}`);
+  }
+}
+
+/** Upload the extension bundle into a Daytona sandbox's Pi extensions dir. Best-effort. */
+async function uploadPiExtensionToSandbox(sandbox: any, agentDir: string): Promise<void> {
+  if (!existsSync(EXTENSION_BUNDLE)) return;
+  try {
+    const dir = `${agentDir}/extensions`;
+    await sandbox.mkdirFs({ path: dir });
+    await sandbox.writeFsFile({ path: `${dir}/agenta.js` }, readFileSync(EXTENSION_BUNDLE, "utf-8"));
+  } catch (err) {
+    log(`pi extension upload skipped: ${(err as Error).message}`);
+  }
+}
+
+/**
+ * The environment the daemon is born with. The local provider merges this into the
+ * `sandbox-agent server` subprocess, which passes it to the ACP adapter and then to
+ * the harness. This is also where per-invoke trace/secret injection would go for a
+ * warm-daemon model; under one-daemon-per-invoke the in-process tracer handles spans,
+ * so this only needs to make the adapters and harness resolvable + authed.
+ */
+function buildDaemonEnv(harness: string): Record<string, string> {
+  const env: Record<string, string> = {};
+
+  // Adapters (pi-acp, claude-agent-acp) and the pi CLI live in our node_modules/.bin;
+  // claude CLI is on the inherited PATH. Prepend ours, keep the inherited PATH.
+  const extra = process.env.AGENTA_RIVET_ADAPTER_PATH;
+  env.PATH = [ADAPTER_BIN_DIR, extra, process.env.PATH].filter(Boolean).join(":");
+
+  // Pi: point pi-acp at our pi bin and the agent dir that carries auth.json.
+  env.PI_ACP_PI_COMMAND =
+    process.env.AGENTA_RIVET_PI_COMMAND ?? join(ADAPTER_BIN_DIR, "pi");
+  const piAgentDir = process.env.PI_CODING_AGENT_DIR;
+  if (piAgentDir) env.PI_CODING_AGENT_DIR = piAgentDir;
+
+  // Keep HOME so harness logins (~/.pi/agent, ~/.claude) resolve.
+  if (process.env.HOME) env.HOME = process.env.HOME;
+
+  // Harness LLM auth passed as launch env, never written into the agent filesystem.
+  for (const key of [
+    "OPENAI_API_KEY",
+    "ANTHROPIC_API_KEY",
+    "ANTHROPIC_AUTH_TOKEN",
+    "CLAUDE_CODE_OAUTH_TOKEN",
+    "CLAUDE_CONFIG_DIR",
+    "GEMINI_API_KEY",
+  ]) {
+    const value = process.env[key];
+    if (value) env[key] = value;
+  }
+
+  return env;
+}
+
+/** The latest user turn (shared protocol helper; flattens content blocks to text). */
+const resolvePrompt = resolvePromptText;
+
+/** Prior turns (everything before the latest user message) for trace + history. */
+function priorMessages(request: AgentRunRequest): ChatMessage[] {
+  const messages = request.messages ?? [];
+  const latest = resolvePrompt(request);
+  // Drop the trailing user turn (it is the prompt we send) to avoid double-counting.
+  if (messages.length && messages[messages.length - 1].role === "user") {
+    return messages.slice(0, -1);
+  }
+  // No trailing user message (prompt came in explicitly): drop only the LAST user turn
+  // whose text matches the prompt being sent, not every matching turn (repeated short
+  // turns like "yes"/"continue" would otherwise vanish from the replayed history).
+  let lastMatch = -1;
+  for (let i = messages.length - 1; i >= 0; i--) {
+    if (messages[i].role === "user" && messageText(messages[i].content) === latest) {
+      lastMatch = i;
+      break;
+    }
+  }
+  return lastMatch === -1 ? messages : messages.filter((_, i) => i !== lastMatch);
+}
+
+function safeJson(value: unknown): string {
+  if (value === undefined || value === null) return "";
+  try {
+    return typeof value === "string" ? value : JSON.stringify(value);
+  } catch {
+    return String(value);
+  }
+}
+
+/**
+ * Render one message for the replayed transcript, INCLUDING resolved tool turns. Under the
+ * cold model the harness rebuilds context from this text, and ACP prompt content blocks
+ * cannot carry tool calls/results — so a resolved interaction (an approved tool that ran, a
+ * client-fulfilled tool) is encoded here as text, letting the model resume from the result
+ * instead of re-asking. This is the cross-turn HITL continuation substrate: the `/messages`
+ * egress folds inbound UIMessage tool/approval parts into `tool_call` / `tool_result` content
+ * blocks, and they survive into the replay here. Plain string / text blocks pass through;
+ * image/resource blocks are summarized.
+ */
+export function messageTranscript(content: string | ContentBlock[] | undefined): string {
+  if (!content) return "";
+  if (typeof content === "string") return content;
+  const parts: string[] = [];
+  for (const block of content) {
+    if (!block) continue;
+    if (block.type === "text" && typeof block.text === "string") {
+      parts.push(block.text);
+    } else if (block.type === "tool_call") {
+      parts.push(`[called ${block.toolName ?? "tool"}(${safeJson(block.input)})]`);
+    } else if (block.type === "tool_result") {
+      const body = safeJson(block.output);
+      parts.push(`[${block.toolName ?? "tool"} ${block.isError ? "error" : "returned"}: ${body}]`);
+    } else if (block.type === "image") {
+      parts.push("[image]");
+    } else if (block.type === "resource") {
+      parts.push(block.uri ? `[resource: ${block.uri}]` : "[resource]");
+    }
+  }
+  return parts.filter(Boolean).join("\n");
+}
+
+/**
+ * The text sent over ACP for this turn. Each invoke is a cold sandbox, so prior turns
+ * are replayed as transcript context ahead of the latest user message — this is the
+ * "persisted message history replayed" model, with the client/playground holding the
+ * history. Capped by AGENTA_AGENT_HISTORY_MAX_CHARS so replay tokens stay bounded.
+ */
+export function buildTurnText(request: AgentRunRequest): string {
+  const latest = resolvePrompt(request);
+  const history = priorMessages(request).filter((m) => messageTranscript(m.content));
+  if (history.length === 0) return latest;
+
+  const maxChars = Number(process.env.AGENTA_AGENT_HISTORY_MAX_CHARS ?? 24000);
+  let transcript = history.map((m) => `${m.role}: ${messageTranscript(m.content)}`).join("\n");
+  if (transcript.length > maxChars) transcript = transcript.slice(-maxChars);
+  return (
+    `Conversation so far:\n${transcript}\n\n` +
+    `Continue the conversation. The user now says:\n${latest}`
+  );
+}
+
+/**
+ * Convert user-declared MCP servers (already resolved server-side, secrets injected into
+ * `env`) into ACP stdio entries. Only `stdio` is delivered over ACP today; `http`/remote
+ * carries no auth on the wire by design and is skipped. The per-server `tools` allowlist is
+ * NOT enforced over ACP in v1 — the harness lists all of a server's tools — so it is dropped
+ * with a log rather than silently implying a filter that does not happen.
+ */
+export function toAcpMcpServers(servers: McpServerConfig[] | undefined): McpServerStdio[] {
+  const out: McpServerStdio[] = [];
+  for (const s of servers ?? []) {
+    if ((s.transport ?? "stdio") !== "stdio" || !s.command) {
+      log(`skipping non-stdio MCP server '${s?.name ?? "?"}' (remote transport deferred)`);
+      continue;
+    }
+    if (s.tools && s.tools.length > 0) {
+      log(`MCP server '${s.name}': per-server tool allowlist not enforced over ACP (v1)`);
+    }
+    out.push({
+      name: s.name,
+      command: s.command,
+      args: s.args ?? [],
+      env: Object.entries(s.env ?? {}).map(([name, value]) => ({ name, value: String(value) })),
+    });
+  }
+  return out;
+}
+
+/**
+ * Pick the harness-specific model id for a requested name. Harnesses expose their own
+ * ids (Pi: "openai-codex/gpt-5.5"; Claude: its own). Match exact, then by the id after
+ * the provider prefix, so "gpt-5.5" resolves to "openai-codex/gpt-5.5".
+ */
+function pickModel(allowed: string[], wanted?: string): string | undefined {
+  if (!wanted) return undefined;
+  if (allowed.includes(wanted)) return wanted;
+  const suffix = (id: string) => id.slice(id.indexOf("/") + 1);
+  return (
+    allowed.find((id) => suffix(id) === wanted) ??
+    allowed.find((id) => suffix(id) === suffix(wanted)) ??
+    undefined
+  );
+}
+
+/** Enumerate the harness's selectable model ids from the session config options. */
+async function allowedModels(session: any): Promise<string[]> {
+  try {
+    const options = await session.getConfigOptions();
+    const modelOpt = (options ?? []).find(
+      (o: any) => o.category === "model" || o.id === "model",
+    );
+    const choices = modelOpt?.options ?? [];
+    return choices.map((c: any) => c.id).filter(Boolean);
+  } catch {
+    return [];
+  }
+}
+
+/** Parse the allowed model ids out of an UnsupportedSessionValueError message. */
+function allowedFromError(err: unknown): string[] {
+  const match = /Allowed values:\s*(.+?)\s*$/.exec(String((err as Error)?.message ?? err));
+  if (!match) return [];
+  return match[1]
+    .split(",")
+    .map((s) => s.trim())
+    .filter(Boolean);
+}
+
+/**
+ * Apply the requested model to a session, normalizing to the harness's own id. Tries the
+ * value as given first (already-qualified ids pass); on rejection it reads the allowed
+ * ids from the error (always listed there) or the session config and retries a match.
+ * Returns the id set, or undefined when no match exists (the harness keeps its default
+ * rather than failing the run).
+ */
+async function applyModel(session: any, wanted?: string): Promise<string | undefined> {
+  if (!wanted) return undefined;
+  try {
+    await session.setModel(wanted);
+    return wanted;
+  } catch (err) {
+    const allowed = allowedFromError(err);
+    const fallbackAllowed = allowed.length ? allowed : await allowedModels(session);
+    const match = pickModel(fallbackAllowed, wanted);
+    if (match && match !== wanted) {
+      try {
+        await session.setModel(match);
+        return match;
+      } catch {
+        // fall through to harness default
+      }
+    }
+    log(`model '${wanted}' not settable (${(err as Error).message}); using harness default`);
+    return undefined;
+  }
+}
+
+/**
+ * In-sandbox env for the Daytona daemon: where Pi reads its login, any provider keys,
+ * and the Agenta extension env (traceparent + OTLP + tool spec) so the remote Pi traces
+ * and runs tools exactly like local. No local-only paths (PATH/PI_ACP_PI_COMMAND) here.
+ */
+function daytonaEnvVars(
+  piExtEnv: Record<string, string>,
+  secrets: Record<string, string>,
+): Record<string, string> {
+  const env: Record<string, string> = {
+    PI_CODING_AGENT_DIR: DAYTONA_PI_DIR,
+    ...piExtEnv,
+    // Provider API keys from the vault: the in-sandbox harness authenticates with these.
+    ...secrets,
+  };
+  // Point pi-acp at the `pi` we install into the sandbox (the image lacks it).
+  if (DAYTONA_PI_INSTALL) {
+    env.PI_ACP_PI_COMMAND = `${DAYTONA_PI_INSTALL_DIR}/node_modules/.bin/pi`;
+  }
+  return env;
+}
+
+/**
+ * Build the rivet sandbox provider for the requested axis.
+ *
+ * Daytona needs an image that carries both the rivet daemon and the harness CLI. Rivet's
+ * `-full` image ships the daemon and the ACP adapters but NOT the `pi` CLI, so we run
+ * from a pre-baked snapshot (`AGENTA_RIVET_DAYTONA_SNAPSHOT`, default `agenta-rivet-pi`,
+ * built by poc/build_rivet_snapshot.py) that adds `pi`; this avoids a ~150s per-invoke
+ * `npm install pi`. `AGENTA_RIVET_DAYTONA_IMAGE` overrides with a plain image instead. The
+ * code-evaluator DAYTONA_SNAPSHOT is intentionally NOT reused (it has no daemon). The
+ * provider key comes from the vault env; Pi's OAuth login is only uploaded when no key.
+ */
+function buildSandboxProvider(
+  sandboxId: string,
+  env: Record<string, string>,
+  binaryPath: string | undefined,
+  piExtEnv: Record<string, string>,
+  secrets: Record<string, string>,
+) {
+  if (sandboxId === "daytona") {
+    const snapshot = process.env.AGENTA_RIVET_DAYTONA_SNAPSHOT;
+    const image = process.env.AGENTA_RIVET_DAYTONA_IMAGE;
+    const target = process.env.DAYTONA_TARGET;
+    return daytona({
+      ...(image ? { image } : {}),
+      create: {
+        // The rivet provider always sets a default `image`, which Daytona turns into a
+        // build entry that conflicts with `snapshot`. Spreading image:undefined last
+        // suppresses that so the snapshot is used as-is.
+        ...(snapshot ? { snapshot, image: undefined } : {}),
+        ...(target ? { target } : {}),
+        envVars: daytonaEnvVars(piExtEnv, secrets),
+        ephemeral: true,
+      } as any,
+    });
+  }
+  // local: spawn `sandbox-agent server` on this host with the daemon env merged in.
+  const logMode = (process.env.AGENTA_RIVET_DAEMON_LOG ?? "silent") as any;
+  return local({ env, binaryPath, log: logMode });
+}
+
+/** In-sandbox Pi agent dir on the rivet `-full` image (daemon runs as user `sandbox`). */
+const DAYTONA_PI_DIR = process.env.AGENTA_RIVET_DAYTONA_PI_DIR ?? "/home/sandbox/.pi/agent";
+// The rivet `-full` image ships the pi-acp adapter but NOT the `pi` CLI, so by default we
+// install it into the sandbox at session time and point pi-acp at it. A snapshot that
+// pre-installs `pi` should set AGENTA_RIVET_DAYTONA_INSTALL_PI=false (faster, no per-run
+// npm install). Version mirrors the wrapper's pinned Pi.
+const DAYTONA_PI_INSTALL_DIR = "/home/sandbox/.agenta-pi";
+const DAYTONA_PI_INSTALL = process.env.AGENTA_RIVET_DAYTONA_INSTALL_PI !== "false";
+const DAYTONA_PI_VERSION = process.env.AGENTA_RIVET_PI_VERSION ?? "0.79.4";
+
+/** Install the `pi` CLI into a Daytona sandbox (the rivet image lacks it). Best-effort. */
+async function installPiInSandbox(sandbox: any): Promise<void> {
+  try {
+    await sandbox.mkdirFs({ path: DAYTONA_PI_INSTALL_DIR });
+    const res = await sandbox.runProcess({
+      command: "npm",
+      args: [
+        "install",
+        "--no-fund",
+        "--no-audit",
+        `@earendil-works/pi-coding-agent@${DAYTONA_PI_VERSION}`,
+      ],
+      cwd: DAYTONA_PI_INSTALL_DIR,
+      timeoutMs: 180_000,
+    });
+    if (res?.exitCode !== 0) {
+      log(`pi install in sandbox exit=${res?.exitCode}: ${String(res?.stderr).slice(-400)}`);
+    }
+  } catch (err) {
+    log(`pi install in sandbox skipped: ${(err as Error).message}`);
+  }
+}
+
+/**
+ * Upload the local Pi login into a Daytona sandbox so the remote Pi authenticates with
+ * the dev's ChatGPT/Codex OAuth (it auto-refreshes from the token in auth.json). Must
+ * `mkdirFs` the parent first (a fresh sandbox lacks it) and pass a string body — a
+ * missing dir or a stream body is what produced the earlier "Stream Error". Best-effort:
+ * with no local login the remote run falls back to any provider key in the sandbox env.
+ */
+async function uploadPiAuthToSandbox(sandbox: any): Promise<void> {
+  const localDir = process.env.PI_CODING_AGENT_DIR || join(process.env.HOME ?? "", ".pi/agent");
+  const authPath = join(localDir, "auth.json");
+  if (!existsSync(authPath)) return;
+  try {
+    await sandbox.mkdirFs({ path: DAYTONA_PI_DIR });
+    await sandbox.writeFsFile({ path: `${DAYTONA_PI_DIR}/auth.json` }, readFileSync(authPath, "utf-8"));
+    const settingsPath = join(localDir, "settings.json");
+    if (existsSync(settingsPath)) {
+      await sandbox.writeFsFile(
+        { path: `${DAYTONA_PI_DIR}/settings.json` },
+        readFileSync(settingsPath, "utf-8"),
+      );
+    }
+  } catch (err) {
+    log(`pi auth upload skipped: ${(err as Error).message}`);
+  }
+}
+
+/**
+ * A `fetch` that persists cookies per host. Daytona's preview proxy authenticates with a
+ * `daytona-sandbox-auth-*` cookie set on the first response; Node's fetch keeps no cookie
+ * jar, so without this the proxy rejects later ACP requests with "Authentication
+ * required" / 502. The rivet SDK accepts a custom fetch, so we hand it this one.
+ */
+function createCookieFetch(): typeof fetch {
+  const jar = new Map<string, Map<string, string>>(); // host -> (name -> "name=value")
+  return async (input: any, init?: any) => {
+    const url = new URL(typeof input === "string" ? input : input.url);
+    const host = url.host;
+    const cookies = jar.get(host);
+    const headers = new Headers(init?.headers ?? (typeof input !== "string" ? input.headers : undefined));
+    if (cookies && cookies.size > 0) {
+      const existing = headers.get("cookie");
+      const merged = [...cookies.values()];
+      if (existing) merged.unshift(existing);
+      headers.set("cookie", merged.join("; "));
+    }
+    const response = await fetch(input, { ...init, headers });
+    const setCookies =
+      typeof (response.headers as any).getSetCookie === "function"
+        ? (response.headers as any).getSetCookie()
+        : (response.headers.get("set-cookie") ? [response.headers.get("set-cookie")] : []);
+    if (setCookies.length) {
+      const store = jar.get(host) ?? new Map<string, string>();
+      for (const sc of setCookies) {
+        const pair = String(sc).split(";")[0];
+        const name = pair.split("=")[0];
+        if (name) store.set(name, pair);
+      }
+      jar.set(host, store);
+    }
+    return response;
+  };
+}
+
+/** Read the run-total usage Pi wrote on agent_end (local fs or the sandbox FS API). */
+async function readRunUsage(
+  sandbox: any,
+  path: string | undefined,
+  isDaytona: boolean,
+): Promise<AgentRunResult["usage"]> {
+  if (!path) return undefined;
+  try {
+    let raw: string;
+    if (isDaytona) {
+      const bytes = await sandbox.readFsFile({ path });
+      raw = typeof bytes === "string" ? bytes : new TextDecoder().decode(bytes);
+    } else {
+      if (!existsSync(path)) return undefined;
+      raw = readFileSync(path, "utf-8");
+    }
+    const u = JSON.parse(raw);
+    return u && u.total > 0 ? u : undefined;
+  } catch {
+    return undefined;
+  }
+}
+
+/**
+ * Turn a harness/SDK error into one clear line for the caller (the playground shows it
+ * verbatim), instead of dumping a full ACP/JS stack. Recognizes the common harness auth
+ * failures so the user sees what to fix.
+ */
+function conciseError(err: unknown, harness: string): string {
+  const raw = err instanceof Error ? err.message : String(err);
+  const msg = raw.split("\n")[0].trim();
+  const keyHint =
+    harness === "claude" ? "the project's Anthropic key" : "the project's OpenAI key";
+  if (/credit balance is too low/i.test(raw)) {
+    return `${harness}: the model provider account has insufficient credit (check ${keyHint}).`;
+  }
+  if (/authentication required|invalid api key|401|unauthorized/i.test(raw)) {
+    return `${harness}: model authentication failed — add ${keyHint} to the project vault, or log in (OAuth).`;
+  }
+  return msg || "agent run failed";
+}
+
+/**
+ * Map a rivet `AgentInfo` to our capability flags. Falls back to a per-harness static
+ * guess when the probe is unavailable, so tool delivery and tracing still pick a sane
+ * path. Rivet has no `usage` capability flag (usage rides on `usage_update` events), so we
+ * derive it from the harness: Pi reports usage through its extension, others over ACP.
+ */
+function mapCapabilities(harness: string, info: any): HarnessCapabilities {
+  const c = info?.capabilities;
+  if (c) {
+    return {
+      textMessages: c.textMessages ?? true,
+      images: !!c.images,
+      fileAttachments: !!c.fileAttachments,
+      mcpTools: !!c.mcpTools,
+      toolCalls: !!c.toolCalls,
+      reasoning: !!c.reasoning,
+      planMode: !!c.planMode,
+      permissions: !!c.permissions,
+      streamingDeltas: !!c.streamingDeltas,
+      sessionLifecycle: !!c.sessionLifecycle,
+      usage: true,
+    };
+  }
+  // Static fallback by harness id: pi-acp does not forward MCP, Claude/Codex do.
+  const isPiHarness = harness === "pi";
+  return {
+    textMessages: true,
+    images: false,
+    fileAttachments: false,
+    mcpTools: !isPiHarness,
+    toolCalls: true,
+    reasoning: true,
+    planMode: !isPiHarness,
+    permissions: !isPiHarness,
+    streamingDeltas: true,
+    sessionLifecycle: true,
+    usage: true,
+  };
+}
+
+/** Probe the harness's capabilities from the daemon (best-effort, static fallback). */
+async function probeCapabilities(
+  sandbox: any,
+  harness: string,
+): Promise<HarnessCapabilities> {
+  try {
+    const info = await sandbox.getAgent(harness, { config: true });
+    return mapCapabilities(harness, info);
+  } catch {
+    return mapCapabilities(harness, undefined);
+  }
+}
+
+export async function runRivet(
+  request: AgentRunRequest,
+  emit?: EmitEvent,
+  signal?: AbortSignal,
+): Promise<AgentRunResult> {
+  const harness = request.harness || process.env.AGENTA_AGENT_HARNESS || "pi";
+  const sandboxId = request.sandbox || process.env.AGENTA_AGENT_SANDBOX || "local";
+
+  const prompt = resolvePrompt(request);
+  if (!prompt) {
+    return { ok: false, error: "No user message to send (prompt/messages empty)." };
+  }
+  // What we actually send over ACP: the latest turn, with prior turns replayed as
+  // context when this is a continued conversation.
+  const turnText = buildTurnText(request);
+
+  const isPi = harness === "pi";
+  const isDaytona = sandboxId === "daytona";
+
+  // Provider API keys resolved from the vault (OPENAI_API_KEY/ANTHROPIC_API_KEY/...).
+  // Present => the harness authenticates with the key; absent => it uses its own login
+  // (OAuth: local Codex / a mounted-or-uploaded auth.json).
+  const secrets = request.secrets ?? {};
+  const harnessKeyVar = harness === "claude" ? "ANTHROPIC_API_KEY" : "OPENAI_API_KEY";
+  const hasApiKey = !!secrets[harnessKeyVar];
+
+  // Session cwd holds AGENTS.md. Local: a host temp dir. Daytona: an in-sandbox path
+  // (the host path would not exist on the remote sandbox).
+  const cwd = isDaytona
+    ? `/home/sandbox/agenta-${randomBytes(6).toString("hex")}`
+    : mkdtempSync(join(tmpdir(), "agenta-rivet-"));
+  const agentsMd = request.agentsMd?.trim();
+
+  const toolSpecsForRun = (request.customTools as ResolvedToolSpec[]) ?? [];
+  const executableToolSpecsForRun = executableToolSpecs(toolSpecsForRun);
+  const relayDir = `${cwd}/.agenta-tools`;
+  const useToolRelay = executableToolSpecsForRun.length > 0;
+
+  // Pi writes its run totals here on agent_end; we read them back and return them so the
+  // caller can roll them onto the workflow span (separate OTLP batch, see piExtension).
+  const usageOutPath = isPi ? `${cwd}/.agenta-usage.json` : undefined;
+
+  const env = buildDaemonEnv(harness);
+  Object.assign(env, secrets); // local daemon inherits the provider keys
+  // Pi self-instruments locally: propagate the trace context + public tool metadata into Pi
+  // via the Agenta extension. Tool execution always relays back to this runner, which keeps
+  // private specs, scoped env, callback endpoints, and callback auth in memory.
+  const piExtEnv = isPi
+    ? buildPiExtensionEnv(request, !isDaytona, { relayDir, usageOutPath })
+    : {};
+  Object.assign(env, piExtEnv); // local daemon inherits it; daytona gets it via envVars
+  // undefined is fine: the local provider runs its own resolution and errors clearly.
+  const binaryPath = resolveDaemonBinary();
+
+  // For local Pi, install the extension into the agent dir Pi loads from.
+  const localPiAgentDir = process.env.PI_CODING_AGENT_DIR;
+  if (isPi && !isDaytona && localPiAgentDir) installPiExtensionLocal(localPiAgentDir);
+
+  // Pi's system-prompt overrides (systemPrompt / appendSystemPrompt) are honored on the
+  // in-process Pi engine via the resource loader. The ACP path drives Pi through pi-acp,
+  // which gives us no per-run hook to set them (a project SYSTEM.md is trust-gated, and CLI
+  // flags can't be set per session here), so they are not delivered yet. Warn rather than
+  // drop them silently. AGENTS.md still applies on this path regardless.
+  if (isPi && (request.systemPrompt?.trim() || request.appendSystemPrompt?.trim())) {
+    log("systemPrompt/appendSystemPrompt are not yet delivered on the ACP (rivet) Pi path; ignored");
+  }
+
+  log(`harness=${harness} sandbox=${sandboxId} cwd=${cwd}`);
+
+  // Persist events in-process so a follow-up turn can resume by session id.
+  const persist = new InMemorySessionPersistDriver();
+  const sandbox = await SandboxAgent.start({
+    sandbox: buildSandboxProvider(sandboxId, env, binaryPath, piExtEnv, secrets),
+    persist,
+    // Propagate caller cancellation (a client disconnect on the streaming HTTP edge) so an
+    // in-flight run aborts instead of finishing unobserved. The `finally` still disposes.
+    ...(signal ? { signal } : {}),
+    // Daytona's preview proxy authenticates with a per-sandbox cookie; carry it across
+    // requests so ACP calls after the first don't 401. Harmless for local.
+    ...(isDaytona ? { fetch: createCookieFetch() } : {}),
+  });
+
+  // Pi traces itself via the extension under the propagated traceparent; for other
+  // harnesses we build the span tree here from the ACP event stream. Created below, once
+  // the model is resolved, so the chat span carries the harness's actual model rather
+  // than the requested one. Declared here so the catch can flush a partial trace.
+  let otel: ReturnType<typeof createRivetOtel> | undefined;
+  // Daytona tool relay loop (started once the session exists, stopped after the prompt).
+  let toolRelay: { stop: () => Promise<void> } | undefined;
+
+  try {
+    // On Daytona, push the harness login, the extension, and AGENTS.md into the remote
+    // sandbox via the filesystem API (nothing secret is baked into the image). Locally
+    // these use the host filesystem and the harness's own login (PI_CODING_AGENT_DIR).
+    if (isDaytona) {
+      if (isPi) {
+        // With a provider API key the harness authenticates via env; only fall back to
+        // uploading the Codex/OAuth login when no key is available.
+        if (!hasApiKey) await uploadPiAuthToSandbox(sandbox);
+        await uploadPiExtensionToSandbox(sandbox, DAYTONA_PI_DIR);
+        if (DAYTONA_PI_INSTALL) await installPiInSandbox(sandbox);
+      }
+      await sandbox.mkdirFs({ path: cwd }).catch(() => {});
+      if (useToolRelay) await sandbox.mkdirFs({ path: relayDir }).catch(() => {});
+      if (agentsMd) await sandbox.writeFsFile({ path: `${cwd}/AGENTS.md` }, agentsMd);
+    } else {
+      if (useToolRelay) mkdirSync(relayDir, { recursive: true });
+      if (agentsMd) writeFileSync(join(cwd, "AGENTS.md"), agentsMd, "utf-8");
+    }
+
+    // Probe what this harness supports and branch on capabilities, not on the harness
+    // name. Tool delivery: Pi loads our extension (native tools, set up above); any other
+    // harness takes tools over MCP only when it advertises `mcpTools` (pi-acp does not
+    // forward MCP, Claude/Codex do).
+    const capabilities = await probeCapabilities(sandbox, harness);
+    const toolSpecs = (request.customTools as ResolvedToolSpec[]) ?? [];
+    const userMcpCount = request.mcpServers?.length ?? 0;
+    // MCP delivery is gated on `mcpTools`: pi-acp does not forward MCP, Claude/Codex do. The
+    // synthesized `agenta-tools` server (gateway/code tools) and the user-declared servers
+    // ride the same gate.
+    const mcpServers =
+      !isPi && capabilities.mcpTools
+        ? [
+            ...buildToolMcpServers(
+              toolSpecs,
+              request.toolCallback as ToolCallbackContext | undefined,
+              relayDir,
+            ),
+            ...toAcpMcpServers(request.mcpServers),
+          ]
+        : [];
+    if (!isPi && (toolSpecs.length > 0 || userMcpCount > 0) && !capabilities.mcpTools) {
+      log(
+        `harness '${harness}' lacks MCP support; ${toolSpecs.length} tool(s) and ` +
+          `${userMcpCount} user MCP server(s) not delivered`,
+      );
+    }
+
+    const session = await sandbox.createSession({
+      agent: harness,
+      cwd,
+      sessionInit: { cwd, mcpServers },
+    });
+    const sessionId = resolveRunSessionId(request, session.id);
+
+    // Resolve the model first: when the harness rejects the requested id and keeps its
+    // own default (e.g. Claude ignores "gpt-5.5"), `model` is undefined and the chat span
+    // is labelled "chat" instead of falsely claiming the requested model.
+    const model = await applyModel(session, request.model);
+
+    const run = createRivetOtel({
+      harness,
+      model,
+      traceparent: request.trace?.traceparent,
+      baggage: request.trace?.baggage,
+      endpoint: request.trace?.endpoint,
+      authorization: request.trace?.authorization,
+      captureContent: request.trace?.captureContent,
+      emitSpans: !isPi || isDaytona,
+      emit,
+    });
+    otel = run;
+
+    run.start({
+      prompt,
+      sessionId,
+      messages: [...priorMessages(request), { role: "user", content: prompt }],
+    });
+
+    session.onEvent((event: any) => {
+      const payload = event?.payload;
+      const update = payload?.params?.update ?? payload?.update;
+      if (update) run.handleUpdate(update);
+    });
+
+    // Permission gating, behind the Responder seam. Pi never gates; a permission-gating
+    // harness (e.g. Claude) raises a request, which we (a) surface as an `interaction_request`
+    // event so the egress can project it (Vercel `tool-approval-request`) and the trace can
+    // record it, and (b) resolve via the responder. The headless `PolicyResponder` keeps the
+    // prior behavior: auto-allow trusted backend tools, or deny per `permissionPolicy` /
+    // AGENTA_RIVET_DENY_PERMISSIONS. A cross-turn responder (true HITL) slots in here later
+    // without touching the harness. Tools are backend-resolved and trusted; the run is headless.
+    const responder: Responder = new PolicyResponder(policyFromRequest(request.permissionPolicy));
+    session.onPermissionRequest((req: any) => {
+      const id = String(req?.id ?? "");
+      const availableReplies: string[] = req?.availableReplies ?? [];
+      run.emitEvent({
+        type: "interaction_request",
+        id, // ACP permission id -> Vercel approvalId
+        kind: "permission",
+        payload: {
+          // toolCallId of the gated tool, so the cross-turn approval reply correlates back to
+          // its tool call (and the #6 resume finds it). `toolCall` is the ACP ToolCallUpdate.
+          toolCallId: req?.toolCall?.toolCallId,
+          toolCall: req?.toolCall,
+          availableReplies,
+          options: req?.options,
+        },
+      });
+      void responder
+        .onPermission({ id, availableReplies, raw: req })
+        .then((decision) => {
+          if (!req?.id) return;
+          return session.respondPermission(req.id, decisionToReply(decision, availableReplies) as any);
+        })
+        .catch(() => {});
+    });
+
+    if (useToolRelay) {
+      toolRelay = startToolRelay(
+        isDaytona ? sandboxRelayHost(sandbox) : localRelayHost(),
+        relayDir,
+        toolSpecsForRun,
+        request.toolCallback as ToolCallbackContext | undefined,
+      );
+    }
+
+    const result = await session.prompt([{ type: "text", text: turnText }]);
+    await toolRelay?.stop();
+    const stopReason = (result as any)?.stopReason;
+    log(`prompt stopReason=${stopReason}`);
+
+    // Usage: Pi writes its totals to a file via the extension. Other harnesses report the
+    // input/output token split on the PromptResponse and the cost on ACP `usage_update`,
+    // so combine the two (the stream alone carries no per-call token split). Read and stamp
+    // this before finish/flush so exported spans and final events carry the final usage.
+    let usage = await readRunUsage(sandbox, usageOutPath, isDaytona);
+    if (!usage) {
+      const promptUsage = (result as any)?.usage;
+      const streamUsage = run.usage();
+      const inputTokens = promptUsage?.inputTokens ?? streamUsage?.input ?? 0;
+      const outputTokens = promptUsage?.outputTokens ?? streamUsage?.output ?? 0;
+      const total = inputTokens + outputTokens || streamUsage?.total || 0;
+      const cost = streamUsage?.cost ?? 0;
+      usage =
+        total > 0 || cost > 0
+          ? { input: inputTokens, output: outputTokens, total, cost }
+          : undefined;
+    }
+    run.setUsage(usage);
+
+    const output = run.finish();
+    await run.flush();
+
+    return {
+      ok: true,
+      output,
+      messages: output ? [{ role: "assistant", content: output }] : [],
+      // Streaming already delivered every event live, so the terminal result carries none
+      // (re-sending would double them on the consumer).
+      events: emit ? [] : run.events(),
+      usage,
+      stopReason,
+      // `streamingDeltas` advertises end-to-end live deltas, which is only true when a live
+      // sink is wired. The one-shot path reports false even when the harness produces deltas.
+      capabilities: { ...capabilities, streamingDeltas: !!emit && capabilities.streamingDeltas },
+      sessionId,
+      model: model ?? request.model,
+      traceId: run.traceId(),
+    };
+  } catch (err) {
+    otel?.finish();
+    await otel?.flush().catch(() => {});
+    return { ok: false, error: conciseError(err, harness) };
+  } finally {
+    await toolRelay?.stop().catch(() => {});
+    await sandbox.destroySandbox().catch(() => {});
+    await sandbox.dispose().catch(() => {});
+    rmSync(cwd, { recursive: true, force: true });
+  }
+}
diff --git a/services/agent/src/extensions/agenta.ts b/services/agent/src/extensions/agenta.ts
new file mode 100644
index 0000000000..85b88a79ad
--- /dev/null
+++ b/services/agent/src/extensions/agenta.ts
@@ -0,0 +1,114 @@
+/**
+ * Agenta Pi extension (WP-8): tracing + tools, installed into Pi's agent dir and loaded
+ * by Pi when it runs under rivet (`pi --mode rpc` via pi-acp).
+ *
+ * This is how we keep WP-1/WP-2/WP-7 behavior on the rivet path: instead of a synthetic,
+ * coarse tracer in the runner, we propagate the caller's trace context INTO Pi and let
+ * Pi emit its real span tree (turn / chat / tool, with token usage) under that parent —
+ * and we deliver tools the Pi-native way (`registerTool`), each routing back to Agenta's
+ * /tools/call, rather than over MCP. Pi is highly customizable; this leans on that.
+ *
+ * Everything is read from the environment (injected at the daemon's birth). Tool env is
+ * intentionally public-only; execution relays back to the runner where private specs/auth
+ * remain in memory:
+ *   AGENTA_TRACEPARENT            W3C traceparent of the caller's /invoke span
+ *   AGENTA_OTLP_ENDPOINT          OTLP traces URL (e.g. https://host/api/otlp/v1/traces)
+ *   AGENTA_OTLP_AUTHORIZATION     Authorization header for the OTLP export
+ *   AGENTA_CAPTURE_CONTENT        "false" to drop prompt/completion/tool I/O from spans
+ *   AGENTA_TOOL_PUBLIC_SPECS      JSON [{ name, description, inputSchema }]
+ *   AGENTA_TOOL_RELAY_DIR         relay tool calls through the runner via files here
+ *
+ * Bundled self-contained (esbuild) so its OpenTelemetry deps resolve wherever Pi loads
+ * it (local, the docker sidecar, a Daytona snapshot). Default export is the Pi
+ * ExtensionFactory.
+ */
+import { writeFileSync } from "node:fs";
+
+import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
+
+import { createAgentaOtel } from "../tracing/otel.ts";
+import type { ResolvedToolSpec } from "../protocol.ts";
+import { EMPTY_OBJECT_SCHEMA } from "../tools/callback.ts";
+import { runResolvedTool } from "../tools/dispatch.ts";
+
+function log(message: string): void {
+  process.stderr.write(`[agenta-pi-ext] ${message}\n`);
+}
+
+/** Register public tool metadata as Pi tools whose execution relays to the runner. */
+function registerTools(pi: ExtensionAPI): void {
+  const raw = process.env.AGENTA_TOOL_PUBLIC_SPECS;
+  const relayDir = process.env.AGENTA_TOOL_RELAY_DIR;
+  if (!raw || !relayDir) return;
+
+  let specs: ResolvedToolSpec[] = [];
+  try {
+    specs = JSON.parse(raw);
+  } catch (err) {
+    log(`bad AGENTA_TOOL_PUBLIC_SPECS: ${(err as Error).message}`);
+    return;
+  }
+
+  let registered = 0;
+  for (const spec of specs) {
+    pi.registerTool({
+      name: spec.name,
+      label: spec.name,
+      description: spec.description ?? spec.name,
+      // Pi accepts plain JSON Schema here (non-TypeBox validation path).
+      parameters: (spec.inputSchema as any) ?? EMPTY_OBJECT_SCHEMA,
+      async execute(toolCallId: string, params: unknown, signal?: AbortSignal) {
+        const text = await runResolvedTool(spec, params, {
+          toolCallId,
+          relayDir,
+          signal,
+        });
+        return {
+          content: [{ type: "text", text }],
+          details: { toolName: spec.name },
+        };
+      },
+    } as any);
+    registered += 1;
+  }
+  log(`registered ${registered} tool(s) -> relay ${relayDir}`);
+}
+
+/** The Pi ExtensionFactory: tools + (env-driven) tracing + usage writeback. */
+const factory = (pi: ExtensionAPI): void => {
+  // Fully inert unless Agenta wired this run (so it is safe to install globally in a
+  // shared Pi agent dir — a normal `pi` session with no Agenta env does nothing).
+  const hasTracing = !!(process.env.AGENTA_TRACEPARENT || process.env.AGENTA_OTLP_ENDPOINT);
+  const hasTools = !!(process.env.AGENTA_TOOL_PUBLIC_SPECS && process.env.AGENTA_TOOL_RELAY_DIR);
+  const usageOut = process.env.AGENTA_USAGE_OUT;
+  if (!hasTracing && !hasTools && !usageOut) return;
+
+  if (hasTools) registerTools(pi);
+  // Tracing exports the span tree (when the OTLP target is reachable, i.e. local runs).
+  // Usage accumulation is needed both for that export AND for the writeback the runner
+  // uses on Daytona (where the in-sandbox process can't reach Agenta's OTLP, so the
+  // runner traces from the event stream and only needs the token totals). So set up the
+  // otel state whenever either applies; only flush (export) when tracing is on.
+  if (!hasTracing && !usageOut) return;
+
+  const otel = createAgentaOtel({
+    traceparent: process.env.AGENTA_TRACEPARENT,
+    endpoint: process.env.AGENTA_OTLP_ENDPOINT,
+    authorization: process.env.AGENTA_OTLP_AUTHORIZATION,
+    captureContent: process.env.AGENTA_CAPTURE_CONTENT !== "false",
+  });
+  otel.register(pi); // lifecycle handlers (spans + usage accumulation)
+
+  pi.on("agent_end", async () => {
+    if (hasTracing) await otel.flush(); // invoke_agent has a remote parent → flush by id
+    if (usageOut) {
+      try {
+        writeFileSync(usageOut, JSON.stringify(otel.usage()), "utf-8");
+      } catch (err) {
+        log(`usage writeback skipped: ${(err as Error).message}`);
+      }
+    }
+  });
+};
+
+export default factory;
diff --git a/services/agent/src/responder.ts b/services/agent/src/responder.ts
new file mode 100644
index 0000000000..6af4132841
--- /dev/null
+++ b/services/agent/src/responder.ts
@@ -0,0 +1,77 @@
+/**
+ * The interaction responder seam.
+ *
+ * A harness (the ACP "Agent") does not only emit tool calls. It also raises typed
+ * reverse-RPC interaction requests that something must answer: permission gates today,
+ * elicitation (input) and client-side tools later. Today the rivet runner answered the
+ * permission gate inline with a hardcoded auto-approve. This module lifts that decision
+ * behind a `Responder` interface so it is pluggable:
+ *
+ *   - `PolicyResponder` is the headless answer (a fixed `auto` / `deny` policy, no human).
+ *     It reproduces the previous behavior exactly and is what `/invoke` uses.
+ *   - A cross-turn responder (the `/messages` HITL path) slots in here later: it surfaces the
+ *     request to the browser, ends the turn, and resolves on the next turn's reply. The
+ *     harness adapter does not change when the responder does.
+ *
+ * Resolution is modeled as `allow` / `deny`; the adapter maps that onto the harness's
+ * available ACP replies via `decisionToReply`.
+ */
+
+export type PermissionPolicy = "auto" | "deny";
+
+export type PermissionDecision = "allow" | "deny";
+
+/** A permission gate raised by the harness, normalized from the ACP request. */
+export interface PermissionRequest {
+  /** The ACP permission id; reused as the `interaction_request` event id for reply matching. */
+  id: string;
+  /** Replies the harness offers (e.g. "always" | "once" | "reject"). */
+  availableReplies: string[];
+  /** The original ACP request, for responders that want the tool-call detail. */
+  raw?: unknown;
+}
+
+/**
+ * Answers interaction requests the harness raises. Permission is the only kind wired today;
+ * `input` (elicitation) and `client_tool` are forward-looking and will extend this interface
+ * alongside the cross-turn responder.
+ */
+export interface Responder {
+  onPermission(request: PermissionRequest): Promise<PermissionDecision>;
+}
+
+/** Headless responder: a fixed policy, no human in the loop. */
+export class PolicyResponder implements Responder {
+  constructor(private readonly policy: PermissionPolicy) {}
+
+  async onPermission(_request: PermissionRequest): Promise<PermissionDecision> {
+    return this.policy === "deny" ? "deny" : "allow";
+  }
+}
+
+/**
+ * Resolve the permission policy with the same precedence as before: an explicit per-run
+ * `permissionPolicy: "deny"` or the `AGENTA_RIVET_DENY_PERMISSIONS` env flips to deny; the
+ * default is auto-allow, because backend-resolved tools are trusted and the run is headless.
+ */
+export function policyFromRequest(permissionPolicy?: string): PermissionPolicy {
+  if (permissionPolicy === "deny" || process.env.AGENTA_RIVET_DENY_PERMISSIONS === "true") {
+    return "deny";
+  }
+  return "auto";
+}
+
+/** Map an allow/deny decision onto the harness's available ACP replies. */
+export function decisionToReply(
+  decision: PermissionDecision,
+  availableReplies: string[],
+): string {
+  if (decision === "deny") {
+    return availableReplies.find((r) => r === "reject") ?? "reject";
+  }
+  return (
+    availableReplies.find((r) => r === "always") ??
+    availableReplies.find((r) => r === "once") ??
+    "once"
+  );
+}
diff --git a/services/agent/src/server.ts b/services/agent/src/server.ts
new file mode 100644
index 0000000000..aae23c4480
--- /dev/null
+++ b/services/agent/src/server.ts
@@ -0,0 +1,155 @@
+/**
+ * WP-2 Pi wrapper HTTP server: the HTTP transport for the Harness port.
+ *
+ * Same contract as the CLI, exposed over HTTP so the wrapper can run as its own
+ * container (a sidecar) that the Python service calls in-network:
+ *
+ *   GET  /health -> { status: "ok" }
+ *   POST /run    -> body is an AgentRunRequest, response is an AgentRunResult
+ *
+ * Uses Node's built-in http server (no framework dependency). Pi auth comes from
+ * PI_CODING_AGENT_DIR / ~/.pi/agent, mounted into the container.
+ */
+import { createServer, type IncomingMessage, type ServerResponse } from "node:http";
+
+import type {
+  AgentRunRequest,
+  AgentRunResult,
+  EmitEvent,
+  StreamRecord,
+} from "./protocol.ts";
+import { runPi } from "./engines/pi.ts";
+import { runRivet } from "./engines/rivet.ts";
+
+const PORT = Number(process.env.PORT ?? 8765);
+
+// Select the engine. `rivet` drives a harness over ACP via a rivet daemon; `pi` is the
+// legacy in-process Pi path. The request's explicit `backend` (set by the Python
+// transport) wins; the AGENT_BACKEND env is the sidecar default; `auto` falls back to the
+// request shape (a rivet request carries `harness`/`sandbox`).
+const DEFAULT_BACKEND = (process.env.AGENT_BACKEND ?? "auto").toLowerCase();
+
+function runAgent(
+  request: AgentRunRequest,
+  emit?: EmitEvent,
+  signal?: AbortSignal,
+): Promise<AgentRunResult> {
+  const backend = (request.backend ?? DEFAULT_BACKEND).toLowerCase();
+  if (backend === "rivet") return runRivet(request, emit, signal);
+  if (backend === "pi") return runPi(request, emit);
+  return request.harness || request.sandbox
+    ? runRivet(request, emit, signal)
+    : runPi(request, emit);
+}
+
+/**
+ * Stream a run as NDJSON: one `{kind:"event"}` line per event the moment it is built, then
+ * exactly one terminal `{kind:"result"}` line (success or failure). Selected by the caller
+ * with `Accept: application/x-ndjson`; the one-shot `/run` path is left untouched.
+ */
+async function runAndStream(
+  req: IncomingMessage,
+  res: ServerResponse,
+  request: AgentRunRequest,
+): Promise<void> {
+  res.writeHead(200, {
+    "content-type": "application/x-ndjson",
+    "cache-control": "no-cache",
+    "x-accel-buffering": "no",
+    connection: "keep-alive",
+  });
+
+  // A client disconnect aborts the in-flight run rather than letting it finish unobserved.
+  // Listen on the response, not the request: the request body is already fully read, so its
+  // `close` can fire early on a keep-alive connection. `res` `close` fires when the response
+  // connection ends — after a normal `res.end()` (harmless: the run is already done) or when
+  // the client drops mid-stream (the case we want to cancel).
+  const controller = new AbortController();
+  res.on("close", () => controller.abort());
+
+  const writeRecord = (record: StreamRecord): void => {
+    if (res.writableEnded) return;
+    res.write(JSON.stringify(record) + "\n");
+  };
+  const emit: EmitEvent = (event) => writeRecord({ kind: "event", event });
+
+  let result: AgentRunResult;
+  try {
+    result = await runAgent(request, emit, controller.signal);
+  } catch (err) {
+    const message = err instanceof Error ? err.stack ?? err.message : String(err);
+    result = { ok: false, error: message };
+  }
+  // Streaming delivered the events live, so don't echo them in the terminal record.
+  writeRecord({ kind: "result", result: { ...result, events: [] } });
+  res.end();
+}
+
+function send(res: ServerResponse, status: number, body: unknown): void {
+  const payload = JSON.stringify(body);
+  res.writeHead(status, {
+    "content-type": "application/json",
+    "content-length": Buffer.byteLength(payload),
+  });
+  res.end(payload);
+}
+
+async function readBody(req: IncomingMessage): Promise<string> {
+  const chunks: Buffer[] = [];
+  for await (const chunk of req) {
+    chunks.push(chunk as Buffer);
+  }
+  return Buffer.concat(chunks).toString("utf8");
+}
+
+const server = createServer(async (req, res) => {
+  try {
+    if (req.method === "GET" && req.url === "/health") {
+      return send(res, 200, { status: "ok" });
+    }
+
+    if (req.method === "POST" && req.url === "/run") {
+      const raw = await readBody(req);
+      let request: AgentRunRequest;
+      try {
+        request = raw.trim() ? (JSON.parse(raw) as AgentRunRequest) : {};
+      } catch (err) {
+        return send(res, 400, { ok: false, error: `Invalid JSON: ${String(err)}` });
+      }
+
+      const wantsStream = (req.headers["accept"] ?? "").includes(
+        "application/x-ndjson",
+      );
+      if (wantsStream) {
+        await runAndStream(req, res, request);
+        return;
+      }
+
+      const result = await runAgent(request);
+      return send(res, result.ok ? 200 : 500, result);
+    }
+
+    return send(res, 404, { ok: false, error: "Not found" });
+  } catch (err) {
+    const message = err instanceof Error ? err.stack ?? err.message : String(err);
+    return send(res, 500, { ok: false, error: message });
+  }
+});
+
+// The rivet SDK can reject a background promise (e.g. an adapter install or the Daytona
+// preview SSE failing) outside any awaited path. Node's default turns that into an
+// uncaught exception that kills the whole process — taking every in-flight request with
+// it (the caller sees "Server disconnected"). Log and keep serving instead; the failing
+// run still returns its own error to its caller.
+process.on("unhandledRejection", (reason) => {
+  process.stderr.write(
+    `[pi-wrapper] unhandledRejection: ${reason instanceof Error ? (reason.stack ?? reason.message) : String(reason)}\n`,
+  );
+});
+process.on("uncaughtException", (err) => {
+  process.stderr.write(`[pi-wrapper] uncaughtException: ${err.stack ?? err.message}\n`);
+});
+
+server.listen(PORT, () => {
+  process.stderr.write(`[pi-wrapper] http server listening on :${PORT}\n`);
+});
diff --git a/services/agent/src/tools/dispatch.ts b/services/agent/src/tools/dispatch.ts
index f948501265..fd68a87b72 100644
--- a/services/agent/src/tools/dispatch.ts
+++ b/services/agent/src/tools/dispatch.ts
@@ -54,7 +54,7 @@ export interface RunResolvedToolOpts {
  */
 export async function relayToolCall(
   dir: string,
-  callRef: string,
+  toolName: string,
   toolCallId: string,
   params: unknown,
   signal?: AbortSignal,
@@ -67,7 +67,7 @@ export async function relayToolCall(
   } catch {
     // The runner also creates it; a race here is harmless.
   }
-  writeFileSync(reqPath, JSON.stringify({ callRef, toolCallId, args: params ?? {} }), "utf-8");
+  writeFileSync(reqPath, JSON.stringify({ toolName, toolCallId, args: params ?? {} }), "utf-8");
 
   const deadline = Date.now() + RELAY_TIMEOUT_MS;
   while (Date.now() < deadline) {
@@ -116,7 +116,7 @@ export async function runResolvedTool(
   }
   // callback (default): route back to Agenta's /tools/call (directly or via the Daytona relay).
   if (opts.relayDir) {
-    return relayToolCall(opts.relayDir, spec.callRef ?? "", opts.toolCallId, params, opts.signal);
+    return relayToolCall(opts.relayDir, spec.name, opts.toolCallId, params, opts.signal);
   }
   return callAgentaTool(
     opts.endpoint ?? "",
diff --git a/services/agent/src/tools/mcp-bridge.ts b/services/agent/src/tools/mcp-bridge.ts
index eaf5683a4d..c94230319b 100644
--- a/services/agent/src/tools/mcp-bridge.ts
+++ b/services/agent/src/tools/mcp-bridge.ts
@@ -3,20 +3,20 @@
  *
  * The Pi engine (engines/pi.ts) injected resolved runnable tools (WP-7) as in-process Pi
  * customTools. Over ACP the harness only accepts tools through MCP, so the same
- * resolved specs are exposed as an MCP server whose tool bodies POST back to Agenta's
- * /tools/call (the provider key and connection auth stay server-side, exactly as in
- * the Pi path). `buildToolMcpServers` returns the ACP `mcpServers` entry to attach to
- * the session.
+ * resolved specs are exposed as an MCP server whose tool bodies relay back to the runner.
+ * The runner keeps private specs/auth in memory and performs the actual execution.
+ * `buildToolMcpServers` returns the ACP `mcpServers` entry to attach to the session.
  *
- * Delivery: a stdio MCP bridge (mcp-server.ts) launched by the daemon. The specs and
- * callback are passed to it as env, so nothing tool-specific is written to the
- * agent-visible filesystem.
+ * Delivery: a stdio MCP bridge (mcp-server.ts) launched by the daemon. Its env carries
+ * only public tool metadata and the relay directory. It never receives scoped env, code,
+ * callback auth, or callback endpoints.
  */
 import { existsSync } from "node:fs";
 import { dirname, join } from "node:path";
 import { fileURLToPath } from "node:url";
 
 import type { ResolvedToolSpec, ToolCallbackContext } from "../protocol.ts";
+import { executableToolSpecs, publicToolSpecs } from "./public-spec.ts";
 
 export type { ResolvedToolSpec, ToolCallbackContext } from "../protocol.ts";
 
@@ -56,47 +56,35 @@ export interface McpServerStdio {
  *    filters them from tools/list), so they never justify attaching the bridge on their own.
  *  - "Executable here" = non-client (`code` and `callback`). With zero executable specs we
  *    return [] (the no-tools path stays untouched).
- *  - `code` tools run locally in mcp-server.ts (runCodeTool) and need NO callback endpoint, so
- *    we attach `agenta-tools` whenever there is at least one executable spec.
- *  - Only `callback` tools require `callback.endpoint`. If callback tools are present but the
- *    endpoint is missing, we do NOT drop the whole server (that would silently lose the `code`
- *    tools too): we still attach it and warn, naming the callback tools whose `tools/call` will
- *    fail. The endpoint/auth env entries are pushed only when the endpoint actually exists.
+ *  - The bridge does not execute tools itself. It sends a request file to `relayDir`, and
+ *    the runner executes the private resolved spec in memory. That keeps scoped env, code,
+ *    callback auth, and callback endpoints out of child-process env.
  */
 export function buildToolMcpServers(
   specs: ResolvedToolSpec[],
-  callback: ToolCallbackContext | undefined,
+  _callbackOrRelayDir?: ToolCallbackContext | string,
+  relayDir?: string,
 ): McpServerStdio[] {
   if (!specs || specs.length === 0) return [];
 
   // Absent kind defaults to `callback` (back-compat); `client` is the only non-executable kind.
-  const executable = specs.filter((s) => (s.kind ?? "callback") !== "client");
+  const executable = executableToolSpecs(specs);
   if (executable.length === 0) return [];
 
-  // The callback subset is the only thing that needs the endpoint to function.
-  const callbackSpecs = executable.filter((s) => (s.kind ?? "callback") === "callback");
-  const hasEndpoint = Boolean(callback?.endpoint);
-
-  if (callbackSpecs.length > 0 && !hasEndpoint) {
-    const names = callbackSpecs.map((s) => s.name).join(", ");
+  const resolvedRelayDir =
+    typeof _callbackOrRelayDir === "string" ? _callbackOrRelayDir : relayDir;
+  if (!resolvedRelayDir) {
+    const names = executable.map((s) => s.name).join(", ");
     process.stderr.write(
-      `[tool-bridge] missing toolCallback endpoint: ${callbackSpecs.length} callback tool(s) ` +
-        `will fail (${names}); still attaching server for the other tool(s)\n`,
+      `[tool-bridge] missing tool relay directory: ${executable.length} tool(s) ` +
+        `will fail (${names})\n`,
     );
   }
 
-  // Pass every executable spec; mcp-server.ts dispatches per kind (code runs locally, callback
-  // routes to the endpoint).
   const env: EnvVariable[] = [
-    { name: "AGENTA_TOOL_SPECS", value: JSON.stringify(executable) },
+    { name: "AGENTA_TOOL_PUBLIC_SPECS", value: JSON.stringify(publicToolSpecs(executable)) },
   ];
-  // Only carry the callback env when there is an endpoint to call back to.
-  if (hasEndpoint) {
-    env.push({ name: "AGENTA_TOOL_CALLBACK_ENDPOINT", value: callback!.endpoint });
-    if (callback!.authorization) {
-      env.push({ name: "AGENTA_TOOL_CALLBACK_AUTH", value: callback!.authorization });
-    }
-  }
+  if (resolvedRelayDir) env.push({ name: "AGENTA_TOOL_RELAY_DIR", value: resolvedRelayDir });
 
   const { command, args } = bridgeLauncher();
   return [{ name: "agenta-tools", command, args, env }];
diff --git a/services/agent/src/tools/mcp-server.ts b/services/agent/src/tools/mcp-server.ts
index 98a240c50e..5628423c77 100644
--- a/services/agent/src/tools/mcp-server.ts
+++ b/services/agent/src/tools/mcp-server.ts
@@ -3,14 +3,13 @@
  *
  * The harness only accepts tools over MCP when driven via ACP. This is a minimal,
  * dependency-free MCP stdio server that exposes the backend-resolved runnable tools
- * (WP-7) and routes each tool call back through Agenta's /tools/call — so the Composio
- * key and connection auth stay server-side, exactly as in the in-process Pi path.
+ * (WP-7) and relays each tool call back to the runner — so private specs/auth stay in
+ * runner memory, exactly as in the in-process Pi path.
  *
- * Launched by the rivet daemon as a session MCP server (see mcp-bridge.ts). It reads
- * everything from env so nothing tool-specific is written to the agent filesystem:
- *   AGENTA_TOOL_SPECS            JSON array of { name, description, inputSchema, callRef }
- *   AGENTA_TOOL_CALLBACK_ENDPOINT  full /tools/call URL
- *   AGENTA_TOOL_CALLBACK_AUTH      Authorization header value (optional)
+ * Launched by the rivet daemon as a session MCP server (see mcp-bridge.ts). Its env
+ * contains only public tool metadata and the relay dir:
+ *   AGENTA_TOOL_PUBLIC_SPECS     JSON array of { name, description, inputSchema }
+ *   AGENTA_TOOL_RELAY_DIR        directory watched by the runner for tool requests
  *
  * Protocol: JSON-RPC 2.0 over stdio, newline-delimited (the MCP stdio framing). Handles
  * initialize, tools/list, tools/call; ignores notifications. stdout carries protocol
@@ -22,9 +21,8 @@ import type { ResolvedToolSpec } from "../protocol.ts";
 import { EMPTY_OBJECT_SCHEMA } from "./callback.ts";
 import { runResolvedTool } from "./dispatch.ts";
 
-const SPECS: ResolvedToolSpec[] = JSON.parse(process.env.AGENTA_TOOL_SPECS ?? "[]");
-const ENDPOINT = process.env.AGENTA_TOOL_CALLBACK_ENDPOINT ?? "";
-const AUTH = process.env.AGENTA_TOOL_CALLBACK_AUTH;
+const SPECS: ResolvedToolSpec[] = JSON.parse(process.env.AGENTA_TOOL_PUBLIC_SPECS ?? "[]");
+const RELAY_DIR = process.env.AGENTA_TOOL_RELAY_DIR;
 const SPEC_BY_NAME = new Map(SPECS.map((s) => [s.name, s]));
 const DEFAULT_PROTOCOL = "2025-06-18";
 
@@ -78,13 +76,12 @@ async function handle(message: any): Promise<unknown | undefined> {
       return { jsonrpc: "2.0", id, error: { code: -32602, message: `unknown tool: ${name}` } };
     }
     try {
-      // `code` runs the snippet locally (scoped secret env); everything else routes back to
-      // Agenta's /tools/call. A unique id per call so two parallel calls in the same
-      // millisecond don't collide (Date.now() would).
+      if (!RELAY_DIR) throw new Error("missing AGENTA_TOOL_RELAY_DIR");
+      // The bridge only has public metadata. A unique id per call keeps parallel calls from
+      // colliding while the runner maps the tool name back to its private resolved spec.
       const text = await runResolvedTool(spec, params?.arguments, {
         toolCallId: randomUUID(),
-        endpoint: ENDPOINT,
-        authorization: AUTH,
+        relayDir: RELAY_DIR,
       });
       return { jsonrpc: "2.0", id, result: { content: [{ type: "text", text }] } };
     } catch (err) {
@@ -104,7 +101,7 @@ async function handle(message: any): Promise<unknown | undefined> {
 }
 
 function main(): void {
-  log(`serving ${SPECS.length} tool(s) -> ${ENDPOINT || "(no endpoint)"}`);
+  log(`serving ${SPECS.length} tool(s) -> relay ${RELAY_DIR || "(missing)"}`);
   let buffer = "";
   process.stdin.setEncoding("utf8");
   process.stdin.on("data", (chunk: string) => {
diff --git a/services/agent/src/tools/public-spec.ts b/services/agent/src/tools/public-spec.ts
new file mode 100644
index 0000000000..01ded7d3ed
--- /dev/null
+++ b/services/agent/src/tools/public-spec.ts
@@ -0,0 +1,31 @@
+/**
+ * Public tool metadata safe to expose to harness child processes.
+ *
+ * ResolvedToolSpec also carries executor-private fields (`callRef`, `code`, scoped `env`,
+ * runtime). Those must stay in runner memory. Child processes only need the advertisement
+ * shape so the model can choose a tool; every execution is relayed back to the runner.
+ */
+import type { ResolvedToolSpec } from "../protocol.ts";
+
+export interface PublicToolSpec {
+  name: string;
+  description?: string;
+  inputSchema?: Record<string, unknown> | null;
+}
+
+/** `client` tools are browser-fulfilled and are not executable by a runner child process. */
+export function executableToolSpecs(specs: ResolvedToolSpec[]): ResolvedToolSpec[] {
+  return specs.filter((spec) => (spec.kind ?? "callback") !== "client");
+}
+
+export function publicToolSpec(spec: ResolvedToolSpec): PublicToolSpec {
+  return {
+    name: spec.name,
+    description: spec.description,
+    inputSchema: spec.inputSchema,
+  };
+}
+
+export function publicToolSpecs(specs: ResolvedToolSpec[]): PublicToolSpec[] {
+  return executableToolSpecs(specs).map(publicToolSpec);
+}
diff --git a/services/agent/src/tools/relay.ts b/services/agent/src/tools/relay.ts
index 952ff8893a..4889b110af 100644
--- a/services/agent/src/tools/relay.ts
+++ b/services/agent/src/tools/relay.ts
@@ -1,23 +1,25 @@
 /**
  * Daytona tool relay.
  *
- * On Daytona the harness runs in a remote cloud sandbox that can reach the public internet
- * but NOT a firewalled / private Agenta backend (the same reason tracing is built from the
- * event stream there instead of in-sandbox OTLP). So the in-sandbox Pi extension cannot
- * POST tool calls to Agenta's /tools/call directly.
+ * Tool child processes do not receive private resolved specs, executable code, scoped env,
+ * callback endpoints, or callback auth. They receive only public tool metadata plus this
+ * relay directory, then ask the runner to execute each call.
  *
  * The runner CAN reach Agenta (it resolved the tools and holds the callback), and it can
  * reach the sandbox filesystem over the daemon API. So tool calls are relayed through the
  * runner via files in a sandbox dir:
  *
- *   extension: write `<id>.req.json` {callRef, args}  ──▶  poll `<id>.res.json`
- *   runner:    poll the dir, read `<id>.req.json` ──▶ /tools/call ──▶ write `<id>.res.json`
+ *   child:  write `<id>.req.json` {toolName, args} ──▶ poll `<id>.res.json`
+ *   runner: poll the dir, read `<id>.req.json` ──▶ execute private spec in memory
+ *           ──▶ write `<id>.res.json`
  *
- * Local runs keep the direct path (the in-process / local-daemon extension reaches Agenta);
- * the relay is only wired when AGENTA_TOOL_RELAY_DIR is set (Daytona + Pi + tools).
+ * The same loop supports local filesystem relays and Daytona sandbox filesystem relays.
  */
+import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync } from "node:fs";
+
 import { callAgentaTool } from "./callback.ts";
-import type { ToolCallbackContext } from "../protocol.ts";
+import { runCodeTool } from "./code.ts";
+import type { ResolvedToolSpec, ToolCallbackContext } from "../protocol.ts";
 
 export const RELAY_REQ_SUFFIX = ".req.json";
 export const RELAY_RES_SUFFIX = ".res.json";
@@ -25,7 +27,7 @@ export const RELAY_POLL_MS = Number(process.env.AGENTA_TOOL_RELAY_POLL_MS ?? 300
 export const RELAY_TIMEOUT_MS = Number(process.env.AGENTA_TOOL_RELAY_TIMEOUT_MS ?? 60000);
 
 export interface RelayRequest {
-  callRef: string;
+  toolName: string;
   toolCallId: string;
   args: unknown;
 }
@@ -42,6 +44,74 @@ export function sanitizeRelayId(id: string): string {
 
 export const sleep = (ms: number): Promise<void> => new Promise((r) => setTimeout(r, ms));
 
+export interface RelayHost {
+  list: (dir: string) => Promise<string[]>;
+  read: (path: string) => Promise<string>;
+  write: (path: string, contents: string) => Promise<void>;
+}
+
+/** Relay host for child processes running on the same filesystem as the runner. */
+export function localRelayHost(): RelayHost {
+  return {
+    list: async (dir) => {
+      if (!existsSync(dir)) return [];
+      return readdirSync(dir);
+    },
+    read: async (path) => readFileSync(path, "utf-8"),
+    write: async (path, contents) => {
+      mkdirSync(path.slice(0, path.lastIndexOf("/")), { recursive: true });
+      writeFileSync(path, contents, "utf-8");
+    },
+  };
+}
+
+/** Relay host for child processes running inside a Daytona sandbox. */
+export function sandboxRelayHost(sandbox: any): RelayHost {
+  return {
+    list: async (dir) => {
+      const ls = await sandbox.runProcess({
+        command: "ls",
+        args: ["-1", dir],
+        timeoutMs: 10_000,
+      });
+      return String(ls?.stdout ?? "")
+        .split("\n")
+        .map((s) => s.trim())
+        .filter(Boolean);
+    },
+    read: async (path) => {
+      const bytes = await sandbox.readFsFile({ path });
+      return typeof bytes === "string" ? bytes : new TextDecoder().decode(bytes);
+    },
+    write: async (path, contents) => {
+      await sandbox.writeFsFile({ path }, contents);
+    },
+  };
+}
+
+async function executeRelayedTool(
+  spec: ResolvedToolSpec,
+  req: RelayRequest,
+  callback: ToolCallbackContext | undefined,
+): Promise<string> {
+  if (spec.kind === "client") {
+    throw new Error(`client tool '${spec.name}' is browser-fulfilled and cannot be executed`);
+  }
+  if (spec.kind === "code") {
+    return runCodeTool(spec.runtime, spec.code ?? "", spec.env, req.args);
+  }
+  if (!callback?.endpoint) {
+    throw new Error(`missing toolCallback endpoint for '${spec.name}'`);
+  }
+  return callAgentaTool(
+    callback.endpoint,
+    callback.authorization,
+    spec.callRef ?? "",
+    req.toolCallId,
+    req.args,
+  );
+}
+
 /**
  * Runner-side relay loop. Polls the sandbox relay dir for request files, executes each
  * against Agenta's /tools/call (which the runner can reach), and writes the response file
@@ -49,37 +119,35 @@ export const sleep = (ms: number): Promise<void> => new Promise((r) => setTimeou
  * in-flight executions; call it once the prompt resolves.
  */
 export function startToolRelay(
-  sandbox: any,
+  host: RelayHost,
   relayDir: string,
-  callback: ToolCallbackContext,
+  specs: ResolvedToolSpec[],
+  callback: ToolCallbackContext | undefined,
 ): { stop: () => Promise<void> } {
   let active = true;
   const seen = new Set<string>();
   const inflight: Promise<void>[] = [];
+  const specsByName = new Map(specs.map((spec) => [spec.name, spec]));
 
   const handle = async (reqName: string): Promise<void> => {
     const id = reqName.slice(0, -RELAY_REQ_SUFFIX.length);
     let res: RelayResponse;
     try {
-      const bytes = await sandbox.readFsFile({ path: `${relayDir}/${reqName}` });
-      const raw = typeof bytes === "string" ? bytes : new TextDecoder().decode(bytes);
+      const raw = await host.read(`${relayDir}/${reqName}`);
       const req = JSON.parse(raw) as RelayRequest;
-      const text = await callAgentaTool(
-        callback.endpoint,
-        callback.authorization,
-        req.callRef,
-        req.toolCallId ?? id,
-        req.args,
+      const spec = specsByName.get(req.toolName);
+      if (!spec) throw new Error(`unknown tool '${req.toolName}'`);
+      const text = await executeRelayedTool(
+        spec,
+        { ...req, toolCallId: req.toolCallId ?? id },
+        callback,
       );
       res = { ok: true, text };
     } catch (err) {
       res = { ok: false, error: err instanceof Error ? err.message : String(err) };
     }
     try {
-      await sandbox.writeFsFile(
-        { path: `${relayDir}/${id}${RELAY_RES_SUFFIX}` },
-        JSON.stringify(res),
-      );
+      await host.write(`${relayDir}/${id}${RELAY_RES_SUFFIX}`, JSON.stringify(res));
     } catch {
       // The extension will time out and surface a tool error; nothing else to do here.
     }
@@ -88,15 +156,7 @@ export function startToolRelay(
   const loop = (async () => {
     while (active) {
       try {
-        const ls = await sandbox.runProcess({
-          command: "ls",
-          args: ["-1", relayDir],
-          timeoutMs: 10_000,
-        });
-        const names = String(ls?.stdout ?? "")
-          .split("\n")
-          .map((s) => s.trim())
-          .filter(Boolean);
+        const names = await host.list(relayDir);
         for (const name of names) {
           if (!name.endsWith(RELAY_REQ_SUFFIX) || seen.has(name)) continue;
           seen.add(name);
diff --git a/services/agent/src/tracing/otel.ts b/services/agent/src/tracing/otel.ts
new file mode 100644
index 0000000000..d022095a42
--- /dev/null
+++ b/services/agent/src/tracing/otel.ts
@@ -0,0 +1,1026 @@
+/**
+ * agenta-otel — a Pi extension that turns Pi's `pi.on(...)` lifecycle events into
+ * OpenTelemetry spans and exports them (OTLP/HTTP protobuf) to Agenta.
+ *
+ * This is the service build of the WP-1 POC extension
+ * (docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/agenta-otel.ts). It keeps the
+ * span tree and the load-bearing attribute choices identical, and adds three
+ * things the service needs that the single-run POC did not:
+ *
+ *   1. Per-run state. The POC kept span state in module globals because it ran one
+ *      prompt at a time. The service may drive several runs in one process (the
+ *      HTTP sidecar), so all per-run state lives in the closure returned by
+ *      `createAgentaOtel`. The shared tracer/provider/exporters stay module-level.
+ *   2. Cross-boundary trace context. The caller (the Agenta Python service) passes a
+ *      W3C `traceparent`. When present, `invoke_agent` is started as a CHILD of that
+ *      remote span, so the whole agent run joins the same trace as the `/invoke`
+ *      request — the agent's work becomes part of the response trace, the way
+ *      completion/chat nest their LLM spans under the workflow span.
+ *   3. Per-trace export target. The OTLP endpoint and `Authorization` header come
+ *      from the run config (the caller's host + credentials), falling back to env.
+ *      Each trace is exported with its own target, so a shared process can serve
+ *      more than one project.
+ *
+ * Span tree (per user prompt), unchanged from the POC:
+ *   invoke_agent            (openinference.span.kind = AGENT)
+ *     turn N                (CHAIN)
+ *       chat <model>        (LLM)   — the provider request for that turn
+ *       execute_tool <name> (TOOL)  — each tool the turn ran
+ *
+ * Config (read lazily from the environment for the fallback target):
+ *   AGENTA_HOST, AGENTA_API_KEY  — fallback exporter endpoint + auth
+ *   OTEL_SERVICE_NAME            — resource service.name (default "pi-agent")
+ */
+import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
+import {
+  context,
+  ROOT_CONTEXT,
+  trace,
+  TraceFlags,
+  SpanStatusCode,
+  type Context,
+  type Span,
+  type SpanContext,
+} from "@opentelemetry/api";
+import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-proto";
+import { Resource } from "@opentelemetry/resources";
+import type {
+  ReadableSpan,
+  SpanExporter,
+  SpanProcessor,
+} from "@opentelemetry/sdk-trace-base";
+import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node";
+import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions";
+
+import type { AgentEvent, AgentUsage, EmitEvent } from "../protocol.ts";
+
+// ---------------------------------------------------------------------------
+// Shared, process-wide tracing infrastructure
+// ---------------------------------------------------------------------------
+
+/** Where a trace's spans are shipped: an OTLP endpoint and an Authorization header. */
+interface ExportTarget {
+  endpoint: string;
+  authorization?: string;
+}
+
+/** traceId (hex) -> where that trace's spans should be exported. Set on agent_start. */
+const traceTargets = new Map<string, ExportTarget>();
+
+/** Cache one exporter per distinct endpoint+auth so we do not rebuild per export. */
+const exporterCache = new Map<string, OTLPTraceExporter>();
+
+function targetKey(target: ExportTarget): string {
+  return `${target.endpoint}\n${target.authorization ?? ""}`;
+}
+
+function getExporter(target: ExportTarget): OTLPTraceExporter {
+  const key = targetKey(target);
+  let exporter = exporterCache.get(key);
+  if (!exporter) {
+    exporter = new OTLPTraceExporter({
+      url: target.endpoint,
+      headers: target.authorization
+        ? { Authorization: target.authorization }
+        : {},
+      timeoutMillis: 10_000,
+    });
+    exporterCache.set(key, exporter);
+  }
+  return exporter;
+}
+
+/** Fallback target from env, used when a trace was started without an explicit one. */
+function defaultTarget(): ExportTarget {
+  const host = (process.env.AGENTA_HOST || "https://cloud.agenta.ai").replace(
+    /\/+$/,
+    "",
+  );
+  const apiKey = process.env.AGENTA_API_KEY || "";
+  return {
+    endpoint: `${host}/api/otlp/v1/traces`,
+    authorization: apiKey ? `ApiKey ${apiKey}` : undefined,
+  };
+}
+
+/**
+ * Buffer a trace's spans and export them in ONE OTLP batch. Agenta computes
+ * cumulative (rolled-up) token/cost metrics per ingest batch, so a trace split
+ * across batches loses the root aggregation. Two completion signals:
+ *   - the root span ends (standalone run: invoke_agent IS the root), or
+ *   - the run flushes explicitly by trace id (cross-boundary run: invoke_agent
+ *     has a remote parent that never ends in this process, so root-end never fires).
+ */
+class TraceBatchProcessor implements SpanProcessor {
+  private readonly buffers = new Map<string, ReadableSpan[]>();
+
+  onStart(): void {}
+
+  onEnd(span: ReadableSpan): void {
+    const traceId = span.spanContext().traceId;
+    const spans = this.buffers.get(traceId) ?? [];
+    spans.push(span);
+    this.buffers.set(traceId, spans);
+    // No parent in this process => this is the local root and the trace is done.
+    if (!span.parentSpanId) {
+      this.flush(traceId);
+    }
+  }
+
+  /** Export and drop one trace's buffered spans. Resolves once the export returns. */
+  flush(traceId: string): Promise<void> {
+    const spans = this.buffers.get(traceId);
+    if (!spans || spans.length === 0) return Promise.resolve();
+    this.buffers.delete(traceId);
+    const target = traceTargets.get(traceId) ?? defaultTarget();
+    traceTargets.delete(traceId);
+    return new Promise((resolve) =>
+      getExporter(target).export(orderParentFirst(spans), () => resolve()),
+    );
+  }
+
+  forceFlush(): Promise<void> {
+    return Promise.all(
+      [...this.buffers.keys()].map((traceId) => this.flush(traceId)),
+    ).then(() => undefined);
+  }
+
+  shutdown(): Promise<void> {
+    return this.forceFlush().then(async () => {
+      await Promise.all(
+        [...exporterCache.values()].map((exporter) => exporter.shutdown()),
+      );
+    });
+  }
+}
+
+let provider: NodeTracerProvider | undefined;
+let processor: TraceBatchProcessor | undefined;
+
+function ensureProvider(): void {
+  if (provider) return;
+  processor = new TraceBatchProcessor();
+  provider = new NodeTracerProvider({
+    resource: new Resource({
+      [ATTR_SERVICE_NAME]: process.env.OTEL_SERVICE_NAME || "pi-agent",
+    }),
+  });
+  provider.addSpanProcessor(processor);
+  provider.register();
+}
+
+/** Flush one trace's spans to Agenta. Call after a run whose root has a remote parent. */
+export async function flushTrace(traceId?: string): Promise<void> {
+  if (!processor || !traceId) return;
+  await processor.flush(traceId);
+}
+
+/** Flush and shut down all exporters. Call once on process exit, not per run. */
+export async function shutdownTracing(): Promise<void> {
+  if (!provider) return;
+  try {
+    await provider.forceFlush();
+    await provider.shutdown();
+  } finally {
+    provider = undefined;
+    processor = undefined;
+    exporterCache.clear();
+  }
+}
+
+/**
+ * Order spans parent-before-child (preorder DFS). Agenta stores timestamps at
+ * millisecond resolution and builds its roll-up tree by sorting on start_time,
+ * attaching a span only if its parent is already seen. A parent-first request
+ * order keeps parents ahead of children on same-millisecond ties.
+ */
+function orderParentFirst(spans: ReadableSpan[]): ReadableSpan[] {
+  const byId = new Map(spans.map((s) => [s.spanContext().spanId, s]));
+  const childrenOf = new Map<string, ReadableSpan[]>();
+  const roots: ReadableSpan[] = [];
+  for (const s of spans) {
+    const parentId = s.parentSpanId;
+    if (parentId && byId.has(parentId)) {
+      const list = childrenOf.get(parentId) ?? [];
+      list.push(s);
+      childrenOf.set(parentId, list);
+    } else {
+      roots.push(s);
+    }
+  }
+  const ordered: ReadableSpan[] = [];
+  const visit = (s: ReadableSpan) => {
+    ordered.push(s);
+    for (const child of childrenOf.get(s.spanContext().spanId) ?? []) visit(child);
+  };
+  roots.forEach(visit);
+  // Any spans not reached (defensive) get appended so nothing is dropped.
+  if (ordered.length !== spans.length) {
+    const seen = new Set(ordered);
+    for (const s of spans) if (!seen.has(s)) ordered.push(s);
+  }
+  return ordered;
+}
+
+/** Build a parent Context from a W3C traceparent string, or undefined if absent/invalid. */
+function parentContext(traceparent?: string): Context | undefined {
+  if (!traceparent) return undefined;
+  const match = /^00-([0-9a-f]{32})-([0-9a-f]{16})-([0-9a-f]{2})$/.exec(
+    traceparent.trim(),
+  );
+  if (!match) return undefined;
+  const [, traceId, spanId, flags] = match;
+  const spanContext: SpanContext = {
+    traceId,
+    spanId,
+    // Honor the incoming sampled bit; default to sampled so child spans record.
+    traceFlags: (parseInt(flags, 16) & 1) === 1 ? TraceFlags.SAMPLED : TraceFlags.NONE,
+    isRemote: true,
+  };
+  return trace.setSpanContext(ROOT_CONTEXT, spanContext);
+}
+
+// ---------------------------------------------------------------------------
+// Per-run config + content helpers
+// ---------------------------------------------------------------------------
+
+/** One run's tracing config. Mutated by the runner after the session is created. */
+export interface RunConfig {
+  /** OTLP traces endpoint for this run's trace (falls back to env). */
+  endpoint?: string;
+  /** Authorization header value for the OTLP export (falls back to env ApiKey). */
+  authorization?: string;
+  /** W3C traceparent from the caller; nests invoke_agent under that span. */
+  traceparent?: string;
+  /** W3C baggage from the caller (carried for future use). */
+  baggage?: string;
+  /** Drop prompt/completion/tool I/O from spans when false. */
+  captureContent: boolean;
+  /** Pi session id, set after createAgentSession so spans carry session.id. */
+  sessionId?: string;
+  /** Resolved provider, set after the model is picked. */
+  provider?: string;
+  /** Resolved model id, set after the model is picked. */
+  requestModel?: string;
+  /** Filled by the extension on agent_start so the runner can flush/return it. */
+  traceId?: string;
+}
+
+/** A string output → ag.data.outputs (any type is valid there). */
+function setOutput(span: Span, value: unknown, capture: boolean): void {
+  if (!capture || value == null) return;
+  const text = typeof value === "string" ? value : JSON.stringify(value);
+  if (text.length > 0) span.setAttribute("output.value", text);
+}
+
+/**
+ * ag.data.inputs must be a dict, so emit input.value as a JSON object string.
+ * A non-object (raw string) would be relocated to ag.unsupported by Agenta.
+ */
+function setInputs(
+  span: Span,
+  obj: Record<string, unknown>,
+  capture: boolean,
+): void {
+  if (!capture) return;
+  span.setAttribute("input.value", JSON.stringify(obj));
+  span.setAttribute("input.mime_type", "application/json");
+}
+
+function oiRole(role: string): string {
+  return role === "toolResult" ? "tool" : role; // user | assistant | system | tool
+}
+
+function messageText(msg: any): string {
+  const c = msg?.content;
+  if (typeof c === "string") return c;
+  if (Array.isArray(c)) {
+    return c
+      .filter((b: any) => b?.type === "text")
+      .map((b: any) => b.text)
+      .join("");
+  }
+  return "";
+}
+
+/**
+ * Emit OpenInference structured messages so Agenta renders a proper message
+ * thread. `llm.input_messages.*` -> ag.data.inputs.prompt.*,
+ * `llm.output_messages.*` -> ag.data.outputs.completion.*.
+ */
+function emitMessages(
+  span: Span,
+  prefix: string,
+  messages: any[],
+  capture: boolean,
+): void {
+  if (!capture || !Array.isArray(messages)) return;
+  messages.forEach((m, i) => {
+    const base = `${prefix}.${i}.message`;
+    span.setAttribute(`${base}.role`, oiRole(m.role));
+    const text = messageText(m);
+    if (text) span.setAttribute(`${base}.content`, text);
+    if (m.role === "toolResult" && m.toolCallId)
+      span.setAttribute(`${base}.tool_call_id`, m.toolCallId);
+    if (Array.isArray(m.content)) {
+      m.content
+        .filter((b: any) => b?.type === "toolCall")
+        .forEach((call: any, j: number) => {
+          const tc = `${base}.tool_calls.${j}.tool_call`;
+          if (call.id) span.setAttribute(`${tc}.id`, call.id);
+          span.setAttribute(`${tc}.function.name`, call.name);
+          span.setAttribute(
+            `${tc}.function.arguments`,
+            JSON.stringify(call.arguments ?? {}),
+          );
+        });
+    }
+  });
+}
+
+function toolResultText(result: any): string {
+  if (!result) return "";
+  if (typeof result === "string") return result;
+  if (Array.isArray(result)) {
+    return result
+      .filter((c: any) => c?.type === "text")
+      .map((c: any) => c.text)
+      .join("");
+  }
+  if (result.content) return toolResultText(result.content);
+  return JSON.stringify(result);
+}
+
+function lastAssistantText(messages: any): string {
+  if (!Array.isArray(messages)) return "";
+  for (let i = messages.length - 1; i >= 0; i--) {
+    if (messages[i]?.role === "assistant") return messageText(messages[i]);
+  }
+  return "";
+}
+
+/** Fill an LLM span from a finished assistant message (model, tokens, finish, output). */
+function applyAssistant(span: Span, msg: any, capture: boolean): void {
+  if (msg.provider) span.setAttribute("gen_ai.system", msg.provider);
+  if (msg.model) span.setAttribute("gen_ai.request.model", msg.model);
+  if (msg.responseModel || msg.model)
+    span.setAttribute("gen_ai.response.model", msg.responseModel ?? msg.model);
+  if (msg.responseId) span.setAttribute("gen_ai.response.id", msg.responseId);
+  if (msg.stopReason)
+    span.setAttribute("gen_ai.response.finish_reasons", [String(msg.stopReason)]);
+
+  const u = msg.usage;
+  if (u) {
+    // Current GenAI names (mapped by Agenta's logfire adapter) ...
+    span.setAttribute("gen_ai.usage.input_tokens", u.input ?? 0);
+    span.setAttribute("gen_ai.usage.output_tokens", u.output ?? 0);
+    // ... and legacy names (mapped by Agenta's semconv.py). Emit both so token
+    // usage is never silently dropped regardless of which adapter wins.
+    span.setAttribute("gen_ai.usage.prompt_tokens", u.input ?? 0);
+    span.setAttribute("gen_ai.usage.completion_tokens", u.output ?? 0);
+    span.setAttribute(
+      "gen_ai.usage.total_tokens",
+      u.totalTokens ?? (u.input ?? 0) + (u.output ?? 0),
+    );
+    if (u.cacheRead)
+      span.setAttribute("gen_ai.usage.cache_read_input_tokens", u.cacheRead);
+    if (u.cacheWrite)
+      span.setAttribute("gen_ai.usage.cache_creation_input_tokens", u.cacheWrite);
+    if (u.cost?.total != null) span.setAttribute("gen_ai.usage.cost", u.cost.total);
+  }
+
+  emitMessages(span, "llm.output_messages", [msg], capture);
+  if (msg.stopReason === "error" || msg.errorMessage) {
+    span.setStatus({ code: SpanStatusCode.ERROR, message: msg.errorMessage });
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Extension factory (one per run; state is closure-scoped)
+// ---------------------------------------------------------------------------
+
+export interface AgentaOtel {
+  /** Register with DefaultResourceLoader.extensionFactories. */
+  register: (pi: ExtensionAPI) => void;
+  /** Mutable config; set sessionId/provider/requestModel after the session exists. */
+  config: RunConfig;
+  /** Flush this run's trace to Agenta. Await before the process/response ends. */
+  flush: () => Promise<void>;
+  /** Run totals (tokens + cost) summed across turns, for roll-up onto the parent. */
+  usage: () => { input: number; output: number; total: number; cost: number };
+}
+
+/**
+ * Build a tracing extension scoped to a single agent run. Pass `register` to the
+ * resource loader, fill in `config.sessionId`/`provider`/`requestModel` once the
+ * session and model are resolved, then `await flush()` after the prompt completes.
+ */
+export function createAgentaOtel(
+  init: Partial<RunConfig> & { captureContent?: boolean },
+): AgentaOtel {
+  ensureProvider();
+
+  const config: RunConfig = {
+    endpoint: init.endpoint,
+    authorization: init.authorization,
+    traceparent: init.traceparent,
+    captureContent: init.captureContent !== false,
+    sessionId: init.sessionId,
+    provider: init.provider,
+    requestModel: init.requestModel,
+  };
+
+  const tracer = trace.getTracer("agenta-pi-otel", "0.1.0");
+
+  // Per-run span state — closure-scoped so concurrent runs never collide.
+  let agentSpan: Span | undefined;
+  let agentCtx: Context | undefined;
+  let pendingPrompt: string | undefined;
+  let currentTurn: { span: Span; ctx: Context; index?: number } | undefined;
+  let llmSpan: Span | undefined;
+  let lastContextMessages: any[] | undefined;
+  const toolSpans = new Map<string, Span>();
+  // Run totals, summed across every assistant turn. Stamped on the agent span and
+  // returned so the caller can roll them up onto the workflow span in its own process
+  // (the agent and workflow spans are exported in separate OTLP batches, so Agenta's
+  // per-batch cumulative roll-up cannot bridge them on its own).
+  const runUsage = { input: 0, output: 0, total: 0, cost: 0 };
+
+  function accumulateUsage(msg: any): void {
+    const u = msg?.usage;
+    if (!u) return;
+    const input = u.input ?? 0;
+    const output = u.output ?? 0;
+    runUsage.input += input;
+    runUsage.output += output;
+    runUsage.total += u.totalTokens ?? input + output;
+    if (u.cost?.total != null) runUsage.cost += u.cost.total;
+  }
+
+  const register = (pi: ExtensionAPI): void => {
+    pi.on("before_agent_start", async (event: any) => {
+      pendingPrompt = event?.prompt;
+    });
+
+    pi.on("agent_start", async () => {
+      // Nest under the caller's workflow span when a traceparent was supplied,
+      // so the whole run joins the /invoke trace; otherwise start a fresh root.
+      const parent = parentContext(config.traceparent);
+      agentSpan = tracer.startSpan("invoke_agent", undefined, parent);
+      agentSpan.setAttribute("openinference.span.kind", "AGENT");
+      agentSpan.setAttribute("gen_ai.operation.name", "invoke_agent");
+      agentSpan.setAttribute("gen_ai.agent.name", "pi");
+      if (config.sessionId) {
+        agentSpan.setAttribute("session.id", config.sessionId);
+        agentSpan.setAttribute("gen_ai.conversation.id", config.sessionId);
+      }
+      setInputs(agentSpan, { prompt: pendingPrompt ?? "" }, config.captureContent);
+
+      const traceId = agentSpan.spanContext().traceId;
+      config.traceId = traceId;
+      traceTargets.set(traceId, {
+        endpoint: config.endpoint ?? defaultTarget().endpoint,
+        authorization: config.authorization ?? defaultTarget().authorization,
+      });
+      agentCtx = trace.setSpan(parent ?? context.active(), agentSpan);
+    });
+
+    // The messages handed to the next LLM call — the chat span's input.
+    pi.on("context", async (event: any) => {
+      if (Array.isArray(event?.messages)) lastContextMessages = event.messages;
+    });
+
+    pi.on("turn_start", async (event: any) => {
+      const parent = agentCtx ?? context.active();
+      const name = event?.turnIndex != null ? `turn ${event.turnIndex}` : "turn";
+      const span = tracer.startSpan(name, undefined, parent);
+      span.setAttribute("openinference.span.kind", "CHAIN");
+      if (event?.turnIndex != null) span.setAttribute("pi.turn.index", event.turnIndex);
+      currentTurn = { span, ctx: trace.setSpan(parent, span), index: event?.turnIndex };
+    });
+
+    pi.on("before_provider_request", async (_event: any, ctx: any) => {
+      const parent = currentTurn?.ctx ?? agentCtx ?? context.active();
+      const modelId = config.requestModel ?? ctx?.model?.id;
+      const providerName = config.provider ?? ctx?.model?.provider;
+      llmSpan = tracer.startSpan(modelId ? `chat ${modelId}` : "chat", undefined, parent);
+      llmSpan.setAttribute("openinference.span.kind", "LLM");
+      llmSpan.setAttribute("gen_ai.operation.name", "chat");
+      if (providerName) llmSpan.setAttribute("gen_ai.system", providerName);
+      if (modelId) llmSpan.setAttribute("gen_ai.request.model", modelId);
+      if (lastContextMessages)
+        emitMessages(llmSpan, "llm.input_messages", lastContextMessages, config.captureContent);
+    });
+
+    pi.on("message_end", async (event: any) => {
+      const msg = event?.message;
+      if (!msg || msg.role !== "assistant" || !llmSpan) return;
+      applyAssistant(llmSpan, msg, config.captureContent);
+      accumulateUsage(msg);
+      llmSpan.end();
+      llmSpan = undefined;
+    });
+
+    pi.on("tool_execution_start", async (event: any) => {
+      const parent = currentTurn?.ctx ?? agentCtx ?? context.active();
+      const name = event?.toolName ? `execute_tool ${event.toolName}` : "execute_tool";
+      const span = tracer.startSpan(name, undefined, parent);
+      span.setAttribute("openinference.span.kind", "TOOL");
+      span.setAttribute("gen_ai.operation.name", "execute_tool");
+      if (event?.toolName) span.setAttribute("gen_ai.tool.name", event.toolName);
+      if (event?.toolCallId) span.setAttribute("gen_ai.tool.call.id", event.toolCallId);
+      setInputs(span, (event?.args as Record<string, unknown>) ?? {}, config.captureContent);
+      if (event?.toolCallId) toolSpans.set(event.toolCallId, span);
+    });
+
+    pi.on("tool_execution_end", async (event: any) => {
+      const span = event?.toolCallId ? toolSpans.get(event.toolCallId) : undefined;
+      if (!span) return;
+      setOutput(span, toolResultText(event?.result), config.captureContent);
+      if (event?.isError) span.setStatus({ code: SpanStatusCode.ERROR });
+      span.end();
+      toolSpans.delete(event.toolCallId);
+    });
+
+    pi.on("turn_end", async (event: any) => {
+      // Safety net: if the LLM span is still open (no assistant message_end seen),
+      // close it from the turn's assistant message.
+      if (llmSpan && event?.message) {
+        applyAssistant(llmSpan, event.message, config.captureContent);
+        accumulateUsage(event.message);
+        llmSpan.end();
+        llmSpan = undefined;
+      }
+      if (currentTurn) {
+        currentTurn.span.end();
+        currentTurn = undefined;
+      }
+    });
+
+    pi.on("agent_end", async (event: any) => {
+      if (!agentSpan) return;
+      setOutput(agentSpan, lastAssistantText(event?.messages), config.captureContent);
+      // Stamp the run total on the agent span so it shows the agent's tokens/cost even
+      // though Agenta cannot roll the per-turn LLM spans up across batches.
+      if (runUsage.total > 0) {
+        agentSpan.setAttribute("gen_ai.usage.input_tokens", runUsage.input);
+        agentSpan.setAttribute("gen_ai.usage.output_tokens", runUsage.output);
+        agentSpan.setAttribute("gen_ai.usage.prompt_tokens", runUsage.input);
+        agentSpan.setAttribute("gen_ai.usage.completion_tokens", runUsage.output);
+        agentSpan.setAttribute("gen_ai.usage.total_tokens", runUsage.total);
+        if (runUsage.cost > 0) agentSpan.setAttribute("gen_ai.usage.cost", runUsage.cost);
+      }
+      agentSpan.end();
+      agentSpan = undefined;
+      agentCtx = undefined;
+      lastContextMessages = undefined;
+    });
+  };
+
+  return {
+    register,
+    config,
+    flush: () => flushTrace(config.traceId),
+    usage: () => ({ ...runUsage }),
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Rivet / ACP tracer (one per run; state is closure-scoped)
+// ---------------------------------------------------------------------------
+//
+// The Pi extension above hooks Pi's in-process `pi.on(...)` events. Under rivet the
+// harness runs as a separate process and we never see those events; instead the rivet
+// SDK surfaces the run as ACP `session/update` notifications (agent_message_chunk,
+// tool_call, tool_call_update, usage_update). This tracer builds the SAME span tree
+// from that event stream, so tracing is uniform across every harness rivet drives
+// (Pi, Claude Code, ...) and always nests under the caller's `/invoke` span.
+//
+// Span tree (per prompt turn):
+//   invoke_agent          (AGENT)
+//     turn 0              (CHAIN)
+//       chat <model>      (LLM)   — model interaction; usage where the harness reports it
+//       execute_tool <n>  (TOOL)  — one per ACP tool_call
+
+/** Text of an ACP ContentBlock (the shape carried by message/thought chunks). */
+function acpBlockText(block: any): string {
+  if (!block) return "";
+  if (typeof block === "string") return block;
+  if (block.type === "text" && typeof block.text === "string") return block.text;
+  return "";
+}
+
+/** Text of an ACP tool_call `content` array (ToolCallContent[]). */
+function acpToolContentText(content: any): string {
+  if (!content) return "";
+  if (typeof content === "string") return content;
+  if (Array.isArray(content)) {
+    return content
+      .map((c: any) => acpBlockText(c?.content ?? c))
+      .filter(Boolean)
+      .join("");
+  }
+  return "";
+}
+
+/**
+ * Strip the pi-acp startup banner that some setups emit as the first agent message
+ * chunk (a "pi vX.Y.Z" / "## Context" / file list / "New version available" prelude,
+ * surfaced ahead of the real answer). Removes only a leading run of those marker lines
+ * so a genuine answer is never touched.
+ */
+function stripStartupBanner(text: string): string {
+  const lines = text.split("\n");
+  const isBanner = (line: string) =>
+    /^pi v\d+\.\d+\.\d+/.test(line) ||
+    /^## Context\b/.test(line) ||
+    /^-\s+\/.*AGENTS\.md\s*$/.test(line) ||
+    /^New version available:/.test(line) ||
+    /^Run: `npm/.test(line) ||
+    line.trim() === "---" ||
+    line.trim() === "";
+  let i = 0;
+  let sawBanner = false;
+  while (i < lines.length && isBanner(lines[i])) {
+    if (lines[i].trim() !== "") sawBanner = true;
+    i++;
+  }
+  return sawBanner ? lines.slice(i).join("\n").trim() : text;
+}
+
+/** Split a resolved model id ("openai-codex/gpt-5.5") into provider + id. */
+function splitModel(model?: string): { provider?: string; id?: string } {
+  if (!model) return {};
+  const slash = model.indexOf("/");
+  if (slash === -1) return { id: model };
+  return { provider: model.slice(0, slash), id: model.slice(slash + 1) };
+}
+
+export interface RivetOtelInit extends Partial<RunConfig> {
+  captureContent?: boolean;
+  /** Harness id ("pi" / "claude"); becomes gen_ai.agent.name. */
+  harness?: string;
+  /** Resolved model id ("openai-codex/gpt-5.5"); set on the LLM span. */
+  model?: string;
+  /**
+   * Emit the span tree from the ACP event stream. Default true. Set false when the
+   * harness instruments itself (e.g. Pi via the agenta extension propagates the trace
+   * context and emits its own real turn/chat/tool spans) — then this only accumulates
+   * the reply text and builds no spans, so the two do not double up.
+   */
+  emitSpans?: boolean;
+  /**
+   * Live event sink. When set, each `AgentEvent` is flushed here the moment it is built
+   * (in addition to being recorded in `events[]`), and the text/reasoning blocks are
+   * emitted as `*_start`/`*_delta`/`*_end` lifecycle events rather than coalesced at the
+   * end. When unset (the one-shot path), only the coalesced `message`/`thought` land in
+   * `events[]`. This split is what keeps a delta'd block from being re-sent in full.
+   */
+  emit?: EmitEvent;
+}
+
+export interface RivetOtel {
+  /** Start the invoke_agent (AGENT) span as a child of the caller's traceparent. */
+  start(input: { prompt?: string; messages?: any[]; sessionId?: string }): void;
+  /** Feed one ACP `session/update` payload (the `update` object). */
+  handleUpdate(update: any): void;
+  /**
+   * Record an event the ACP stream does not carry (e.g. an `interaction_request` raised via
+   * the permission callback). Routes through the same choke point as stream events, so it
+   * lands in both the live sink and the batch `events()` log in build order.
+   */
+  emitEvent(event: AgentEvent): void;
+  /** End all open spans. Returns the accumulated assistant text. */
+  finish(): string;
+  /** Set final run usage before finish/flush so events and exported spans carry final totals. */
+  setUsage(usage: AgentUsage | undefined): void;
+  /** Flush this run's trace to Agenta (invoke_agent has a remote parent). */
+  flush(): Promise<void>;
+  /** Trace id of the run (the caller's trace when a traceparent was passed). */
+  traceId(): string | undefined;
+  /** Accumulated assistant output text so far. */
+  output(): string;
+  /** The structured event log built from the ACP stream (tool calls, usage, final message). */
+  events(): AgentEvent[];
+  /** Run token/cost totals from the stream, when the harness reported `usage_update`. */
+  usage(): AgentUsage | undefined;
+}
+
+/**
+ * Build an ACP-event-driven tracer scoped to a single rivet run. Call `start` once,
+ * `handleUpdate` for every ACP session update, then `finish` + `await flush`.
+ */
+export function createRivetOtel(init: RivetOtelInit): RivetOtel {
+  ensureProvider();
+
+  const capture = init.captureContent !== false;
+  const emitSpans = init.emitSpans !== false;
+  const endpoint = init.endpoint ?? defaultTarget().endpoint;
+  const authorization = init.authorization ?? defaultTarget().authorization;
+  const { provider, id: modelId } = splitModel(init.model);
+  const tracer = trace.getTracer("agenta-rivet-otel", "0.1.0");
+
+  let agentSpan: Span | undefined;
+  let agentCtx: Context | undefined;
+  let turnSpan: Span | undefined;
+  let turnCtx: Context | undefined;
+  let llmSpan: Span | undefined;
+  let runTraceId: string | undefined;
+  let accumulated = "";
+  let reasoningAccumulated = "";
+  let usage: AgentUsage | undefined;
+  const events: AgentEvent[] = [];
+  const toolSpans = new Map<string, { span?: Span; name: string }>();
+
+  // Live emission. `record` is the single choke point for every event: it appends to the
+  // result log and, on the streaming path, flushes the event the moment it is built — so
+  // the live order is byte-identical to `events[]`. A sink failure never aborts the run.
+  const sink = init.emit;
+  function record(event: AgentEvent): void {
+    events.push(event);
+    if (sink) {
+      try {
+        sink(event);
+      } catch {
+        // a downstream sink error must not break the agent run
+      }
+    }
+  }
+
+  function stampUsage(span: Span, u: AgentUsage | undefined): void {
+    if (!u) return;
+    span.setAttribute("gen_ai.usage.input_tokens", u.input);
+    span.setAttribute("gen_ai.usage.output_tokens", u.output);
+    span.setAttribute("gen_ai.usage.prompt_tokens", u.input);
+    span.setAttribute("gen_ai.usage.completion_tokens", u.output);
+    span.setAttribute("gen_ai.usage.total_tokens", u.total);
+    if (u.cost > 0) span.setAttribute("gen_ai.usage.cost", u.cost);
+  }
+
+  function setUsage(finalUsage: AgentUsage | undefined): void {
+    if (!finalUsage) return;
+    usage = finalUsage;
+    const event: AgentEvent = { type: "usage", ...finalUsage };
+    if (!sink) {
+      const index = events.findLastIndex((e) => e.type === "usage");
+      if (index !== -1) {
+        events[index] = event;
+        return;
+      }
+    }
+    record(event);
+  }
+
+  // Text/reasoning block lifecycle (streaming path only). At most one block of each kind is
+  // open; each gets a stable, monotonic id. `*Emitted` tracks the total text delivered as
+  // deltas across the whole run (NOT per block) — `accumulated` is run-long, so the next
+  // delta is always its remainder. Block boundaries (a tool call between two text runs) only
+  // insert start/end markers; they must not reset the counter, or the second block would
+  // re-emit the first block's text.
+  let textBlockId: string | undefined;
+  let textEmitted = "";
+  let anyTextDelta = false;
+  let reasoningBlockId: string | undefined;
+  let reasoningEmitted = "";
+  let blockSeq = 0;
+  const nextId = (prefix: string): string => `${prefix}-${blockSeq++}`;
+
+  function closeText(): void {
+    if (textBlockId === undefined) return;
+    record({ type: "message_end", id: textBlockId });
+    textBlockId = undefined;
+  }
+
+  function closeReasoning(): void {
+    if (reasoningBlockId === undefined) return;
+    record({ type: "reasoning_end", id: reasoningBlockId });
+    reasoningBlockId = undefined;
+  }
+
+  /** Open (if needed) the assistant text block and emit the pure delta up to `target`. */
+  function streamText(target: string): void {
+    closeReasoning(); // a text chunk ends any open reasoning run (blocks never overlap)
+    const delta = target.startsWith(textEmitted)
+      ? target.slice(textEmitted.length)
+      : target;
+    if (!delta) return;
+    if (textBlockId === undefined) {
+      textBlockId = nextId("msg");
+      record({ type: "message_start", id: textBlockId });
+    }
+    record({ type: "message_delta", id: textBlockId, delta });
+    textEmitted = target.startsWith(textEmitted) ? target : textEmitted + delta;
+    anyTextDelta = true;
+  }
+
+  /** Open (if needed) the reasoning block and emit the pure delta up to `target`. */
+  function streamReasoning(target: string): void {
+    closeText(); // a reasoning chunk ends any open text run
+    const delta = target.startsWith(reasoningEmitted)
+      ? target.slice(reasoningEmitted.length)
+      : target;
+    if (!delta) return;
+    if (reasoningBlockId === undefined) {
+      reasoningBlockId = nextId("reason");
+      record({ type: "reasoning_start", id: reasoningBlockId });
+    }
+    record({ type: "reasoning_delta", id: reasoningBlockId, delta });
+    reasoningEmitted = target.startsWith(reasoningEmitted) ? target : reasoningEmitted + delta;
+  }
+
+  function start(input: { prompt?: string; messages?: any[]; sessionId?: string }): void {
+    // Span-less mode (harness self-instruments): only track the trace id so the run can
+    // report it; the harness emits the spans under the propagated parent.
+    if (!emitSpans) {
+      const m = /^00-([0-9a-f]{32})-/.exec(init.traceparent ?? "");
+      runTraceId = m ? m[1] : undefined;
+      return;
+    }
+    const parent = parentContext(init.traceparent);
+    agentSpan = tracer.startSpan("invoke_agent", undefined, parent);
+    agentSpan.setAttribute("openinference.span.kind", "AGENT");
+    agentSpan.setAttribute("gen_ai.operation.name", "invoke_agent");
+    agentSpan.setAttribute("gen_ai.agent.name", init.harness ?? "agent");
+    const sessionId = input.sessionId ?? init.sessionId;
+    if (sessionId) {
+      agentSpan.setAttribute("session.id", sessionId);
+      agentSpan.setAttribute("gen_ai.conversation.id", sessionId);
+    }
+    setInputs(agentSpan, { prompt: input.prompt ?? "" }, capture);
+
+    runTraceId = agentSpan.spanContext().traceId;
+    traceTargets.set(runTraceId, { endpoint, authorization });
+    agentCtx = trace.setSpan(parent ?? context.active(), agentSpan);
+
+    turnSpan = tracer.startSpan("turn 0", undefined, agentCtx);
+    turnSpan.setAttribute("openinference.span.kind", "CHAIN");
+    turnSpan.setAttribute("pi.turn.index", 0);
+    turnCtx = trace.setSpan(agentCtx, turnSpan);
+
+    llmSpan = tracer.startSpan(modelId ? `chat ${modelId}` : "chat", undefined, turnCtx);
+    llmSpan.setAttribute("openinference.span.kind", "LLM");
+    llmSpan.setAttribute("gen_ai.operation.name", "chat");
+    if (provider) llmSpan.setAttribute("gen_ai.system", provider);
+    if (modelId) llmSpan.setAttribute("gen_ai.request.model", modelId);
+    const inputMessages =
+      input.messages && input.messages.length
+        ? input.messages
+        : [{ role: "user", content: input.prompt ?? "" }];
+    emitMessages(llmSpan, "llm.input_messages", inputMessages, capture);
+  }
+
+  function handleUpdate(update: any): void {
+    const kind = update?.sessionUpdate;
+    if (!kind) return;
+
+    if (kind === "agent_message_chunk") {
+      const t = acpBlockText(update.content);
+      if (!t) return;
+      // Pi streams pure deltas; Claude streams deltas plus a cumulative snapshot.
+      // Replace when a chunk is a superset of what we have, append otherwise.
+      if (t.startsWith(accumulated)) accumulated = t;
+      else accumulated += t;
+      // Live deltas run independent of span emission (text, not a span), so they flow even
+      // when the harness self-instruments (emitSpans=false). `accumulated` is the cumulative
+      // text, so the pure delta is its tail past what we already sent.
+      if (sink) streamText(accumulated);
+      return;
+    }
+
+    if (kind === "agent_thought_chunk") {
+      const t = acpBlockText(update.content);
+      if (!t) return;
+      if (t.startsWith(reasoningAccumulated)) reasoningAccumulated = t;
+      else reasoningAccumulated += t;
+      if (sink) streamReasoning(reasoningAccumulated);
+      return;
+    }
+
+    if (kind === "tool_call") {
+      const id = update.toolCallId;
+      if (!id) return;
+      // A tool call ends any open text/reasoning block (keeps streamed block boundaries
+      // clean across text -> tool -> text interleaving). No-op on the one-shot path.
+      closeText();
+      closeReasoning();
+      const name = update.title || update.kind || "tool";
+      let span: Span | undefined;
+      if (emitSpans && turnCtx) {
+        span = tracer.startSpan(`execute_tool ${name}`, undefined, turnCtx);
+        span.setAttribute("openinference.span.kind", "TOOL");
+        span.setAttribute("gen_ai.operation.name", "execute_tool");
+        span.setAttribute("gen_ai.tool.name", String(name));
+        span.setAttribute("gen_ai.tool.call.id", String(id));
+        if (update.rawInput != null)
+          setInputs(span, update.rawInput as Record<string, unknown>, capture);
+      }
+      toolSpans.set(id, { span, name: String(name) });
+      record({ type: "tool_call", id: String(id), name: String(name), input: update.rawInput });
+      // A tool_call can arrive already completed (status set up front).
+      maybeCloseTool(id, update);
+      return;
+    }
+
+    if (kind === "tool_call_update") {
+      maybeCloseTool(update.toolCallId, update);
+      return;
+    }
+
+    if (kind === "usage_update") {
+      // ACP usage_update carries only `used` (context tokens) and `cost.amount`. The
+      // per-call input/output split is NOT on the stream; it rides on the PromptResponse,
+      // which the rivet engine reads. Keep total + cost here and leave the split to the caller.
+      const cost = update.cost?.amount;
+      const total = update.used;
+      usage = {
+        input: usage?.input ?? 0,
+        output: usage?.output ?? 0,
+        total: typeof total === "number" ? total : usage?.total ?? 0,
+        cost: typeof cost === "number" ? cost : usage?.cost ?? 0,
+      };
+      record({ type: "usage", ...usage });
+    }
+  }
+
+  /** Close a tool span when the update marks it completed or failed. */
+  function maybeCloseTool(id: string | undefined, update: any): void {
+    if (!id) return;
+    const entry = toolSpans.get(id);
+    if (!entry) return;
+    const status = update?.status;
+    if (status !== "completed" && status !== "failed") return;
+    const out = acpToolContentText(update.content) || acpToolContentText(update.rawOutput);
+    if (entry.span) {
+      setOutput(entry.span, out, capture);
+      if (status === "failed") entry.span.setStatus({ code: SpanStatusCode.ERROR });
+      entry.span.end();
+    }
+    toolSpans.delete(id);
+    record({ type: "tool_result", id, output: out, isError: status === "failed" });
+  }
+
+  function finish(): string {
+    const text = stripStartupBanner(accumulated.trim());
+    // The event log is independent of span emission, so build its tail either way.
+    closeText();
+    closeReasoning();
+    if (sink) {
+      // Streaming path: the block deltas were already flushed, so do NOT re-emit the
+      // coalesced message (that would double it). If the harness produced no token deltas
+      // at all but there is text, synthesize a minimal start/delta/end so the consumer
+      // always sees one uniform block shape regardless of harness streaming support.
+      if (text && !anyTextDelta) {
+        const id = nextId("msg");
+        record({ type: "message_start", id });
+        record({ type: "message_delta", id, delta: text });
+        record({ type: "message_end", id });
+      }
+    } else {
+      // One-shot path: coalesced events only (no per-token granularity to recover).
+      if (text) record({ type: "message", text });
+      const reasoning = reasoningAccumulated.trim();
+      if (reasoning) record({ type: "thought", text: reasoning });
+    }
+    record({ type: "done" });
+    if (!emitSpans) return text;
+    if (llmSpan) {
+      emitMessages(
+        llmSpan,
+        "llm.output_messages",
+        [{ role: "assistant", content: text }],
+        capture,
+      );
+      stampUsage(llmSpan, usage);
+      llmSpan.end();
+      llmSpan = undefined;
+    }
+    for (const { span } of toolSpans.values()) span?.end();
+    toolSpans.clear();
+    if (turnSpan) {
+      turnSpan.end();
+      turnSpan = undefined;
+    }
+    if (agentSpan) {
+      setOutput(agentSpan, text, capture);
+      stampUsage(agentSpan, usage);
+      agentSpan.end();
+      agentSpan = undefined;
+    }
+    agentCtx = undefined;
+    turnCtx = undefined;
+    return text;
+  }
+
+  return {
+    start,
+    handleUpdate,
+    emitEvent: record,
+    finish,
+    setUsage,
+    flush: () => flushTrace(runTraceId),
+    traceId: () => runTraceId,
+    output: () => accumulated,
+    events: () => events,
+    usage: () => usage,
+  };
+}
diff --git a/services/agent/test/continuation.test.ts b/services/agent/test/continuation.test.ts
new file mode 100644
index 0000000000..c9f9d4356c
--- /dev/null
+++ b/services/agent/test/continuation.test.ts
@@ -0,0 +1,66 @@
+/**
+ * Unit tests for the cross-turn HITL continuation substrate.
+ *
+ * Under the cold model the harness rebuilds context from the replayed transcript, and ACP
+ * prompt content blocks cannot carry tool calls/results. So a resolved interaction (an
+ * approved tool that ran, a client-fulfilled tool) must survive into the replay as text.
+ * `messageTranscript` encodes tool turns; `buildTurnText` keeps them in the replayed history.
+ *
+ * Run: pnpm exec tsx test/continuation.test.ts
+ */
+import assert from "node:assert/strict";
+
+import { messageTranscript, buildTurnText } from "../src/engines/rivet.ts";
+import {
+  resolveRunSessionId,
+  type AgentRunRequest,
+  type ContentBlock,
+} from "../src/protocol.ts";
+
+// --- messageTranscript -------------------------------------------------------
+assert.equal(messageTranscript("hello"), "hello");
+assert.equal(messageTranscript([{ type: "text", text: "a" }, { type: "text", text: "b" }]), "a\nb");
+assert.equal(
+  messageTranscript([{ type: "tool_call", toolName: "getWeather", input: { city: "Paris" } }]),
+  '[called getWeather({"city":"Paris"})]',
+);
+assert.equal(
+  messageTranscript([{ type: "tool_result", toolName: "getWeather", output: { temp: 24 } }]),
+  '[getWeather returned: {"temp":24}]',
+);
+assert.equal(
+  messageTranscript([{ type: "tool_result", toolName: "send", output: "boom", isError: true }]),
+  "[send error: boom]",
+);
+
+// --- session id metadata ------------------------------------------------------
+assert.equal(
+  resolveRunSessionId({ sessionId: "sess_platform" }, "runner-ephemeral"),
+  "sess_platform",
+);
+assert.equal(resolveRunSessionId({}, "runner-ephemeral"), "runner-ephemeral");
+
+// --- buildTurnText keeps a resolved tool turn in the replay ------------------
+{
+  const req: AgentRunRequest = {
+    messages: [
+      { role: "user", content: "weather in Paris?" },
+      {
+        role: "assistant",
+        content: [{ type: "tool_call", toolName: "getWeather", input: { city: "Paris" } } as ContentBlock],
+      },
+      {
+        role: "tool",
+        content: [{ type: "tool_result", toolName: "getWeather", output: { temp: 24 } } as ContentBlock],
+      },
+      { role: "user", content: "and tomorrow?" },
+    ],
+  };
+  const text = buildTurnText(req);
+  assert.ok(text.includes("called getWeather"), "tool call survives replay");
+  assert.ok(text.includes("getWeather returned"), "tool result survives replay");
+  assert.ok(text.includes("and tomorrow?"), "latest user prompt is the live turn");
+  assert.ok(text.startsWith("Conversation so far:"), "transcript header present");
+}
+
+console.log("continuation.test.ts: all assertions passed");
diff --git a/services/agent/test/responder.test.ts b/services/agent/test/responder.test.ts
new file mode 100644
index 0000000000..e06ae43e00
--- /dev/null
+++ b/services/agent/test/responder.test.ts
@@ -0,0 +1,84 @@
+/**
+ * Unit tests for the interaction responder seam and the otel `emitEvent` hook.
+ *
+ * Covers the behavior parity of the responder (it replaces the old inline auto-approve in
+ * rivet.ts) and that an out-of-stream event (an `interaction_request`) routed through
+ * `emitEvent` lands in both the live sink and the batch `events()` log. No harness, no
+ * network.
+ *
+ * Run: pnpm exec tsx test/responder.test.ts
+ */
+import assert from "node:assert/strict";
+
+import { createRivetOtel } from "../src/tracing/otel.ts";
+import type { AgentEvent } from "../src/protocol.ts";
+import {
+  PolicyResponder,
+  decisionToReply,
+  policyFromRequest,
+} from "../src/responder.ts";
+
+// --- policyFromRequest -------------------------------------------------------
+{
+  delete process.env.AGENTA_RIVET_DENY_PERMISSIONS;
+  assert.equal(policyFromRequest(undefined), "auto");
+  assert.equal(policyFromRequest("auto"), "auto");
+  assert.equal(policyFromRequest("deny"), "deny");
+
+  process.env.AGENTA_RIVET_DENY_PERMISSIONS = "true";
+  assert.equal(policyFromRequest(undefined), "deny", "env forces deny");
+  assert.equal(policyFromRequest("auto"), "deny", "env overrides auto");
+  delete process.env.AGENTA_RIVET_DENY_PERMISSIONS;
+}
+
+// --- decisionToReply (parity with the old inline mapping) --------------------
+{
+  assert.equal(decisionToReply("allow", ["always", "once", "reject"]), "always");
+  assert.equal(decisionToReply("allow", ["once", "reject"]), "once");
+  assert.equal(decisionToReply("allow", []), "once", "allow falls back to once");
+  assert.equal(decisionToReply("deny", ["always", "once", "reject"]), "reject");
+  assert.equal(decisionToReply("deny", []), "reject", "deny falls back to reject");
+}
+
+// --- PolicyResponder ---------------------------------------------------------
+{
+  const auto = new PolicyResponder("auto");
+  const deny = new PolicyResponder("deny");
+  const req = { id: "p1", availableReplies: ["once", "reject"] };
+  assert.equal(await auto.onPermission(req), "allow");
+  assert.equal(await deny.onPermission(req), "deny");
+}
+
+// --- emitEvent: streaming path (sink + batch) --------------------------------
+{
+  const emitted: AgentEvent[] = [];
+  const run = createRivetOtel({ harness: "claude", model: "anthropic/x", emit: (e) => emitted.push(e) });
+  run.start({ prompt: "hi" });
+  const interaction: AgentEvent = {
+    type: "interaction_request",
+    id: "p1",
+    kind: "permission",
+    payload: { availableReplies: ["once", "reject"] },
+  };
+  run.emitEvent(interaction);
+
+  const live = emitted.find((e) => e.type === "interaction_request");
+  assert.ok(live, "interaction_request flushed to the live sink");
+  assert.equal((live as any).id, "p1");
+  assert.ok(
+    run.events().some((e) => e.type === "interaction_request"),
+    "interaction_request also recorded in the batch log",
+  );
+}
+
+// --- emitEvent: one-shot path (batch only) -----------------------------------
+{
+  const run = createRivetOtel({ harness: "claude", model: "anthropic/x" });
+  run.start({ prompt: "hi" });
+  run.emitEvent({ type: "data", name: "weather", data: { temp: 24 } });
+  const ev = run.events().find((e) => e.type === "data");
+  assert.ok(ev, "data event recorded with no live sink");
+  assert.equal((ev as any).name, "weather");
+}
+
+console.log("responder.test.ts: all assertions passed");
diff --git a/services/agent/test/stream-events.test.ts b/services/agent/test/stream-events.test.ts
new file mode 100644
index 0000000000..f27e31fc23
--- /dev/null
+++ b/services/agent/test/stream-events.test.ts
@@ -0,0 +1,148 @@
+/**
+ * Unit test for the createRivetOtel delta/lifecycle state machine.
+ *
+ * Drives `handleUpdate` with a hand-built ACP `session/update` sequence (Claude-style
+ * cumulative text snapshots, a tool call between two text runs, a reasoning run) and asserts
+ * the streaming and one-shot event shapes. No harness, no network: spans are built offline
+ * and never flushed.
+ *
+ * Run: pnpm exec tsx test/stream-events.test.ts
+ */
+import assert from "node:assert/strict";
+
+import { createRivetOtel } from "../src/tracing/otel.ts";
+import type { AgentEvent } from "../src/protocol.ts";
+
+const textChunk = (text: string) => ({
+  sessionUpdate: "agent_message_chunk",
+  content: { type: "text", text },
+});
+const thoughtChunk = (text: string) => ({
+  sessionUpdate: "agent_thought_chunk",
+  content: { type: "text", text },
+});
+const toolCall = (id: string, title: string, rawInput: unknown) => ({
+  sessionUpdate: "tool_call",
+  toolCallId: id,
+  title,
+  rawInput,
+});
+const toolDone = (id: string, text: string) => ({
+  sessionUpdate: "tool_call_update",
+  toolCallId: id,
+  status: "completed",
+  content: [{ content: { type: "text", text } }],
+});
+const usage = () => ({ sessionUpdate: "usage_update", used: 100, cost: { amount: 0.01 } });
+
+// The same ACP sequence drives both modes: two text runs around a tool call, then reasoning.
+function drive(run: ReturnType<typeof createRivetOtel>): void {
+  run.start({ prompt: "weather in Paris?" });
+  run.handleUpdate(textChunk("Hello ")); // pure delta
+  run.handleUpdate(textChunk("Hello world")); // cumulative snapshot (Claude-style)
+  run.handleUpdate(toolCall("call_1", "getWeather", { city: "Paris" }));
+  run.handleUpdate(toolDone("call_1", "sunny"));
+  run.handleUpdate(textChunk("Hello world It is sunny.")); // resumes after the tool
+  run.handleUpdate(thoughtChunk("thinking..."));
+  run.handleUpdate(usage());
+}
+
+const types = (events: AgentEvent[]) => events.map((e) => e.type);
+const ofType = <T extends AgentEvent["type"]>(events: AgentEvent[], t: T) =>
+  events.filter((e) => e.type === t) as Extract<AgentEvent, { type: T }>[];
+
+// --- Scenario 1: streaming (emit set) ---------------------------------------
+{
+  const emitted: AgentEvent[] = [];
+  const run = createRivetOtel({ harness: "claude", model: "anthropic/x", emit: (e) => emitted.push(e) });
+  drive(run);
+  const finalText = run.finish();
+
+  // No coalesced text events on the streaming path.
+  assert.equal(ofType(emitted, "message").length, 0, "no coalesced message when streaming");
+  assert.equal(ofType(emitted, "thought").length, 0, "no coalesced thought when streaming");
+
+  // Exactly one terminal done.
+  assert.equal(ofType(emitted, "done").length, 1, "exactly one done");
+
+  // Two text blocks (split by the tool call), one reasoning block, balanced start/end.
+  const mStart = ofType(emitted, "message_start");
+  const mEnd = ofType(emitted, "message_end");
+  assert.equal(mStart.length, 2, "two message_start");
+  assert.equal(mEnd.length, 2, "two message_end");
+  assert.deepEqual(mStart.map((e) => e.id), ["msg-0", "msg-1"], "stable monotonic text ids");
+  const rStart = ofType(emitted, "reasoning_start");
+  const rEnd = ofType(emitted, "reasoning_end");
+  assert.equal(rStart.length, 1, "one reasoning_start");
+  assert.equal(rEnd.length, 1, "one reasoning_end");
+
+  // Deltas are pure and reconstruct the full text, with no overlap/repeat.
+  const text = ofType(emitted, "message_delta").map((e) => e.delta).join("");
+  assert.equal(text, "Hello world It is sunny.", "concatenated deltas == full text");
+  assert.equal(text, finalText, "deltas match finish() output");
+  const reasoning = ofType(emitted, "reasoning_delta").map((e) => e.delta).join("");
+  assert.equal(reasoning, "thinking...", "concatenated reasoning deltas");
+
+  // Ordering invariant: each block's start precedes its deltas precede its end; tool result
+  // lands before the second text block opens.
+  const seq = types(emitted);
+  assert.ok(seq.indexOf("message_end") < seq.indexOf("tool_call"), "first text block closes before the tool call");
+  assert.ok(seq.indexOf("tool_result") < seq.lastIndexOf("message_start"), "tool result precedes the second text block");
+  for (const id of ["msg-0", "msg-1", "reason-2"]) {
+    const idxs = emitted
+      .map((e, i) => ((e as any).id === id ? { i, t: e.type } : null))
+      .filter(Boolean) as { i: number; t: string }[];
+    assert.ok(idxs[0].t.endsWith("_start"), `${id} starts with *_start`);
+    assert.ok(idxs[idxs.length - 1].t.endsWith("_end"), `${id} ends with *_end`);
+  }
+}
+
+// --- Scenario 2: one-shot (no emit) -----------------------------------------
+{
+  const run = createRivetOtel({ harness: "claude", model: "anthropic/x" });
+  drive(run);
+  const finalText = run.finish();
+  const events = run.events();
+
+  // Coalesced text/thought, no delta lifecycle events.
+  const messages = ofType(events, "message");
+  assert.equal(messages.length, 1, "one coalesced message");
+  assert.equal(messages[0].text, "Hello world It is sunny.", "coalesced text == final");
+  assert.equal(messages[0].text, finalText);
+  assert.equal(ofType(events, "thought").length, 1, "one coalesced thought");
+  for (const t of ["message_start", "message_delta", "message_end", "reasoning_start", "reasoning_delta", "reasoning_end"]) {
+    assert.equal(events.filter((e) => e.type === t).length, 0, `no ${t} on the one-shot path`);
+  }
+
+  // The structured tool/usage events are still present, with exactly one done.
+  assert.equal(ofType(events, "tool_call").length, 1, "tool_call present");
+  assert.equal(ofType(events, "tool_result").length, 1, "tool_result present");
+  assert.equal(ofType(events, "usage").length, 1, "usage present");
+  assert.equal(ofType(events, "done").length, 1, "exactly one done");
+}
+
+// --- Scenario 3: span-less mode still records ACP events ---------------------
+{
+  const run = createRivetOtel({ harness: "pi", model: "openai-codex/x", emitSpans: false });
+  drive(run);
+  run.setUsage({ input: 4, output: 6, total: 10, cost: 0.02 });
+  const finalText = run.finish();
+  const events = run.events();
+
+  assert.equal(finalText, "Hello world It is sunny.");
+  assert.equal(ofType(events, "message").length, 1, "message present without spans");
+  assert.equal(ofType(events, "thought").length, 1, "thought present without spans");
+  assert.equal(ofType(events, "tool_call").length, 1, "tool_call present without spans");
+  assert.equal(ofType(events, "tool_result").length, 1, "tool_result present without spans");
+  const usageEvents = ofType(events, "usage");
+  assert.equal(usageEvents.length, 1, "usage present without spans");
+  assert.deepEqual(
+    usageEvents[0],
+    { type: "usage", input: 4, output: 6, total: 10, cost: 0.02 },
+    "final usage replaces stream-only usage before done",
+  );
+  assert.equal(ofType(events, "done").length, 1, "exactly one done without spans");
+  assert.ok(types(events).indexOf("usage") < types(events).indexOf("done"), "usage precedes done");
+}
+
+console.log("stream-events.test.ts: all assertions passed");
diff --git a/services/agent/test/tool-bridge.test.ts b/services/agent/test/tool-bridge.test.ts
index 1838177918..4dac2b3f9d 100644
--- a/services/agent/test/tool-bridge.test.ts
+++ b/services/agent/test/tool-bridge.test.ts
@@ -22,28 +22,42 @@ function envValue(
   return env.find((e) => e.name === name)?.value;
 }
 
-// code-only specs + NO callback -> one server, with AGENTA_TOOL_SPECS but no endpoint (F4).
+const relayDir = "/tmp/agenta-tools";
+
+// code-only specs + no callback -> one server, with public specs and relay dir.
 {
   const specs: ResolvedToolSpec[] = [
-    { name: "adder", kind: "code", runtime: "python", code: "def main(**k): return 1" },
+    {
+      name: "adder",
+      description: "Add numbers",
+      kind: "code",
+      runtime: "python",
+      code: "def main(**k): return 1",
+      env: { PRIVATE: "secret" },
+    },
   ];
-  const out = buildToolMcpServers(specs, undefined);
+  const out = buildToolMcpServers(specs, relayDir);
   assert.equal(out.length, 1, "code-only run still attaches the server");
   assert.equal(out[0].name, "agenta-tools");
   assert.ok(
-    envValue(out[0].env, "AGENTA_TOOL_SPECS") !== undefined,
-    "AGENTA_TOOL_SPECS is set",
+    envValue(out[0].env, "AGENTA_TOOL_PUBLIC_SPECS") !== undefined,
+    "AGENTA_TOOL_PUBLIC_SPECS is set",
   );
   assert.equal(
     envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"),
     undefined,
     "no endpoint env for code-only run",
   );
-  // The full executable spec list round-trips through AGENTA_TOOL_SPECS.
-  assert.deepEqual(JSON.parse(envValue(out[0].env, "AGENTA_TOOL_SPECS")!), specs);
+  assert.equal(envValue(out[0].env, "AGENTA_TOOL_RELAY_DIR"), relayDir);
+  assert.equal(envValue(out[0].env, "AGENTA_TOOL_CALLBACK_AUTH"), undefined);
+  assert.equal(envValue(out[0].env, "AGENTA_TOOL_SPECS"), undefined);
+  // Only public metadata round-trips; private executor fields stay runner-side.
+  assert.deepEqual(JSON.parse(envValue(out[0].env, "AGENTA_TOOL_PUBLIC_SPECS")!), [
+    { name: "adder", description: "Add numbers" },
+  ]);
 }
 
-// callback specs + a callback with endpoint -> one server carrying endpoint (+ auth).
+// callback specs + a callback with endpoint -> still no endpoint/auth in child env.
 {
   const specs: ResolvedToolSpec[] = [
     { name: "search", kind: "callback", callRef: "composio.search" },
@@ -52,30 +66,31 @@ function envValue(
     endpoint: "https://agenta.example/tools/call",
     authorization: "Bearer tok",
   };
-  const out = buildToolMcpServers(specs, callback);
+  const out = buildToolMcpServers(specs, callback, relayDir);
   assert.equal(out.length, 1);
   assert.equal(
     envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"),
-    "https://agenta.example/tools/call",
-    "endpoint env set for callback tools",
+    undefined,
+    "endpoint env is never exposed to the bridge",
   );
   assert.equal(
     envValue(out[0].env, "AGENTA_TOOL_CALLBACK_AUTH"),
-    "Bearer tok",
-    "auth env set when provided",
+    undefined,
+    "auth env is never exposed to the bridge",
   );
+  assert.equal(envValue(out[0].env, "AGENTA_TOOL_RELAY_DIR"), relayDir);
 }
 
-// callback spec + endpoint but no authorization -> no AUTH env entry.
+// callback spec + endpoint but no authorization -> still only public metadata + relay dir.
 {
   const specs: ResolvedToolSpec[] = [
     { name: "search", kind: "callback", callRef: "composio.search" },
   ];
-  const out = buildToolMcpServers(specs, { endpoint: "https://agenta.example/tools/call" });
+  const out = buildToolMcpServers(specs, { endpoint: "https://agenta.example/tools/call" }, relayDir);
   assert.equal(out.length, 1);
   assert.equal(
     envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"),
-    "https://agenta.example/tools/call",
+    undefined,
   );
   assert.equal(
     envValue(out[0].env, "AGENTA_TOOL_CALLBACK_AUTH"),
@@ -87,11 +102,11 @@ function envValue(
 // absent kind defaults to callback (back-compat): endpoint still wired when present.
 {
   const specs: ResolvedToolSpec[] = [{ name: "legacy", callRef: "composio.legacy" }];
-  const out = buildToolMcpServers(specs, { endpoint: "https://agenta.example/tools/call" });
+  const out = buildToolMcpServers(specs, { endpoint: "https://agenta.example/tools/call" }, relayDir);
   assert.equal(out.length, 1, "back-compat (no kind) attaches as a callback tool");
   assert.equal(
     envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"),
-    "https://agenta.example/tools/call",
+    undefined,
   );
 }
 
@@ -101,7 +116,7 @@ function envValue(
     { name: "adder", kind: "code", runtime: "python", code: "def main(**k): return 1" },
     { name: "search", kind: "callback", callRef: "composio.search" },
   ];
-  const out = buildToolMcpServers(specs, undefined);
+  const out = buildToolMcpServers(specs, relayDir);
   assert.notDeepEqual(out, [], "mixed run with no endpoint must not return []");
   assert.equal(out.length, 1, "still attaches the server so the code tool works");
   assert.equal(
@@ -109,8 +124,11 @@ function envValue(
     undefined,
     "endpoint env omitted when missing",
   );
-  // Both executable specs are still passed to the bridge.
-  assert.deepEqual(JSON.parse(envValue(out[0].env, "AGENTA_TOOL_SPECS")!), specs);
+  // Both executable specs are advertised, but only as public metadata.
+  assert.deepEqual(JSON.parse(envValue(out[0].env, "AGENTA_TOOL_PUBLIC_SPECS")!), [
+    { name: "adder" },
+    { name: "search" },
+  ]);
 }
 
 // empty specs -> [].
@@ -126,7 +144,7 @@ assert.deepEqual(buildToolMcpServers([], undefined), [], "empty specs -> []");
   );
   // Even with an endpoint, client-only stays empty.
   assert.deepEqual(
-    buildToolMcpServers(specs, { endpoint: "https://agenta.example/tools/call" }),
+    buildToolMcpServers(specs, { endpoint: "https://agenta.example/tools/call" }, relayDir),
     [],
     "client-only -> [] even with an endpoint",
   );
@@ -138,9 +156,9 @@ assert.deepEqual(buildToolMcpServers([], undefined), [], "empty specs -> []");
     { name: "confirm", kind: "client" },
     { name: "adder", kind: "code", runtime: "python", code: "def main(**k): return 1" },
   ];
-  const out = buildToolMcpServers(specs, undefined);
+  const out = buildToolMcpServers(specs, relayDir);
   assert.equal(out.length, 1, "executable spec attaches the server");
-  const passed: ResolvedToolSpec[] = JSON.parse(envValue(out[0].env, "AGENTA_TOOL_SPECS")!);
+  const passed: ResolvedToolSpec[] = JSON.parse(envValue(out[0].env, "AGENTA_TOOL_PUBLIC_SPECS")!);
   assert.deepEqual(
     passed.map((s) => s.name),
     ["adder"],