diff --git a/services/agent/README.md b/services/agent/README.md new file mode 100644 index 0000000000..82b5272e17 --- /dev/null +++ b/services/agent/README.md @@ -0,0 +1,121 @@ +# Agent runner (TypeScript) + +The Node side of the agent workflow service. It runs the actual agent loop and serves one +contract: a JSON request in, a structured result out. The Python service +(`services/oss/src/agent/`) decides *what* to run (config, tools, secrets, trace) and calls +in here; this package *runs* it. It lives in Node because the harnesses (Pi, Claude Code, +rivet's `sandbox-agent`) are Node libraries with no Python SDK. + +## How it is invoked + +Two entrypoints, same `/run` contract (see `src/protocol.ts`): + +- **`src/cli.ts`** — one JSON request on stdin, one result on stdout. The Python + SDK adapters use this subprocess transport when `AGENTA_AGENT_PI_URL` is unset. stdout is + the result channel only; logs go to stderr. +- **`src/server.ts`** — the same thing as a long-lived HTTP server on `:8765` + (`GET /health`, `POST /run`). This is the dockerized agent runner sidecar the Python SDK + adapters call over HTTP when `AGENTA_AGENT_PI_URL` points at it. The dev image + (`docker/Dockerfile.dev`) runs `tsx watch src/server.ts`. + +Both route to an engine by the request's `backend` field. + +## Layout (`src/`) + +``` +src/ + cli.ts entrypoint: stdin/stdout (subprocess transport) + server.ts entrypoint: HTTP sidecar on :8765 + protocol.ts the /run wire contract (request, result, events, capabilities) + engines/ + pi.ts engine: drive the Pi SDK in-process + rivet.ts engine: drive a harness over ACP via a rivet sandbox-agent daemon + tracing/ + otel.ts turn a run into OpenTelemetry spans nested under /invoke + tools/ + callback.ts the one /tools/call HTTP client + code.ts execute resolved code tools in a scoped subprocess + dispatch.ts dispatch resolved tools by executor kind + mcp-bridge.ts build the MCP server config that exposes tools to a harness + mcp-server.ts the stdio MCP server itself (launched per session by the daemon) + extensions/ + agenta.ts the Pi extension (tracing + tools), bundled into dist/ for Pi to load +``` + +## Engines + +- **`pi`** (`engines/pi.ts`) — drives the Pi SDK directly in-process. +- **`rivet`** (`engines/rivet.ts`) — drives any harness (`pi`, `claude`) over the Agent + Client Protocol through a rivet `sandbox-agent` daemon, either local or in a Daytona + sandbox. This is the default on the platform. + +The engine is a deployment choice (`backend` on the wire / `AGENT_BACKEND` env), not a +harness. Harness choice (`pi`, `claude`, or experimental `agenta`) and sandbox (`local` or +`daytona`, where supported) are per-run config the Python service sends. + +## Result + +```json +{ + "ok": true, + "output": "Rome", + "messages": [{ "role": "assistant", "content": "Rome" }], + "events": [{ "type": "message", "text": "Rome" }, { "type": "done" }], + "usage": { "input": 1297, "output": 5, "total": 1302, "cost": 0.0066 }, + "stopReason": "end_turn", + "capabilities": { "mcpTools": false, "images": true, "...": "..." }, + "sessionId": "...", + "model": "openai-codex/gpt-5.5", + "traceId": "..." +} +``` + +`runRivet` probes the harness's capabilities and branches on them (for example, tools go +over MCP only when the harness advertises `mcpTools`); usage and the structured event log +come back on every run. + +## Tracing + +When the request carries a `trace` block, the run is exported to Agenta as OpenTelemetry +spans nested under the caller's `/invoke` span. The Pi path self-instruments via the +bundled extension (`extensions/agenta.ts`); other harnesses are traced from the rivet ACP +event stream (`tracing/otel.ts`). The Python `tracing` module fills `trace` in from the +live workflow span. + +## Tools + +Tools are resolved in the Python backend and arrive on the request as `customTools` plus a +`toolCallback`. Delivery is capability-routed: the Pi extension registers them natively; +other harnesses get them over MCP through `tools/mcp-bridge.ts` + `tools/mcp-server.ts`. +Either way each call POSTs back to Agenta's `/tools/call` (`tools/callback.ts`), so the +provider key and connection auth stay server-side. + +## The extension bundle + +`scripts/build-extension.mjs` esbuild-bundles `src/extensions/agenta.ts` into one +self-contained `dist/extensions/agenta.js` that Pi can load anywhere (host, the sidecar, a +Daytona snapshot). The dev image bakes it; rebuild after editing the extension or the +tracer: + +```bash +pnpm run build:extension +``` + +## Auth + +Provider keys arrive as `request.secrets` (resolved from the project vault) or fall back to +the harness's own login: Pi reads `~/.pi/agent/auth.json` (`pnpm exec pi` then `/login`), +Claude Code reads `~/.claude`. Set `OPENAI_API_KEY` / `ANTHROPIC_API_KEY` to override. + +## config/ + +`config/AGENTS.md` and `config/agent.json` are a fallback "hello-world" agent, used only +when a request arrives with no config. In practice the playground always sends the agent +revision's config, so these are rarely hit. + +## Local use + +```bash +pnpm install +echo '{"backend":"pi","messages":[{"role":"user","content":"Hi"}]}' | pnpm run run:cli +``` diff --git a/services/agent/config/AGENTS.md b/services/agent/config/AGENTS.md new file mode 100644 index 0000000000..767a2cdd49 --- /dev/null +++ b/services/agent/config/AGENTS.md @@ -0,0 +1,7 @@ +# Hello-world agent + +You are a friendly hello-world agent running on the Agenta agent service. + +- Greet the user warmly. +- Answer the user's message in one or two short sentences. +- Do not use tools. Keep replies plain text. diff --git a/services/agent/config/agent.json b/services/agent/config/agent.json new file mode 100644 index 0000000000..adc26f793c --- /dev/null +++ b/services/agent/config/agent.json @@ -0,0 +1,4 @@ +{ + "model": "gpt-5.5", + "tools": [] +} diff --git a/services/agent/docker/Dockerfile b/services/agent/docker/Dockerfile new file mode 100644 index 0000000000..687fea4347 --- /dev/null +++ b/services/agent/docker/Dockerfile @@ -0,0 +1,55 @@ +# Agent runner sidecar (sandbox-agent server), production image. +# +# Runs the TypeScript runner (src/server.ts) as a long-lived HTTP server on :8765. +# The Python agent service calls it in-network. Unlike Dockerfile.dev there is no +# `tsx watch` and no bind mount: the source is baked in. +# +# Licensing posture (see docker/README.md): +# - Pi (@earendil-works/pi-coding-agent, MIT) is baked via the npm dependencies. +# - Claude Code is proprietary (Anthropic Commercial Terms). It is NEVER baked into +# this image. The sandbox-agent daemon installs it at runtime from Anthropic over +# HTTPS (the reason ca-certificates is installed). That keeps Anthropic as the +# distributor, the only compliant path for an image we build and ship. +# - No credential is baked: no API key, no OAuth login. Auth is injected at runtime +# (ANTHROPIC_API_KEY / request secrets; OAuth self-host is a mounted opt-in only). + +FROM node:24-slim + +WORKDIR /app + +# CA certificates: the sandbox-agent daemon (Rust) downloads harness CLIs (e.g. Claude +# Code) over HTTPS using the system trust store, which node:*-slim omits — without this +# the daemon's `install-agent claude` fails TLS verification. git lets npm/installers +# fetch git deps. +RUN apt-get update \ + && apt-get install -y --no-install-recommends ca-certificates git \ + && rm -rf /var/lib/apt/lists/* + +RUN corepack enable + +# Install deps as a cached layer (manifest + lockfile only). The full dependency set is +# installed (not --prod): the runtime uses `tsx` and the extension build uses `esbuild`, +# both devDependencies. +COPY package.json pnpm-lock.yaml ./ +RUN pnpm install --frozen-lockfile + +# Bake the source (no bind mount in production). +COPY tsconfig.json ./ +COPY scripts ./scripts +COPY src ./src +COPY config ./config +COPY skills ./skills + +# Bundle the Agenta Pi extension (tracing + tools) into dist/. runSandboxAgent installs +# this baked copy into Pi's agent dir on every run. Rebuild the image after editing +# src/extensions/agenta.ts or the tracer. +RUN pnpm run build:extension + +ENV NODE_ENV=production \ + PORT=8765 + +EXPOSE 8765 + +# Call the local tsx binary directly to avoid pnpm/corepack HOME writes when the +# container runs as a non-root host uid. +CMD ["node_modules/.bin/tsx", "src/server.ts"] diff --git a/services/agent/docker/Dockerfile.dev b/services/agent/docker/Dockerfile.dev new file mode 100644 index 0000000000..4f2f64f126 --- /dev/null +++ b/services/agent/docker/Dockerfile.dev @@ -0,0 +1,41 @@ +# Pi harness sidecar (WP-2), dev image. +# +# Runs the TypeScript Pi wrapper as an HTTP server. The Python agent service calls +# it in-network. Source is bind-mounted in dev so `tsx watch` hot-reloads; node_modules +# stays baked into the image. Build context is services/agent. + +FROM node:24-slim + +WORKDIR /app + +# CA certificates: the rivet daemon (Rust) downloads harness CLIs (e.g. Claude Code) over +# HTTPS using the system trust store, which node:*-slim omits — without this the daemon's +# `install-agent claude` fails TLS verification. git lets npm/installers fetch git deps. +RUN apt-get update \ + && apt-get install -y --no-install-recommends ca-certificates git \ + && rm -rf /var/lib/apt/lists/* + +RUN corepack enable + +# Install deps as a cached layer (manifest + lockfile only). +COPY package.json pnpm-lock.yaml ./ +RUN pnpm install --frozen-lockfile + +# Fallback copy for non-mounted runs; in dev these are bind-mounted over. +COPY tsconfig.json ./ +COPY scripts ./scripts +COPY src ./src + +# Bundle the Agenta Pi extension (tracing + tools) into dist/. dist/ is NOT bind-mounted +# in dev, so this baked copy is what runRivet installs into Pi's agent dir. Rebuild the +# image after editing src/piExtension.ts or src/agenta-otel.ts. +RUN pnpm run build:extension + +ENV NODE_ENV=development \ + PORT=8765 + +EXPOSE 8765 + +# Call the local tsx binary directly to avoid pnpm/corepack HOME writes when the +# container runs as a non-root host uid. +CMD ["node_modules/.bin/tsx", "watch", "src/server.ts"] diff --git a/services/agent/docker/README.md b/services/agent/docker/README.md new file mode 100644 index 0000000000..63895b109a --- /dev/null +++ b/services/agent/docker/README.md @@ -0,0 +1,66 @@ +# Agent sidecar images + +Images for the agent runner sidecar (the `sandbox-agent server` runtime in +`services/agent/src/server.ts`). The Python service calls it in-network at +`:8765`. + +- `Dockerfile.dev` — dev image. `tsx watch`, source bind-mounted, hot reload. +- `Dockerfile` — production image. Source baked in, no watcher. + +## Licensing posture (read before changing any image or build recipe) + +The rule that shapes every image here: + +> **We ship build recipes, not Claude-containing images, and we never bake a +> credential into any image.** + +Why: + +- **Pi** (`@earendil-works/pi-coding-agent`) is MIT. We bake it freely via the npm + dependencies, in every image and snapshot. +- **Claude Code** is proprietary (© Anthropic PBC, governed by Anthropic's + [Commercial Terms](https://www.anthropic.com/legal/commercial-terms); + [legal & compliance](https://code.claude.com/docs/en/legal-and-compliance)). The + Commercial Terms grant a usage license only. They do not grant any right to + redistribute, resell, sublicense, or repackage the Services. So an image **we + build and distribute must not contain Claude Code.** +- Claude Code is installed **from Anthropic** (`npm install -g + @anthropic-ai/claude-code`, `https://claude.ai/install.sh`, or the daemon's + `install-agent claude`). That keeps Anthropic as the distributor, which is the + permitted path. The production sidecar does this at runtime; a snapshot we build + for our own use does it at build time. + +## Authentication + +Auth is injected at runtime, never baked into a layer. + +- **API key (default, and the only option for cloud / multi-tenant).** Set + `ANTHROPIC_API_KEY` (or pass provider keys as request secrets from the vault). + Anthropic directs products and services that interact with Claude to use API key + auth, so this is the path for any Agenta-orchestrated run that serves users. +- **OAuth subscription (self-host opt-in only).** An individual operator may mount + their own Claude login (e.g. `~/.claude`) into the container and run with their + own subscription. This is for personal, individual use of Claude Code, never for + serving other users, and it is the operator's responsibility. Anthropic restricts + Free/Pro/Max OAuth to first-party use and forbids third parties routing requests + through it (enforced since 2026-03). Cloud and multi-tenant deployments must stay + API-key only. + +We never bake an OAuth login or an API key into an image. + +## Build recipes (two paths) + +- **Cloud / Daytona (API key).** The Daytona snapshot recipe bakes Pi. Agenta Cloud + builds and uses its own snapshot internally; self-hosters run the same recipe + against their own Daytona account. We ship the build script (the recipe), not the + built snapshot, so we never distribute a Claude-containing artifact. Snapshot + builder: `docs/design/agent-workflows/scratch/wp-8-rivet-acp-runtime/poc/build_rivet_snapshot.py`. + Today it bases on rivet's `-full` image, which already bundles Claude. That is + compliant under the recipe-not-image model. **Cleaner-provenance follow-up + (needs a live Daytona build to verify):** base on a daemon-only rivet image and + install Claude from Anthropic at build, so the snapshot's Claude comes straight + from Anthropic rather than from a third party's bundled image. Relocation of the + builder into this folder is a follow-up. +- **Self-host (API key, OAuth optional).** Build the production `Dockerfile` (it + bakes neither Claude nor a credential), then supply auth at runtime: an + `ANTHROPIC_API_KEY` env var, or, for individual use, a mounted OAuth login dir. diff --git a/services/agent/scripts/build-extension.mjs b/services/agent/scripts/build-extension.mjs new file mode 100644 index 0000000000..debdae88d7 --- /dev/null +++ b/services/agent/scripts/build-extension.mjs @@ -0,0 +1,30 @@ +/** + * Bundle the Agenta Pi extension into one self-contained file so its OpenTelemetry deps + * resolve wherever Pi loads it (host, docker sidecar, Daytona snapshot). Pi only accepts + * `.ts`/`.js` extension files, so we emit `.js` (ESM) with a default export. + * + * Run: pnpm run build:extension -> dist/extensions/agenta.js + */ +import { build } from "esbuild"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; + +const root = join(dirname(fileURLToPath(import.meta.url)), ".."); + +await build({ + entryPoints: [join(root, "src/extensions/agenta.ts")], + outfile: join(root, "dist/extensions/agenta.js"), + bundle: true, + platform: "node", + format: "esm", + target: "node20", + // Pi provides the ExtensionAPI at load time; never bundle the harness SDK. + external: ["@earendil-works/pi-coding-agent"], + banner: { + // protobufjs and some deps expect CommonJS globals under ESM; shim them. + js: "import{createRequire as __cr}from'node:module';const require=__cr(import.meta.url);", + }, + logLevel: "info", +}); + +process.stderr.write("[build-extension] wrote dist/extensions/agenta.js\n"); diff --git a/services/agent/skills/agenta-getting-started/SKILL.md b/services/agent/skills/agenta-getting-started/SKILL.md new file mode 100644 index 0000000000..44bc6a7a6b --- /dev/null +++ b/services/agent/skills/agenta-getting-started/SKILL.md @@ -0,0 +1,21 @@ +--- +name: agenta-getting-started +description: Baseline guidance for agents running on the Agenta platform. Use at the start of a task to recall how to work with the tools and skills Agenta provides and how to report results clearly. +--- + +# Agenta getting started + +This is a placeholder Agenta skill that ships with the `AgentaHarness`. It proves the +forced-skill path end to end; replace its content with real Agenta guidance. + +## When to use + +Read this when you begin a task and want a reminder of the Agenta conventions below. + +## Conventions + +- Prefer the provided tools and skills over guessing; call a tool when one fits. +- When another skill matches the task, read its `SKILL.md` fully before acting. +- Keep answers grounded in what the tools and skills actually return. Do not fabricate + results or tool output. +- Be concise. State what you did, what it returned, and what is left. diff --git a/services/agent/src/cli.ts b/services/agent/src/cli.ts new file mode 100644 index 0000000000..7f45ebb714 --- /dev/null +++ b/services/agent/src/cli.ts @@ -0,0 +1,88 @@ +/** + * WP-2 Pi wrapper CLI: the JSON transport for the Harness port. + * + * Reads one JSON `AgentRunRequest` from stdin, runs Pi once, and writes one JSON + * `AgentRunResult` to stdout. stdout carries the result and nothing else; logs go + * to stderr. This is the one-shot "json adapter" the design doc describes; a + * long-lived RPC adapter can replace it later behind the same Python-side port. + */ +import type { + AgentRunRequest, + AgentRunResult, + EmitEvent, + StreamRecord, +} from "./protocol.ts"; +import { runPi } from "./engines/pi.ts"; +import { runRivet } from "./engines/rivet.ts"; + +// Engine: `rivet` drives a harness over ACP via a rivet daemon; `pi` (default) is the +// legacy in-process Pi path. The request's `backend` wins, then the AGENT_BACKEND env. +function runAgent( + request: AgentRunRequest, + emit?: EmitEvent, +): Promise { + const backend = (request.backend ?? process.env.AGENT_BACKEND ?? "pi").toLowerCase(); + return backend === "rivet" ? runRivet(request, emit) : runPi(request, emit); +} + +async function readStdin(): Promise { + const chunks: Buffer[] = []; + for await (const chunk of process.stdin) { + chunks.push(chunk as Buffer); + } + return Buffer.concat(chunks).toString("utf8"); +} + +// One-shot mode: the whole result as a single JSON document (the `/invoke` contract). +function emitResult(result: AgentRunResult): void { + process.stdout.write(JSON.stringify(result)); +} + +// Streaming mode (`--stream`): one NDJSON record per line — an `{kind:"event"}` line the +// moment each event is built, then exactly one terminal `{kind:"result"}` line. +function writeRecord(record: StreamRecord): void { + process.stdout.write(JSON.stringify(record) + "\n"); +} + +async function main(): Promise { + const stream = process.argv.includes("--stream"); + const raw = await readStdin(); + + let request: AgentRunRequest; + try { + request = raw.trim() ? (JSON.parse(raw) as AgentRunRequest) : {}; + } catch (err) { + const failure: AgentRunResult = { ok: false, error: `Invalid JSON on stdin: ${String(err)}` }; + if (stream) writeRecord({ kind: "result", result: failure }); + else emitResult(failure); + process.exit(1); + } + + if (!stream) { + try { + const result = await runAgent(request); + emitResult(result); + process.exit(result.ok ? 0 : 1); + } catch (err) { + emitResult({ + ok: false, + error: err instanceof Error ? err.stack ?? err.message : String(err), + }); + process.exit(1); + } + return; + } + + const emit: EmitEvent = (event) => writeRecord({ kind: "event", event }); + let result: AgentRunResult; + try { + result = await runAgent(request, emit); + } catch (err) { + result = { ok: false, error: err instanceof Error ? err.stack ?? err.message : String(err) }; + } + // Streaming delivered the events live, so don't echo them in the terminal record. + writeRecord({ kind: "result", result: { ...result, events: [] } }); + process.exit(result.ok ? 0 : 1); +} + +main(); diff --git a/services/agent/src/engines/pi.ts b/services/agent/src/engines/pi.ts new file mode 100644 index 0000000000..2be7d1698f --- /dev/null +++ b/services/agent/src/engines/pi.ts @@ -0,0 +1,432 @@ +/** + * Legacy backend: drive the Pi SDK in-process for one cold run. + * + * This is the non-rivet engine. It drives Pi's `createAgentSession` directly: injects + * AGENTS.md in memory, resolves the model, sends one user turn, and returns the structured + * result (final text, messages, events, usage, capabilities). It also turns the + * backend-resolved runnable tools (WP-7) into Pi customTools that route back through + * Agenta's /tools/call. The rivet engine (`engines/rivet.ts`) is the ACP path; both serve the + * same `/run` contract (see `protocol.ts`). + * + * Auth: provider keys arrive as `request.secrets` (applied to the env) or fall back to the + * local Pi login (`AuthStorage.create()` reads ~/.pi/agent/auth.json). Nothing + * invocation-specific is written to a persistent disk: the session is in-memory and the + * working dir is a throwaway temp dir. + * + * Important: stdout is reserved for the JSON result (see cli.ts). Everything here logs to + * stderr so it never pollutes the result channel. + */ +import { existsSync, mkdtempSync, rmSync, statSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { dirname, isAbsolute, join } from "node:path"; +import { fileURLToPath } from "node:url"; + +import { + AuthStorage, + createAgentSession, + DefaultResourceLoader, + getAgentDir, + ModelRegistry, + SessionManager, + SettingsManager, +} from "@earendil-works/pi-coding-agent"; + +import { createAgentaOtel } from "../tracing/otel.ts"; +import { + type AgentEvent, + type AgentRunRequest, + type AgentRunResult, + type ChatMessage, + type EmitEvent, + type HarnessCapabilities, + type ResolvedToolSpec, + type ToolCallbackContext, + resolveRunSessionId, + resolvePromptText, +} from "../protocol.ts"; +import { EMPTY_OBJECT_SCHEMA } from "../tools/callback.ts"; +import { runResolvedTool } from "../tools/dispatch.ts"; + +/** What the in-process Pi engine supports. Static (no daemon to probe, unlike rivet). */ +const PI_CAPABILITIES: HarnessCapabilities = { + textMessages: true, + toolCalls: true, + reasoning: true, + usage: true, + streamingDeltas: true, + images: false, + fileAttachments: false, + mcpTools: false, + planMode: false, + permissions: false, + sessionLifecycle: false, +}; + +function log(message: string): void { + process.stderr.write(`[pi-wrapper] ${message}\n`); +} + +// services/agent/src/engines/pi.ts -> services/agent. Bundled skills (the Agenta harness's +// forced skills) live under services/agent/skills//. Overridable for non-default layouts. +const PKG_ROOT = dirname(dirname(dirname(fileURLToPath(import.meta.url)))); +const SKILLS_ROOT = process.env.AGENTA_AGENT_SKILLS_DIR || join(PKG_ROOT, "skills"); + +/** + * Resolve the requested skill names to bundled skill directories under SKILLS_ROOT. Each name + * must be a committed dir holding a SKILL.md (Pi loads them and surfaces them in the system + * prompt). Absolute paths are honored as-is; unknown or non-directory entries are skipped with + * a warning so a stale name never fails the run. + */ +function resolveSkillDirs(names: string[] | undefined): string[] { + const dirs: string[] = []; + for (const name of names ?? []) { + if (!name) continue; + const path = isAbsolute(name) ? name : join(SKILLS_ROOT, name); + try { + if (existsSync(path) && statSync(path).isDirectory()) { + dirs.push(path); + } else { + log(`skipping unknown skill "${name}" (no directory at ${path})`); + } + } catch { + log(`skipping skill "${name}": cannot stat ${path}`); + } + } + return dirs; +} + +// In-process Pi reads provider keys from process.env. Since process.env is process-global, +// serialize Pi runs while applying request-scoped provider env, then restore the prior env +// exactly so one request's vault keys cannot leak into the next request. +let providerEnvQueue: Promise = Promise.resolve(); + +async function withRequestProviderEnv( + secrets: Record | undefined, + fn: () => Promise, +): Promise { + const run = providerEnvQueue.then(async () => { + const previous = new Map(); + for (const [key, value] of Object.entries(secrets ?? {})) { + previous.set(key, process.env[key]); + if (value) process.env[key] = value; + else delete process.env[key]; + } + try { + return await fn(); + } finally { + for (const [key, value] of previous) { + if (value === undefined) delete process.env[key]; + else process.env[key] = value; + } + } + }); + providerEnvQueue = run.then( + () => undefined, + () => undefined, + ); + return run; +} + +/** Pick the requested model, else gpt-5.5, else a sensible non-mini default. */ +function pickModel(available: any[], wanted?: string): any { + return ( + (wanted && + available.find((m) => m.id === wanted || `${m.provider}/${m.id}` === wanted)) || + available.find((m) => m.id === "gpt-5.5") || + available.find((m) => !/spark|mini/i.test(m.id)) || + available[0] + ); +} + +/** Concatenate the text blocks of the last assistant message. */ +function extractAssistantText(messages: any[]): string { + for (let i = messages.length - 1; i >= 0; i--) { + const message = messages[i]; + if (message?.role !== "assistant") continue; + const content = message.content; + if (typeof content === "string") return content; + if (Array.isArray(content)) { + const text = content + .filter((block: any) => block?.type === "text" && block.text) + .map((block: any) => block.text) + .join(""); + if (text) return text; + } + } + return ""; +} + +/** The stop reason of the last assistant message, when Pi set one. */ +function lastStopReason(messages: any[]): string | undefined { + for (let i = messages.length - 1; i >= 0; i--) { + if (messages[i]?.role === "assistant" && messages[i].stopReason) { + return String(messages[i].stopReason); + } + } + return undefined; +} + +/** + * Turn resolved tool specs into Pi customTools, branching on the executor `kind`: + * - `callback` (default): `execute` POSTs back through Agenta's /tools/call, so the Composio + * key and connection auth stay server-side. + * - `code`: `execute` runs the snippet in a sandbox subprocess with its scoped secret env. + * - `client`: browser-fulfilled, so skipped on the in-process path (no browser to answer). + * + * A failed `execute` throws, which Pi turns into a tool-error result (the loop continues) + * rather than a run failure. Pi accepts a plain JSON Schema for `parameters` (non-TypeBox path). + */ +export function buildCustomTools( + specs: ResolvedToolSpec[], + callback: ToolCallbackContext | undefined, +): any[] { + const tools: any[] = []; + for (const spec of specs) { + const base = { + name: spec.name, + label: spec.name, + description: spec.description ?? spec.name, + parameters: (spec.inputSchema as any) ?? EMPTY_OBJECT_SCHEMA, + }; + if (spec.kind === "client") { + log(`skipping client tool '${spec.name}' (browser-fulfilled; not available in-process)`); + continue; + } + if (spec.kind === "code") { + tools.push({ + ...base, + async execute(toolCallId: string, params: unknown, signal?: AbortSignal) { + const text = await runResolvedTool(spec, params, { toolCallId, signal }); + return { content: [{ type: "text", text }], details: { kind: "code" } }; + }, + }); + continue; + } + // callback (default): route back to Agenta's /tools/call. + if (!callback?.endpoint) { + log(`skipping callback tool '${spec.name}': missing toolCallback endpoint`); + continue; + } + tools.push({ + ...base, + async execute(toolCallId: string, params: unknown, signal?: AbortSignal) { + const text = await runResolvedTool(spec, params, { + toolCallId, + endpoint: callback.endpoint, + authorization: callback.authorization, + signal, + }); + return { + content: [{ type: "text", text }], + details: { callRef: spec.callRef }, + }; + }, + }); + } + return tools; +} + +export async function runPi( + request: AgentRunRequest, + emit?: EmitEvent, +): Promise { + return withRequestProviderEnv(request.secrets, () => runPiWithEnv(request, emit)); +} + +async function runPiWithEnv( + request: AgentRunRequest, + emit?: EmitEvent, +): Promise { + const prompt = resolvePromptText(request); + if (!prompt) { + return { ok: false, error: "No user message to send (prompt/messages empty)." }; + } + + const cwd = mkdtempSync(join(tmpdir(), "agenta-agent-")); + + try { + const authStorage = AuthStorage.create(); + const modelRegistry = ModelRegistry.create(authStorage); + const available = await modelRegistry.getAvailable(); + if (!available || available.length === 0) { + return { + ok: false, + error: + "No model available. Log in with `pnpm exec pi` -> /login, or set OPENAI_API_KEY / ANTHROPIC_API_KEY.", + }; + } + + const model = pickModel(available, request.model); + log(`model: ${model.provider}/${model.id}`); + + // Tracing: turn this run into OTel spans. When the caller passed a traceparent, + // invoke_agent nests under their /invoke span so the whole agent run is part of the + // same trace (just like completion/chat). + const otel = createAgentaOtel({ + traceparent: request.trace?.traceparent, + baggage: request.trace?.baggage, + endpoint: request.trace?.endpoint, + authorization: request.trace?.authorization, + captureContent: request.trace?.captureContent, + }); + + // Inject AGENTS.md in memory and keep on-disk context files out of the run. + const agentsMd = request.agentsMd?.trim(); + // Pi's two system-prompt layers, carried on the request (PiAgentConfig.system / + // append_system). `systemPrompt` replaces Pi's base prompt; `appendSystemPrompt` adds to + // it. We feed them through the loader overrides so the run stays hermetic: only what the + // request carries applies, never a SYSTEM.md / APPEND_SYSTEM.md left on disk. + const systemPrompt = request.systemPrompt?.trim(); + const appendSystemPrompt = request.appendSystemPrompt?.trim(); + // Forced skills (the Agenta harness): load exactly the bundled dirs the request names. + // `noSkills` suppresses host/global discovery so the run is deterministic; the loader still + // merges `additionalSkillPaths` on top, so the bundled skills load. They only surface in + // the prompt when `read` is enabled (the harness forces it). + const skillDirs = resolveSkillDirs(request.skills); + if (skillDirs.length > 0) { + log(`skills: ${skillDirs.join(", ")}`); + } + const loader = new DefaultResourceLoader({ + cwd, + agentDir: getAgentDir(), + noContextFiles: true, + noSkills: true, + additionalSkillPaths: skillDirs, + systemPromptOverride: () => systemPrompt || undefined, + appendSystemPromptOverride: () => (appendSystemPrompt ? [appendSystemPrompt] : []), + agentsFilesOverride: () => ({ + agentsFiles: agentsMd + ? [{ path: "/virtual/AGENTS.md", content: agentsMd }] + : [], + }), + extensionFactories: [otel.register], + }); + await loader.reload(); + + // Build runnable tools from the resolved specs. Pi's allowlist gates custom tools too, + // so their names must be in `tools` for the model to see them. + const customTools = buildCustomTools(request.customTools ?? [], request.toolCallback); + const toolAllowlist = [ + ...(request.tools ?? []), + ...customTools.map((tool) => tool.name), + ]; + if (customTools.length > 0) { + log(`custom tools: ${customTools.map((t) => t.name).join(", ")}`); + } + + // Created before the prompt so a throw mid-run still flushes the partial trace and + // disposes the session (the inner finally below). Mirrors the rivet engine's pattern. + let session: Awaited>["session"] | undefined; + try { + ({ session } = await createAgentSession({ + cwd, + model, + authStorage, + modelRegistry, + tools: toolAllowlist, + customTools, + sessionManager: SessionManager.inMemory(cwd), + settingsManager: SettingsManager.inMemory(), + resourceLoader: loader, + })); + + // Hand the session id + model to the extension so spans carry them. + const sessionId = resolveRunSessionId(request, session.sessionId); + otel.config.sessionId = sessionId; + otel.config.provider = model.provider; + otel.config.requestModel = model.id; + + // Accumulate streamed text as the primary output channel. On the streaming path, flush + // each Pi `text_delta` as a `message_delta` live (Pi deltas are already pure, so they + // emit verbatim); the block opens on the first delta and closes after the run. + let streamed = ""; + let piTextId: string | undefined; + session.subscribe((event: any) => { + if ( + event.type === "message_update" && + event.assistantMessageEvent?.type === "text_delta" + ) { + const delta = event.assistantMessageEvent.delta ?? ""; + if (!delta) return; + streamed += delta; + if (emit) { + if (piTextId === undefined) { + piTextId = "msg-0"; + emit({ type: "message_start", id: piTextId }); + } + emit({ type: "message_delta", id: piTextId, delta }); + } + } + }); + + await session.prompt(prompt); + + const output = streamed.trim() || extractAssistantText(session.messages); + const stopReason = lastStopReason(session.messages); + const usage = otel.usage(); + + // Ship this run's trace before the result is returned (and before the CLI process + // exits): invoke_agent has a remote parent, so the per-trace flush is what exports it. + await otel.flush(); + + // The structured stream is thinner here than on the rivet path: Pi's in-process tool + // events feed the trace spans, while the result-level event log carries the final + // message, usage, and stop reason (enough for the platform without double-plumbing). + // + // On the streaming path the events were flushed live via `emit`, so the result log stays + // empty; here we only close the open text block (or synthesize one when the text never + // streamed) and flush the tail usage/done events. + const events: AgentEvent[] = []; + const emitOrLog = (event: AgentEvent): void => { + if (emit) emit(event); + else events.push(event); + }; + if (emit) { + if (piTextId !== undefined) { + emit({ type: "message_end", id: piTextId }); + } else if (output) { + emit({ type: "message_start", id: "msg-0" }); + emit({ type: "message_delta", id: "msg-0", delta: output }); + emit({ type: "message_end", id: "msg-0" }); + } + } else if (output) { + events.push({ type: "message", text: output }); + } + if (usage.total > 0) emitOrLog({ type: "usage", ...usage }); + emitOrLog({ type: "done", stopReason }); + + const messages: ChatMessage[] = output + ? [{ role: "assistant", content: output }] + : []; + + return { + ok: true, + output, + messages, + events, + usage, + stopReason, + // `streamingDeltas` is only honest when a live sink carried the deltas end-to-end. + capabilities: { ...PI_CAPABILITIES, streamingDeltas: !!emit }, + sessionId, + model: `${model.provider}/${model.id}`, + traceId: otel.config.traceId, + }; + } catch (err) { + // Flush the partial trace before the error propagates so a failed run is still + // observable (the happy-path flush above never ran). Best-effort: never mask `err`. + await otel.flush().catch(() => {}); + throw err; + } finally { + // Pi keeps the in-memory session alive until disposed; release it on every exit + // (success or throw). Guarded for the case where createAgentSession itself threw. + session?.dispose(); + } + } finally { + try { + rmSync(cwd, { recursive: true, force: true }); + } catch { + // best-effort cleanup of the throwaway working dir + } + } +} diff --git a/services/agent/src/engines/rivet.ts b/services/agent/src/engines/rivet.ts new file mode 100644 index 0000000000..3a5d138106 --- /dev/null +++ b/services/agent/src/engines/rivet.ts @@ -0,0 +1,948 @@ +/** + * WP-8 rivet harness driver. + * + * Drives a coding harness (Pi, Claude Code, ...) over the Agent Client Protocol (ACP) + * through a rivet `sandbox-agent` daemon, instead of the bespoke Pi SDK calls in the pi + * engine. It serves the same /run contract (AgentRunRequest -> AgentRunResult), so the + * Python side stays thin and the choice of harness/sandbox is config, not new code. + * + * Per invoke (cold), mirroring the shipped code-evaluator DaytonaRunner pattern: + * + * SandboxAgent.start({ sandbox: local({ env }) | daytona({ create }) }) + * -> createSession({ agent: , cwd, model }) + * -> write AGENTS.md into cwd + * -> session.prompt([{ type: "text", text }]) + * -> accumulate ACP `agent_message_chunk` text + build the trace + * -> destroySandbox() + * + * Two orthogonal axes swap independently: the sandbox (where the daemon runs) and the + * harness (which engine). The ACP boundary is daemon-to-harness; the service-to-rivet + * hop stays harness-agnostic behind the Harness port. + * + * Tracing is built here from the ACP event stream (see tracing/otel.ts createRivetOtel), + * so it is uniform across every harness and always nests under the caller's /invoke + * span. stdout is reserved for the JSON result (see cli.ts); logs go to stderr. + */ +import { randomBytes } from "node:crypto"; +import { + chmodSync, + copyFileSync, + existsSync, + mkdirSync, + mkdtempSync, + readdirSync, + readFileSync, + rmSync, + writeFileSync, +} from "node:fs"; +import { createRequire } from "node:module"; +import { tmpdir } from "node:os"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; + +import { SandboxAgent, InMemorySessionPersistDriver } from "sandbox-agent"; +import { local } from "sandbox-agent/local"; +import { daytona } from "sandbox-agent/daytona"; + +import { createRivetOtel } from "../tracing/otel.ts"; +import { buildToolMcpServers, type McpServerStdio } from "../tools/mcp-bridge.ts"; +import { executableToolSpecs, publicToolSpecs } from "../tools/public-spec.ts"; +import { + localRelayHost, + sandboxRelayHost, + startToolRelay, +} from "../tools/relay.ts"; +import { + PolicyResponder, + decisionToReply, + policyFromRequest, + type Responder, +} from "../responder.ts"; +import { + type AgentRunRequest, + type AgentRunResult, + type ChatMessage, + type ContentBlock, + type EmitEvent, + type HarnessCapabilities, + type McpServerConfig, + type ResolvedToolSpec, + type ToolCallbackContext, + messageText, + resolvePromptText, + resolveRunSessionId, +} from "../protocol.ts"; + +const require = createRequire(import.meta.url); +// services/agent/src/engines/rivet.ts -> services/agent +const PKG_ROOT = dirname(dirname(dirname(fileURLToPath(import.meta.url)))); +const ADAPTER_BIN_DIR = join(PKG_ROOT, "node_modules", ".bin"); + +/** Map node platform/arch to the @sandbox-agent CLI binary package. */ +const CLI_PACKAGES: Record = { + "darwin-arm64": "@sandbox-agent/cli-darwin-arm64", + "darwin-x64": "@sandbox-agent/cli-darwin-x64", + "linux-x64": "@sandbox-agent/cli-linux-x64", + "linux-arm64": "@sandbox-agent/cli-linux-arm64", + "win32-x64": "@sandbox-agent/cli-win32-x64", +}; + +function log(message: string): void { + process.stderr.write(`[rivet-wrapper] ${message}\n`); +} + +/** + * Resolve the sandbox-agent daemon binary. Prefers SANDBOX_AGENT_BIN, then the + * platform CLI package shipped with `sandbox-agent` (resolved from the SDK's own + * location, since pnpm nests it under `sandbox-agent`). Ensures it is executable + * (pnpm may skip the package's chmod postinstall). Returns undefined when not found; + * the local provider then runs its own resolution and surfaces a clear error. + */ +function resolveDaemonBinary(): string | undefined { + const fromEnv = process.env.SANDBOX_AGENT_BIN; + if (fromEnv && existsSync(fromEnv)) return ensureExecutable(fromEnv); + + const pkg = CLI_PACKAGES[`${process.platform}-${process.arch}`]; + if (!pkg) return undefined; + const bin = process.platform === "win32" ? "sandbox-agent.exe" : "sandbox-agent"; + try { + // Resolve from the sandbox-agent package context (its node_modules sees the + // sibling CLI package in the pnpm layout); package.json blocks the subpath, so + // resolve from the main entry instead. + const sdkRequire = createRequire(require.resolve("sandbox-agent")); + const pkgJson = sdkRequire.resolve(`${pkg}/package.json`); + const resolved = join(dirname(pkgJson), "bin", bin); + if (existsSync(resolved)) return ensureExecutable(resolved); + } catch { + // fall through to a store scan + } + // Fallback: scan the pnpm store for the platform binary. + try { + const store = join(PKG_ROOT, "node_modules", ".pnpm"); + for (const entry of readdirSync(store)) { + if (!entry.startsWith(`@sandbox-agent+cli-${process.platform}`)) continue; + const candidate = join(store, entry, "node_modules", pkg, "bin", bin); + if (existsSync(candidate)) return ensureExecutable(candidate); + } + } catch { + // store not present + } + return undefined; +} + +function ensureExecutable(path: string): string { + try { + chmodSync(path, 0o755); + } catch { + // read-only fs (e.g. baked snapshot already +x): ignore + } + return path; +} + +// The bundled Agenta Pi extension (tracing + tools). Built by `pnpm run build:extension` +// and into the image; installed into Pi's agent dir so Pi loads it on every run. +const EXTENSION_BUNDLE = + process.env.AGENTA_RIVET_EXTENSION_BUNDLE ?? join(PKG_ROOT, "dist", "extensions", "agenta.js"); + +/** + * Env the Agenta Pi extension reads. Propagating the trace context here is what makes Pi + * emit its real spans under the caller's `/invoke` span. Tool env contains only public + * metadata plus the relay directory; private specs/auth stay in the runner. Empty keys are + * omitted so the extension stays inert when nothing applies. + */ +function buildPiExtensionEnv( + request: AgentRunRequest, + tracing: boolean, + opts: { relayDir?: string; usageOutPath?: string } = {}, +): Record { + const env: Record = {}; + // Tracing env is omitted when the harness process can't reach Agenta's OTLP (Daytona): + // there the runner traces from the event stream instead, and the extension only does + // tools + the usage writeback. + const trace = tracing ? request.trace : undefined; + if (trace?.traceparent) env.AGENTA_TRACEPARENT = trace.traceparent; + if (trace?.endpoint) env.AGENTA_OTLP_ENDPOINT = trace.endpoint; + if (trace?.authorization) env.AGENTA_OTLP_AUTHORIZATION = trace.authorization; + if (trace && trace.captureContent === false) env.AGENTA_CAPTURE_CONTENT = "false"; + + const specs = publicToolSpecs((request.customTools as ResolvedToolSpec[]) ?? []); + if (specs.length && opts.relayDir) { + env.AGENTA_TOOL_PUBLIC_SPECS = JSON.stringify(specs); + env.AGENTA_TOOL_RELAY_DIR = opts.relayDir; + } + if (opts.usageOutPath) env.AGENTA_USAGE_OUT = opts.usageOutPath; + return env; +} + +/** Install the extension bundle into a local Pi agent dir's extensions/. Best-effort. */ +function installPiExtensionLocal(agentDir: string): void { + if (!existsSync(EXTENSION_BUNDLE)) { + log(`pi extension bundle missing at ${EXTENSION_BUNDLE} (run build:extension)`); + return; + } + try { + const dir = join(agentDir, "extensions"); + mkdirSync(dir, { recursive: true }); + copyFileSync(EXTENSION_BUNDLE, join(dir, "agenta.js")); + } catch (err) { + log(`pi extension install skipped: ${(err as Error).message}`); + } +} + +/** Upload the extension bundle into a Daytona sandbox's Pi extensions dir. Best-effort. */ +async function uploadPiExtensionToSandbox(sandbox: any, agentDir: string): Promise { + if (!existsSync(EXTENSION_BUNDLE)) return; + try { + const dir = `${agentDir}/extensions`; + await sandbox.mkdirFs({ path: dir }); + await sandbox.writeFsFile({ path: `${dir}/agenta.js` }, readFileSync(EXTENSION_BUNDLE, "utf-8")); + } catch (err) { + log(`pi extension upload skipped: ${(err as Error).message}`); + } +} + +/** + * The environment the daemon is born with. The local provider merges this into the + * `sandbox-agent server` subprocess, which passes it to the ACP adapter and then to + * the harness. This is also where per-invoke trace/secret injection would go for a + * warm-daemon model; under one-daemon-per-invoke the in-process tracer handles spans, + * so this only needs to make the adapters and harness resolvable + authed. + */ +function buildDaemonEnv(harness: string): Record { + const env: Record = {}; + + // Adapters (pi-acp, claude-agent-acp) and the pi CLI live in our node_modules/.bin; + // claude CLI is on the inherited PATH. Prepend ours, keep the inherited PATH. + const extra = process.env.AGENTA_RIVET_ADAPTER_PATH; + env.PATH = [ADAPTER_BIN_DIR, extra, process.env.PATH].filter(Boolean).join(":"); + + // Pi: point pi-acp at our pi bin and the agent dir that carries auth.json. + env.PI_ACP_PI_COMMAND = + process.env.AGENTA_RIVET_PI_COMMAND ?? join(ADAPTER_BIN_DIR, "pi"); + const piAgentDir = process.env.PI_CODING_AGENT_DIR; + if (piAgentDir) env.PI_CODING_AGENT_DIR = piAgentDir; + + // Keep HOME so harness logins (~/.pi/agent, ~/.claude) resolve. + if (process.env.HOME) env.HOME = process.env.HOME; + + // Harness LLM auth passed as launch env, never written into the agent filesystem. + for (const key of [ + "OPENAI_API_KEY", + "ANTHROPIC_API_KEY", + "ANTHROPIC_AUTH_TOKEN", + "CLAUDE_CODE_OAUTH_TOKEN", + "CLAUDE_CONFIG_DIR", + "GEMINI_API_KEY", + ]) { + const value = process.env[key]; + if (value) env[key] = value; + } + + return env; +} + +/** The latest user turn (shared protocol helper; flattens content blocks to text). */ +const resolvePrompt = resolvePromptText; + +/** Prior turns (everything before the latest user message) for trace + history. */ +function priorMessages(request: AgentRunRequest): ChatMessage[] { + const messages = request.messages ?? []; + const latest = resolvePrompt(request); + // Drop the trailing user turn (it is the prompt we send) to avoid double-counting. + if (messages.length && messages[messages.length - 1].role === "user") { + return messages.slice(0, -1); + } + // No trailing user message (prompt came in explicitly): drop only the LAST user turn + // whose text matches the prompt being sent, not every matching turn (repeated short + // turns like "yes"/"continue" would otherwise vanish from the replayed history). + let lastMatch = -1; + for (let i = messages.length - 1; i >= 0; i--) { + if (messages[i].role === "user" && messageText(messages[i].content) === latest) { + lastMatch = i; + break; + } + } + return lastMatch === -1 ? messages : messages.filter((_, i) => i !== lastMatch); +} + +function safeJson(value: unknown): string { + if (value === undefined || value === null) return ""; + try { + return typeof value === "string" ? value : JSON.stringify(value); + } catch { + return String(value); + } +} + +/** + * Render one message for the replayed transcript, INCLUDING resolved tool turns. Under the + * cold model the harness rebuilds context from this text, and ACP prompt content blocks + * cannot carry tool calls/results — so a resolved interaction (an approved tool that ran, a + * client-fulfilled tool) is encoded here as text, letting the model resume from the result + * instead of re-asking. This is the cross-turn HITL continuation substrate: the `/messages` + * egress folds inbound UIMessage tool/approval parts into `tool_call` / `tool_result` content + * blocks, and they survive into the replay here. Plain string / text blocks pass through; + * image/resource blocks are summarized. + */ +export function messageTranscript(content: string | ContentBlock[] | undefined): string { + if (!content) return ""; + if (typeof content === "string") return content; + const parts: string[] = []; + for (const block of content) { + if (!block) continue; + if (block.type === "text" && typeof block.text === "string") { + parts.push(block.text); + } else if (block.type === "tool_call") { + parts.push(`[called ${block.toolName ?? "tool"}(${safeJson(block.input)})]`); + } else if (block.type === "tool_result") { + const body = safeJson(block.output); + parts.push(`[${block.toolName ?? "tool"} ${block.isError ? "error" : "returned"}: ${body}]`); + } else if (block.type === "image") { + parts.push("[image]"); + } else if (block.type === "resource") { + parts.push(block.uri ? `[resource: ${block.uri}]` : "[resource]"); + } + } + return parts.filter(Boolean).join("\n"); +} + +/** + * The text sent over ACP for this turn. Each invoke is a cold sandbox, so prior turns + * are replayed as transcript context ahead of the latest user message — this is the + * "persisted message history replayed" model, with the client/playground holding the + * history. Capped by AGENTA_AGENT_HISTORY_MAX_CHARS so replay tokens stay bounded. + */ +export function buildTurnText(request: AgentRunRequest): string { + const latest = resolvePrompt(request); + const history = priorMessages(request).filter((m) => messageTranscript(m.content)); + if (history.length === 0) return latest; + + const maxChars = Number(process.env.AGENTA_AGENT_HISTORY_MAX_CHARS ?? 24000); + let transcript = history.map((m) => `${m.role}: ${messageTranscript(m.content)}`).join("\n"); + if (transcript.length > maxChars) transcript = transcript.slice(-maxChars); + return ( + `Conversation so far:\n${transcript}\n\n` + + `Continue the conversation. The user now says:\n${latest}` + ); +} + +/** + * Convert user-declared MCP servers (already resolved server-side, secrets injected into + * `env`) into ACP stdio entries. Only `stdio` is delivered over ACP today; `http`/remote + * carries no auth on the wire by design and is skipped. The per-server `tools` allowlist is + * NOT enforced over ACP in v1 — the harness lists all of a server's tools — so it is dropped + * with a log rather than silently implying a filter that does not happen. + */ +export function toAcpMcpServers(servers: McpServerConfig[] | undefined): McpServerStdio[] { + const out: McpServerStdio[] = []; + for (const s of servers ?? []) { + if ((s.transport ?? "stdio") !== "stdio" || !s.command) { + log(`skipping non-stdio MCP server '${s?.name ?? "?"}' (remote transport deferred)`); + continue; + } + if (s.tools && s.tools.length > 0) { + log(`MCP server '${s.name}': per-server tool allowlist not enforced over ACP (v1)`); + } + out.push({ + name: s.name, + command: s.command, + args: s.args ?? [], + env: Object.entries(s.env ?? {}).map(([name, value]) => ({ name, value: String(value) })), + }); + } + return out; +} + +/** + * Pick the harness-specific model id for a requested name. Harnesses expose their own + * ids (Pi: "openai-codex/gpt-5.5"; Claude: its own). Match exact, then by the id after + * the provider prefix, so "gpt-5.5" resolves to "openai-codex/gpt-5.5". + */ +function pickModel(allowed: string[], wanted?: string): string | undefined { + if (!wanted) return undefined; + if (allowed.includes(wanted)) return wanted; + const suffix = (id: string) => id.slice(id.indexOf("/") + 1); + return ( + allowed.find((id) => suffix(id) === wanted) ?? + allowed.find((id) => suffix(id) === suffix(wanted)) ?? + undefined + ); +} + +/** Enumerate the harness's selectable model ids from the session config options. */ +async function allowedModels(session: any): Promise { + try { + const options = await session.getConfigOptions(); + const modelOpt = (options ?? []).find( + (o: any) => o.category === "model" || o.id === "model", + ); + const choices = modelOpt?.options ?? []; + return choices.map((c: any) => c.id).filter(Boolean); + } catch { + return []; + } +} + +/** Parse the allowed model ids out of an UnsupportedSessionValueError message. */ +function allowedFromError(err: unknown): string[] { + const match = /Allowed values:\s*(.+?)\s*$/.exec(String((err as Error)?.message ?? err)); + if (!match) return []; + return match[1] + .split(",") + .map((s) => s.trim()) + .filter(Boolean); +} + +/** + * Apply the requested model to a session, normalizing to the harness's own id. Tries the + * value as given first (already-qualified ids pass); on rejection it reads the allowed + * ids from the error (always listed there) or the session config and retries a match. + * Returns the id set, or undefined when no match exists (the harness keeps its default + * rather than failing the run). + */ +async function applyModel(session: any, wanted?: string): Promise { + if (!wanted) return undefined; + try { + await session.setModel(wanted); + return wanted; + } catch (err) { + const allowed = allowedFromError(err); + const fallbackAllowed = allowed.length ? allowed : await allowedModels(session); + const match = pickModel(fallbackAllowed, wanted); + if (match && match !== wanted) { + try { + await session.setModel(match); + return match; + } catch { + // fall through to harness default + } + } + log(`model '${wanted}' not settable (${(err as Error).message}); using harness default`); + return undefined; + } +} + +/** + * In-sandbox env for the Daytona daemon: where Pi reads its login, any provider keys, + * and the Agenta extension env (traceparent + OTLP + tool spec) so the remote Pi traces + * and runs tools exactly like local. No local-only paths (PATH/PI_ACP_PI_COMMAND) here. + */ +function daytonaEnvVars( + piExtEnv: Record, + secrets: Record, +): Record { + const env: Record = { + PI_CODING_AGENT_DIR: DAYTONA_PI_DIR, + ...piExtEnv, + // Provider API keys from the vault: the in-sandbox harness authenticates with these. + ...secrets, + }; + // Point pi-acp at the `pi` we install into the sandbox (the image lacks it). + if (DAYTONA_PI_INSTALL) { + env.PI_ACP_PI_COMMAND = `${DAYTONA_PI_INSTALL_DIR}/node_modules/.bin/pi`; + } + return env; +} + +/** + * Build the rivet sandbox provider for the requested axis. + * + * Daytona needs an image that carries both the rivet daemon and the harness CLI. Rivet's + * `-full` image ships the daemon and the ACP adapters but NOT the `pi` CLI, so we run + * from a pre-baked snapshot (`AGENTA_RIVET_DAYTONA_SNAPSHOT`, default `agenta-rivet-pi`, + * built by poc/build_rivet_snapshot.py) that adds `pi`; this avoids a ~150s per-invoke + * `npm install pi`. `AGENTA_RIVET_DAYTONA_IMAGE` overrides with a plain image instead. The + * code-evaluator DAYTONA_SNAPSHOT is intentionally NOT reused (it has no daemon). The + * provider key comes from the vault env; Pi's OAuth login is only uploaded when no key. + */ +function buildSandboxProvider( + sandboxId: string, + env: Record, + binaryPath: string | undefined, + piExtEnv: Record, + secrets: Record, +) { + if (sandboxId === "daytona") { + const snapshot = process.env.AGENTA_RIVET_DAYTONA_SNAPSHOT; + const image = process.env.AGENTA_RIVET_DAYTONA_IMAGE; + const target = process.env.DAYTONA_TARGET; + return daytona({ + ...(image ? { image } : {}), + create: { + // The rivet provider always sets a default `image`, which Daytona turns into a + // build entry that conflicts with `snapshot`. Spreading image:undefined last + // suppresses that so the snapshot is used as-is. + ...(snapshot ? { snapshot, image: undefined } : {}), + ...(target ? { target } : {}), + envVars: daytonaEnvVars(piExtEnv, secrets), + ephemeral: true, + } as any, + }); + } + // local: spawn `sandbox-agent server` on this host with the daemon env merged in. + const logMode = (process.env.AGENTA_RIVET_DAEMON_LOG ?? "silent") as any; + return local({ env, binaryPath, log: logMode }); +} + +/** In-sandbox Pi agent dir on the rivet `-full` image (daemon runs as user `sandbox`). */ +const DAYTONA_PI_DIR = process.env.AGENTA_RIVET_DAYTONA_PI_DIR ?? "/home/sandbox/.pi/agent"; +// The rivet `-full` image ships the pi-acp adapter but NOT the `pi` CLI, so by default we +// install it into the sandbox at session time and point pi-acp at it. A snapshot that +// pre-installs `pi` should set AGENTA_RIVET_DAYTONA_INSTALL_PI=false (faster, no per-run +// npm install). Version mirrors the wrapper's pinned Pi. +const DAYTONA_PI_INSTALL_DIR = "/home/sandbox/.agenta-pi"; +const DAYTONA_PI_INSTALL = process.env.AGENTA_RIVET_DAYTONA_INSTALL_PI !== "false"; +const DAYTONA_PI_VERSION = process.env.AGENTA_RIVET_PI_VERSION ?? "0.79.4"; + +/** Install the `pi` CLI into a Daytona sandbox (the rivet image lacks it). Best-effort. */ +async function installPiInSandbox(sandbox: any): Promise { + try { + await sandbox.mkdirFs({ path: DAYTONA_PI_INSTALL_DIR }); + const res = await sandbox.runProcess({ + command: "npm", + args: [ + "install", + "--no-fund", + "--no-audit", + `@earendil-works/pi-coding-agent@${DAYTONA_PI_VERSION}`, + ], + cwd: DAYTONA_PI_INSTALL_DIR, + timeoutMs: 180_000, + }); + if (res?.exitCode !== 0) { + log(`pi install in sandbox exit=${res?.exitCode}: ${String(res?.stderr).slice(-400)}`); + } + } catch (err) { + log(`pi install in sandbox skipped: ${(err as Error).message}`); + } +} + +/** + * Upload the local Pi login into a Daytona sandbox so the remote Pi authenticates with + * the dev's ChatGPT/Codex OAuth (it auto-refreshes from the token in auth.json). Must + * `mkdirFs` the parent first (a fresh sandbox lacks it) and pass a string body — a + * missing dir or a stream body is what produced the earlier "Stream Error". Best-effort: + * with no local login the remote run falls back to any provider key in the sandbox env. + */ +async function uploadPiAuthToSandbox(sandbox: any): Promise { + const localDir = process.env.PI_CODING_AGENT_DIR || join(process.env.HOME ?? "", ".pi/agent"); + const authPath = join(localDir, "auth.json"); + if (!existsSync(authPath)) return; + try { + await sandbox.mkdirFs({ path: DAYTONA_PI_DIR }); + await sandbox.writeFsFile({ path: `${DAYTONA_PI_DIR}/auth.json` }, readFileSync(authPath, "utf-8")); + const settingsPath = join(localDir, "settings.json"); + if (existsSync(settingsPath)) { + await sandbox.writeFsFile( + { path: `${DAYTONA_PI_DIR}/settings.json` }, + readFileSync(settingsPath, "utf-8"), + ); + } + } catch (err) { + log(`pi auth upload skipped: ${(err as Error).message}`); + } +} + +/** + * A `fetch` that persists cookies per host. Daytona's preview proxy authenticates with a + * `daytona-sandbox-auth-*` cookie set on the first response; Node's fetch keeps no cookie + * jar, so without this the proxy rejects later ACP requests with "Authentication + * required" / 502. The rivet SDK accepts a custom fetch, so we hand it this one. + */ +function createCookieFetch(): typeof fetch { + const jar = new Map>(); // host -> (name -> "name=value") + return async (input: any, init?: any) => { + const url = new URL(typeof input === "string" ? input : input.url); + const host = url.host; + const cookies = jar.get(host); + const headers = new Headers(init?.headers ?? (typeof input !== "string" ? input.headers : undefined)); + if (cookies && cookies.size > 0) { + const existing = headers.get("cookie"); + const merged = [...cookies.values()]; + if (existing) merged.unshift(existing); + headers.set("cookie", merged.join("; ")); + } + const response = await fetch(input, { ...init, headers }); + const setCookies = + typeof (response.headers as any).getSetCookie === "function" + ? (response.headers as any).getSetCookie() + : (response.headers.get("set-cookie") ? [response.headers.get("set-cookie")] : []); + if (setCookies.length) { + const store = jar.get(host) ?? new Map(); + for (const sc of setCookies) { + const pair = String(sc).split(";")[0]; + const name = pair.split("=")[0]; + if (name) store.set(name, pair); + } + jar.set(host, store); + } + return response; + }; +} + +/** Read the run-total usage Pi wrote on agent_end (local fs or the sandbox FS API). */ +async function readRunUsage( + sandbox: any, + path: string | undefined, + isDaytona: boolean, +): Promise { + if (!path) return undefined; + try { + let raw: string; + if (isDaytona) { + const bytes = await sandbox.readFsFile({ path }); + raw = typeof bytes === "string" ? bytes : new TextDecoder().decode(bytes); + } else { + if (!existsSync(path)) return undefined; + raw = readFileSync(path, "utf-8"); + } + const u = JSON.parse(raw); + return u && u.total > 0 ? u : undefined; + } catch { + return undefined; + } +} + +/** + * Turn a harness/SDK error into one clear line for the caller (the playground shows it + * verbatim), instead of dumping a full ACP/JS stack. Recognizes the common harness auth + * failures so the user sees what to fix. + */ +function conciseError(err: unknown, harness: string): string { + const raw = err instanceof Error ? err.message : String(err); + const msg = raw.split("\n")[0].trim(); + const keyHint = + harness === "claude" ? "the project's Anthropic key" : "the project's OpenAI key"; + if (/credit balance is too low/i.test(raw)) { + return `${harness}: the model provider account has insufficient credit (check ${keyHint}).`; + } + if (/authentication required|invalid api key|401|unauthorized/i.test(raw)) { + return `${harness}: model authentication failed — add ${keyHint} to the project vault, or log in (OAuth).`; + } + return msg || "agent run failed"; +} + +/** + * Map a rivet `AgentInfo` to our capability flags. Falls back to a per-harness static + * guess when the probe is unavailable, so tool delivery and tracing still pick a sane + * path. Rivet has no `usage` capability flag (usage rides on `usage_update` events), so we + * derive it from the harness: Pi reports usage through its extension, others over ACP. + */ +function mapCapabilities(harness: string, info: any): HarnessCapabilities { + const c = info?.capabilities; + if (c) { + return { + textMessages: c.textMessages ?? true, + images: !!c.images, + fileAttachments: !!c.fileAttachments, + mcpTools: !!c.mcpTools, + toolCalls: !!c.toolCalls, + reasoning: !!c.reasoning, + planMode: !!c.planMode, + permissions: !!c.permissions, + streamingDeltas: !!c.streamingDeltas, + sessionLifecycle: !!c.sessionLifecycle, + usage: true, + }; + } + // Static fallback by harness id: pi-acp does not forward MCP, Claude/Codex do. + const isPiHarness = harness === "pi"; + return { + textMessages: true, + images: false, + fileAttachments: false, + mcpTools: !isPiHarness, + toolCalls: true, + reasoning: true, + planMode: !isPiHarness, + permissions: !isPiHarness, + streamingDeltas: true, + sessionLifecycle: true, + usage: true, + }; +} + +/** Probe the harness's capabilities from the daemon (best-effort, static fallback). */ +async function probeCapabilities( + sandbox: any, + harness: string, +): Promise { + try { + const info = await sandbox.getAgent(harness, { config: true }); + return mapCapabilities(harness, info); + } catch { + return mapCapabilities(harness, undefined); + } +} + +export async function runRivet( + request: AgentRunRequest, + emit?: EmitEvent, + signal?: AbortSignal, +): Promise { + const harness = request.harness || process.env.AGENTA_AGENT_HARNESS || "pi"; + const sandboxId = request.sandbox || process.env.AGENTA_AGENT_SANDBOX || "local"; + + const prompt = resolvePrompt(request); + if (!prompt) { + return { ok: false, error: "No user message to send (prompt/messages empty)." }; + } + // What we actually send over ACP: the latest turn, with prior turns replayed as + // context when this is a continued conversation. + const turnText = buildTurnText(request); + + const isPi = harness === "pi"; + const isDaytona = sandboxId === "daytona"; + + // Provider API keys resolved from the vault (OPENAI_API_KEY/ANTHROPIC_API_KEY/...). + // Present => the harness authenticates with the key; absent => it uses its own login + // (OAuth: local Codex / a mounted-or-uploaded auth.json). + const secrets = request.secrets ?? {}; + const harnessKeyVar = harness === "claude" ? "ANTHROPIC_API_KEY" : "OPENAI_API_KEY"; + const hasApiKey = !!secrets[harnessKeyVar]; + + // Session cwd holds AGENTS.md. Local: a host temp dir. Daytona: an in-sandbox path + // (the host path would not exist on the remote sandbox). + const cwd = isDaytona + ? `/home/sandbox/agenta-${randomBytes(6).toString("hex")}` + : mkdtempSync(join(tmpdir(), "agenta-rivet-")); + const agentsMd = request.agentsMd?.trim(); + + const toolSpecsForRun = (request.customTools as ResolvedToolSpec[]) ?? []; + const executableToolSpecsForRun = executableToolSpecs(toolSpecsForRun); + const relayDir = `${cwd}/.agenta-tools`; + const useToolRelay = executableToolSpecsForRun.length > 0; + + // Pi writes its run totals here on agent_end; we read them back and return them so the + // caller can roll them onto the workflow span (separate OTLP batch, see piExtension). + const usageOutPath = isPi ? `${cwd}/.agenta-usage.json` : undefined; + + const env = buildDaemonEnv(harness); + Object.assign(env, secrets); // local daemon inherits the provider keys + // Pi self-instruments locally: propagate the trace context + public tool metadata into Pi + // via the Agenta extension. Tool execution always relays back to this runner, which keeps + // private specs, scoped env, callback endpoints, and callback auth in memory. + const piExtEnv = isPi + ? buildPiExtensionEnv(request, !isDaytona, { relayDir, usageOutPath }) + : {}; + Object.assign(env, piExtEnv); // local daemon inherits it; daytona gets it via envVars + // undefined is fine: the local provider runs its own resolution and errors clearly. + const binaryPath = resolveDaemonBinary(); + + // For local Pi, install the extension into the agent dir Pi loads from. + const localPiAgentDir = process.env.PI_CODING_AGENT_DIR; + if (isPi && !isDaytona && localPiAgentDir) installPiExtensionLocal(localPiAgentDir); + + // Pi's system-prompt overrides (systemPrompt / appendSystemPrompt) are honored on the + // in-process Pi engine via the resource loader. The ACP path drives Pi through pi-acp, + // which gives us no per-run hook to set them (a project SYSTEM.md is trust-gated, and CLI + // flags can't be set per session here), so they are not delivered yet. Warn rather than + // drop them silently. AGENTS.md still applies on this path regardless. + if (isPi && (request.systemPrompt?.trim() || request.appendSystemPrompt?.trim())) { + log("systemPrompt/appendSystemPrompt are not yet delivered on the ACP (rivet) Pi path; ignored"); + } + + log(`harness=${harness} sandbox=${sandboxId} cwd=${cwd}`); + + // Persist events in-process so a follow-up turn can resume by session id. + const persist = new InMemorySessionPersistDriver(); + const sandbox = await SandboxAgent.start({ + sandbox: buildSandboxProvider(sandboxId, env, binaryPath, piExtEnv, secrets), + persist, + // Propagate caller cancellation (a client disconnect on the streaming HTTP edge) so an + // in-flight run aborts instead of finishing unobserved. The `finally` still disposes. + ...(signal ? { signal } : {}), + // Daytona's preview proxy authenticates with a per-sandbox cookie; carry it across + // requests so ACP calls after the first don't 401. Harmless for local. + ...(isDaytona ? { fetch: createCookieFetch() } : {}), + }); + + // Pi traces itself via the extension under the propagated traceparent; for other + // harnesses we build the span tree here from the ACP event stream. Created below, once + // the model is resolved, so the chat span carries the harness's actual model rather + // than the requested one. Declared here so the catch can flush a partial trace. + let otel: ReturnType | undefined; + // Daytona tool relay loop (started once the session exists, stopped after the prompt). + let toolRelay: { stop: () => Promise } | undefined; + + try { + // On Daytona, push the harness login, the extension, and AGENTS.md into the remote + // sandbox via the filesystem API (nothing secret is baked into the image). Locally + // these use the host filesystem and the harness's own login (PI_CODING_AGENT_DIR). + if (isDaytona) { + if (isPi) { + // With a provider API key the harness authenticates via env; only fall back to + // uploading the Codex/OAuth login when no key is available. + if (!hasApiKey) await uploadPiAuthToSandbox(sandbox); + await uploadPiExtensionToSandbox(sandbox, DAYTONA_PI_DIR); + if (DAYTONA_PI_INSTALL) await installPiInSandbox(sandbox); + } + await sandbox.mkdirFs({ path: cwd }).catch(() => {}); + if (useToolRelay) await sandbox.mkdirFs({ path: relayDir }).catch(() => {}); + if (agentsMd) await sandbox.writeFsFile({ path: `${cwd}/AGENTS.md` }, agentsMd); + } else { + if (useToolRelay) mkdirSync(relayDir, { recursive: true }); + if (agentsMd) writeFileSync(join(cwd, "AGENTS.md"), agentsMd, "utf-8"); + } + + // Probe what this harness supports and branch on capabilities, not on the harness + // name. Tool delivery: Pi loads our extension (native tools, set up above); any other + // harness takes tools over MCP only when it advertises `mcpTools` (pi-acp does not + // forward MCP, Claude/Codex do). + const capabilities = await probeCapabilities(sandbox, harness); + const toolSpecs = (request.customTools as ResolvedToolSpec[]) ?? []; + const userMcpCount = request.mcpServers?.length ?? 0; + // MCP delivery is gated on `mcpTools`: pi-acp does not forward MCP, Claude/Codex do. The + // synthesized `agenta-tools` server (gateway/code tools) and the user-declared servers + // ride the same gate. + const mcpServers = + !isPi && capabilities.mcpTools + ? [ + ...buildToolMcpServers( + toolSpecs, + request.toolCallback as ToolCallbackContext | undefined, + relayDir, + ), + ...toAcpMcpServers(request.mcpServers), + ] + : []; + if (!isPi && (toolSpecs.length > 0 || userMcpCount > 0) && !capabilities.mcpTools) { + log( + `harness '${harness}' lacks MCP support; ${toolSpecs.length} tool(s) and ` + + `${userMcpCount} user MCP server(s) not delivered`, + ); + } + + const session = await sandbox.createSession({ + agent: harness, + cwd, + sessionInit: { cwd, mcpServers }, + }); + const sessionId = resolveRunSessionId(request, session.id); + + // Resolve the model first: when the harness rejects the requested id and keeps its + // own default (e.g. Claude ignores "gpt-5.5"), `model` is undefined and the chat span + // is labelled "chat" instead of falsely claiming the requested model. + const model = await applyModel(session, request.model); + + const run = createRivetOtel({ + harness, + model, + traceparent: request.trace?.traceparent, + baggage: request.trace?.baggage, + endpoint: request.trace?.endpoint, + authorization: request.trace?.authorization, + captureContent: request.trace?.captureContent, + emitSpans: !isPi || isDaytona, + emit, + }); + otel = run; + + run.start({ + prompt, + sessionId, + messages: [...priorMessages(request), { role: "user", content: prompt }], + }); + + session.onEvent((event: any) => { + const payload = event?.payload; + const update = payload?.params?.update ?? payload?.update; + if (update) run.handleUpdate(update); + }); + + // Permission gating, behind the Responder seam. Pi never gates; a permission-gating + // harness (e.g. Claude) raises a request, which we (a) surface as an `interaction_request` + // event so the egress can project it (Vercel `tool-approval-request`) and the trace can + // record it, and (b) resolve via the responder. The headless `PolicyResponder` keeps the + // prior behavior: auto-allow trusted backend tools, or deny per `permissionPolicy` / + // AGENTA_RIVET_DENY_PERMISSIONS. A cross-turn responder (true HITL) slots in here later + // without touching the harness. Tools are backend-resolved and trusted; the run is headless. + const responder: Responder = new PolicyResponder(policyFromRequest(request.permissionPolicy)); + session.onPermissionRequest((req: any) => { + const id = String(req?.id ?? ""); + const availableReplies: string[] = req?.availableReplies ?? []; + run.emitEvent({ + type: "interaction_request", + id, // ACP permission id -> Vercel approvalId + kind: "permission", + payload: { + // toolCallId of the gated tool, so the cross-turn approval reply correlates back to + // its tool call (and the #6 resume finds it). `toolCall` is the ACP ToolCallUpdate. + toolCallId: req?.toolCall?.toolCallId, + toolCall: req?.toolCall, + availableReplies, + options: req?.options, + }, + }); + void responder + .onPermission({ id, availableReplies, raw: req }) + .then((decision) => { + if (!req?.id) return; + return session.respondPermission(req.id, decisionToReply(decision, availableReplies) as any); + }) + .catch(() => {}); + }); + + if (useToolRelay) { + toolRelay = startToolRelay( + isDaytona ? sandboxRelayHost(sandbox) : localRelayHost(), + relayDir, + toolSpecsForRun, + request.toolCallback as ToolCallbackContext | undefined, + ); + } + + const result = await session.prompt([{ type: "text", text: turnText }]); + await toolRelay?.stop(); + const stopReason = (result as any)?.stopReason; + log(`prompt stopReason=${stopReason}`); + + // Usage: Pi writes its totals to a file via the extension. Other harnesses report the + // input/output token split on the PromptResponse and the cost on ACP `usage_update`, + // so combine the two (the stream alone carries no per-call token split). Read and stamp + // this before finish/flush so exported spans and final events carry the final usage. + let usage = await readRunUsage(sandbox, usageOutPath, isDaytona); + if (!usage) { + const promptUsage = (result as any)?.usage; + const streamUsage = run.usage(); + const inputTokens = promptUsage?.inputTokens ?? streamUsage?.input ?? 0; + const outputTokens = promptUsage?.outputTokens ?? streamUsage?.output ?? 0; + const total = inputTokens + outputTokens || streamUsage?.total || 0; + const cost = streamUsage?.cost ?? 0; + usage = + total > 0 || cost > 0 + ? { input: inputTokens, output: outputTokens, total, cost } + : undefined; + } + run.setUsage(usage); + + const output = run.finish(); + await run.flush(); + + return { + ok: true, + output, + messages: output ? [{ role: "assistant", content: output }] : [], + // Streaming already delivered every event live, so the terminal result carries none + // (re-sending would double them on the consumer). + events: emit ? [] : run.events(), + usage, + stopReason, + // `streamingDeltas` advertises end-to-end live deltas, which is only true when a live + // sink is wired. The one-shot path reports false even when the harness produces deltas. + capabilities: { ...capabilities, streamingDeltas: !!emit && capabilities.streamingDeltas }, + sessionId, + model: model ?? request.model, + traceId: run.traceId(), + }; + } catch (err) { + otel?.finish(); + await otel?.flush().catch(() => {}); + return { ok: false, error: conciseError(err, harness) }; + } finally { + await toolRelay?.stop().catch(() => {}); + await sandbox.destroySandbox().catch(() => {}); + await sandbox.dispose().catch(() => {}); + rmSync(cwd, { recursive: true, force: true }); + } +} diff --git a/services/agent/src/extensions/agenta.ts b/services/agent/src/extensions/agenta.ts new file mode 100644 index 0000000000..85b88a79ad --- /dev/null +++ b/services/agent/src/extensions/agenta.ts @@ -0,0 +1,114 @@ +/** + * Agenta Pi extension (WP-8): tracing + tools, installed into Pi's agent dir and loaded + * by Pi when it runs under rivet (`pi --mode rpc` via pi-acp). + * + * This is how we keep WP-1/WP-2/WP-7 behavior on the rivet path: instead of a synthetic, + * coarse tracer in the runner, we propagate the caller's trace context INTO Pi and let + * Pi emit its real span tree (turn / chat / tool, with token usage) under that parent — + * and we deliver tools the Pi-native way (`registerTool`), each routing back to Agenta's + * /tools/call, rather than over MCP. Pi is highly customizable; this leans on that. + * + * Everything is read from the environment (injected at the daemon's birth). Tool env is + * intentionally public-only; execution relays back to the runner where private specs/auth + * remain in memory: + * AGENTA_TRACEPARENT W3C traceparent of the caller's /invoke span + * AGENTA_OTLP_ENDPOINT OTLP traces URL (e.g. https://host/api/otlp/v1/traces) + * AGENTA_OTLP_AUTHORIZATION Authorization header for the OTLP export + * AGENTA_CAPTURE_CONTENT "false" to drop prompt/completion/tool I/O from spans + * AGENTA_TOOL_PUBLIC_SPECS JSON [{ name, description, inputSchema }] + * AGENTA_TOOL_RELAY_DIR relay tool calls through the runner via files here + * + * Bundled self-contained (esbuild) so its OpenTelemetry deps resolve wherever Pi loads + * it (local, the docker sidecar, a Daytona snapshot). Default export is the Pi + * ExtensionFactory. + */ +import { writeFileSync } from "node:fs"; + +import type { ExtensionAPI } from "@earendil-works/pi-coding-agent"; + +import { createAgentaOtel } from "../tracing/otel.ts"; +import type { ResolvedToolSpec } from "../protocol.ts"; +import { EMPTY_OBJECT_SCHEMA } from "../tools/callback.ts"; +import { runResolvedTool } from "../tools/dispatch.ts"; + +function log(message: string): void { + process.stderr.write(`[agenta-pi-ext] ${message}\n`); +} + +/** Register public tool metadata as Pi tools whose execution relays to the runner. */ +function registerTools(pi: ExtensionAPI): void { + const raw = process.env.AGENTA_TOOL_PUBLIC_SPECS; + const relayDir = process.env.AGENTA_TOOL_RELAY_DIR; + if (!raw || !relayDir) return; + + let specs: ResolvedToolSpec[] = []; + try { + specs = JSON.parse(raw); + } catch (err) { + log(`bad AGENTA_TOOL_PUBLIC_SPECS: ${(err as Error).message}`); + return; + } + + let registered = 0; + for (const spec of specs) { + pi.registerTool({ + name: spec.name, + label: spec.name, + description: spec.description ?? spec.name, + // Pi accepts plain JSON Schema here (non-TypeBox validation path). + parameters: (spec.inputSchema as any) ?? EMPTY_OBJECT_SCHEMA, + async execute(toolCallId: string, params: unknown, signal?: AbortSignal) { + const text = await runResolvedTool(spec, params, { + toolCallId, + relayDir, + signal, + }); + return { + content: [{ type: "text", text }], + details: { toolName: spec.name }, + }; + }, + } as any); + registered += 1; + } + log(`registered ${registered} tool(s) -> relay ${relayDir}`); +} + +/** The Pi ExtensionFactory: tools + (env-driven) tracing + usage writeback. */ +const factory = (pi: ExtensionAPI): void => { + // Fully inert unless Agenta wired this run (so it is safe to install globally in a + // shared Pi agent dir — a normal `pi` session with no Agenta env does nothing). + const hasTracing = !!(process.env.AGENTA_TRACEPARENT || process.env.AGENTA_OTLP_ENDPOINT); + const hasTools = !!(process.env.AGENTA_TOOL_PUBLIC_SPECS && process.env.AGENTA_TOOL_RELAY_DIR); + const usageOut = process.env.AGENTA_USAGE_OUT; + if (!hasTracing && !hasTools && !usageOut) return; + + if (hasTools) registerTools(pi); + // Tracing exports the span tree (when the OTLP target is reachable, i.e. local runs). + // Usage accumulation is needed both for that export AND for the writeback the runner + // uses on Daytona (where the in-sandbox process can't reach Agenta's OTLP, so the + // runner traces from the event stream and only needs the token totals). So set up the + // otel state whenever either applies; only flush (export) when tracing is on. + if (!hasTracing && !usageOut) return; + + const otel = createAgentaOtel({ + traceparent: process.env.AGENTA_TRACEPARENT, + endpoint: process.env.AGENTA_OTLP_ENDPOINT, + authorization: process.env.AGENTA_OTLP_AUTHORIZATION, + captureContent: process.env.AGENTA_CAPTURE_CONTENT !== "false", + }); + otel.register(pi); // lifecycle handlers (spans + usage accumulation) + + pi.on("agent_end", async () => { + if (hasTracing) await otel.flush(); // invoke_agent has a remote parent → flush by id + if (usageOut) { + try { + writeFileSync(usageOut, JSON.stringify(otel.usage()), "utf-8"); + } catch (err) { + log(`usage writeback skipped: ${(err as Error).message}`); + } + } + }); +}; + +export default factory; diff --git a/services/agent/src/responder.ts b/services/agent/src/responder.ts new file mode 100644 index 0000000000..6af4132841 --- /dev/null +++ b/services/agent/src/responder.ts @@ -0,0 +1,77 @@ +/** + * The interaction responder seam. + * + * A harness (the ACP "Agent") does not only emit tool calls. It also raises typed + * reverse-RPC interaction requests that something must answer: permission gates today, + * elicitation (input) and client-side tools later. Today the rivet runner answered the + * permission gate inline with a hardcoded auto-approve. This module lifts that decision + * behind a `Responder` interface so it is pluggable: + * + * - `PolicyResponder` is the headless answer (a fixed `auto` / `deny` policy, no human). + * It reproduces the previous behavior exactly and is what `/invoke` uses. + * - A cross-turn responder (the `/messages` HITL path) slots in here later: it surfaces the + * request to the browser, ends the turn, and resolves on the next turn's reply. The + * harness adapter does not change when the responder does. + * + * Resolution is modeled as `allow` / `deny`; the adapter maps that onto the harness's + * available ACP replies via `decisionToReply`. + */ + +export type PermissionPolicy = "auto" | "deny"; + +export type PermissionDecision = "allow" | "deny"; + +/** A permission gate raised by the harness, normalized from the ACP request. */ +export interface PermissionRequest { + /** The ACP permission id; reused as the `interaction_request` event id for reply matching. */ + id: string; + /** Replies the harness offers (e.g. "always" | "once" | "reject"). */ + availableReplies: string[]; + /** The original ACP request, for responders that want the tool-call detail. */ + raw?: unknown; +} + +/** + * Answers interaction requests the harness raises. Permission is the only kind wired today; + * `input` (elicitation) and `client_tool` are forward-looking and will extend this interface + * alongside the cross-turn responder. + */ +export interface Responder { + onPermission(request: PermissionRequest): Promise; +} + +/** Headless responder: a fixed policy, no human in the loop. */ +export class PolicyResponder implements Responder { + constructor(private readonly policy: PermissionPolicy) {} + + async onPermission(_request: PermissionRequest): Promise { + return this.policy === "deny" ? "deny" : "allow"; + } +} + +/** + * Resolve the permission policy with the same precedence as before: an explicit per-run + * `permissionPolicy: "deny"` or the `AGENTA_RIVET_DENY_PERMISSIONS` env flips to deny; the + * default is auto-allow, because backend-resolved tools are trusted and the run is headless. + */ +export function policyFromRequest(permissionPolicy?: string): PermissionPolicy { + if (permissionPolicy === "deny" || process.env.AGENTA_RIVET_DENY_PERMISSIONS === "true") { + return "deny"; + } + return "auto"; +} + +/** Map an allow/deny decision onto the harness's available ACP replies. */ +export function decisionToReply( + decision: PermissionDecision, + availableReplies: string[], +): string { + if (decision === "deny") { + return availableReplies.find((r) => r === "reject") ?? "reject"; + } + return ( + availableReplies.find((r) => r === "always") ?? + availableReplies.find((r) => r === "once") ?? + "once" + ); +} diff --git a/services/agent/src/server.ts b/services/agent/src/server.ts new file mode 100644 index 0000000000..aae23c4480 --- /dev/null +++ b/services/agent/src/server.ts @@ -0,0 +1,155 @@ +/** + * WP-2 Pi wrapper HTTP server: the HTTP transport for the Harness port. + * + * Same contract as the CLI, exposed over HTTP so the wrapper can run as its own + * container (a sidecar) that the Python service calls in-network: + * + * GET /health -> { status: "ok" } + * POST /run -> body is an AgentRunRequest, response is an AgentRunResult + * + * Uses Node's built-in http server (no framework dependency). Pi auth comes from + * PI_CODING_AGENT_DIR / ~/.pi/agent, mounted into the container. + */ +import { createServer, type IncomingMessage, type ServerResponse } from "node:http"; + +import type { + AgentRunRequest, + AgentRunResult, + EmitEvent, + StreamRecord, +} from "./protocol.ts"; +import { runPi } from "./engines/pi.ts"; +import { runRivet } from "./engines/rivet.ts"; + +const PORT = Number(process.env.PORT ?? 8765); + +// Select the engine. `rivet` drives a harness over ACP via a rivet daemon; `pi` is the +// legacy in-process Pi path. The request's explicit `backend` (set by the Python +// transport) wins; the AGENT_BACKEND env is the sidecar default; `auto` falls back to the +// request shape (a rivet request carries `harness`/`sandbox`). +const DEFAULT_BACKEND = (process.env.AGENT_BACKEND ?? "auto").toLowerCase(); + +function runAgent( + request: AgentRunRequest, + emit?: EmitEvent, + signal?: AbortSignal, +): Promise { + const backend = (request.backend ?? DEFAULT_BACKEND).toLowerCase(); + if (backend === "rivet") return runRivet(request, emit, signal); + if (backend === "pi") return runPi(request, emit); + return request.harness || request.sandbox + ? runRivet(request, emit, signal) + : runPi(request, emit); +} + +/** + * Stream a run as NDJSON: one `{kind:"event"}` line per event the moment it is built, then + * exactly one terminal `{kind:"result"}` line (success or failure). Selected by the caller + * with `Accept: application/x-ndjson`; the one-shot `/run` path is left untouched. + */ +async function runAndStream( + req: IncomingMessage, + res: ServerResponse, + request: AgentRunRequest, +): Promise { + res.writeHead(200, { + "content-type": "application/x-ndjson", + "cache-control": "no-cache", + "x-accel-buffering": "no", + connection: "keep-alive", + }); + + // A client disconnect aborts the in-flight run rather than letting it finish unobserved. + // Listen on the response, not the request: the request body is already fully read, so its + // `close` can fire early on a keep-alive connection. `res` `close` fires when the response + // connection ends — after a normal `res.end()` (harmless: the run is already done) or when + // the client drops mid-stream (the case we want to cancel). + const controller = new AbortController(); + res.on("close", () => controller.abort()); + + const writeRecord = (record: StreamRecord): void => { + if (res.writableEnded) return; + res.write(JSON.stringify(record) + "\n"); + }; + const emit: EmitEvent = (event) => writeRecord({ kind: "event", event }); + + let result: AgentRunResult; + try { + result = await runAgent(request, emit, controller.signal); + } catch (err) { + const message = err instanceof Error ? err.stack ?? err.message : String(err); + result = { ok: false, error: message }; + } + // Streaming delivered the events live, so don't echo them in the terminal record. + writeRecord({ kind: "result", result: { ...result, events: [] } }); + res.end(); +} + +function send(res: ServerResponse, status: number, body: unknown): void { + const payload = JSON.stringify(body); + res.writeHead(status, { + "content-type": "application/json", + "content-length": Buffer.byteLength(payload), + }); + res.end(payload); +} + +async function readBody(req: IncomingMessage): Promise { + const chunks: Buffer[] = []; + for await (const chunk of req) { + chunks.push(chunk as Buffer); + } + return Buffer.concat(chunks).toString("utf8"); +} + +const server = createServer(async (req, res) => { + try { + if (req.method === "GET" && req.url === "/health") { + return send(res, 200, { status: "ok" }); + } + + if (req.method === "POST" && req.url === "/run") { + const raw = await readBody(req); + let request: AgentRunRequest; + try { + request = raw.trim() ? (JSON.parse(raw) as AgentRunRequest) : {}; + } catch (err) { + return send(res, 400, { ok: false, error: `Invalid JSON: ${String(err)}` }); + } + + const wantsStream = (req.headers["accept"] ?? "").includes( + "application/x-ndjson", + ); + if (wantsStream) { + await runAndStream(req, res, request); + return; + } + + const result = await runAgent(request); + return send(res, result.ok ? 200 : 500, result); + } + + return send(res, 404, { ok: false, error: "Not found" }); + } catch (err) { + const message = err instanceof Error ? err.stack ?? err.message : String(err); + return send(res, 500, { ok: false, error: message }); + } +}); + +// The rivet SDK can reject a background promise (e.g. an adapter install or the Daytona +// preview SSE failing) outside any awaited path. Node's default turns that into an +// uncaught exception that kills the whole process — taking every in-flight request with +// it (the caller sees "Server disconnected"). Log and keep serving instead; the failing +// run still returns its own error to its caller. +process.on("unhandledRejection", (reason) => { + process.stderr.write( + `[pi-wrapper] unhandledRejection: ${reason instanceof Error ? (reason.stack ?? reason.message) : String(reason)}\n`, + ); +}); +process.on("uncaughtException", (err) => { + process.stderr.write(`[pi-wrapper] uncaughtException: ${err.stack ?? err.message}\n`); +}); + +server.listen(PORT, () => { + process.stderr.write(`[pi-wrapper] http server listening on :${PORT}\n`); +}); diff --git a/services/agent/src/tools/dispatch.ts b/services/agent/src/tools/dispatch.ts index f948501265..fd68a87b72 100644 --- a/services/agent/src/tools/dispatch.ts +++ b/services/agent/src/tools/dispatch.ts @@ -54,7 +54,7 @@ export interface RunResolvedToolOpts { */ export async function relayToolCall( dir: string, - callRef: string, + toolName: string, toolCallId: string, params: unknown, signal?: AbortSignal, @@ -67,7 +67,7 @@ export async function relayToolCall( } catch { // The runner also creates it; a race here is harmless. } - writeFileSync(reqPath, JSON.stringify({ callRef, toolCallId, args: params ?? {} }), "utf-8"); + writeFileSync(reqPath, JSON.stringify({ toolName, toolCallId, args: params ?? {} }), "utf-8"); const deadline = Date.now() + RELAY_TIMEOUT_MS; while (Date.now() < deadline) { @@ -116,7 +116,7 @@ export async function runResolvedTool( } // callback (default): route back to Agenta's /tools/call (directly or via the Daytona relay). if (opts.relayDir) { - return relayToolCall(opts.relayDir, spec.callRef ?? "", opts.toolCallId, params, opts.signal); + return relayToolCall(opts.relayDir, spec.name, opts.toolCallId, params, opts.signal); } return callAgentaTool( opts.endpoint ?? "", diff --git a/services/agent/src/tools/mcp-bridge.ts b/services/agent/src/tools/mcp-bridge.ts index eaf5683a4d..c94230319b 100644 --- a/services/agent/src/tools/mcp-bridge.ts +++ b/services/agent/src/tools/mcp-bridge.ts @@ -3,20 +3,20 @@ * * The Pi engine (engines/pi.ts) injected resolved runnable tools (WP-7) as in-process Pi * customTools. Over ACP the harness only accepts tools through MCP, so the same - * resolved specs are exposed as an MCP server whose tool bodies POST back to Agenta's - * /tools/call (the provider key and connection auth stay server-side, exactly as in - * the Pi path). `buildToolMcpServers` returns the ACP `mcpServers` entry to attach to - * the session. + * resolved specs are exposed as an MCP server whose tool bodies relay back to the runner. + * The runner keeps private specs/auth in memory and performs the actual execution. + * `buildToolMcpServers` returns the ACP `mcpServers` entry to attach to the session. * - * Delivery: a stdio MCP bridge (mcp-server.ts) launched by the daemon. The specs and - * callback are passed to it as env, so nothing tool-specific is written to the - * agent-visible filesystem. + * Delivery: a stdio MCP bridge (mcp-server.ts) launched by the daemon. Its env carries + * only public tool metadata and the relay directory. It never receives scoped env, code, + * callback auth, or callback endpoints. */ import { existsSync } from "node:fs"; import { dirname, join } from "node:path"; import { fileURLToPath } from "node:url"; import type { ResolvedToolSpec, ToolCallbackContext } from "../protocol.ts"; +import { executableToolSpecs, publicToolSpecs } from "./public-spec.ts"; export type { ResolvedToolSpec, ToolCallbackContext } from "../protocol.ts"; @@ -56,47 +56,35 @@ export interface McpServerStdio { * filters them from tools/list), so they never justify attaching the bridge on their own. * - "Executable here" = non-client (`code` and `callback`). With zero executable specs we * return [] (the no-tools path stays untouched). - * - `code` tools run locally in mcp-server.ts (runCodeTool) and need NO callback endpoint, so - * we attach `agenta-tools` whenever there is at least one executable spec. - * - Only `callback` tools require `callback.endpoint`. If callback tools are present but the - * endpoint is missing, we do NOT drop the whole server (that would silently lose the `code` - * tools too): we still attach it and warn, naming the callback tools whose `tools/call` will - * fail. The endpoint/auth env entries are pushed only when the endpoint actually exists. + * - The bridge does not execute tools itself. It sends a request file to `relayDir`, and + * the runner executes the private resolved spec in memory. That keeps scoped env, code, + * callback auth, and callback endpoints out of child-process env. */ export function buildToolMcpServers( specs: ResolvedToolSpec[], - callback: ToolCallbackContext | undefined, + _callbackOrRelayDir?: ToolCallbackContext | string, + relayDir?: string, ): McpServerStdio[] { if (!specs || specs.length === 0) return []; // Absent kind defaults to `callback` (back-compat); `client` is the only non-executable kind. - const executable = specs.filter((s) => (s.kind ?? "callback") !== "client"); + const executable = executableToolSpecs(specs); if (executable.length === 0) return []; - // The callback subset is the only thing that needs the endpoint to function. - const callbackSpecs = executable.filter((s) => (s.kind ?? "callback") === "callback"); - const hasEndpoint = Boolean(callback?.endpoint); - - if (callbackSpecs.length > 0 && !hasEndpoint) { - const names = callbackSpecs.map((s) => s.name).join(", "); + const resolvedRelayDir = + typeof _callbackOrRelayDir === "string" ? _callbackOrRelayDir : relayDir; + if (!resolvedRelayDir) { + const names = executable.map((s) => s.name).join(", "); process.stderr.write( - `[tool-bridge] missing toolCallback endpoint: ${callbackSpecs.length} callback tool(s) ` + - `will fail (${names}); still attaching server for the other tool(s)\n`, + `[tool-bridge] missing tool relay directory: ${executable.length} tool(s) ` + + `will fail (${names})\n`, ); } - // Pass every executable spec; mcp-server.ts dispatches per kind (code runs locally, callback - // routes to the endpoint). const env: EnvVariable[] = [ - { name: "AGENTA_TOOL_SPECS", value: JSON.stringify(executable) }, + { name: "AGENTA_TOOL_PUBLIC_SPECS", value: JSON.stringify(publicToolSpecs(executable)) }, ]; - // Only carry the callback env when there is an endpoint to call back to. - if (hasEndpoint) { - env.push({ name: "AGENTA_TOOL_CALLBACK_ENDPOINT", value: callback!.endpoint }); - if (callback!.authorization) { - env.push({ name: "AGENTA_TOOL_CALLBACK_AUTH", value: callback!.authorization }); - } - } + if (resolvedRelayDir) env.push({ name: "AGENTA_TOOL_RELAY_DIR", value: resolvedRelayDir }); const { command, args } = bridgeLauncher(); return [{ name: "agenta-tools", command, args, env }]; diff --git a/services/agent/src/tools/mcp-server.ts b/services/agent/src/tools/mcp-server.ts index 98a240c50e..5628423c77 100644 --- a/services/agent/src/tools/mcp-server.ts +++ b/services/agent/src/tools/mcp-server.ts @@ -3,14 +3,13 @@ * * The harness only accepts tools over MCP when driven via ACP. This is a minimal, * dependency-free MCP stdio server that exposes the backend-resolved runnable tools - * (WP-7) and routes each tool call back through Agenta's /tools/call — so the Composio - * key and connection auth stay server-side, exactly as in the in-process Pi path. + * (WP-7) and relays each tool call back to the runner — so private specs/auth stay in + * runner memory, exactly as in the in-process Pi path. * - * Launched by the rivet daemon as a session MCP server (see mcp-bridge.ts). It reads - * everything from env so nothing tool-specific is written to the agent filesystem: - * AGENTA_TOOL_SPECS JSON array of { name, description, inputSchema, callRef } - * AGENTA_TOOL_CALLBACK_ENDPOINT full /tools/call URL - * AGENTA_TOOL_CALLBACK_AUTH Authorization header value (optional) + * Launched by the rivet daemon as a session MCP server (see mcp-bridge.ts). Its env + * contains only public tool metadata and the relay dir: + * AGENTA_TOOL_PUBLIC_SPECS JSON array of { name, description, inputSchema } + * AGENTA_TOOL_RELAY_DIR directory watched by the runner for tool requests * * Protocol: JSON-RPC 2.0 over stdio, newline-delimited (the MCP stdio framing). Handles * initialize, tools/list, tools/call; ignores notifications. stdout carries protocol @@ -22,9 +21,8 @@ import type { ResolvedToolSpec } from "../protocol.ts"; import { EMPTY_OBJECT_SCHEMA } from "./callback.ts"; import { runResolvedTool } from "./dispatch.ts"; -const SPECS: ResolvedToolSpec[] = JSON.parse(process.env.AGENTA_TOOL_SPECS ?? "[]"); -const ENDPOINT = process.env.AGENTA_TOOL_CALLBACK_ENDPOINT ?? ""; -const AUTH = process.env.AGENTA_TOOL_CALLBACK_AUTH; +const SPECS: ResolvedToolSpec[] = JSON.parse(process.env.AGENTA_TOOL_PUBLIC_SPECS ?? "[]"); +const RELAY_DIR = process.env.AGENTA_TOOL_RELAY_DIR; const SPEC_BY_NAME = new Map(SPECS.map((s) => [s.name, s])); const DEFAULT_PROTOCOL = "2025-06-18"; @@ -78,13 +76,12 @@ async function handle(message: any): Promise { return { jsonrpc: "2.0", id, error: { code: -32602, message: `unknown tool: ${name}` } }; } try { - // `code` runs the snippet locally (scoped secret env); everything else routes back to - // Agenta's /tools/call. A unique id per call so two parallel calls in the same - // millisecond don't collide (Date.now() would). + if (!RELAY_DIR) throw new Error("missing AGENTA_TOOL_RELAY_DIR"); + // The bridge only has public metadata. A unique id per call keeps parallel calls from + // colliding while the runner maps the tool name back to its private resolved spec. const text = await runResolvedTool(spec, params?.arguments, { toolCallId: randomUUID(), - endpoint: ENDPOINT, - authorization: AUTH, + relayDir: RELAY_DIR, }); return { jsonrpc: "2.0", id, result: { content: [{ type: "text", text }] } }; } catch (err) { @@ -104,7 +101,7 @@ async function handle(message: any): Promise { } function main(): void { - log(`serving ${SPECS.length} tool(s) -> ${ENDPOINT || "(no endpoint)"}`); + log(`serving ${SPECS.length} tool(s) -> relay ${RELAY_DIR || "(missing)"}`); let buffer = ""; process.stdin.setEncoding("utf8"); process.stdin.on("data", (chunk: string) => { diff --git a/services/agent/src/tools/public-spec.ts b/services/agent/src/tools/public-spec.ts new file mode 100644 index 0000000000..01ded7d3ed --- /dev/null +++ b/services/agent/src/tools/public-spec.ts @@ -0,0 +1,31 @@ +/** + * Public tool metadata safe to expose to harness child processes. + * + * ResolvedToolSpec also carries executor-private fields (`callRef`, `code`, scoped `env`, + * runtime). Those must stay in runner memory. Child processes only need the advertisement + * shape so the model can choose a tool; every execution is relayed back to the runner. + */ +import type { ResolvedToolSpec } from "../protocol.ts"; + +export interface PublicToolSpec { + name: string; + description?: string; + inputSchema?: Record | null; +} + +/** `client` tools are browser-fulfilled and are not executable by a runner child process. */ +export function executableToolSpecs(specs: ResolvedToolSpec[]): ResolvedToolSpec[] { + return specs.filter((spec) => (spec.kind ?? "callback") !== "client"); +} + +export function publicToolSpec(spec: ResolvedToolSpec): PublicToolSpec { + return { + name: spec.name, + description: spec.description, + inputSchema: spec.inputSchema, + }; +} + +export function publicToolSpecs(specs: ResolvedToolSpec[]): PublicToolSpec[] { + return executableToolSpecs(specs).map(publicToolSpec); +} diff --git a/services/agent/src/tools/relay.ts b/services/agent/src/tools/relay.ts index 952ff8893a..4889b110af 100644 --- a/services/agent/src/tools/relay.ts +++ b/services/agent/src/tools/relay.ts @@ -1,23 +1,25 @@ /** * Daytona tool relay. * - * On Daytona the harness runs in a remote cloud sandbox that can reach the public internet - * but NOT a firewalled / private Agenta backend (the same reason tracing is built from the - * event stream there instead of in-sandbox OTLP). So the in-sandbox Pi extension cannot - * POST tool calls to Agenta's /tools/call directly. + * Tool child processes do not receive private resolved specs, executable code, scoped env, + * callback endpoints, or callback auth. They receive only public tool metadata plus this + * relay directory, then ask the runner to execute each call. * * The runner CAN reach Agenta (it resolved the tools and holds the callback), and it can * reach the sandbox filesystem over the daemon API. So tool calls are relayed through the * runner via files in a sandbox dir: * - * extension: write `.req.json` {callRef, args} ──▶ poll `.res.json` - * runner: poll the dir, read `.req.json` ──▶ /tools/call ──▶ write `.res.json` + * child: write `.req.json` {toolName, args} ──▶ poll `.res.json` + * runner: poll the dir, read `.req.json` ──▶ execute private spec in memory + * ──▶ write `.res.json` * - * Local runs keep the direct path (the in-process / local-daemon extension reaches Agenta); - * the relay is only wired when AGENTA_TOOL_RELAY_DIR is set (Daytona + Pi + tools). + * The same loop supports local filesystem relays and Daytona sandbox filesystem relays. */ +import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync } from "node:fs"; + import { callAgentaTool } from "./callback.ts"; -import type { ToolCallbackContext } from "../protocol.ts"; +import { runCodeTool } from "./code.ts"; +import type { ResolvedToolSpec, ToolCallbackContext } from "../protocol.ts"; export const RELAY_REQ_SUFFIX = ".req.json"; export const RELAY_RES_SUFFIX = ".res.json"; @@ -25,7 +27,7 @@ export const RELAY_POLL_MS = Number(process.env.AGENTA_TOOL_RELAY_POLL_MS ?? 300 export const RELAY_TIMEOUT_MS = Number(process.env.AGENTA_TOOL_RELAY_TIMEOUT_MS ?? 60000); export interface RelayRequest { - callRef: string; + toolName: string; toolCallId: string; args: unknown; } @@ -42,6 +44,74 @@ export function sanitizeRelayId(id: string): string { export const sleep = (ms: number): Promise => new Promise((r) => setTimeout(r, ms)); +export interface RelayHost { + list: (dir: string) => Promise; + read: (path: string) => Promise; + write: (path: string, contents: string) => Promise; +} + +/** Relay host for child processes running on the same filesystem as the runner. */ +export function localRelayHost(): RelayHost { + return { + list: async (dir) => { + if (!existsSync(dir)) return []; + return readdirSync(dir); + }, + read: async (path) => readFileSync(path, "utf-8"), + write: async (path, contents) => { + mkdirSync(path.slice(0, path.lastIndexOf("/")), { recursive: true }); + writeFileSync(path, contents, "utf-8"); + }, + }; +} + +/** Relay host for child processes running inside a Daytona sandbox. */ +export function sandboxRelayHost(sandbox: any): RelayHost { + return { + list: async (dir) => { + const ls = await sandbox.runProcess({ + command: "ls", + args: ["-1", dir], + timeoutMs: 10_000, + }); + return String(ls?.stdout ?? "") + .split("\n") + .map((s) => s.trim()) + .filter(Boolean); + }, + read: async (path) => { + const bytes = await sandbox.readFsFile({ path }); + return typeof bytes === "string" ? bytes : new TextDecoder().decode(bytes); + }, + write: async (path, contents) => { + await sandbox.writeFsFile({ path }, contents); + }, + }; +} + +async function executeRelayedTool( + spec: ResolvedToolSpec, + req: RelayRequest, + callback: ToolCallbackContext | undefined, +): Promise { + if (spec.kind === "client") { + throw new Error(`client tool '${spec.name}' is browser-fulfilled and cannot be executed`); + } + if (spec.kind === "code") { + return runCodeTool(spec.runtime, spec.code ?? "", spec.env, req.args); + } + if (!callback?.endpoint) { + throw new Error(`missing toolCallback endpoint for '${spec.name}'`); + } + return callAgentaTool( + callback.endpoint, + callback.authorization, + spec.callRef ?? "", + req.toolCallId, + req.args, + ); +} + /** * Runner-side relay loop. Polls the sandbox relay dir for request files, executes each * against Agenta's /tools/call (which the runner can reach), and writes the response file @@ -49,37 +119,35 @@ export const sleep = (ms: number): Promise => new Promise((r) => setTimeou * in-flight executions; call it once the prompt resolves. */ export function startToolRelay( - sandbox: any, + host: RelayHost, relayDir: string, - callback: ToolCallbackContext, + specs: ResolvedToolSpec[], + callback: ToolCallbackContext | undefined, ): { stop: () => Promise } { let active = true; const seen = new Set(); const inflight: Promise[] = []; + const specsByName = new Map(specs.map((spec) => [spec.name, spec])); const handle = async (reqName: string): Promise => { const id = reqName.slice(0, -RELAY_REQ_SUFFIX.length); let res: RelayResponse; try { - const bytes = await sandbox.readFsFile({ path: `${relayDir}/${reqName}` }); - const raw = typeof bytes === "string" ? bytes : new TextDecoder().decode(bytes); + const raw = await host.read(`${relayDir}/${reqName}`); const req = JSON.parse(raw) as RelayRequest; - const text = await callAgentaTool( - callback.endpoint, - callback.authorization, - req.callRef, - req.toolCallId ?? id, - req.args, + const spec = specsByName.get(req.toolName); + if (!spec) throw new Error(`unknown tool '${req.toolName}'`); + const text = await executeRelayedTool( + spec, + { ...req, toolCallId: req.toolCallId ?? id }, + callback, ); res = { ok: true, text }; } catch (err) { res = { ok: false, error: err instanceof Error ? err.message : String(err) }; } try { - await sandbox.writeFsFile( - { path: `${relayDir}/${id}${RELAY_RES_SUFFIX}` }, - JSON.stringify(res), - ); + await host.write(`${relayDir}/${id}${RELAY_RES_SUFFIX}`, JSON.stringify(res)); } catch { // The extension will time out and surface a tool error; nothing else to do here. } @@ -88,15 +156,7 @@ export function startToolRelay( const loop = (async () => { while (active) { try { - const ls = await sandbox.runProcess({ - command: "ls", - args: ["-1", relayDir], - timeoutMs: 10_000, - }); - const names = String(ls?.stdout ?? "") - .split("\n") - .map((s) => s.trim()) - .filter(Boolean); + const names = await host.list(relayDir); for (const name of names) { if (!name.endsWith(RELAY_REQ_SUFFIX) || seen.has(name)) continue; seen.add(name); diff --git a/services/agent/src/tracing/otel.ts b/services/agent/src/tracing/otel.ts new file mode 100644 index 0000000000..d022095a42 --- /dev/null +++ b/services/agent/src/tracing/otel.ts @@ -0,0 +1,1026 @@ +/** + * agenta-otel — a Pi extension that turns Pi's `pi.on(...)` lifecycle events into + * OpenTelemetry spans and exports them (OTLP/HTTP protobuf) to Agenta. + * + * This is the service build of the WP-1 POC extension + * (docs/design/agent-workflows/scratch/wp-1-pi-tracing/poc/agenta-otel.ts). It keeps the + * span tree and the load-bearing attribute choices identical, and adds three + * things the service needs that the single-run POC did not: + * + * 1. Per-run state. The POC kept span state in module globals because it ran one + * prompt at a time. The service may drive several runs in one process (the + * HTTP sidecar), so all per-run state lives in the closure returned by + * `createAgentaOtel`. The shared tracer/provider/exporters stay module-level. + * 2. Cross-boundary trace context. The caller (the Agenta Python service) passes a + * W3C `traceparent`. When present, `invoke_agent` is started as a CHILD of that + * remote span, so the whole agent run joins the same trace as the `/invoke` + * request — the agent's work becomes part of the response trace, the way + * completion/chat nest their LLM spans under the workflow span. + * 3. Per-trace export target. The OTLP endpoint and `Authorization` header come + * from the run config (the caller's host + credentials), falling back to env. + * Each trace is exported with its own target, so a shared process can serve + * more than one project. + * + * Span tree (per user prompt), unchanged from the POC: + * invoke_agent (openinference.span.kind = AGENT) + * turn N (CHAIN) + * chat (LLM) — the provider request for that turn + * execute_tool (TOOL) — each tool the turn ran + * + * Config (read lazily from the environment for the fallback target): + * AGENTA_HOST, AGENTA_API_KEY — fallback exporter endpoint + auth + * OTEL_SERVICE_NAME — resource service.name (default "pi-agent") + */ +import type { ExtensionAPI } from "@earendil-works/pi-coding-agent"; +import { + context, + ROOT_CONTEXT, + trace, + TraceFlags, + SpanStatusCode, + type Context, + type Span, + type SpanContext, +} from "@opentelemetry/api"; +import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-proto"; +import { Resource } from "@opentelemetry/resources"; +import type { + ReadableSpan, + SpanExporter, + SpanProcessor, +} from "@opentelemetry/sdk-trace-base"; +import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; +import { ATTR_SERVICE_NAME } from "@opentelemetry/semantic-conventions"; + +import type { AgentEvent, AgentUsage, EmitEvent } from "../protocol.ts"; + +// --------------------------------------------------------------------------- +// Shared, process-wide tracing infrastructure +// --------------------------------------------------------------------------- + +/** Where a trace's spans are shipped: an OTLP endpoint and an Authorization header. */ +interface ExportTarget { + endpoint: string; + authorization?: string; +} + +/** traceId (hex) -> where that trace's spans should be exported. Set on agent_start. */ +const traceTargets = new Map(); + +/** Cache one exporter per distinct endpoint+auth so we do not rebuild per export. */ +const exporterCache = new Map(); + +function targetKey(target: ExportTarget): string { + return `${target.endpoint}\n${target.authorization ?? ""}`; +} + +function getExporter(target: ExportTarget): OTLPTraceExporter { + const key = targetKey(target); + let exporter = exporterCache.get(key); + if (!exporter) { + exporter = new OTLPTraceExporter({ + url: target.endpoint, + headers: target.authorization + ? { Authorization: target.authorization } + : {}, + timeoutMillis: 10_000, + }); + exporterCache.set(key, exporter); + } + return exporter; +} + +/** Fallback target from env, used when a trace was started without an explicit one. */ +function defaultTarget(): ExportTarget { + const host = (process.env.AGENTA_HOST || "https://cloud.agenta.ai").replace( + /\/+$/, + "", + ); + const apiKey = process.env.AGENTA_API_KEY || ""; + return { + endpoint: `${host}/api/otlp/v1/traces`, + authorization: apiKey ? `ApiKey ${apiKey}` : undefined, + }; +} + +/** + * Buffer a trace's spans and export them in ONE OTLP batch. Agenta computes + * cumulative (rolled-up) token/cost metrics per ingest batch, so a trace split + * across batches loses the root aggregation. Two completion signals: + * - the root span ends (standalone run: invoke_agent IS the root), or + * - the run flushes explicitly by trace id (cross-boundary run: invoke_agent + * has a remote parent that never ends in this process, so root-end never fires). + */ +class TraceBatchProcessor implements SpanProcessor { + private readonly buffers = new Map(); + + onStart(): void {} + + onEnd(span: ReadableSpan): void { + const traceId = span.spanContext().traceId; + const spans = this.buffers.get(traceId) ?? []; + spans.push(span); + this.buffers.set(traceId, spans); + // No parent in this process => this is the local root and the trace is done. + if (!span.parentSpanId) { + this.flush(traceId); + } + } + + /** Export and drop one trace's buffered spans. Resolves once the export returns. */ + flush(traceId: string): Promise { + const spans = this.buffers.get(traceId); + if (!spans || spans.length === 0) return Promise.resolve(); + this.buffers.delete(traceId); + const target = traceTargets.get(traceId) ?? defaultTarget(); + traceTargets.delete(traceId); + return new Promise((resolve) => + getExporter(target).export(orderParentFirst(spans), () => resolve()), + ); + } + + forceFlush(): Promise { + return Promise.all( + [...this.buffers.keys()].map((traceId) => this.flush(traceId)), + ).then(() => undefined); + } + + shutdown(): Promise { + return this.forceFlush().then(async () => { + await Promise.all( + [...exporterCache.values()].map((exporter) => exporter.shutdown()), + ); + }); + } +} + +let provider: NodeTracerProvider | undefined; +let processor: TraceBatchProcessor | undefined; + +function ensureProvider(): void { + if (provider) return; + processor = new TraceBatchProcessor(); + provider = new NodeTracerProvider({ + resource: new Resource({ + [ATTR_SERVICE_NAME]: process.env.OTEL_SERVICE_NAME || "pi-agent", + }), + }); + provider.addSpanProcessor(processor); + provider.register(); +} + +/** Flush one trace's spans to Agenta. Call after a run whose root has a remote parent. */ +export async function flushTrace(traceId?: string): Promise { + if (!processor || !traceId) return; + await processor.flush(traceId); +} + +/** Flush and shut down all exporters. Call once on process exit, not per run. */ +export async function shutdownTracing(): Promise { + if (!provider) return; + try { + await provider.forceFlush(); + await provider.shutdown(); + } finally { + provider = undefined; + processor = undefined; + exporterCache.clear(); + } +} + +/** + * Order spans parent-before-child (preorder DFS). Agenta stores timestamps at + * millisecond resolution and builds its roll-up tree by sorting on start_time, + * attaching a span only if its parent is already seen. A parent-first request + * order keeps parents ahead of children on same-millisecond ties. + */ +function orderParentFirst(spans: ReadableSpan[]): ReadableSpan[] { + const byId = new Map(spans.map((s) => [s.spanContext().spanId, s])); + const childrenOf = new Map(); + const roots: ReadableSpan[] = []; + for (const s of spans) { + const parentId = s.parentSpanId; + if (parentId && byId.has(parentId)) { + const list = childrenOf.get(parentId) ?? []; + list.push(s); + childrenOf.set(parentId, list); + } else { + roots.push(s); + } + } + const ordered: ReadableSpan[] = []; + const visit = (s: ReadableSpan) => { + ordered.push(s); + for (const child of childrenOf.get(s.spanContext().spanId) ?? []) visit(child); + }; + roots.forEach(visit); + // Any spans not reached (defensive) get appended so nothing is dropped. + if (ordered.length !== spans.length) { + const seen = new Set(ordered); + for (const s of spans) if (!seen.has(s)) ordered.push(s); + } + return ordered; +} + +/** Build a parent Context from a W3C traceparent string, or undefined if absent/invalid. */ +function parentContext(traceparent?: string): Context | undefined { + if (!traceparent) return undefined; + const match = /^00-([0-9a-f]{32})-([0-9a-f]{16})-([0-9a-f]{2})$/.exec( + traceparent.trim(), + ); + if (!match) return undefined; + const [, traceId, spanId, flags] = match; + const spanContext: SpanContext = { + traceId, + spanId, + // Honor the incoming sampled bit; default to sampled so child spans record. + traceFlags: (parseInt(flags, 16) & 1) === 1 ? TraceFlags.SAMPLED : TraceFlags.NONE, + isRemote: true, + }; + return trace.setSpanContext(ROOT_CONTEXT, spanContext); +} + +// --------------------------------------------------------------------------- +// Per-run config + content helpers +// --------------------------------------------------------------------------- + +/** One run's tracing config. Mutated by the runner after the session is created. */ +export interface RunConfig { + /** OTLP traces endpoint for this run's trace (falls back to env). */ + endpoint?: string; + /** Authorization header value for the OTLP export (falls back to env ApiKey). */ + authorization?: string; + /** W3C traceparent from the caller; nests invoke_agent under that span. */ + traceparent?: string; + /** W3C baggage from the caller (carried for future use). */ + baggage?: string; + /** Drop prompt/completion/tool I/O from spans when false. */ + captureContent: boolean; + /** Pi session id, set after createAgentSession so spans carry session.id. */ + sessionId?: string; + /** Resolved provider, set after the model is picked. */ + provider?: string; + /** Resolved model id, set after the model is picked. */ + requestModel?: string; + /** Filled by the extension on agent_start so the runner can flush/return it. */ + traceId?: string; +} + +/** A string output → ag.data.outputs (any type is valid there). */ +function setOutput(span: Span, value: unknown, capture: boolean): void { + if (!capture || value == null) return; + const text = typeof value === "string" ? value : JSON.stringify(value); + if (text.length > 0) span.setAttribute("output.value", text); +} + +/** + * ag.data.inputs must be a dict, so emit input.value as a JSON object string. + * A non-object (raw string) would be relocated to ag.unsupported by Agenta. + */ +function setInputs( + span: Span, + obj: Record, + capture: boolean, +): void { + if (!capture) return; + span.setAttribute("input.value", JSON.stringify(obj)); + span.setAttribute("input.mime_type", "application/json"); +} + +function oiRole(role: string): string { + return role === "toolResult" ? "tool" : role; // user | assistant | system | tool +} + +function messageText(msg: any): string { + const c = msg?.content; + if (typeof c === "string") return c; + if (Array.isArray(c)) { + return c + .filter((b: any) => b?.type === "text") + .map((b: any) => b.text) + .join(""); + } + return ""; +} + +/** + * Emit OpenInference structured messages so Agenta renders a proper message + * thread. `llm.input_messages.*` -> ag.data.inputs.prompt.*, + * `llm.output_messages.*` -> ag.data.outputs.completion.*. + */ +function emitMessages( + span: Span, + prefix: string, + messages: any[], + capture: boolean, +): void { + if (!capture || !Array.isArray(messages)) return; + messages.forEach((m, i) => { + const base = `${prefix}.${i}.message`; + span.setAttribute(`${base}.role`, oiRole(m.role)); + const text = messageText(m); + if (text) span.setAttribute(`${base}.content`, text); + if (m.role === "toolResult" && m.toolCallId) + span.setAttribute(`${base}.tool_call_id`, m.toolCallId); + if (Array.isArray(m.content)) { + m.content + .filter((b: any) => b?.type === "toolCall") + .forEach((call: any, j: number) => { + const tc = `${base}.tool_calls.${j}.tool_call`; + if (call.id) span.setAttribute(`${tc}.id`, call.id); + span.setAttribute(`${tc}.function.name`, call.name); + span.setAttribute( + `${tc}.function.arguments`, + JSON.stringify(call.arguments ?? {}), + ); + }); + } + }); +} + +function toolResultText(result: any): string { + if (!result) return ""; + if (typeof result === "string") return result; + if (Array.isArray(result)) { + return result + .filter((c: any) => c?.type === "text") + .map((c: any) => c.text) + .join(""); + } + if (result.content) return toolResultText(result.content); + return JSON.stringify(result); +} + +function lastAssistantText(messages: any): string { + if (!Array.isArray(messages)) return ""; + for (let i = messages.length - 1; i >= 0; i--) { + if (messages[i]?.role === "assistant") return messageText(messages[i]); + } + return ""; +} + +/** Fill an LLM span from a finished assistant message (model, tokens, finish, output). */ +function applyAssistant(span: Span, msg: any, capture: boolean): void { + if (msg.provider) span.setAttribute("gen_ai.system", msg.provider); + if (msg.model) span.setAttribute("gen_ai.request.model", msg.model); + if (msg.responseModel || msg.model) + span.setAttribute("gen_ai.response.model", msg.responseModel ?? msg.model); + if (msg.responseId) span.setAttribute("gen_ai.response.id", msg.responseId); + if (msg.stopReason) + span.setAttribute("gen_ai.response.finish_reasons", [String(msg.stopReason)]); + + const u = msg.usage; + if (u) { + // Current GenAI names (mapped by Agenta's logfire adapter) ... + span.setAttribute("gen_ai.usage.input_tokens", u.input ?? 0); + span.setAttribute("gen_ai.usage.output_tokens", u.output ?? 0); + // ... and legacy names (mapped by Agenta's semconv.py). Emit both so token + // usage is never silently dropped regardless of which adapter wins. + span.setAttribute("gen_ai.usage.prompt_tokens", u.input ?? 0); + span.setAttribute("gen_ai.usage.completion_tokens", u.output ?? 0); + span.setAttribute( + "gen_ai.usage.total_tokens", + u.totalTokens ?? (u.input ?? 0) + (u.output ?? 0), + ); + if (u.cacheRead) + span.setAttribute("gen_ai.usage.cache_read_input_tokens", u.cacheRead); + if (u.cacheWrite) + span.setAttribute("gen_ai.usage.cache_creation_input_tokens", u.cacheWrite); + if (u.cost?.total != null) span.setAttribute("gen_ai.usage.cost", u.cost.total); + } + + emitMessages(span, "llm.output_messages", [msg], capture); + if (msg.stopReason === "error" || msg.errorMessage) { + span.setStatus({ code: SpanStatusCode.ERROR, message: msg.errorMessage }); + } +} + +// --------------------------------------------------------------------------- +// Extension factory (one per run; state is closure-scoped) +// --------------------------------------------------------------------------- + +export interface AgentaOtel { + /** Register with DefaultResourceLoader.extensionFactories. */ + register: (pi: ExtensionAPI) => void; + /** Mutable config; set sessionId/provider/requestModel after the session exists. */ + config: RunConfig; + /** Flush this run's trace to Agenta. Await before the process/response ends. */ + flush: () => Promise; + /** Run totals (tokens + cost) summed across turns, for roll-up onto the parent. */ + usage: () => { input: number; output: number; total: number; cost: number }; +} + +/** + * Build a tracing extension scoped to a single agent run. Pass `register` to the + * resource loader, fill in `config.sessionId`/`provider`/`requestModel` once the + * session and model are resolved, then `await flush()` after the prompt completes. + */ +export function createAgentaOtel( + init: Partial & { captureContent?: boolean }, +): AgentaOtel { + ensureProvider(); + + const config: RunConfig = { + endpoint: init.endpoint, + authorization: init.authorization, + traceparent: init.traceparent, + captureContent: init.captureContent !== false, + sessionId: init.sessionId, + provider: init.provider, + requestModel: init.requestModel, + }; + + const tracer = trace.getTracer("agenta-pi-otel", "0.1.0"); + + // Per-run span state — closure-scoped so concurrent runs never collide. + let agentSpan: Span | undefined; + let agentCtx: Context | undefined; + let pendingPrompt: string | undefined; + let currentTurn: { span: Span; ctx: Context; index?: number } | undefined; + let llmSpan: Span | undefined; + let lastContextMessages: any[] | undefined; + const toolSpans = new Map(); + // Run totals, summed across every assistant turn. Stamped on the agent span and + // returned so the caller can roll them up onto the workflow span in its own process + // (the agent and workflow spans are exported in separate OTLP batches, so Agenta's + // per-batch cumulative roll-up cannot bridge them on its own). + const runUsage = { input: 0, output: 0, total: 0, cost: 0 }; + + function accumulateUsage(msg: any): void { + const u = msg?.usage; + if (!u) return; + const input = u.input ?? 0; + const output = u.output ?? 0; + runUsage.input += input; + runUsage.output += output; + runUsage.total += u.totalTokens ?? input + output; + if (u.cost?.total != null) runUsage.cost += u.cost.total; + } + + const register = (pi: ExtensionAPI): void => { + pi.on("before_agent_start", async (event: any) => { + pendingPrompt = event?.prompt; + }); + + pi.on("agent_start", async () => { + // Nest under the caller's workflow span when a traceparent was supplied, + // so the whole run joins the /invoke trace; otherwise start a fresh root. + const parent = parentContext(config.traceparent); + agentSpan = tracer.startSpan("invoke_agent", undefined, parent); + agentSpan.setAttribute("openinference.span.kind", "AGENT"); + agentSpan.setAttribute("gen_ai.operation.name", "invoke_agent"); + agentSpan.setAttribute("gen_ai.agent.name", "pi"); + if (config.sessionId) { + agentSpan.setAttribute("session.id", config.sessionId); + agentSpan.setAttribute("gen_ai.conversation.id", config.sessionId); + } + setInputs(agentSpan, { prompt: pendingPrompt ?? "" }, config.captureContent); + + const traceId = agentSpan.spanContext().traceId; + config.traceId = traceId; + traceTargets.set(traceId, { + endpoint: config.endpoint ?? defaultTarget().endpoint, + authorization: config.authorization ?? defaultTarget().authorization, + }); + agentCtx = trace.setSpan(parent ?? context.active(), agentSpan); + }); + + // The messages handed to the next LLM call — the chat span's input. + pi.on("context", async (event: any) => { + if (Array.isArray(event?.messages)) lastContextMessages = event.messages; + }); + + pi.on("turn_start", async (event: any) => { + const parent = agentCtx ?? context.active(); + const name = event?.turnIndex != null ? `turn ${event.turnIndex}` : "turn"; + const span = tracer.startSpan(name, undefined, parent); + span.setAttribute("openinference.span.kind", "CHAIN"); + if (event?.turnIndex != null) span.setAttribute("pi.turn.index", event.turnIndex); + currentTurn = { span, ctx: trace.setSpan(parent, span), index: event?.turnIndex }; + }); + + pi.on("before_provider_request", async (_event: any, ctx: any) => { + const parent = currentTurn?.ctx ?? agentCtx ?? context.active(); + const modelId = config.requestModel ?? ctx?.model?.id; + const providerName = config.provider ?? ctx?.model?.provider; + llmSpan = tracer.startSpan(modelId ? `chat ${modelId}` : "chat", undefined, parent); + llmSpan.setAttribute("openinference.span.kind", "LLM"); + llmSpan.setAttribute("gen_ai.operation.name", "chat"); + if (providerName) llmSpan.setAttribute("gen_ai.system", providerName); + if (modelId) llmSpan.setAttribute("gen_ai.request.model", modelId); + if (lastContextMessages) + emitMessages(llmSpan, "llm.input_messages", lastContextMessages, config.captureContent); + }); + + pi.on("message_end", async (event: any) => { + const msg = event?.message; + if (!msg || msg.role !== "assistant" || !llmSpan) return; + applyAssistant(llmSpan, msg, config.captureContent); + accumulateUsage(msg); + llmSpan.end(); + llmSpan = undefined; + }); + + pi.on("tool_execution_start", async (event: any) => { + const parent = currentTurn?.ctx ?? agentCtx ?? context.active(); + const name = event?.toolName ? `execute_tool ${event.toolName}` : "execute_tool"; + const span = tracer.startSpan(name, undefined, parent); + span.setAttribute("openinference.span.kind", "TOOL"); + span.setAttribute("gen_ai.operation.name", "execute_tool"); + if (event?.toolName) span.setAttribute("gen_ai.tool.name", event.toolName); + if (event?.toolCallId) span.setAttribute("gen_ai.tool.call.id", event.toolCallId); + setInputs(span, (event?.args as Record) ?? {}, config.captureContent); + if (event?.toolCallId) toolSpans.set(event.toolCallId, span); + }); + + pi.on("tool_execution_end", async (event: any) => { + const span = event?.toolCallId ? toolSpans.get(event.toolCallId) : undefined; + if (!span) return; + setOutput(span, toolResultText(event?.result), config.captureContent); + if (event?.isError) span.setStatus({ code: SpanStatusCode.ERROR }); + span.end(); + toolSpans.delete(event.toolCallId); + }); + + pi.on("turn_end", async (event: any) => { + // Safety net: if the LLM span is still open (no assistant message_end seen), + // close it from the turn's assistant message. + if (llmSpan && event?.message) { + applyAssistant(llmSpan, event.message, config.captureContent); + accumulateUsage(event.message); + llmSpan.end(); + llmSpan = undefined; + } + if (currentTurn) { + currentTurn.span.end(); + currentTurn = undefined; + } + }); + + pi.on("agent_end", async (event: any) => { + if (!agentSpan) return; + setOutput(agentSpan, lastAssistantText(event?.messages), config.captureContent); + // Stamp the run total on the agent span so it shows the agent's tokens/cost even + // though Agenta cannot roll the per-turn LLM spans up across batches. + if (runUsage.total > 0) { + agentSpan.setAttribute("gen_ai.usage.input_tokens", runUsage.input); + agentSpan.setAttribute("gen_ai.usage.output_tokens", runUsage.output); + agentSpan.setAttribute("gen_ai.usage.prompt_tokens", runUsage.input); + agentSpan.setAttribute("gen_ai.usage.completion_tokens", runUsage.output); + agentSpan.setAttribute("gen_ai.usage.total_tokens", runUsage.total); + if (runUsage.cost > 0) agentSpan.setAttribute("gen_ai.usage.cost", runUsage.cost); + } + agentSpan.end(); + agentSpan = undefined; + agentCtx = undefined; + lastContextMessages = undefined; + }); + }; + + return { + register, + config, + flush: () => flushTrace(config.traceId), + usage: () => ({ ...runUsage }), + }; +} + +// --------------------------------------------------------------------------- +// Rivet / ACP tracer (one per run; state is closure-scoped) +// --------------------------------------------------------------------------- +// +// The Pi extension above hooks Pi's in-process `pi.on(...)` events. Under rivet the +// harness runs as a separate process and we never see those events; instead the rivet +// SDK surfaces the run as ACP `session/update` notifications (agent_message_chunk, +// tool_call, tool_call_update, usage_update). This tracer builds the SAME span tree +// from that event stream, so tracing is uniform across every harness rivet drives +// (Pi, Claude Code, ...) and always nests under the caller's `/invoke` span. +// +// Span tree (per prompt turn): +// invoke_agent (AGENT) +// turn 0 (CHAIN) +// chat (LLM) — model interaction; usage where the harness reports it +// execute_tool (TOOL) — one per ACP tool_call + +/** Text of an ACP ContentBlock (the shape carried by message/thought chunks). */ +function acpBlockText(block: any): string { + if (!block) return ""; + if (typeof block === "string") return block; + if (block.type === "text" && typeof block.text === "string") return block.text; + return ""; +} + +/** Text of an ACP tool_call `content` array (ToolCallContent[]). */ +function acpToolContentText(content: any): string { + if (!content) return ""; + if (typeof content === "string") return content; + if (Array.isArray(content)) { + return content + .map((c: any) => acpBlockText(c?.content ?? c)) + .filter(Boolean) + .join(""); + } + return ""; +} + +/** + * Strip the pi-acp startup banner that some setups emit as the first agent message + * chunk (a "pi vX.Y.Z" / "## Context" / file list / "New version available" prelude, + * surfaced ahead of the real answer). Removes only a leading run of those marker lines + * so a genuine answer is never touched. + */ +function stripStartupBanner(text: string): string { + const lines = text.split("\n"); + const isBanner = (line: string) => + /^pi v\d+\.\d+\.\d+/.test(line) || + /^## Context\b/.test(line) || + /^-\s+\/.*AGENTS\.md\s*$/.test(line) || + /^New version available:/.test(line) || + /^Run: `npm/.test(line) || + line.trim() === "---" || + line.trim() === ""; + let i = 0; + let sawBanner = false; + while (i < lines.length && isBanner(lines[i])) { + if (lines[i].trim() !== "") sawBanner = true; + i++; + } + return sawBanner ? lines.slice(i).join("\n").trim() : text; +} + +/** Split a resolved model id ("openai-codex/gpt-5.5") into provider + id. */ +function splitModel(model?: string): { provider?: string; id?: string } { + if (!model) return {}; + const slash = model.indexOf("/"); + if (slash === -1) return { id: model }; + return { provider: model.slice(0, slash), id: model.slice(slash + 1) }; +} + +export interface RivetOtelInit extends Partial { + captureContent?: boolean; + /** Harness id ("pi" / "claude"); becomes gen_ai.agent.name. */ + harness?: string; + /** Resolved model id ("openai-codex/gpt-5.5"); set on the LLM span. */ + model?: string; + /** + * Emit the span tree from the ACP event stream. Default true. Set false when the + * harness instruments itself (e.g. Pi via the agenta extension propagates the trace + * context and emits its own real turn/chat/tool spans) — then this only accumulates + * the reply text and builds no spans, so the two do not double up. + */ + emitSpans?: boolean; + /** + * Live event sink. When set, each `AgentEvent` is flushed here the moment it is built + * (in addition to being recorded in `events[]`), and the text/reasoning blocks are + * emitted as `*_start`/`*_delta`/`*_end` lifecycle events rather than coalesced at the + * end. When unset (the one-shot path), only the coalesced `message`/`thought` land in + * `events[]`. This split is what keeps a delta'd block from being re-sent in full. + */ + emit?: EmitEvent; +} + +export interface RivetOtel { + /** Start the invoke_agent (AGENT) span as a child of the caller's traceparent. */ + start(input: { prompt?: string; messages?: any[]; sessionId?: string }): void; + /** Feed one ACP `session/update` payload (the `update` object). */ + handleUpdate(update: any): void; + /** + * Record an event the ACP stream does not carry (e.g. an `interaction_request` raised via + * the permission callback). Routes through the same choke point as stream events, so it + * lands in both the live sink and the batch `events()` log in build order. + */ + emitEvent(event: AgentEvent): void; + /** End all open spans. Returns the accumulated assistant text. */ + finish(): string; + /** Set final run usage before finish/flush so events and exported spans carry final totals. */ + setUsage(usage: AgentUsage | undefined): void; + /** Flush this run's trace to Agenta (invoke_agent has a remote parent). */ + flush(): Promise; + /** Trace id of the run (the caller's trace when a traceparent was passed). */ + traceId(): string | undefined; + /** Accumulated assistant output text so far. */ + output(): string; + /** The structured event log built from the ACP stream (tool calls, usage, final message). */ + events(): AgentEvent[]; + /** Run token/cost totals from the stream, when the harness reported `usage_update`. */ + usage(): AgentUsage | undefined; +} + +/** + * Build an ACP-event-driven tracer scoped to a single rivet run. Call `start` once, + * `handleUpdate` for every ACP session update, then `finish` + `await flush`. + */ +export function createRivetOtel(init: RivetOtelInit): RivetOtel { + ensureProvider(); + + const capture = init.captureContent !== false; + const emitSpans = init.emitSpans !== false; + const endpoint = init.endpoint ?? defaultTarget().endpoint; + const authorization = init.authorization ?? defaultTarget().authorization; + const { provider, id: modelId } = splitModel(init.model); + const tracer = trace.getTracer("agenta-rivet-otel", "0.1.0"); + + let agentSpan: Span | undefined; + let agentCtx: Context | undefined; + let turnSpan: Span | undefined; + let turnCtx: Context | undefined; + let llmSpan: Span | undefined; + let runTraceId: string | undefined; + let accumulated = ""; + let reasoningAccumulated = ""; + let usage: AgentUsage | undefined; + const events: AgentEvent[] = []; + const toolSpans = new Map(); + + // Live emission. `record` is the single choke point for every event: it appends to the + // result log and, on the streaming path, flushes the event the moment it is built — so + // the live order is byte-identical to `events[]`. A sink failure never aborts the run. + const sink = init.emit; + function record(event: AgentEvent): void { + events.push(event); + if (sink) { + try { + sink(event); + } catch { + // a downstream sink error must not break the agent run + } + } + } + + function stampUsage(span: Span, u: AgentUsage | undefined): void { + if (!u) return; + span.setAttribute("gen_ai.usage.input_tokens", u.input); + span.setAttribute("gen_ai.usage.output_tokens", u.output); + span.setAttribute("gen_ai.usage.prompt_tokens", u.input); + span.setAttribute("gen_ai.usage.completion_tokens", u.output); + span.setAttribute("gen_ai.usage.total_tokens", u.total); + if (u.cost > 0) span.setAttribute("gen_ai.usage.cost", u.cost); + } + + function setUsage(finalUsage: AgentUsage | undefined): void { + if (!finalUsage) return; + usage = finalUsage; + const event: AgentEvent = { type: "usage", ...finalUsage }; + if (!sink) { + const index = events.findLastIndex((e) => e.type === "usage"); + if (index !== -1) { + events[index] = event; + return; + } + } + record(event); + } + + // Text/reasoning block lifecycle (streaming path only). At most one block of each kind is + // open; each gets a stable, monotonic id. `*Emitted` tracks the total text delivered as + // deltas across the whole run (NOT per block) — `accumulated` is run-long, so the next + // delta is always its remainder. Block boundaries (a tool call between two text runs) only + // insert start/end markers; they must not reset the counter, or the second block would + // re-emit the first block's text. + let textBlockId: string | undefined; + let textEmitted = ""; + let anyTextDelta = false; + let reasoningBlockId: string | undefined; + let reasoningEmitted = ""; + let blockSeq = 0; + const nextId = (prefix: string): string => `${prefix}-${blockSeq++}`; + + function closeText(): void { + if (textBlockId === undefined) return; + record({ type: "message_end", id: textBlockId }); + textBlockId = undefined; + } + + function closeReasoning(): void { + if (reasoningBlockId === undefined) return; + record({ type: "reasoning_end", id: reasoningBlockId }); + reasoningBlockId = undefined; + } + + /** Open (if needed) the assistant text block and emit the pure delta up to `target`. */ + function streamText(target: string): void { + closeReasoning(); // a text chunk ends any open reasoning run (blocks never overlap) + const delta = target.startsWith(textEmitted) + ? target.slice(textEmitted.length) + : target; + if (!delta) return; + if (textBlockId === undefined) { + textBlockId = nextId("msg"); + record({ type: "message_start", id: textBlockId }); + } + record({ type: "message_delta", id: textBlockId, delta }); + textEmitted = target.startsWith(textEmitted) ? target : textEmitted + delta; + anyTextDelta = true; + } + + /** Open (if needed) the reasoning block and emit the pure delta up to `target`. */ + function streamReasoning(target: string): void { + closeText(); // a reasoning chunk ends any open text run + const delta = target.startsWith(reasoningEmitted) + ? target.slice(reasoningEmitted.length) + : target; + if (!delta) return; + if (reasoningBlockId === undefined) { + reasoningBlockId = nextId("reason"); + record({ type: "reasoning_start", id: reasoningBlockId }); + } + record({ type: "reasoning_delta", id: reasoningBlockId, delta }); + reasoningEmitted = target.startsWith(reasoningEmitted) ? target : reasoningEmitted + delta; + } + + function start(input: { prompt?: string; messages?: any[]; sessionId?: string }): void { + // Span-less mode (harness self-instruments): only track the trace id so the run can + // report it; the harness emits the spans under the propagated parent. + if (!emitSpans) { + const m = /^00-([0-9a-f]{32})-/.exec(init.traceparent ?? ""); + runTraceId = m ? m[1] : undefined; + return; + } + const parent = parentContext(init.traceparent); + agentSpan = tracer.startSpan("invoke_agent", undefined, parent); + agentSpan.setAttribute("openinference.span.kind", "AGENT"); + agentSpan.setAttribute("gen_ai.operation.name", "invoke_agent"); + agentSpan.setAttribute("gen_ai.agent.name", init.harness ?? "agent"); + const sessionId = input.sessionId ?? init.sessionId; + if (sessionId) { + agentSpan.setAttribute("session.id", sessionId); + agentSpan.setAttribute("gen_ai.conversation.id", sessionId); + } + setInputs(agentSpan, { prompt: input.prompt ?? "" }, capture); + + runTraceId = agentSpan.spanContext().traceId; + traceTargets.set(runTraceId, { endpoint, authorization }); + agentCtx = trace.setSpan(parent ?? context.active(), agentSpan); + + turnSpan = tracer.startSpan("turn 0", undefined, agentCtx); + turnSpan.setAttribute("openinference.span.kind", "CHAIN"); + turnSpan.setAttribute("pi.turn.index", 0); + turnCtx = trace.setSpan(agentCtx, turnSpan); + + llmSpan = tracer.startSpan(modelId ? `chat ${modelId}` : "chat", undefined, turnCtx); + llmSpan.setAttribute("openinference.span.kind", "LLM"); + llmSpan.setAttribute("gen_ai.operation.name", "chat"); + if (provider) llmSpan.setAttribute("gen_ai.system", provider); + if (modelId) llmSpan.setAttribute("gen_ai.request.model", modelId); + const inputMessages = + input.messages && input.messages.length + ? input.messages + : [{ role: "user", content: input.prompt ?? "" }]; + emitMessages(llmSpan, "llm.input_messages", inputMessages, capture); + } + + function handleUpdate(update: any): void { + const kind = update?.sessionUpdate; + if (!kind) return; + + if (kind === "agent_message_chunk") { + const t = acpBlockText(update.content); + if (!t) return; + // Pi streams pure deltas; Claude streams deltas plus a cumulative snapshot. + // Replace when a chunk is a superset of what we have, append otherwise. + if (t.startsWith(accumulated)) accumulated = t; + else accumulated += t; + // Live deltas run independent of span emission (text, not a span), so they flow even + // when the harness self-instruments (emitSpans=false). `accumulated` is the cumulative + // text, so the pure delta is its tail past what we already sent. + if (sink) streamText(accumulated); + return; + } + + if (kind === "agent_thought_chunk") { + const t = acpBlockText(update.content); + if (!t) return; + if (t.startsWith(reasoningAccumulated)) reasoningAccumulated = t; + else reasoningAccumulated += t; + if (sink) streamReasoning(reasoningAccumulated); + return; + } + + if (kind === "tool_call") { + const id = update.toolCallId; + if (!id) return; + // A tool call ends any open text/reasoning block (keeps streamed block boundaries + // clean across text -> tool -> text interleaving). No-op on the one-shot path. + closeText(); + closeReasoning(); + const name = update.title || update.kind || "tool"; + let span: Span | undefined; + if (emitSpans && turnCtx) { + span = tracer.startSpan(`execute_tool ${name}`, undefined, turnCtx); + span.setAttribute("openinference.span.kind", "TOOL"); + span.setAttribute("gen_ai.operation.name", "execute_tool"); + span.setAttribute("gen_ai.tool.name", String(name)); + span.setAttribute("gen_ai.tool.call.id", String(id)); + if (update.rawInput != null) + setInputs(span, update.rawInput as Record, capture); + } + toolSpans.set(id, { span, name: String(name) }); + record({ type: "tool_call", id: String(id), name: String(name), input: update.rawInput }); + // A tool_call can arrive already completed (status set up front). + maybeCloseTool(id, update); + return; + } + + if (kind === "tool_call_update") { + maybeCloseTool(update.toolCallId, update); + return; + } + + if (kind === "usage_update") { + // ACP usage_update carries only `used` (context tokens) and `cost.amount`. The + // per-call input/output split is NOT on the stream; it rides on the PromptResponse, + // which the rivet engine reads. Keep total + cost here and leave the split to the caller. + const cost = update.cost?.amount; + const total = update.used; + usage = { + input: usage?.input ?? 0, + output: usage?.output ?? 0, + total: typeof total === "number" ? total : usage?.total ?? 0, + cost: typeof cost === "number" ? cost : usage?.cost ?? 0, + }; + record({ type: "usage", ...usage }); + } + } + + /** Close a tool span when the update marks it completed or failed. */ + function maybeCloseTool(id: string | undefined, update: any): void { + if (!id) return; + const entry = toolSpans.get(id); + if (!entry) return; + const status = update?.status; + if (status !== "completed" && status !== "failed") return; + const out = acpToolContentText(update.content) || acpToolContentText(update.rawOutput); + if (entry.span) { + setOutput(entry.span, out, capture); + if (status === "failed") entry.span.setStatus({ code: SpanStatusCode.ERROR }); + entry.span.end(); + } + toolSpans.delete(id); + record({ type: "tool_result", id, output: out, isError: status === "failed" }); + } + + function finish(): string { + const text = stripStartupBanner(accumulated.trim()); + // The event log is independent of span emission, so build its tail either way. + closeText(); + closeReasoning(); + if (sink) { + // Streaming path: the block deltas were already flushed, so do NOT re-emit the + // coalesced message (that would double it). If the harness produced no token deltas + // at all but there is text, synthesize a minimal start/delta/end so the consumer + // always sees one uniform block shape regardless of harness streaming support. + if (text && !anyTextDelta) { + const id = nextId("msg"); + record({ type: "message_start", id }); + record({ type: "message_delta", id, delta: text }); + record({ type: "message_end", id }); + } + } else { + // One-shot path: coalesced events only (no per-token granularity to recover). + if (text) record({ type: "message", text }); + const reasoning = reasoningAccumulated.trim(); + if (reasoning) record({ type: "thought", text: reasoning }); + } + record({ type: "done" }); + if (!emitSpans) return text; + if (llmSpan) { + emitMessages( + llmSpan, + "llm.output_messages", + [{ role: "assistant", content: text }], + capture, + ); + stampUsage(llmSpan, usage); + llmSpan.end(); + llmSpan = undefined; + } + for (const { span } of toolSpans.values()) span?.end(); + toolSpans.clear(); + if (turnSpan) { + turnSpan.end(); + turnSpan = undefined; + } + if (agentSpan) { + setOutput(agentSpan, text, capture); + stampUsage(agentSpan, usage); + agentSpan.end(); + agentSpan = undefined; + } + agentCtx = undefined; + turnCtx = undefined; + return text; + } + + return { + start, + handleUpdate, + emitEvent: record, + finish, + setUsage, + flush: () => flushTrace(runTraceId), + traceId: () => runTraceId, + output: () => accumulated, + events: () => events, + usage: () => usage, + }; +} diff --git a/services/agent/test/continuation.test.ts b/services/agent/test/continuation.test.ts new file mode 100644 index 0000000000..c9f9d4356c --- /dev/null +++ b/services/agent/test/continuation.test.ts @@ -0,0 +1,66 @@ +/** + * Unit tests for the cross-turn HITL continuation substrate. + * + * Under the cold model the harness rebuilds context from the replayed transcript, and ACP + * prompt content blocks cannot carry tool calls/results. So a resolved interaction (an + * approved tool that ran, a client-fulfilled tool) must survive into the replay as text. + * `messageTranscript` encodes tool turns; `buildTurnText` keeps them in the replayed history. + * + * Run: pnpm exec tsx test/continuation.test.ts + */ +import assert from "node:assert/strict"; + +import { messageTranscript, buildTurnText } from "../src/engines/rivet.ts"; +import { + resolveRunSessionId, + type AgentRunRequest, + type ContentBlock, +} from "../src/protocol.ts"; + +// --- messageTranscript ------------------------------------------------------- +assert.equal(messageTranscript("hello"), "hello"); +assert.equal(messageTranscript([{ type: "text", text: "a" }, { type: "text", text: "b" }]), "a\nb"); +assert.equal( + messageTranscript([{ type: "tool_call", toolName: "getWeather", input: { city: "Paris" } }]), + '[called getWeather({"city":"Paris"})]', +); +assert.equal( + messageTranscript([{ type: "tool_result", toolName: "getWeather", output: { temp: 24 } }]), + '[getWeather returned: {"temp":24}]', +); +assert.equal( + messageTranscript([{ type: "tool_result", toolName: "send", output: "boom", isError: true }]), + "[send error: boom]", +); + +// --- session id metadata ------------------------------------------------------ +assert.equal( + resolveRunSessionId({ sessionId: "sess_platform" }, "runner-ephemeral"), + "sess_platform", +); +assert.equal(resolveRunSessionId({}, "runner-ephemeral"), "runner-ephemeral"); + +// --- buildTurnText keeps a resolved tool turn in the replay ------------------ +{ + const req: AgentRunRequest = { + messages: [ + { role: "user", content: "weather in Paris?" }, + { + role: "assistant", + content: [{ type: "tool_call", toolName: "getWeather", input: { city: "Paris" } } as ContentBlock], + }, + { + role: "tool", + content: [{ type: "tool_result", toolName: "getWeather", output: { temp: 24 } } as ContentBlock], + }, + { role: "user", content: "and tomorrow?" }, + ], + }; + const text = buildTurnText(req); + assert.ok(text.includes("called getWeather"), "tool call survives replay"); + assert.ok(text.includes("getWeather returned"), "tool result survives replay"); + assert.ok(text.includes("and tomorrow?"), "latest user prompt is the live turn"); + assert.ok(text.startsWith("Conversation so far:"), "transcript header present"); +} + +console.log("continuation.test.ts: all assertions passed"); diff --git a/services/agent/test/responder.test.ts b/services/agent/test/responder.test.ts new file mode 100644 index 0000000000..e06ae43e00 --- /dev/null +++ b/services/agent/test/responder.test.ts @@ -0,0 +1,84 @@ +/** + * Unit tests for the interaction responder seam and the otel `emitEvent` hook. + * + * Covers the behavior parity of the responder (it replaces the old inline auto-approve in + * rivet.ts) and that an out-of-stream event (an `interaction_request`) routed through + * `emitEvent` lands in both the live sink and the batch `events()` log. No harness, no + * network. + * + * Run: pnpm exec tsx test/responder.test.ts + */ +import assert from "node:assert/strict"; + +import { createRivetOtel } from "../src/tracing/otel.ts"; +import type { AgentEvent } from "../src/protocol.ts"; +import { + PolicyResponder, + decisionToReply, + policyFromRequest, +} from "../src/responder.ts"; + +// --- policyFromRequest ------------------------------------------------------- +{ + delete process.env.AGENTA_RIVET_DENY_PERMISSIONS; + assert.equal(policyFromRequest(undefined), "auto"); + assert.equal(policyFromRequest("auto"), "auto"); + assert.equal(policyFromRequest("deny"), "deny"); + + process.env.AGENTA_RIVET_DENY_PERMISSIONS = "true"; + assert.equal(policyFromRequest(undefined), "deny", "env forces deny"); + assert.equal(policyFromRequest("auto"), "deny", "env overrides auto"); + delete process.env.AGENTA_RIVET_DENY_PERMISSIONS; +} + +// --- decisionToReply (parity with the old inline mapping) -------------------- +{ + assert.equal(decisionToReply("allow", ["always", "once", "reject"]), "always"); + assert.equal(decisionToReply("allow", ["once", "reject"]), "once"); + assert.equal(decisionToReply("allow", []), "once", "allow falls back to once"); + assert.equal(decisionToReply("deny", ["always", "once", "reject"]), "reject"); + assert.equal(decisionToReply("deny", []), "reject", "deny falls back to reject"); +} + +// --- PolicyResponder --------------------------------------------------------- +{ + const auto = new PolicyResponder("auto"); + const deny = new PolicyResponder("deny"); + const req = { id: "p1", availableReplies: ["once", "reject"] }; + assert.equal(await auto.onPermission(req), "allow"); + assert.equal(await deny.onPermission(req), "deny"); +} + +// --- emitEvent: streaming path (sink + batch) -------------------------------- +{ + const emitted: AgentEvent[] = []; + const run = createRivetOtel({ harness: "claude", model: "anthropic/x", emit: (e) => emitted.push(e) }); + run.start({ prompt: "hi" }); + const interaction: AgentEvent = { + type: "interaction_request", + id: "p1", + kind: "permission", + payload: { availableReplies: ["once", "reject"] }, + }; + run.emitEvent(interaction); + + const live = emitted.find((e) => e.type === "interaction_request"); + assert.ok(live, "interaction_request flushed to the live sink"); + assert.equal((live as any).id, "p1"); + assert.ok( + run.events().some((e) => e.type === "interaction_request"), + "interaction_request also recorded in the batch log", + ); +} + +// --- emitEvent: one-shot path (batch only) ----------------------------------- +{ + const run = createRivetOtel({ harness: "claude", model: "anthropic/x" }); + run.start({ prompt: "hi" }); + run.emitEvent({ type: "data", name: "weather", data: { temp: 24 } }); + const ev = run.events().find((e) => e.type === "data"); + assert.ok(ev, "data event recorded with no live sink"); + assert.equal((ev as any).name, "weather"); +} + +console.log("responder.test.ts: all assertions passed"); diff --git a/services/agent/test/stream-events.test.ts b/services/agent/test/stream-events.test.ts new file mode 100644 index 0000000000..f27e31fc23 --- /dev/null +++ b/services/agent/test/stream-events.test.ts @@ -0,0 +1,148 @@ +/** + * Unit test for the createRivetOtel delta/lifecycle state machine. + * + * Drives `handleUpdate` with a hand-built ACP `session/update` sequence (Claude-style + * cumulative text snapshots, a tool call between two text runs, a reasoning run) and asserts + * the streaming and one-shot event shapes. No harness, no network: spans are built offline + * and never flushed. + * + * Run: pnpm exec tsx test/stream-events.test.ts + */ +import assert from "node:assert/strict"; + +import { createRivetOtel } from "../src/tracing/otel.ts"; +import type { AgentEvent } from "../src/protocol.ts"; + +const textChunk = (text: string) => ({ + sessionUpdate: "agent_message_chunk", + content: { type: "text", text }, +}); +const thoughtChunk = (text: string) => ({ + sessionUpdate: "agent_thought_chunk", + content: { type: "text", text }, +}); +const toolCall = (id: string, title: string, rawInput: unknown) => ({ + sessionUpdate: "tool_call", + toolCallId: id, + title, + rawInput, +}); +const toolDone = (id: string, text: string) => ({ + sessionUpdate: "tool_call_update", + toolCallId: id, + status: "completed", + content: [{ content: { type: "text", text } }], +}); +const usage = () => ({ sessionUpdate: "usage_update", used: 100, cost: { amount: 0.01 } }); + +// The same ACP sequence drives both modes: two text runs around a tool call, then reasoning. +function drive(run: ReturnType): void { + run.start({ prompt: "weather in Paris?" }); + run.handleUpdate(textChunk("Hello ")); // pure delta + run.handleUpdate(textChunk("Hello world")); // cumulative snapshot (Claude-style) + run.handleUpdate(toolCall("call_1", "getWeather", { city: "Paris" })); + run.handleUpdate(toolDone("call_1", "sunny")); + run.handleUpdate(textChunk("Hello world It is sunny.")); // resumes after the tool + run.handleUpdate(thoughtChunk("thinking...")); + run.handleUpdate(usage()); +} + +const types = (events: AgentEvent[]) => events.map((e) => e.type); +const ofType = (events: AgentEvent[], t: T) => + events.filter((e) => e.type === t) as Extract[]; + +// --- Scenario 1: streaming (emit set) --------------------------------------- +{ + const emitted: AgentEvent[] = []; + const run = createRivetOtel({ harness: "claude", model: "anthropic/x", emit: (e) => emitted.push(e) }); + drive(run); + const finalText = run.finish(); + + // No coalesced text events on the streaming path. + assert.equal(ofType(emitted, "message").length, 0, "no coalesced message when streaming"); + assert.equal(ofType(emitted, "thought").length, 0, "no coalesced thought when streaming"); + + // Exactly one terminal done. + assert.equal(ofType(emitted, "done").length, 1, "exactly one done"); + + // Two text blocks (split by the tool call), one reasoning block, balanced start/end. + const mStart = ofType(emitted, "message_start"); + const mEnd = ofType(emitted, "message_end"); + assert.equal(mStart.length, 2, "two message_start"); + assert.equal(mEnd.length, 2, "two message_end"); + assert.deepEqual(mStart.map((e) => e.id), ["msg-0", "msg-1"], "stable monotonic text ids"); + const rStart = ofType(emitted, "reasoning_start"); + const rEnd = ofType(emitted, "reasoning_end"); + assert.equal(rStart.length, 1, "one reasoning_start"); + assert.equal(rEnd.length, 1, "one reasoning_end"); + + // Deltas are pure and reconstruct the full text, with no overlap/repeat. + const text = ofType(emitted, "message_delta").map((e) => e.delta).join(""); + assert.equal(text, "Hello world It is sunny.", "concatenated deltas == full text"); + assert.equal(text, finalText, "deltas match finish() output"); + const reasoning = ofType(emitted, "reasoning_delta").map((e) => e.delta).join(""); + assert.equal(reasoning, "thinking...", "concatenated reasoning deltas"); + + // Ordering invariant: each block's start precedes its deltas precede its end; tool result + // lands before the second text block opens. + const seq = types(emitted); + assert.ok(seq.indexOf("message_end") < seq.indexOf("tool_call"), "first text block closes before the tool call"); + assert.ok(seq.indexOf("tool_result") < seq.lastIndexOf("message_start"), "tool result precedes the second text block"); + for (const id of ["msg-0", "msg-1", "reason-2"]) { + const idxs = emitted + .map((e, i) => ((e as any).id === id ? { i, t: e.type } : null)) + .filter(Boolean) as { i: number; t: string }[]; + assert.ok(idxs[0].t.endsWith("_start"), `${id} starts with *_start`); + assert.ok(idxs[idxs.length - 1].t.endsWith("_end"), `${id} ends with *_end`); + } +} + +// --- Scenario 2: one-shot (no emit) ----------------------------------------- +{ + const run = createRivetOtel({ harness: "claude", model: "anthropic/x" }); + drive(run); + const finalText = run.finish(); + const events = run.events(); + + // Coalesced text/thought, no delta lifecycle events. + const messages = ofType(events, "message"); + assert.equal(messages.length, 1, "one coalesced message"); + assert.equal(messages[0].text, "Hello world It is sunny.", "coalesced text == final"); + assert.equal(messages[0].text, finalText); + assert.equal(ofType(events, "thought").length, 1, "one coalesced thought"); + for (const t of ["message_start", "message_delta", "message_end", "reasoning_start", "reasoning_delta", "reasoning_end"]) { + assert.equal(events.filter((e) => e.type === t).length, 0, `no ${t} on the one-shot path`); + } + + // The structured tool/usage events are still present, with exactly one done. + assert.equal(ofType(events, "tool_call").length, 1, "tool_call present"); + assert.equal(ofType(events, "tool_result").length, 1, "tool_result present"); + assert.equal(ofType(events, "usage").length, 1, "usage present"); + assert.equal(ofType(events, "done").length, 1, "exactly one done"); +} + +// --- Scenario 3: span-less mode still records ACP events --------------------- +{ + const run = createRivetOtel({ harness: "pi", model: "openai-codex/x", emitSpans: false }); + drive(run); + run.setUsage({ input: 4, output: 6, total: 10, cost: 0.02 }); + const finalText = run.finish(); + const events = run.events(); + + assert.equal(finalText, "Hello world It is sunny."); + assert.equal(ofType(events, "message").length, 1, "message present without spans"); + assert.equal(ofType(events, "thought").length, 1, "thought present without spans"); + assert.equal(ofType(events, "tool_call").length, 1, "tool_call present without spans"); + assert.equal(ofType(events, "tool_result").length, 1, "tool_result present without spans"); + const usageEvents = ofType(events, "usage"); + assert.equal(usageEvents.length, 1, "usage present without spans"); + assert.deepEqual( + usageEvents[0], + { type: "usage", input: 4, output: 6, total: 10, cost: 0.02 }, + "final usage replaces stream-only usage before done", + ); + assert.equal(ofType(events, "done").length, 1, "exactly one done without spans"); + assert.ok(types(events).indexOf("usage") < types(events).indexOf("done"), "usage precedes done"); +} + +console.log("stream-events.test.ts: all assertions passed"); diff --git a/services/agent/test/tool-bridge.test.ts b/services/agent/test/tool-bridge.test.ts index 1838177918..4dac2b3f9d 100644 --- a/services/agent/test/tool-bridge.test.ts +++ b/services/agent/test/tool-bridge.test.ts @@ -22,28 +22,42 @@ function envValue( return env.find((e) => e.name === name)?.value; } -// code-only specs + NO callback -> one server, with AGENTA_TOOL_SPECS but no endpoint (F4). +const relayDir = "/tmp/agenta-tools"; + +// code-only specs + no callback -> one server, with public specs and relay dir. { const specs: ResolvedToolSpec[] = [ - { name: "adder", kind: "code", runtime: "python", code: "def main(**k): return 1" }, + { + name: "adder", + description: "Add numbers", + kind: "code", + runtime: "python", + code: "def main(**k): return 1", + env: { PRIVATE: "secret" }, + }, ]; - const out = buildToolMcpServers(specs, undefined); + const out = buildToolMcpServers(specs, relayDir); assert.equal(out.length, 1, "code-only run still attaches the server"); assert.equal(out[0].name, "agenta-tools"); assert.ok( - envValue(out[0].env, "AGENTA_TOOL_SPECS") !== undefined, - "AGENTA_TOOL_SPECS is set", + envValue(out[0].env, "AGENTA_TOOL_PUBLIC_SPECS") !== undefined, + "AGENTA_TOOL_PUBLIC_SPECS is set", ); assert.equal( envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"), undefined, "no endpoint env for code-only run", ); - // The full executable spec list round-trips through AGENTA_TOOL_SPECS. - assert.deepEqual(JSON.parse(envValue(out[0].env, "AGENTA_TOOL_SPECS")!), specs); + assert.equal(envValue(out[0].env, "AGENTA_TOOL_RELAY_DIR"), relayDir); + assert.equal(envValue(out[0].env, "AGENTA_TOOL_CALLBACK_AUTH"), undefined); + assert.equal(envValue(out[0].env, "AGENTA_TOOL_SPECS"), undefined); + // Only public metadata round-trips; private executor fields stay runner-side. + assert.deepEqual(JSON.parse(envValue(out[0].env, "AGENTA_TOOL_PUBLIC_SPECS")!), [ + { name: "adder", description: "Add numbers" }, + ]); } -// callback specs + a callback with endpoint -> one server carrying endpoint (+ auth). +// callback specs + a callback with endpoint -> still no endpoint/auth in child env. { const specs: ResolvedToolSpec[] = [ { name: "search", kind: "callback", callRef: "composio.search" }, @@ -52,30 +66,31 @@ function envValue( endpoint: "https://agenta.example/tools/call", authorization: "Bearer tok", }; - const out = buildToolMcpServers(specs, callback); + const out = buildToolMcpServers(specs, callback, relayDir); assert.equal(out.length, 1); assert.equal( envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"), - "https://agenta.example/tools/call", - "endpoint env set for callback tools", + undefined, + "endpoint env is never exposed to the bridge", ); assert.equal( envValue(out[0].env, "AGENTA_TOOL_CALLBACK_AUTH"), - "Bearer tok", - "auth env set when provided", + undefined, + "auth env is never exposed to the bridge", ); + assert.equal(envValue(out[0].env, "AGENTA_TOOL_RELAY_DIR"), relayDir); } -// callback spec + endpoint but no authorization -> no AUTH env entry. +// callback spec + endpoint but no authorization -> still only public metadata + relay dir. { const specs: ResolvedToolSpec[] = [ { name: "search", kind: "callback", callRef: "composio.search" }, ]; - const out = buildToolMcpServers(specs, { endpoint: "https://agenta.example/tools/call" }); + const out = buildToolMcpServers(specs, { endpoint: "https://agenta.example/tools/call" }, relayDir); assert.equal(out.length, 1); assert.equal( envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"), - "https://agenta.example/tools/call", + undefined, ); assert.equal( envValue(out[0].env, "AGENTA_TOOL_CALLBACK_AUTH"), @@ -87,11 +102,11 @@ function envValue( // absent kind defaults to callback (back-compat): endpoint still wired when present. { const specs: ResolvedToolSpec[] = [{ name: "legacy", callRef: "composio.legacy" }]; - const out = buildToolMcpServers(specs, { endpoint: "https://agenta.example/tools/call" }); + const out = buildToolMcpServers(specs, { endpoint: "https://agenta.example/tools/call" }, relayDir); assert.equal(out.length, 1, "back-compat (no kind) attaches as a callback tool"); assert.equal( envValue(out[0].env, "AGENTA_TOOL_CALLBACK_ENDPOINT"), - "https://agenta.example/tools/call", + undefined, ); } @@ -101,7 +116,7 @@ function envValue( { name: "adder", kind: "code", runtime: "python", code: "def main(**k): return 1" }, { name: "search", kind: "callback", callRef: "composio.search" }, ]; - const out = buildToolMcpServers(specs, undefined); + const out = buildToolMcpServers(specs, relayDir); assert.notDeepEqual(out, [], "mixed run with no endpoint must not return []"); assert.equal(out.length, 1, "still attaches the server so the code tool works"); assert.equal( @@ -109,8 +124,11 @@ function envValue( undefined, "endpoint env omitted when missing", ); - // Both executable specs are still passed to the bridge. - assert.deepEqual(JSON.parse(envValue(out[0].env, "AGENTA_TOOL_SPECS")!), specs); + // Both executable specs are advertised, but only as public metadata. + assert.deepEqual(JSON.parse(envValue(out[0].env, "AGENTA_TOOL_PUBLIC_SPECS")!), [ + { name: "adder" }, + { name: "search" }, + ]); } // empty specs -> []. @@ -126,7 +144,7 @@ assert.deepEqual(buildToolMcpServers([], undefined), [], "empty specs -> []"); ); // Even with an endpoint, client-only stays empty. assert.deepEqual( - buildToolMcpServers(specs, { endpoint: "https://agenta.example/tools/call" }), + buildToolMcpServers(specs, { endpoint: "https://agenta.example/tools/call" }, relayDir), [], "client-only -> [] even with an endpoint", ); @@ -138,9 +156,9 @@ assert.deepEqual(buildToolMcpServers([], undefined), [], "empty specs -> []"); { name: "confirm", kind: "client" }, { name: "adder", kind: "code", runtime: "python", code: "def main(**k): return 1" }, ]; - const out = buildToolMcpServers(specs, undefined); + const out = buildToolMcpServers(specs, relayDir); assert.equal(out.length, 1, "executable spec attaches the server"); - const passed: ResolvedToolSpec[] = JSON.parse(envValue(out[0].env, "AGENTA_TOOL_SPECS")!); + const passed: ResolvedToolSpec[] = JSON.parse(envValue(out[0].env, "AGENTA_TOOL_PUBLIC_SPECS")!); assert.deepEqual( passed.map((s) => s.name), ["adder"],