From 3ac8f259a349bcd243376ab840a2e83343cab5fc Mon Sep 17 00:00:00 2001 From: Samuel Huber Date: Sat, 28 Mar 2026 12:38:15 +0100 Subject: [PATCH 1/7] feat(observability): complete agent trace OTEL verification --- .gitignore | 2 + docs/guides/agent-trace-otel-verification.mdx | 200 ++ docs/guides/monitoring-logs.mdx | 3 +- observability/docker-compose.otel.yml | 65 +- .../provisioning/datasources/datasources.yml | 20 + observability/loki/loki-config.yaml | 35 + observability/otel-collector-config.yml | 10 +- scripts/obs-reset.sh | 28 + scripts/obs-wait-healthy.sh | 77 + scripts/verify-observability.sh | 216 +++ src/RunOptions.ts | 1 + src/SmithersEvent.ts | 19 + src/agent-trace.ts | 1675 +++++++++++++++++ src/agents/BaseCliAgent.ts | 33 +- src/agents/PiAgent.ts | 21 + src/cli/index.ts | 43 +- src/effect/child-process.ts | 16 + src/effect/logging.ts | 54 +- src/effect/runtime.ts | 4 + src/engine/approvals.ts | 4 + src/engine/index.ts | 66 +- src/events.ts | 11 +- src/index.ts | 10 + src/observability/index.ts | 18 +- src/tools/logToolCall.ts | 19 +- tests/agent-trace.test.tsx | 1198 ++++++++++++ workflows/agent-trace-otel-demo.tsx | 315 ++++ workflows/agent-trace-otel-loop.tsx | 451 +++++ workflows/real-agent-trace-check.tsx | 63 + 29 files changed, 4636 insertions(+), 41 deletions(-) create mode 100644 docs/guides/agent-trace-otel-verification.mdx create mode 100644 observability/loki/loki-config.yaml create mode 100755 scripts/obs-reset.sh create mode 100755 scripts/obs-wait-healthy.sh create mode 100755 scripts/verify-observability.sh create mode 100644 src/agent-trace.ts create mode 100644 tests/agent-trace.test.tsx create mode 100644 workflows/agent-trace-otel-demo.tsx create mode 100644 workflows/agent-trace-otel-loop.tsx create mode 100644 workflows/real-agent-trace-check.tsx diff --git a/.gitignore b/.gitignore index 8c1b2ea6..4f57d598 100644 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,8 @@ deno.lock reference/ codex/ ghostty/ +observability/data/ +tmp/verification/ # Worktree script SQLite artifacts scripts/worktree-feature/*.db* diff --git a/docs/guides/agent-trace-otel-verification.mdx b/docs/guides/agent-trace-otel-verification.mdx new file mode 100644 index 00000000..2838d1eb --- /dev/null +++ b/docs/guides/agent-trace-otel-verification.mdx @@ -0,0 +1,200 @@ +--- +title: Agent Trace OTEL Log Verification +description: Reproducible local verification for canonical agent trace OTEL log export with Loki. +--- + +## Local stack + +Start the local stack: + +```bash +smithers observability --detach +``` + +Expected endpoints: + +- Grafana: `http://localhost:3001` +- Prometheus: `http://localhost:9090` +- Tempo: `http://localhost:3200` +- Loki: `http://localhost:3100` +- OTEL collector HTTP: `http://localhost:4318` + +Validate the stack: + +```bash +docker compose -f observability/docker-compose.otel.yml ps +curl -sf http://localhost:3100/ready +curl -sf http://localhost:3001/api/datasources | jq 'map({name,type,url})' +docker logs observability-otel-collector-1 | tail -n 80 +``` + +## Enable OTEL export + +```bash +export SMITHERS_OTEL_ENABLED=1 +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 +export OTEL_SERVICE_NAME=smithers-dev +export SMITHERS_LOG_FORMAT=json +``` + +## Demo workflow + +Use the built-in reproducible workflow at `workflows/agent-trace-otel-demo.tsx`. +It emits: + +- one Pi-like high-fidelity attempt with text deltas, thinking deltas, tool lifecycle, usage, and a redaction fixture +- one SDK-style final-only attempt +- stable run annotations for Loki queries + +Run the success case: + +```bash +smithers run workflows/agent-trace-otel-demo.tsx \ + --run-id agent-trace-otel-demo \ + --annotations '{"custom.demo":true,"custom.ticket":"OBS-123"}' +``` + +Run the malformed JSON failure case: + +```bash +smithers run workflows/agent-trace-otel-demo.tsx \ + --run-id agent-trace-otel-demo-fail \ + --input '{"failureMode":"malformed-json"}' \ + --annotations '{"custom.demo":true,"custom.ticket":"OBS-ERR"}' +``` + +Optional durable-local verification: + +```bash +jq 'select(.type == "AgentTraceEvent" or .type == "AgentTraceSummary")' \ + .smithers/executions/agent-trace-otel-demo/logs/stream.ndjson + +find .smithers/executions/agent-trace-otel-demo/logs/agent-trace -type f -maxdepth 1 -print +``` + +## Loki queries + +Smithers OTEL attributes are exposed to Loki as sanitized structured metadata fields such as: + +- `run_id` +- `workflow_path` +- `node_id` +- `node_attempt` +- `agent_family` +- `agent_capture_mode` +- `trace_completeness` +- `event_kind` + +Use `{service_name="smithers-dev"}` as the stream selector, then filter on structured metadata in the LogQL pipeline. Use `| json` to inspect the structured log body. + +All events for one run: + +```logql +{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" +``` + +One node attempt: + +```logql +{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" | node_id="pi-rich-trace" | node_attempt="1" +``` + +Thinking deltas only: + +```logql +{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" | event_kind="assistant.thinking.delta" +``` + +Tool execution only: + +```logql +{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" | event_kind=~"tool\.execution\..*" +``` + +Capture errors only: + +```logql +{service_name="smithers-dev"} | event_kind="capture.error" +``` + +Inspect the structured JSON body for one run: + +```logql +{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" | json +``` + +Redaction proof query: + +```logql +{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" |= "REDACTED" +``` + +## API query examples + +Equivalent direct Loki API checks: + +```bash +curl -sG 'http://localhost:3100/loki/api/v1/query_range' \ + --data-urlencode 'query={service_name="smithers-dev"} | run_id="agent-trace-otel-demo"' \ + --data-urlencode 'limit=200' | jq '.data.result[] | {stream, values: (.values | length)}' + +curl -sG 'http://localhost:3100/loki/api/v1/query_range' \ + --data-urlencode 'query={service_name="smithers-dev"} | run_id="agent-trace-otel-demo" | event_kind="assistant.thinking.delta"' \ + --data-urlencode 'limit=20' | jq '.data.result[]?.values[]?[1]' +``` + +## Tempo trace checks + +Tempo search should show Smithers spans once a workflow has run: + +```bash +curl -s http://localhost:3200/api/search | jq . +curl -s http://localhost:3200/api/search/tags | jq . +curl -s 'http://localhost:3200/api/search/tag/service.name/values' | jq . +curl -s 'http://localhost:3200/api/search/tag/runId/values' | jq . +``` + +Inspect one trace directly: + +```bash +TRACE_ID=$(curl -s http://localhost:3200/api/search \ + | jq -r '.traces[] | select(.rootTraceName=="engine:run-workflow") | .traceID' \ + | head -n 1) + +curl -s http://localhost:3200/api/traces/$TRACE_ID | jq . +``` + +Expected trace attributes include at least: + +- `service.name = smithers-dev` +- `runId = ` +- `workflowPath = ` + +## Verification checklist + +- stack starts successfully in Docker +- Loki is present and queryable +- collector logs pipeline is active +- Pi traces show text deltas, thinking deltas, tool execution lifecycle, final message, usage, and run/node/attempt correlation +- second agent family is exported with truthful `final-only` completeness classification +- Gemini `stream-json` attempts preserve structured deltas truthfully +- Kimi `stream-json` attempts preserve structured tool lifecycle truthfully +- malformed or truncated structured streams emit `capture.error` and classify as `capture-failed` +- artifact write failures emit `capture.warning` and degrade to `partial-observed` without losing durable DB truth +- Tempo search shows Smithers spans and trace attributes including `runId` +- Prometheus is still scraping the collector successfully +- secrets are redacted from canonical events, OTEL log bodies, and persisted trace artifacts + +## Automated coverage + +The fixture-style and workflow-backed tests in `tests/agent-trace.test.tsx` now cover: + +- Pi rich success and malformed JSON failure +- SDK truthful `final-only` +- Claude Code structured stream preservation +- Codex structured delta + usage preservation +- Gemini structured `stream-json` and coarse `cli-json` classification +- Kimi structured `stream-json` tool lifecycle preservation +- truncated structured stream failure classification +- artifact write failure classification +- OTEL body/attribute shaping for Loki query fields diff --git a/docs/guides/monitoring-logs.mdx b/docs/guides/monitoring-logs.mdx index 9e355de9..ea086c94 100644 --- a/docs/guides/monitoring-logs.mdx +++ b/docs/guides/monitoring-logs.mdx @@ -58,8 +58,9 @@ That starts: - OTLP collector on `http://localhost:4318` - Prometheus on `http://localhost:9090` -- Grafana on `http://localhost:3000` +- Grafana on `http://localhost:3001` - Tempo on `http://localhost:3200` +- Loki on `http://localhost:3100` The collector exports metrics on `:8889`, Prometheus scrapes that endpoint, and Grafana comes pre-provisioned with a Smithers dashboard. diff --git a/observability/docker-compose.otel.yml b/observability/docker-compose.otel.yml index 7dd0e55d..510b3f23 100644 --- a/observability/docker-compose.otel.yml +++ b/observability/docker-compose.otel.yml @@ -2,7 +2,8 @@ version: "3.9" services: otel-collector: - image: otel/opentelemetry-collector-contrib:latest + image: otel/opentelemetry-collector-contrib:0.148.0 + restart: unless-stopped command: ["--config", "/etc/otelcol/config.yml"] volumes: - ./otel-collector-config.yml:/etc/otelcol/config.yml:ro @@ -11,28 +12,71 @@ services: - "4318:4318" # HTTP (Smithers sends here) - "8889:8889" # Prometheus exporter scrape endpoint depends_on: - - tempo + tempo: + condition: service_healthy + loki: + condition: service_healthy + healthcheck: + test: ["CMD", "/otelcol-contrib", "--version"] + interval: 10s + timeout: 5s + retries: 12 + start_period: 10s + + loki: + image: grafana/loki:3.3.2 + restart: unless-stopped + command: ["-config.file=/etc/loki/local-config.yaml"] + volumes: + - ./loki/loki-config.yaml:/etc/loki/local-config.yaml:ro + - ./data/loki:/loki + ports: + - "3100:3100" + healthcheck: + test: ["CMD", "wget", "-q", "-O-", "http://127.0.0.1:3100/ready"] + interval: 10s + timeout: 5s + retries: 18 + start_period: 10s prometheus: image: prom/prometheus:v3.2.0 + restart: unless-stopped volumes: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./data/prometheus:/prometheus ports: - "9090:9090" depends_on: - - otel-collector + otel-collector: + condition: service_healthy + healthcheck: + test: ["CMD", "wget", "-q", "-O-", "http://127.0.0.1:9090/-/ready"] + interval: 10s + timeout: 5s + retries: 18 + start_period: 10s tempo: image: grafana/tempo:2.7.0 + restart: unless-stopped command: ["-config.file=/etc/tempo/tempo.yml"] volumes: - ./tempo/tempo.yml:/etc/tempo/tempo.yml:ro + - ./data/tempo:/tmp/tempo ports: - "3200:3200" # Tempo query API - "4317" # OTLP gRPC (internal) + healthcheck: + test: ["CMD", "wget", "-q", "-O-", "http://127.0.0.1:3200/api/search"] + interval: 10s + timeout: 5s + retries: 24 + start_period: 20s grafana: image: grafana/grafana:11.5.0 + restart: unless-stopped environment: - GF_SECURITY_ADMIN_PASSWORD=admin - GF_AUTH_ANONYMOUS_ENABLED=true @@ -40,8 +84,19 @@ services: volumes: - ./grafana/provisioning:/etc/grafana/provisioning:ro - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + - ./data/grafana:/var/lib/grafana ports: - "3001:3000" depends_on: - - prometheus - - tempo + prometheus: + condition: service_healthy + tempo: + condition: service_healthy + loki: + condition: service_healthy + healthcheck: + test: ["CMD", "wget", "-q", "-O-", "http://127.0.0.1:3000/api/health"] + interval: 10s + timeout: 5s + retries: 24 + start_period: 20s diff --git a/observability/grafana/provisioning/datasources/datasources.yml b/observability/grafana/provisioning/datasources/datasources.yml index 8b8c6244..3c885539 100644 --- a/observability/grafana/provisioning/datasources/datasources.yml +++ b/observability/grafana/provisioning/datasources/datasources.yml @@ -14,6 +14,15 @@ datasources: - name: traceID datasourceUid: tempo + - name: Loki + type: loki + uid: loki + access: proxy + url: http://loki:3100 + editable: false + jsonData: + maxLines: 1000 + - name: Tempo type: tempo uid: tempo @@ -21,6 +30,17 @@ datasources: url: http://tempo:3200 editable: false jsonData: + tracesToLogs: + datasourceUid: loki + tags: ["service.name"] + mappedTags: + - key: service.name + value: service.name + mapTagNamesEnabled: false + spanStartTimeShift: "-1h" + spanEndTimeShift: "1h" + filterByTraceID: false + filterBySpanID: false tracesToMetrics: datasourceUid: prometheus spanStartTimeShift: "-1h" diff --git a/observability/loki/loki-config.yaml b/observability/loki/loki-config.yaml new file mode 100644 index 00000000..eb20f384 --- /dev/null +++ b/observability/loki/loki-config.yaml @@ -0,0 +1,35 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2020-10-24 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +limits_config: + allow_structured_metadata: true + +ruler: + alertmanager_url: http://localhost:9093 + +analytics: + reporting_enabled: false diff --git a/observability/otel-collector-config.yml b/observability/otel-collector-config.yml index 68f29b27..daa29aef 100644 --- a/observability/otel-collector-config.yml +++ b/observability/otel-collector-config.yml @@ -10,10 +10,12 @@ exporters: prometheus: endpoint: "0.0.0.0:8889" namespace: smithers - otlp/tempo: + otlp_grpc/tempo: endpoint: "tempo:4317" tls: insecure: true + otlp_http/loki: + endpoint: "http://loki:3100/otlp" processors: batch: @@ -29,4 +31,8 @@ service: traces: receivers: [otlp] processors: [batch] - exporters: [otlp/tempo] + exporters: [otlp_grpc/tempo] + logs: + receivers: [otlp] + processors: [batch] + exporters: [otlp_http/loki] diff --git a/scripts/obs-reset.sh b/scripts/obs-reset.sh new file mode 100755 index 00000000..1dfdf798 --- /dev/null +++ b/scripts/obs-reset.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) +COMPOSE_FILE="$ROOT_DIR/observability/docker-compose.otel.yml" + +if ! command -v docker >/dev/null 2>&1; then + echo "docker is required" >&2 + exit 1 +fi + +DATA_DIR="$ROOT_DIR/observability/data" + +echo "[obs-reset] bringing stack down and removing volumes..." +docker compose -f "$COMPOSE_FILE" down -v --remove-orphans || true + +echo "[obs-reset] resetting local data dirs..." +rm -rf "$DATA_DIR" +mkdir -p "$DATA_DIR"/loki "$DATA_DIR"/tempo "$DATA_DIR"/prometheus "$DATA_DIR"/grafana +chmod -R 0777 "$DATA_DIR" + +echo "[obs-reset] starting fresh stack..." +docker compose -f "$COMPOSE_FILE" up -d + +echo "[obs-reset] waiting for health..." +"$ROOT_DIR/scripts/obs-wait-healthy.sh" + +echo "[obs-reset] stack is healthy" diff --git a/scripts/obs-wait-healthy.sh b/scripts/obs-wait-healthy.sh new file mode 100755 index 00000000..57587e0a --- /dev/null +++ b/scripts/obs-wait-healthy.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) +COMPOSE_FILE="$ROOT_DIR/observability/docker-compose.otel.yml" +TIMEOUT_SECONDS="${1:-240}" +START_TS=$(date +%s) + +services=(loki tempo otel-collector prometheus grafana) + +status_of() { + local container=$1 + docker inspect "$container" --format '{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}' 2>/dev/null || echo "missing" +} + +container_name() { + local service=$1 + docker compose -f "$COMPOSE_FILE" ps -q "$service" | head -n 1 +} + +echo "[obs-wait] timeout=${TIMEOUT_SECONDS}s" + +while true; do + all_ready=1 + for service in "${services[@]}"; do + cid=$(container_name "$service") + if [[ -z "$cid" ]]; then + echo "[obs-wait] $service: missing" + all_ready=0 + continue + fi + state=$(status_of "$cid") + echo "[obs-wait] $service: $state" + if [[ "$state" != "healthy" && "$state" != "running" ]]; then + all_ready=0 + fi + if [[ "$service" != "otel-collector" && "$state" != "healthy" ]]; then + all_ready=0 + fi + done + + if [[ "$all_ready" == "1" ]]; then + break + fi + + now=$(date +%s) + if (( now - START_TS >= TIMEOUT_SECONDS )); then + echo "[obs-wait] timed out waiting for health" >&2 + docker compose -f "$COMPOSE_FILE" ps >&2 || true + exit 1 + fi + + sleep 5 + echo + done + +echo "[obs-wait] endpoint checks" +curl --max-time 10 -fsS http://localhost:3100/ready >/dev/null +curl --max-time 10 -fsS http://localhost:9090/-/ready >/dev/null +curl --max-time 20 -fsS http://localhost:3001/api/health >/dev/null + +# Tempo can report not-ready briefly even when query/search already works, so allow retries. +tempo_ready=0 +for _ in $(seq 1 12); do + if curl --max-time 10 -fsS http://localhost:3200/ready >/dev/null; then + tempo_ready=1 + break + fi + sleep 5 +done + +if [[ "$tempo_ready" != "1" ]]; then + echo "[obs-wait] tempo never became ready" >&2 + exit 1 +fi + +echo "[obs-wait] healthy" diff --git a/scripts/verify-observability.sh b/scripts/verify-observability.sh new file mode 100755 index 00000000..e5353fac --- /dev/null +++ b/scripts/verify-observability.sh @@ -0,0 +1,216 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) +COMPOSE_FILE="$ROOT_DIR/observability/docker-compose.otel.yml" +STAMP=$(date +%Y%m%d-%H%M%S) +OUT_DIR="$ROOT_DIR/tmp/verification/$STAMP" +WORKFLOW_PATH="$ROOT_DIR/workflows/agent-trace-otel-demo.tsx" +RUN_ID="agent-trace-otel-demo-$STAMP" +FAIL_RUN_ID="agent-trace-otel-demo-fail-$STAMP" +mkdir -p "$OUT_DIR" + +export SMITHERS_OTEL_ENABLED=1 +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 +export OTEL_SERVICE_NAME=smithers-dev +export SMITHERS_LOG_FORMAT=json + +run_success() { + bun run "$ROOT_DIR/src/cli/index.ts" run "$WORKFLOW_PATH" --run-id "$RUN_ID" --annotations '{"custom.demo":true,"custom.ticket":"OBS-123"}' +} + +run_failure() { + bun run "$ROOT_DIR/src/cli/index.ts" run "$WORKFLOW_PATH" --run-id "$FAIL_RUN_ID" --input '{"failureMode":"malformed-json"}' --annotations '{"custom.demo":true,"custom.ticket":"OBS-ERR"}' +} + +retry_until() { + local name=$1 + local attempts=$2 + local validator=$3 + local command=$4 + local delay=2 + local out="$OUT_DIR/$name.json" + + for attempt in $(seq 1 "$attempts"); do + echo "[verify] $name attempt $attempt/$attempts" + eval "$command" > "$out.tmp" + if jq -e "$validator" "$out.tmp" >/dev/null 2>&1; then + mv "$out.tmp" "$out" + return 0 + fi + sleep "$delay" + delay=$((delay * 2)) + done + + echo "[verify] failed: $name" >&2 + cat "$out.tmp" >&2 || true + return 1 +} + +loki_query() { + local query=$1 + local limit=${2:-200} + curl --max-time 30 -sG 'http://localhost:3100/loki/api/v1/query_range' \ + --data-urlencode "query=$query" \ + --data-urlencode "limit=$limit" +} + +tempo_search() { + curl --max-time 30 -s 'http://localhost:3200/api/search' +} + +tempo_trace() { + local trace_id=$1 + curl --max-time 30 -s "http://localhost:3200/api/traces/$trace_id" +} + +append_json() { + local title=$1 + local path=$2 + { + echo + echo "## $title" + echo + echo '```json' + cat "$path" + echo + echo '```' + } >> "$OUT_DIR/report.md" +} + +echo "[verify] reset stack" +"$ROOT_DIR/scripts/obs-reset.sh" + +echo "[verify] capture stack state" +docker compose -f "$COMPOSE_FILE" ps > "$OUT_DIR/compose-ps.txt" +curl --max-time 20 -s http://localhost:3001/api/health | jq . > "$OUT_DIR/grafana-health.json" +curl --max-time 20 -s http://localhost:3001/api/datasources | jq 'map({name,type,url,uid})' > "$OUT_DIR/grafana-datasources.json" +curl --max-time 20 -s http://localhost:9090/api/v1/query --data-urlencode 'query=up' | jq . > "$OUT_DIR/prometheus-up.json" + +echo "[verify] run success workflow" +run_success | tee "$OUT_DIR/success-run.txt" + +echo "[verify] run failure workflow" +set +e +run_failure | tee "$OUT_DIR/failure-run.txt" +FAIL_EXIT=$? +set -e +printf '%s\n' "$FAIL_EXIT" > "$OUT_DIR/failure-exit-code.txt" + +sleep 6 + +retry_until loki-all-events 6 '.streams >= 2' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\"' 200 | jq '{streams:(.data.result|length), counts:(.data.result|map(.values|length)), lines:[.data.result[]?.values[]?[1]][0:5]}'" + +retry_until loki-node-attempt 6 '.streams >= 10' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"pi-rich-trace\" | node_attempt=\"1\"' 200 | jq '{streams:(.data.result|length), counts:(.data.result|map(.values|length))}'" + +retry_until loki-claude-structured 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"claude-structured-trace\"' 50 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]][0:10]}'" + +retry_until loki-gemini-structured 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"gemini-structured-trace\"' 50 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]][0:10]}'" + +retry_until loki-codex-structured 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"codex-structured-trace\"' 50 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]][0:10]}'" + +retry_until loki-thinking 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | event_kind=\"assistant.thinking.delta\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-tool-start 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | event_kind=\"tool.execution.start\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-tool-update 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | event_kind=\"tool.execution.update\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-tool-end 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | event_kind=\"tool.execution.end\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-sdk-final-only 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"sdk-final-only\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-claude-text-delta 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"claude-structured-trace\" | event_kind=\"assistant.text.delta\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-gemini-text-delta 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"gemini-structured-trace\" | event_kind=\"assistant.text.delta\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-claude-usage 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"claude-structured-trace\" | event_kind=\"usage\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-gemini-usage 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"gemini-structured-trace\" | event_kind=\"usage\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-codex-text-delta 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"codex-structured-trace\" | event_kind=\"assistant.text.delta\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-codex-usage 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"codex-structured-trace\" | event_kind=\"usage\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-capture-errors 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$FAIL_RUN_ID\" | event_kind=\"capture.error\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-redaction-presence 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" |= \"REDACTED_SECRET\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-secret-absence 6 '.streams == 0' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" |= \"sk_demo_secret_123456789\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until tempo-search 6 '.smithers | length >= 1' \ + "tempo_search | jq '{smithers:[.traces[] | select(.rootServiceName==\"smithers-dev\") | {traceID, rootServiceName, rootTraceName}]}'" + +TRACE_ID="" +for candidate in $(jq -r '.smithers[]?.traceID' "$OUT_DIR/tempo-search.json"); do + if tempo_trace "$candidate" \ + | jq '{resourceAttrs:[.batches[].resource.attributes], spans:[.batches[].scopeSpans[].spans[] | {name, attrs:.attributes}]}' > "$OUT_DIR/tempo-trace-candidate.json.tmp"; then + if jq -e ".spans | any(.attrs | any(.key==\"runId\" and .value.stringValue==\"$RUN_ID\"))" "$OUT_DIR/tempo-trace-candidate.json.tmp" >/dev/null 2>&1; then + TRACE_ID="$candidate" + mv "$OUT_DIR/tempo-trace-candidate.json.tmp" "$OUT_DIR/tempo-trace-with-runid.json" + break + fi + fi +done + +if [[ -z "$TRACE_ID" ]]; then + echo "[verify] failed: unable to find Tempo trace for success run id $RUN_ID" >&2 + exit 1 +fi + +cat > "$OUT_DIR/report.md" <; }; diff --git a/src/SmithersEvent.ts b/src/SmithersEvent.ts index 6b25e4ff..d69a79c0 100644 --- a/src/SmithersEvent.ts +++ b/src/SmithersEvent.ts @@ -1,4 +1,5 @@ import type { RunStatus } from "./RunStatus"; +import type { AgentTraceSummary, CanonicalAgentTraceEvent } from "./agent-trace"; export type SmithersEvent = | { type: "RunStarted"; runId: string; timestampMs: number } @@ -194,4 +195,22 @@ export type SmithersEvent = cacheWriteTokens?: number; reasoningTokens?: number; timestampMs: number; + } + | { + type: "AgentTraceEvent"; + runId: string; + nodeId: string; + iteration: number; + attempt: number; + trace: CanonicalAgentTraceEvent; + timestampMs: number; + } + | { + type: "AgentTraceSummary"; + runId: string; + nodeId: string; + iteration: number; + attempt: number; + summary: AgentTraceSummary; + timestampMs: number; }; diff --git a/src/agent-trace.ts b/src/agent-trace.ts new file mode 100644 index 00000000..2544906c --- /dev/null +++ b/src/agent-trace.ts @@ -0,0 +1,1675 @@ +import { appendFile, mkdir, writeFile } from "node:fs/promises"; +import { join } from "node:path"; +import type { SmithersEvent } from "./SmithersEvent"; +import type { EventBus } from "./events"; +import { logErrorAwait, logInfoAwait, logWarningAwait } from "./effect/logging"; +import { nowMs } from "./utils/time"; + +export type AgentFamily = + | "pi" + | "codex" + | "claude-code" + | "gemini" + | "kimi" + | "openai" + | "anthropic" + | "amp" + | "forge" + | "unknown"; + +export type AgentCaptureMode = + | "sdk-events" + | "rpc-events" + | "cli-json-stream" + | "cli-json" + | "cli-text" + | "artifact-import"; + +export type TraceCompleteness = + | "full-observed" + | "partial-observed" + | "final-only" + | "capture-failed"; + +export type CanonicalAgentTraceEventKind = + | "session.start" + | "session.end" + | "turn.start" + | "turn.end" + | "message.start" + | "message.update" + | "message.end" + | "assistant.text.delta" + | "assistant.thinking.delta" + | "assistant.message.final" + | "tool.execution.start" + | "tool.execution.update" + | "tool.execution.end" + | "tool.result" + | "retry.start" + | "retry.end" + | "compaction.start" + | "compaction.end" + | "stderr" + | "stdout" + | "usage" + | "capture.warning" + | "capture.error" + | "artifact.created"; + +export type CanonicalAgentTraceEvent = { + traceVersion: "1"; + runId: string; + workflowPath?: string; + workflowHash?: string; + nodeId: string; + iteration: number; + attempt: number; + timestampMs: number; + event: { + sequence: number; + kind: CanonicalAgentTraceEventKind; + phase: + | "agent" + | "turn" + | "message" + | "tool" + | "session" + | "capture" + | "artifact"; + }; + source: { + agentFamily: AgentFamily; + captureMode: AgentCaptureMode; + rawType?: string; + observed: boolean; + }; + traceCompleteness: TraceCompleteness; + payload: Record | null; + raw: unknown; + redaction: { + applied: boolean; + ruleIds: string[]; + }; + annotations: Record; +}; + +export type AgentTraceSummary = { + traceVersion: "1"; + runId: string; + workflowPath?: string; + workflowHash?: string; + nodeId: string; + iteration: number; + attempt: number; + traceStartedAtMs: number; + traceFinishedAtMs: number; + agentFamily: AgentFamily; + agentId?: string; + model?: string; + captureMode: AgentCaptureMode; + traceCompleteness: TraceCompleteness; + unsupportedEventKinds: CanonicalAgentTraceEventKind[]; + missingExpectedEventKinds: CanonicalAgentTraceEventKind[]; + rawArtifactRefs: string[]; +}; + +export type AgentTraceSmithersEvent = { + type: "AgentTraceEvent"; + runId: string; + nodeId: string; + iteration: number; + attempt: number; + trace: CanonicalAgentTraceEvent; + timestampMs: number; +}; + +export type AgentTraceSummarySmithersEvent = { + type: "AgentTraceSummary"; + runId: string; + nodeId: string; + iteration: number; + attempt: number; + summary: AgentTraceSummary; + timestampMs: number; +}; + +export type AgentTraceCapabilityProfile = { + sessionMetadata: boolean; + assistantTextDeltas: boolean; + visibleThinkingDeltas: boolean; + finalAssistantMessage: boolean; + toolExecutionStart: boolean; + toolExecutionUpdate: boolean; + toolExecutionEnd: boolean; + retryEvents: boolean; + compactionEvents: boolean; + rawStderrDiagnostics: boolean; + persistedSessionArtifact: boolean; +}; + +export const agentTraceCapabilities: Record< + AgentFamily, + AgentTraceCapabilityProfile +> = { + pi: { + sessionMetadata: true, + assistantTextDeltas: true, + visibleThinkingDeltas: true, + finalAssistantMessage: true, + toolExecutionStart: true, + toolExecutionUpdate: true, + toolExecutionEnd: true, + retryEvents: true, + compactionEvents: true, + rawStderrDiagnostics: true, + persistedSessionArtifact: true, + }, + codex: { + sessionMetadata: false, + assistantTextDeltas: false, + visibleThinkingDeltas: false, + finalAssistantMessage: true, + toolExecutionStart: false, + toolExecutionUpdate: false, + toolExecutionEnd: false, + retryEvents: false, + compactionEvents: false, + rawStderrDiagnostics: true, + persistedSessionArtifact: false, + }, + "claude-code": { + sessionMetadata: false, + assistantTextDeltas: true, + visibleThinkingDeltas: false, + finalAssistantMessage: true, + toolExecutionStart: false, + toolExecutionUpdate: false, + toolExecutionEnd: false, + retryEvents: false, + compactionEvents: false, + rawStderrDiagnostics: true, + persistedSessionArtifact: false, + }, + gemini: { + sessionMetadata: false, + assistantTextDeltas: false, + visibleThinkingDeltas: false, + finalAssistantMessage: true, + toolExecutionStart: false, + toolExecutionUpdate: false, + toolExecutionEnd: false, + retryEvents: false, + compactionEvents: false, + rawStderrDiagnostics: true, + persistedSessionArtifact: false, + }, + kimi: { + sessionMetadata: false, + assistantTextDeltas: false, + visibleThinkingDeltas: false, + finalAssistantMessage: true, + toolExecutionStart: false, + toolExecutionUpdate: false, + toolExecutionEnd: false, + retryEvents: false, + compactionEvents: false, + rawStderrDiagnostics: true, + persistedSessionArtifact: false, + }, + openai: { + sessionMetadata: false, + assistantTextDeltas: false, + visibleThinkingDeltas: false, + finalAssistantMessage: true, + toolExecutionStart: true, + toolExecutionUpdate: false, + toolExecutionEnd: true, + retryEvents: false, + compactionEvents: false, + rawStderrDiagnostics: false, + persistedSessionArtifact: false, + }, + anthropic: { + sessionMetadata: false, + assistantTextDeltas: false, + visibleThinkingDeltas: false, + finalAssistantMessage: true, + toolExecutionStart: true, + toolExecutionUpdate: false, + toolExecutionEnd: true, + retryEvents: false, + compactionEvents: false, + rawStderrDiagnostics: false, + persistedSessionArtifact: false, + }, + amp: { + sessionMetadata: false, + assistantTextDeltas: false, + visibleThinkingDeltas: false, + finalAssistantMessage: true, + toolExecutionStart: true, + toolExecutionUpdate: false, + toolExecutionEnd: true, + retryEvents: false, + compactionEvents: false, + rawStderrDiagnostics: true, + persistedSessionArtifact: false, + }, + forge: { + sessionMetadata: false, + assistantTextDeltas: false, + visibleThinkingDeltas: false, + finalAssistantMessage: true, + toolExecutionStart: true, + toolExecutionUpdate: false, + toolExecutionEnd: true, + retryEvents: false, + compactionEvents: false, + rawStderrDiagnostics: true, + persistedSessionArtifact: false, + }, + unknown: { + sessionMetadata: false, + assistantTextDeltas: false, + visibleThinkingDeltas: false, + finalAssistantMessage: true, + toolExecutionStart: true, + toolExecutionUpdate: false, + toolExecutionEnd: true, + retryEvents: false, + compactionEvents: false, + rawStderrDiagnostics: true, + persistedSessionArtifact: false, + }, +}; + +const capabilityKindMap: Array< + [keyof AgentTraceCapabilityProfile, CanonicalAgentTraceEventKind[]] +> = [ + ["sessionMetadata", ["session.start", "session.end"]], + ["assistantTextDeltas", ["assistant.text.delta"]], + ["visibleThinkingDeltas", ["assistant.thinking.delta"]], + ["finalAssistantMessage", ["assistant.message.final"]], + ["toolExecutionStart", ["tool.execution.start"]], + ["toolExecutionUpdate", ["tool.execution.update"]], + ["toolExecutionEnd", ["tool.execution.end", "tool.result"]], + ["retryEvents", ["retry.start", "retry.end"]], + ["compactionEvents", ["compaction.start", "compaction.end"]], + ["rawStderrDiagnostics", ["stderr"]], + ["persistedSessionArtifact", ["artifact.created"]], +]; + +export function resolveAgentTraceCapabilities( + agentFamily: AgentFamily, + captureMode: AgentCaptureMode, +): AgentTraceCapabilityProfile { + const base = { + ...agentTraceCapabilities[agentFamily], + // Smithers persists a canonical NDJSON trace artifact for every successful + // flush regardless of the upstream agent family. + persistedSessionArtifact: true, + }; + + if (captureMode === "sdk-events" || captureMode === "cli-text") { + return base; + } + + if (agentFamily === "codex") { + return { + ...base, + assistantTextDeltas: captureMode === "cli-json-stream", + toolExecutionStart: captureMode === "cli-json-stream", + toolExecutionUpdate: captureMode === "cli-json-stream", + toolExecutionEnd: captureMode === "cli-json-stream", + }; + } + + if (agentFamily === "claude-code") { + return { + ...base, + toolExecutionStart: captureMode === "cli-json-stream", + toolExecutionUpdate: captureMode === "cli-json-stream", + toolExecutionEnd: captureMode === "cli-json-stream", + }; + } + + if (agentFamily === "gemini") { + return { + ...base, + assistantTextDeltas: captureMode === "cli-json-stream", + toolExecutionStart: captureMode === "cli-json-stream", + toolExecutionUpdate: captureMode === "cli-json-stream", + toolExecutionEnd: captureMode === "cli-json-stream", + }; + } + + if (agentFamily === "kimi") { + return { + ...base, + assistantTextDeltas: captureMode === "cli-json-stream", + toolExecutionStart: captureMode === "cli-json-stream", + toolExecutionUpdate: captureMode === "cli-json-stream", + toolExecutionEnd: captureMode === "cli-json-stream", + }; + } + + return base; +} + +function unsupportedKindsForCapabilities( + profile: AgentTraceCapabilityProfile, +): CanonicalAgentTraceEventKind[] { + const kinds: CanonicalAgentTraceEventKind[] = []; + for (const [field, mappedKinds] of capabilityKindMap) { + if (!profile[field]) kinds.push(...mappedKinds); + } + return kinds; +} + +export function detectAgentFamily(agent: any): AgentFamily { + const constructorName = String(agent?.constructor?.name ?? "").toLowerCase(); + const idName = String(agent?.id ?? "").toLowerCase(); + const name = + constructorName && constructorName !== "object" + ? `${constructorName} ${idName}` + : idName; + if (name.includes("pi")) return "pi"; + if (name.includes("codex")) return "codex"; + if (name.includes("claude")) return "claude-code"; + if (name.includes("gemini")) return "gemini"; + if (name.includes("kimi")) return "kimi"; + if (name.includes("openai")) return "openai"; + if (name.includes("anthropic")) return "anthropic"; + if (name.includes("amp")) return "amp"; + if (name.includes("forge")) return "forge"; + return "unknown"; +} + +export function detectCaptureMode(agent: any): AgentCaptureMode { + const family = detectAgentFamily(agent); + const mode = agent?.opts?.mode ?? agent?.mode; + if (family === "pi") { + if (mode === "rpc") return "rpc-events"; + if (mode === "json") return "cli-json-stream"; + return "cli-text"; + } + const outputFormat = agent?.opts?.outputFormat ?? agent?.outputFormat; + if (family === "openai" || family === "anthropic") return "sdk-events"; + if (outputFormat === "stream-json") return "cli-json-stream"; + if (outputFormat === "json" || agent?.opts?.json) return "cli-json"; + return "cli-text"; +} + +function normalizeAnnotations( + annotations?: Record, +): Record { + const normalized: Record = {}; + for (const [key, value] of Object.entries(annotations ?? {})) { + if (["string", "number", "boolean"].includes(typeof value)) { + normalized[key] = value as string | number | boolean; + } + } + return normalized; +} + +function redactValue(value: unknown): { + value: unknown; + applied: boolean; + ruleIds: string[]; +} { + const rules: Array<{ id: string; pattern: RegExp; replace: string }> = [ + { + id: "api-key", + pattern: /\b(?:sk|pk)_[A-Za-z0-9_-]{8,}\b/g, + replace: "[REDACTED_API_KEY]", + }, + { + id: "bearer-token", + pattern: /Bearer\s+[A-Za-z0-9._-]{8,}/gi, + replace: "Bearer [REDACTED_TOKEN]", + }, + { + id: "auth-header", + pattern: /"authorization"\s*:\s*"[^"]+"/gi, + replace: '"authorization":"[REDACTED]"', + }, + { + id: "cookie-header", + pattern: /"cookie"\s*:\s*"[^"]+"/gi, + replace: '"cookie":"[REDACTED]"', + }, + { + id: "secret-ish", + pattern: /\b(?:api[_-]?key|token|secret|password)=([^\s"']+)/gi, + replace: "$&" as unknown as string, + }, + ]; + const input = + typeof value === "string" ? value : JSON.stringify(value ?? null); + let next = input; + const applied = new Set(); + for (const rule of rules) { + const replaced = next.replace(rule.pattern, (match) => { + applied.add(rule.id); + if (rule.id === "secret-ish") { + const idx = match.indexOf("="); + return `${match.slice(0, idx + 1)}[REDACTED_SECRET]`; + } + return rule.replace; + }); + next = replaced; + } + if (applied.size === 0) return { value, applied: false, ruleIds: [] }; + if (typeof value === "string") + return { value: next, applied: true, ruleIds: [...applied] }; + try { + return { value: JSON.parse(next), applied: true, ruleIds: [...applied] }; + } catch { + return { value: next, applied: true, ruleIds: [...applied] }; + } +} + +function kindPhase( + kind: CanonicalAgentTraceEventKind, +): CanonicalAgentTraceEvent["event"]["phase"] { + if (kind.startsWith("session.")) return "session"; + if (kind.startsWith("turn.")) return "turn"; + if (kind.startsWith("message.") || kind.startsWith("assistant.")) + return "message"; + if (kind.startsWith("tool.")) return "tool"; + if (kind.startsWith("artifact.")) return "artifact"; + return "capture"; +} + +export function canonicalTraceEventToOtelLogRecord( + event: CanonicalAgentTraceEvent, + context?: { agentId?: string; model?: string }, +): { + body: string; + attributes: Record; + severity: "INFO" | "WARN" | "ERROR"; +} { + const attributes: Record = { + "smithers.trace.version": event.traceVersion, + "run.id": event.runId, + "workflow.path": event.workflowPath, + "workflow.hash": event.workflowHash, + "node.id": event.nodeId, + "node.iteration": event.iteration, + "node.attempt": event.attempt, + "agent.family": event.source.agentFamily, + "agent.id": context?.agentId, + "agent.model": context?.model, + "agent.capture_mode": event.source.captureMode, + "trace.completeness": event.traceCompleteness, + "event.kind": event.event.kind, + "event.phase": event.event.phase, + "event.sequence": event.event.sequence, + "source.raw_type": event.source.rawType, + "source.observed": event.source.observed, + }; + for (const [key, value] of Object.entries(event.annotations)) { + attributes[key.startsWith("custom.") ? key : `custom.${key}`] = value; + } + + const severity = + event.event.kind === "capture.error" + ? "ERROR" + : event.event.kind === "capture.warning" || event.event.kind === "stderr" + ? "WARN" + : "INFO"; + + return { + body: JSON.stringify({ + payload: event.payload, + raw: event.raw, + redaction: event.redaction, + annotations: event.annotations, + }), + attributes, + severity, + }; +} + +type TraceCollectorOptions = { + eventBus: EventBus; + runId: string; + workflowPath?: string | null; + workflowHash?: string | null; + nodeId: string; + iteration: number; + attempt: number; + agent: any; + agentId?: string; + model?: string; + logDir?: string; + annotations?: Record; +}; + +export class AgentTraceCollector { + private readonly eventBus: EventBus; + private readonly runId: string; + private readonly workflowPath?: string; + private readonly workflowHash?: string; + private readonly nodeId: string; + private readonly iteration: number; + private readonly attempt: number; + private readonly agentFamily: AgentFamily; + private readonly captureMode: AgentCaptureMode; + private readonly agentId?: string; + private readonly model?: string; + private readonly annotations: Record; + private readonly logDir?: string; + private readonly capabilities: AgentTraceCapabilityProfile; + private readonly startedAtMs = nowMs(); + private readonly events: CanonicalAgentTraceEvent[] = []; + private readonly rawArtifactRefs: string[] = []; + private readonly seenKinds = new Set(); + private readonly expectedKinds = new Set(); + private readonly failures: string[] = []; + private readonly warnings: string[] = []; + private sequence = 0; + private stdoutBuffer = ""; + private stderrBuffer = ""; + private assistantTextBuffer = ""; + private finalText: string | null = null; + private listener?: (event: SmithersEvent) => void; + + constructor(opts: TraceCollectorOptions) { + this.eventBus = opts.eventBus; + this.runId = opts.runId; + this.workflowPath = opts.workflowPath ?? undefined; + this.workflowHash = opts.workflowHash ?? undefined; + this.nodeId = opts.nodeId; + this.iteration = opts.iteration; + this.attempt = opts.attempt; + this.agentFamily = detectAgentFamily(opts.agent); + this.captureMode = detectCaptureMode(opts.agent); + this.capabilities = resolveAgentTraceCapabilities( + this.agentFamily, + this.captureMode, + ); + this.agentId = opts.agentId; + this.model = opts.model; + this.annotations = normalizeAnnotations(opts.annotations); + this.logDir = opts.logDir; + + const profile = this.capabilities; + if (profile.sessionMetadata && this.agentFamily === "pi") { + this.expectedKinds.add("session.start"); + this.expectedKinds.add("session.end"); + this.expectedKinds.add("turn.start"); + this.expectedKinds.add("turn.end"); + } + if (profile.finalAssistantMessage) + this.expectedKinds.add("assistant.message.final"); + } + + begin() { + this.listener = (event) => this.observeSmithersEvent(event); + this.eventBus.on("event", this.listener); + } + + endListener() { + if (this.listener) this.eventBus.off("event", this.listener); + this.listener = undefined; + } + + onStdout(text: string) { + this.processChunk("stdout", text); + } + + onStderr(text: string) { + this.processChunk("stderr", text); + } + + observeResult(result: any) { + const text = String(result?.text ?? "").trim(); + if ( + text && + (!this.finalText || + (!this.seenKinds.has("assistant.text.delta") && + !this.seenKinds.has("assistant.message.final"))) + ) { + this.finalText = text; + } + if (this.captureMode === "sdk-events" && text) { + this.push( + "assistant.message.final", + { text }, + text, + false, + undefined, + true, + ); + } + const usage = result?.usage ?? result?.totalUsage; + if (this.hasMeaningfulUsage(usage) && !this.seenKinds.has("usage")) { + this.push( + "usage", + { + inputTokens: usage.inputTokens ?? usage.promptTokens ?? 0, + outputTokens: usage.outputTokens ?? usage.completionTokens ?? 0, + cacheReadTokens: + usage.inputTokenDetails?.cacheReadTokens ?? + usage.cacheReadTokens ?? + undefined, + cacheWriteTokens: + usage.inputTokenDetails?.cacheWriteTokens ?? + usage.cacheWriteTokens ?? + undefined, + reasoningTokens: + usage.outputTokenDetails?.reasoningTokens ?? + usage.reasoningTokens ?? + undefined, + }, + usage, + false, + "usage", + true, + ); + } + } + + observeError(error: unknown) { + this.failures.push(error instanceof Error ? error.message : String(error)); + this.push( + "capture.error", + { error: this.failures.at(-1) }, + { error: this.failures.at(-1) }, + false, + "error", + true, + ); + } + + async flush() { + this.endListener(); + const finishedAtMs = nowMs(); + this.flushStructuredBuffers(); + if ( + this.captureMode !== "sdk-events" && + !this.seenKinds.has("assistant.message.final") && + this.finalText + ) { + this.push( + "assistant.message.final", + { text: this.finalText }, + this.finalText, + false, + undefined, + true, + ); + } + if ( + (this.captureMode === "cli-json-stream" || + this.captureMode === "rpc-events") && + this.events.length > 0 && + !this.seenKinds.has("assistant.message.final") && + this.failures.length === 0 + ) { + this.warnings.push( + "structured stream ended without a terminal assistant message", + ); + this.push( + "capture.warning", + { reason: "missing-terminal-event" }, + { reason: "missing-terminal-event" }, + false, + "capture", + true, + ); + } + + let traceCompleteness = this.resolveCompleteness(); + let missingExpectedEventKinds = [...this.expectedKinds].filter( + (kind) => !this.seenKinds.has(kind), + ); + let summary: AgentTraceSummary = { + traceVersion: "1", + runId: this.runId, + workflowPath: this.workflowPath, + workflowHash: this.workflowHash, + nodeId: this.nodeId, + iteration: this.iteration, + attempt: this.attempt, + traceStartedAtMs: this.startedAtMs, + traceFinishedAtMs: finishedAtMs, + agentFamily: this.agentFamily, + agentId: this.agentId, + model: this.model, + captureMode: this.captureMode, + traceCompleteness, + unsupportedEventKinds: unsupportedKindsForCapabilities( + this.capabilities, + ).filter((kind) => !this.seenKinds.has(kind)), + missingExpectedEventKinds, + rawArtifactRefs: this.rawArtifactRefs, + }; + + const persistedArtifact = await this.persistNdjson(summary); + if (persistedArtifact.ok && persistedArtifact.file) { + const artifactPath = persistedArtifact.file; + this.rawArtifactRefs.push(artifactPath); + this.push( + "artifact.created", + { + artifactKind: "agent-trace.ndjson", + artifactPath, + contentType: "application/x-ndjson", + }, + { + artifactKind: "agent-trace.ndjson", + artifactPath, + contentType: "application/x-ndjson", + }, + false, + "artifact", + true, + ); + summary = { ...summary, rawArtifactRefs: [...this.rawArtifactRefs] }; + await this.rewriteNdjson(artifactPath, summary); + } else if (!persistedArtifact.ok) { + this.warnings.push(persistedArtifact.error); + this.push( + "capture.warning", + { reason: "artifact-write-failed", error: persistedArtifact.error }, + { reason: "artifact-write-failed", error: persistedArtifact.error }, + false, + "artifact", + true, + ); + traceCompleteness = this.resolveCompleteness(); + missingExpectedEventKinds = [...this.expectedKinds].filter( + (kind) => !this.seenKinds.has(kind), + ); + summary = { + ...summary, + traceCompleteness, + missingExpectedEventKinds, + rawArtifactRefs: [...this.rawArtifactRefs], + }; + } + + for (const event of this.events) { + event.traceCompleteness = traceCompleteness; + const smithersEvent: AgentTraceSmithersEvent = { + type: "AgentTraceEvent", + runId: this.runId, + nodeId: this.nodeId, + iteration: this.iteration, + attempt: this.attempt, + trace: event, + timestampMs: event.timestampMs, + }; + await this.eventBus.emitEventQueued( + smithersEvent as unknown as SmithersEvent, + ); + const record = canonicalTraceEventToOtelLogRecord(event, { + agentId: this.agentId, + model: this.model, + }); + if (record.severity === "ERROR") { + await logErrorAwait(record.body, record.attributes, "agent-trace"); + } else if (record.severity === "WARN") { + await logWarningAwait(record.body, record.attributes, "agent-trace"); + } else { + await logInfoAwait(record.body, record.attributes, "agent-trace"); + } + } + + await this.eventBus.emitEventQueued({ + type: "AgentTraceSummary", + runId: this.runId, + nodeId: this.nodeId, + iteration: this.iteration, + attempt: this.attempt, + summary, + timestampMs: finishedAtMs, + } as unknown as SmithersEvent); + } + + private processChunk(stream: "stdout" | "stderr", text: string) { + if (stream === "stderr") { + this.stderrBuffer += text; + this.push("stderr", { text }, text, true, stream, true); + return; + } + this.stdoutBuffer += text; + if (this.captureMode === "cli-text" || this.captureMode === "sdk-events") { + this.push("stdout", { text }, text, true, stream, true); + return; + } + const lines = this.stdoutBuffer.split(/\r?\n/); + this.stdoutBuffer = lines.pop() ?? ""; + for (const line of lines) { + if (!line.trim()) continue; + this.processStructuredStdoutLine(line); + } + } + + private flushStructuredBuffers() { + if (this.captureMode === "cli-text" || this.captureMode === "sdk-events") { + this.stdoutBuffer = ""; + this.stderrBuffer = ""; + return; + } + + const line = this.stdoutBuffer.trim(); + this.stdoutBuffer = ""; + this.stderrBuffer = ""; + if (!line) return; + + this.failures.push(`truncated structured stream: ${line.slice(0, 200)}`); + this.push( + "capture.error", + { reason: "truncated-json-stream", linePreview: line.slice(0, 200) }, + line, + true, + "stdout", + true, + ); + } + + private processStructuredStdoutLine(line: string) { + let parsed: any; + try { + parsed = JSON.parse(line); + } catch { + this.failures.push(`malformed upstream JSON: ${line.slice(0, 200)}`); + this.push( + "capture.error", + { linePreview: line.slice(0, 200), reason: "malformed-json" }, + line, + true, + "stdout", + true, + ); + return; + } + if (this.agentFamily === "pi") { + this.processPiEvent(parsed); + return; + } + this.processGenericStructuredEvent(parsed); + } + + private processGenericStructuredEvent(parsed: any) { + const rawType = + typeof parsed?.type === "string" ? parsed.type : "structured"; + + if (this.agentFamily === "claude-code" && rawType === "assistant") { + const text = this.extractGenericMessageText(parsed?.message ?? parsed); + if (typeof text === "string" && text) { + this.appendAssistantText(text); + this.push( + "assistant.text.delta", + { text }, + parsed, + true, + rawType, + true, + ); + } else { + this.push("stdout", { eventType: rawType }, parsed, true, rawType, true); + } + const usage = parsed?.message?.usage; + if (usage && !this.seenKinds.has("usage")) { + this.push("usage", usage, parsed, true, rawType, true); + } + return; + } + + if (this.agentFamily === "claude-code" && rawType === "result") { + const usage = parsed?.usage; + if (usage && !this.seenKinds.has("usage")) { + this.push("usage", usage, parsed, true, rawType, true); + } + const text = this.extractGenericMessageText(parsed); + if (typeof text === "string" && text) { + this.setFinalAssistantText(text); + } + return; + } + + if (this.agentFamily === "gemini" && rawType === "message") { + const role = parsed?.role; + const text = this.extractGenericMessageText(parsed); + if (role === "assistant" && typeof text === "string" && text) { + if (parsed?.delta) { + this.appendAssistantText(text); + } else { + this.setFinalAssistantText(text); + } + this.push( + parsed?.delta ? "assistant.text.delta" : "assistant.message.final", + { text }, + parsed, + true, + rawType, + true, + ); + return; + } + } + + if (this.agentFamily === "gemini" && rawType === "result" && parsed?.stats) { + const usage = { + inputTokens: parsed.stats.input_tokens ?? parsed.stats.input, + outputTokens: parsed.stats.output_tokens, + totalTokens: parsed.stats.total_tokens, + }; + if (this.hasMeaningfulUsage(usage) && !this.seenKinds.has("usage")) { + this.push("usage", usage, parsed, true, rawType, true); + } + return; + } + + if ( + ["message_start", "assistant_message_start", "response.started"].includes( + rawType, + ) + ) { + this.push( + "message.start", + this.extractGenericMessagePayload(parsed), + parsed, + true, + rawType, + true, + ); + return; + } + + if ( + [ + "message_delta", + "assistant_message.delta", + "assistant_message_delta", + "response.output_text.delta", + "content_block_delta", + ].includes(rawType) + ) { + const text = this.extractGenericDeltaText(parsed); + if (typeof text === "string" && text) { + this.appendAssistantText(text); + this.push( + "assistant.text.delta", + { text }, + parsed, + true, + rawType, + true, + ); + return; + } + } + + if ( + [ + "thinking_delta", + "reasoning_delta", + "response.reasoning.delta", + ].includes(rawType) + ) { + const text = this.extractGenericDeltaText(parsed); + if (typeof text === "string" && text) { + this.push( + "assistant.thinking.delta", + { text }, + parsed, + true, + rawType, + true, + ); + return; + } + } + + if ( + [ + "message_end", + "assistant_message_end", + "response.completed", + "message_stop", + ].includes(rawType) + ) { + this.push( + "message.end", + this.extractGenericMessagePayload(parsed), + parsed, + true, + rawType, + true, + ); + const text = this.extractGenericMessageText(parsed); + if (text) { + this.setFinalAssistantText(text); + this.push( + "assistant.message.final", + { text }, + parsed, + true, + rawType, + true, + ); + } + if (parsed?.usage) { + this.push("usage", parsed.usage, parsed, true, rawType, true); + } + return; + } + + if ( + ["tool_call_start", "tool_execution_start", "tool_call.started"].includes( + rawType, + ) + ) { + this.push( + "tool.execution.start", + this.extractGenericToolPayload(parsed), + parsed, + true, + rawType, + true, + ); + return; + } + + if ( + [ + "tool_call_delta", + "tool_call_update", + "tool_execution_update", + "tool_call.delta", + ].includes(rawType) + ) { + this.push( + "tool.execution.update", + this.extractGenericToolPayload(parsed), + parsed, + true, + rawType, + true, + ); + return; + } + + if ( + ["tool_call_end", "tool_execution_end", "tool_call.completed"].includes( + rawType, + ) + ) { + this.push( + "tool.execution.end", + this.extractGenericToolPayload(parsed), + parsed, + true, + rawType, + true, + ); + return; + } + + if (["tool_result", "tool_result.completed"].includes(rawType)) { + this.push( + "tool.result", + this.extractGenericToolPayload(parsed), + parsed, + true, + rawType, + true, + ); + return; + } + + if (rawType === "turn.completed" && parsed?.usage) { + this.push("usage", parsed.usage, parsed, true, rawType, true); + const text = this.extractGenericMessageText(parsed); + if (text) { + this.setFinalAssistantText(text); + this.push( + "assistant.message.final", + { text }, + parsed, + true, + rawType, + true, + ); + } + return; + } + + this.push("stdout", { eventType: rawType }, parsed, true, rawType, true); + } + + private processPiEvent(parsed: any) { + const type = String(parsed?.type ?? "unknown"); + switch (type) { + case "session": + case "agent_start": + this.push( + "session.start", + this.extractPiPayload(parsed), + parsed, + true, + type, + true, + ); + return; + case "agent_end": + this.push( + "session.end", + this.extractPiPayload(parsed), + parsed, + true, + type, + true, + ); + return; + case "turn_start": + this.push( + "turn.start", + this.extractPiPayload(parsed), + parsed, + true, + type, + true, + ); + return; + case "turn_end": { + this.push( + "turn.end", + this.extractPiPayload(parsed), + parsed, + true, + type, + true, + ); + const finalText = this.extractText(parsed?.message); + if (finalText) { + this.setFinalAssistantText(finalText); + this.push( + "assistant.message.final", + { text: finalText }, + parsed?.message, + true, + type, + true, + ); + } + if (parsed?.message?.usage) { + this.push( + "usage", + parsed.message.usage, + parsed.message.usage, + true, + "usage", + true, + ); + } + return; + } + case "message_start": + this.push( + "message.start", + this.extractPiPayload(parsed), + parsed, + true, + type, + true, + ); + return; + case "message_end": { + this.push( + "message.end", + this.extractPiPayload(parsed), + parsed, + true, + type, + true, + ); + const finalText = this.extractText(parsed?.message); + if (parsed?.message?.role === "assistant" && finalText) { + this.setFinalAssistantText(finalText); + this.push( + "assistant.message.final", + { text: finalText }, + parsed?.message, + true, + type, + true, + ); + } + return; + } + case "message_update": { + const evt = parsed?.assistantMessageEvent; + if (evt?.type === "text_delta" && typeof evt.delta === "string") { + this.appendAssistantText(evt.delta); + this.push( + "assistant.text.delta", + { text: evt.delta }, + parsed, + true, + evt.type, + true, + ); + return; + } + if ( + (evt?.type === "thinking_delta" || evt?.type === "reasoning_delta") && + typeof evt.delta === "string" + ) { + this.push( + "assistant.thinking.delta", + { text: evt.delta }, + parsed, + true, + evt.type, + true, + ); + return; + } + this.push( + "message.update", + this.extractPiPayload(parsed), + parsed, + true, + type, + true, + ); + return; + } + case "tool_execution_start": + this.push( + "tool.execution.start", + this.extractPiToolPayload(parsed), + parsed, + true, + type, + true, + ); + return; + case "tool_execution_update": + this.push( + "tool.execution.update", + this.extractPiToolPayload(parsed), + parsed, + true, + type, + true, + ); + return; + case "tool_execution_end": + this.push( + "tool.execution.end", + this.extractPiToolPayload(parsed), + parsed, + true, + type, + true, + ); + return; + case "auto_compaction_start": + this.push( + "compaction.start", + this.extractPiPayload(parsed), + parsed, + true, + type, + true, + ); + return; + case "auto_compaction_end": + this.push( + "compaction.end", + this.extractPiPayload(parsed), + parsed, + true, + type, + true, + ); + return; + case "auto_retry_start": + this.push( + "retry.start", + this.extractPiPayload(parsed), + parsed, + true, + type, + true, + ); + return; + case "auto_retry_end": + this.push( + "retry.end", + this.extractPiPayload(parsed), + parsed, + true, + type, + true, + ); + return; + default: + this.push("stdout", { eventType: type }, parsed, true, type, true); + return; + } + } + + private extractGenericDeltaText(parsed: any): string | undefined { + const candidates = [ + parsed?.delta?.text, + parsed?.delta, + parsed?.text, + parsed?.content_block?.text, + parsed?.contentBlock?.text, + parsed?.message?.text, + parsed?.message?.content, + parsed?.output_text, + ]; + for (const candidate of candidates) { + if (typeof candidate === "string" && candidate) return candidate; + } + return undefined; + } + + private extractGenericMessageText(parsed: any): string | undefined { + const message = + parsed?.message ?? parsed?.response ?? parsed?.item ?? parsed; + if (typeof message?.text === "string") return message.text; + if (typeof message?.content === "string") return message.content; + if (typeof parsed?.result === "string") return parsed.result; + if (typeof parsed?.text === "string") return parsed.text; + if (Array.isArray(message?.content)) { + const text = message.content + .map((part: any) => { + if (typeof part === "string") return part; + if (typeof part?.text === "string") return part.text; + if (typeof part?.content === "string") return part.content; + if (typeof part?.output_text === "string") return part.output_text; + return ""; + }) + .join(""); + if (text) return text; + } + if (Array.isArray(parsed?.output)) { + const text = parsed.output + .map((part: any) => { + if (typeof part?.text === "string") return part.text; + if (typeof part?.content === "string") return part.content; + if (typeof part?.output_text === "string") return part.output_text; + return ""; + }) + .join(""); + if (text) return text; + } + return undefined; + } + + private appendAssistantText(text: string) { + this.assistantTextBuffer += text; + this.finalText = this.assistantTextBuffer; + } + + private setFinalAssistantText(text: string) { + this.assistantTextBuffer = text; + this.finalText = text; + } + + private extractGenericMessagePayload(parsed: any): Record { + const payload: Record = {}; + const role = + parsed?.message?.role ?? parsed?.role ?? parsed?.response?.role; + if (typeof role === "string") payload.role = role; + const text = this.extractGenericMessageText(parsed); + if (text) payload.text = text; + if (parsed?.id) payload.id = parsed.id; + return payload; + } + + private extractGenericToolPayload(parsed: any): Record { + const tool = + parsed?.tool ?? + parsed?.toolCall ?? + parsed?.tool_call ?? + parsed?.toolExecution ?? + parsed; + return { + toolCallId: tool?.id ?? tool?.toolCallId ?? parsed?.id, + toolName: tool?.name ?? tool?.toolName ?? parsed?.toolName, + argsPreview: tool?.args ?? tool?.arguments ?? parsed?.args, + resultPreview: tool?.result ?? parsed?.result, + isError: Boolean(tool?.isError ?? parsed?.isError ?? parsed?.error), + }; + } + + private extractPiPayload(parsed: any): Record { + const payload: Record = {}; + if (parsed?.message?.role) payload.role = parsed.message.role; + const text = this.extractText(parsed?.message); + if (text) payload.text = text; + if (parsed?.id) payload.id = parsed.id; + return payload; + } + + private extractPiToolPayload(parsed: any): Record { + const tool = parsed?.toolExecution ?? parsed?.tool ?? parsed; + return { + toolCallId: tool?.id ?? tool?.toolCallId ?? parsed?.id, + toolName: tool?.name ?? tool?.toolName ?? parsed?.toolName, + argsPreview: tool?.args ?? tool?.arguments ?? parsed?.args, + resultPreview: tool?.result ?? parsed?.result, + isError: Boolean(tool?.isError ?? parsed?.isError ?? parsed?.error), + }; + } + + private extractText(message: any): string | undefined { + if (typeof message?.text === "string") return message.text; + if (typeof message?.content === "string") return message.content; + if (Array.isArray(message?.content)) { + const parts = message.content + .map((part: any) => { + if (typeof part === "string") return part; + if (typeof part?.text === "string") return part.text; + if (typeof part?.content === "string") return part.content; + return ""; + }) + .join(""); + return parts || undefined; + } + return undefined; + } + + private observeSmithersEvent(event: SmithersEvent) { + const sameAttempt = + (event as any).runId === this.runId && + (event as any).nodeId === this.nodeId && + (event as any).iteration === this.iteration && + (event as any).attempt === this.attempt; + if (!sameAttempt) return; + if (event.type === "ToolCallStarted") { + this.push( + "tool.execution.start", + { + toolCallId: `${event.toolName}:${event.seq}`, + toolName: event.toolName, + }, + event, + false, + event.type, + true, + ); + } + if (event.type === "ToolCallFinished") { + this.push( + "tool.execution.end", + { + toolCallId: `${event.toolName}:${event.seq}`, + toolName: event.toolName, + isError: event.status === "error", + }, + event, + false, + event.type, + true, + ); + } + if (event.type === "TokenUsageReported") { + this.push( + "usage", + { + model: event.model, + agent: event.agent, + inputTokens: event.inputTokens, + outputTokens: event.outputTokens, + cacheReadTokens: event.cacheReadTokens, + cacheWriteTokens: event.cacheWriteTokens, + reasoningTokens: event.reasoningTokens, + }, + event, + false, + event.type, + true, + ); + } + } + + private resolveCompleteness(): TraceCompleteness { + if (this.failures.length > 0) return "capture-failed"; + + const missing = [...this.expectedKinds].filter( + (kind) => !this.seenKinds.has(kind), + ); + if (missing.length > 0 || this.warnings.length > 0) + return "partial-observed"; + + const richKinds = new Set([ + "session.start", + "session.end", + "turn.start", + "turn.end", + "message.start", + "message.update", + "message.end", + "assistant.text.delta", + "assistant.thinking.delta", + "tool.execution.start", + "tool.execution.update", + "tool.execution.end", + "tool.result", + "retry.start", + "retry.end", + "compaction.start", + "compaction.end", + ]); + const sawRichStructure = [...this.seenKinds].some((kind) => + richKinds.has(kind), + ); + + if (this.captureMode === "sdk-events" || this.captureMode === "cli-text") { + return sawRichStructure ? "partial-observed" : "final-only"; + } + + if (!sawRichStructure) return "final-only"; + return "full-observed"; + } + + private hasMeaningfulUsage(usage: any): boolean { + if (!usage || typeof usage !== "object") return false; + + const values = [ + usage.inputTokens, + usage.promptTokens, + usage.outputTokens, + usage.completionTokens, + usage.cacheReadTokens, + usage.cacheWriteTokens, + usage.reasoningTokens, + usage.inputTokenDetails?.cacheReadTokens, + usage.inputTokenDetails?.cacheWriteTokens, + usage.outputTokenDetails?.reasoningTokens, + ]; + + return values.some( + (value) => typeof value === "number" && Number.isFinite(value) && value > 0, + ); + } + + private push( + kind: CanonicalAgentTraceEventKind, + payload: Record | null, + raw: unknown, + observed: boolean, + rawType?: string, + recordSeen = false, + ) { + const redactedPayload = redactValue(payload); + const redactedRaw = redactValue(raw); + const event: CanonicalAgentTraceEvent = { + traceVersion: "1", + runId: this.runId, + workflowPath: this.workflowPath, + workflowHash: this.workflowHash, + nodeId: this.nodeId, + iteration: this.iteration, + attempt: this.attempt, + timestampMs: nowMs(), + event: { + sequence: this.sequence++, + kind, + phase: kindPhase(kind), + }, + source: { + agentFamily: this.agentFamily, + captureMode: this.captureMode, + rawType, + observed, + }, + traceCompleteness: "partial-observed", + payload: redactedPayload.value as Record | null, + raw: redactedRaw.value, + redaction: { + applied: redactedPayload.applied || redactedRaw.applied, + ruleIds: [ + ...new Set([...redactedPayload.ruleIds, ...redactedRaw.ruleIds]), + ], + }, + annotations: this.annotations, + }; + this.events.push(event); + if (recordSeen) this.seenKinds.add(kind); + } + + private async persistNdjson( + summary: AgentTraceSummary, + ): Promise<{ ok: true; file?: string } | { ok: false; error: string }> { + if (!this.logDir) return { ok: true }; + const dir = join(this.logDir, "agent-trace"); + const file = join( + dir, + `${this.nodeId}-${this.iteration}-${this.attempt}.ndjson`, + ); + const lines = this.events + .map((event) => JSON.stringify(event)) + .concat(JSON.stringify({ summary })); + try { + await mkdir(dir, { recursive: true }); + await appendFile(file, `${lines.join("\n")}\n`, "utf8"); + return { ok: true, file }; + } catch (error) { + return { + ok: false, + error: error instanceof Error ? error.message : String(error), + }; + } + } + + private async rewriteNdjson(file: string, summary: AgentTraceSummary) { + const lines = this.events + .map((event) => JSON.stringify(event)) + .concat(JSON.stringify({ summary })); + await writeFile(file, `${lines.join("\n")}\n`, "utf8"); + } +} diff --git a/src/agents/BaseCliAgent.ts b/src/agents/BaseCliAgent.ts index 40f1bd3b..b7c451a8 100644 --- a/src/agents/BaseCliAgent.ts +++ b/src/agents/BaseCliAgent.ts @@ -58,6 +58,7 @@ type RunRpcCommandOptions = { idleTimeoutMs?: number; signal?: AbortSignal; maxOutputBytes?: number; + onStdout?: (chunk: string) => void; onStderr?: (chunk: string) => void; onExtensionUiRequest?: (request: PiExtensionUiRequest) => | Promise @@ -594,7 +595,19 @@ export function runCommandEffect( agentArgs: args.join(" "), cwd, }), + Effect.annotateSpans({ + agentCommand: command, + agentArgs: args.join(" "), + cwd, + }), Effect.withLogSpan(`agent:${command}`), + Effect.withSpan(`agent:${command}`, { + attributes: { + agentCommand: command, + agentArgs: args.join(" "), + cwd, + }, + }), ); } @@ -613,7 +626,7 @@ export function runRpcCommandEffect(command: string, args: string[], options: Ru exitCode: number | null; usage?: any; }, Error> { - const { cwd, env, prompt, timeoutMs, idleTimeoutMs, signal, maxOutputBytes, onStderr, onExtensionUiRequest } = options; + const { cwd, env, prompt, timeoutMs, idleTimeoutMs, signal, maxOutputBytes, onStdout, onStderr, onExtensionUiRequest } = options; return Effect.async<{ text: string; output: unknown; @@ -823,8 +836,10 @@ export function runRpcCommandEffect(command: string, args: string[], options: Ru }); }); - child.stdout?.on("data", () => { + child.stdout?.on("data", (chunk) => { inactivity.reset(); + const text = chunk.toString("utf8"); + onStdout?.(text); }); child.stderr?.on("data", (chunk) => { @@ -882,7 +897,21 @@ export function runRpcCommandEffect(command: string, args: string[], options: Ru cwd, rpc: true, }), + Effect.annotateSpans({ + agentCommand: command, + agentArgs: args.join(" "), + cwd, + rpc: true, + }), Effect.withLogSpan(`agent:${command}:rpc`), + Effect.withSpan(`agent:${command}:rpc`, { + attributes: { + agentCommand: command, + agentArgs: args.join(" "), + cwd, + rpc: true, + }, + }), ); } diff --git a/src/agents/PiAgent.ts b/src/agents/PiAgent.ts index a3f71e2c..9506a5ac 100644 --- a/src/agents/PiAgent.ts +++ b/src/agents/PiAgent.ts @@ -220,6 +220,27 @@ export class PiAgent extends BaseCliAgent { } throw err; } + + // RPC mode + const rpcResult = await runRpcCommand("pi", args, { + cwd, + env, + prompt, + timeoutMs: callTimeouts.totalMs, + idleTimeoutMs: callTimeouts.idleMs, + signal: options?.abortSignal, + maxOutputBytes: this.maxOutputBytes ?? getToolContext()?.maxOutputBytes, + onStdout: options?.onStdout, + onStderr: options?.onStderr, + onExtensionUiRequest: this.opts.onExtensionUiRequest, + }); + + return buildGenerateResult( + rpcResult.text, + rpcResult.output, + this.opts.model ?? "pi", + rpcResult.usage, + ); } protected async buildCommand(_params: { diff --git a/src/cli/index.ts b/src/cli/index.ts index 6d830e3c..a1310d7a 100755 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -11,7 +11,7 @@ import { ensureSmithersTables } from "../db/ensure"; import { SmithersDb } from "../db/adapter"; import { buildContext } from "../context"; import { fromPromise } from "../effect/interop"; -import { runPromise } from "../effect/runtime"; +import { runPromise, shutdownRuntime } from "../effect/runtime"; import type { SmithersWorkflow } from "../SmithersWorkflow"; import { Smithers } from "../effect/builder"; import { revertToAttempt } from "../revert"; @@ -90,6 +90,30 @@ function parseJsonInput(raw: string | undefined, label: string, fail: FailFn) { } } +function parseAnnotations(raw: string | undefined, fail: FailFn) { + const parsed = parseJsonInput(raw, "annotations", fail); + if (parsed === undefined) return undefined; + if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) { + return fail({ + code: "INVALID_ANNOTATIONS", + message: "Run annotations must be a flat JSON object of string/number/boolean values", + exitCode: 4, + }); + } + const annotations: Record = {}; + for (const [key, value] of Object.entries(parsed)) { + if (!["string", "number", "boolean"].includes(typeof value)) { + return fail({ + code: "INVALID_ANNOTATIONS", + message: `Run annotation ${key} must be a string, number, or boolean`, + exitCode: 4, + }); + } + annotations[key] = value as string | number | boolean; + } + return annotations; +} + function formatStatusExitCode(status: string | undefined) { if (status === "finished") return 0; if (status === "waiting-approval") return 3; @@ -219,6 +243,7 @@ const commonRunOptions = z.object({ maxOutputBytes: z.number().int().min(1).optional().describe("Max bytes a single tool call can return (default: 200000)"), toolTimeoutMs: z.number().int().min(1).optional().describe("Max wall-clock time per tool call in ms (default: 60000)"), hot: z.boolean().default(false).describe("Enable hot module replacement for .tsx workflows"), + annotations: z.string().optional().describe("Run annotations as a flat JSON object of string/number/boolean values"), }); const runOptions = commonRunOptions.extend({ @@ -296,6 +321,7 @@ const cli = Cli.create({ const resolvedWorkflowPath = resolve(process.cwd(), workflowPath); const isToon = extname(resolvedWorkflowPath) === ".toon"; const input = parseJsonInput(options.input, "input", fail) ?? {}; + const annotations = parseAnnotations(options.annotations, fail); const runId = options.runId; const resume = Boolean(options.resume); @@ -388,6 +414,7 @@ const cli = Cli.create({ maxOutputBytes: options.maxOutputBytes, toolTimeoutMs: options.toolTimeoutMs, hot: options.hot, + annotations, onProgress, signal: abort.signal, }) @@ -397,6 +424,7 @@ const cli = Cli.create({ process.exitCode = formatStatusExitCode( typeof status === "string" ? status : undefined, ); + await shutdownRuntime(); return ok(result); } @@ -412,11 +440,13 @@ const cli = Cli.create({ maxOutputBytes: options.maxOutputBytes, toolTimeoutMs: options.toolTimeoutMs, hot: options.hot, + annotations, onProgress, signal: abort.signal, }); process.exitCode = formatStatusExitCode(result.status); + await shutdownRuntime(); return ok(result); } catch (err: any) { return fail({ @@ -443,6 +473,7 @@ const cli = Cli.create({ const resolvedWorkflowPath = resolve(process.cwd(), workflowPath); const isToon = extname(resolvedWorkflowPath) === ".toon"; const input = parseJsonInput(options.input, "input", fail) ?? {}; + const annotations = parseAnnotations(options.annotations, fail); const runId = options.runId; if (options.hot) { @@ -501,6 +532,7 @@ const cli = Cli.create({ maxOutputBytes: options.maxOutputBytes, toolTimeoutMs: options.toolTimeoutMs, hot: options.hot, + annotations, onProgress, signal: abort.signal, }) @@ -510,6 +542,7 @@ const cli = Cli.create({ process.exitCode = formatStatusExitCode( typeof status === "string" ? status : undefined, ); + await shutdownRuntime(); return ok(result); } @@ -525,11 +558,13 @@ const cli = Cli.create({ maxOutputBytes: options.maxOutputBytes, toolTimeoutMs: options.toolTimeoutMs, hot: options.hot, + annotations, onProgress, signal: abort.signal, }); process.exitCode = formatStatusExitCode(result.status); + await shutdownRuntime(); return ok(result); } catch (err: any) { return fail({ @@ -873,7 +908,7 @@ const cli = Cli.create({ }, }) .command("observability", { - description: "Start the local observability stack (Grafana, Prometheus, Tempo, OTLP Collector) via Docker Compose.", + description: "Start the local observability stack (Grafana, Prometheus, Tempo, Loki, OTLP Collector) via Docker Compose.", options: z.object({ detach: z.boolean().default(false).describe("Run containers in the background"), down: z.boolean().default(false).describe("Stop and remove the observability stack"), @@ -909,7 +944,9 @@ const cli = Cli.create({ : `[smithers] Starting observability stack...\n` + ` Grafana: http://localhost:3001\n` + ` Prometheus: http://localhost:9090\n` + - ` Tempo: http://localhost:3200\n`, + ` Tempo: http://localhost:3200\n` + + ` Loki: http://localhost:3100\n` + + ` OTEL HTTP: http://localhost:4318\n`, ); const child = spawn("docker", composeArgs, { diff --git a/src/effect/child-process.ts b/src/effect/child-process.ts index c00241f3..ffaa892c 100644 --- a/src/effect/child-process.ts +++ b/src/effect/child-process.ts @@ -172,6 +172,22 @@ export function spawnCaptureEffect( timeoutMs: timeoutMs ?? null, idleTimeoutMs: idleTimeoutMs ?? null, }), + Effect.annotateSpans({ + command, + args: args.join(" "), + cwd, + timeoutMs: timeoutMs ?? null, + idleTimeoutMs: idleTimeoutMs ?? null, + }), Effect.withLogSpan(`process:${command}`), + Effect.withSpan(`process:${command}`, { + attributes: { + command, + args: args.join(" "), + cwd, + timeoutMs: timeoutMs ?? null, + idleTimeoutMs: idleTimeoutMs ?? null, + }, + }), ); } diff --git a/src/effect/logging.ts b/src/effect/logging.ts index cbdac8a3..79a3bf26 100644 --- a/src/effect/logging.ts +++ b/src/effect/logging.ts @@ -1,9 +1,9 @@ import { Effect } from "effect"; -import { runFork } from "./runtime"; +import { runFork, runPromise } from "./runtime"; type LogAnnotations = Record | undefined; -function emitLog( +function withLogContext( effect: Effect.Effect, annotations?: LogAnnotations, span?: string, @@ -15,7 +15,23 @@ function emitLog( if (span) { program = program.pipe(Effect.withLogSpan(span)); } - void runFork(program); + return program; +} + +function emitLog( + effect: Effect.Effect, + annotations?: LogAnnotations, + span?: string, +) { + void runFork(withLogContext(effect, annotations, span)); +} + +async function emitLogAwait( + effect: Effect.Effect, + annotations?: LogAnnotations, + span?: string, +) { + await runPromise(withLogContext(effect, annotations, span)); } export function logDebug( @@ -26,6 +42,14 @@ export function logDebug( emitLog(Effect.logDebug(message), annotations, span); } +export async function logDebugAwait( + message: string, + annotations?: LogAnnotations, + span?: string, +) { + await emitLogAwait(Effect.logDebug(message), annotations, span); +} + export function logInfo( message: string, annotations?: LogAnnotations, @@ -34,6 +58,14 @@ export function logInfo( emitLog(Effect.logInfo(message), annotations, span); } +export async function logInfoAwait( + message: string, + annotations?: LogAnnotations, + span?: string, +) { + await emitLogAwait(Effect.logInfo(message), annotations, span); +} + export function logWarning( message: string, annotations?: LogAnnotations, @@ -42,6 +74,14 @@ export function logWarning( emitLog(Effect.logWarning(message), annotations, span); } +export async function logWarningAwait( + message: string, + annotations?: LogAnnotations, + span?: string, +) { + await emitLogAwait(Effect.logWarning(message), annotations, span); +} + export function logError( message: string, annotations?: LogAnnotations, @@ -49,3 +89,11 @@ export function logError( ) { emitLog(Effect.logError(message), annotations, span); } + +export async function logErrorAwait( + message: string, + annotations?: LogAnnotations, + span?: string, +) { + await emitLogAwait(Effect.logError(message), annotations, span); +} diff --git a/src/effect/runtime.ts b/src/effect/runtime.ts index b4aa8484..8a958506 100644 --- a/src/effect/runtime.ts +++ b/src/effect/runtime.ts @@ -50,4 +50,8 @@ export function runSync(effect: Effect.Effect) { return runtime.runSync(decorate(effect) as Effect.Effect); } +export async function shutdownRuntime() { + await runtime.dispose(); +} + export { SmithersRuntimeLayer }; diff --git a/src/engine/approvals.ts b/src/engine/approvals.ts index dbe84f8c..d1f5be03 100644 --- a/src/engine/approvals.ts +++ b/src/engine/approvals.ts @@ -47,7 +47,9 @@ export function approveNodeEffect( }); }).pipe( Effect.annotateLogs({ runId, nodeId, iteration, approvalStatus: "approved" }), + Effect.annotateSpans({ runId, nodeId, iteration, approvalStatus: "approved" }), Effect.withLogSpan("approval:grant"), + Effect.withSpan("approval:grant", { attributes: { runId, nodeId, iteration, approvalStatus: "approved" } }), ); } @@ -108,7 +110,9 @@ export function denyNodeEffect( }); }).pipe( Effect.annotateLogs({ runId, nodeId, iteration, approvalStatus: "denied" }), + Effect.annotateSpans({ runId, nodeId, iteration, approvalStatus: "denied" }), Effect.withLogSpan("approval:deny"), + Effect.withSpan("approval:deny", { attributes: { runId, nodeId, iteration, approvalStatus: "denied" } }), ); } diff --git a/src/engine/index.ts b/src/engine/index.ts index 589aa874..713232e5 100644 --- a/src/engine/index.ts +++ b/src/engine/index.ts @@ -54,6 +54,7 @@ import { spawn as nodeSpawn } from "node:child_process"; import { randomUUID } from "node:crypto"; import { platform } from "node:os"; import { withTaskRuntime } from "../effect/task-runtime"; +import { AgentTraceCollector } from "../agent-trace"; /** * Track which worktree paths have already been created this run so we don't @@ -1291,6 +1292,12 @@ async function executeTask( }, workflowName: string, cacheEnabled: boolean, + traceContext?: { + workflowPath?: string | null; + workflowHash?: string | null; + logDir?: string; + annotations?: Record; + }, signal?: AbortSignal, disabledAgents?: Set, ) { @@ -1495,6 +1502,21 @@ async function executeTask( let agentResult: any; if (effectiveAgent) { // Use fallback agent on retry attempts when available + const traceCollector = new AgentTraceCollector({ + eventBus, + runId, + workflowPath: traceContext?.workflowPath ?? null, + workflowHash: traceContext?.workflowHash ?? null, + nodeId: desc.nodeId, + iteration: desc.iteration, + attempt: attemptNo, + agent: effectiveAgent, + agentId: (effectiveAgent as any).id, + model: (effectiveAgent as any).model, + logDir: traceContext?.logDir, + annotations: traceContext?.annotations, + }); + traceCollector.begin(); const result = await runWithToolContext( { db: adapter, @@ -1545,6 +1567,8 @@ async function executeTask( stream, timestampMs: nowMs(), }); + if (stream === "stdout") traceCollector.onStdout(text); + else traceCollector.onStderr(text); }; return (effectiveAgent as any).generate({ options: undefined as any, @@ -1556,7 +1580,15 @@ async function executeTask( outputSchema: desc.outputSchema, }); }, - ); + ).then(async (result) => { + traceCollector.observeResult(result); + await traceCollector.flush(); + return result; + }, async (error) => { + traceCollector.observeError(error); + await traceCollector.flush(); + throw error; + }); agentResult = result; @@ -2299,12 +2331,15 @@ export function renderFrameEffect( ctx: any, opts?: { baseRootDir?: string }, ) { + const spanAttributes = { + runId: ctx?.runId ?? "", + iteration: ctx?.iteration ?? 0, + }; return fromPromise("render frame", () => renderFrameAsync(workflow, ctx, opts)).pipe( - Effect.annotateLogs({ - runId: ctx?.runId ?? "", - iteration: ctx?.iteration ?? 0, - }), + Effect.annotateLogs(spanAttributes), + Effect.annotateSpans(spanAttributes), Effect.withLogSpan("engine:render-frame"), + Effect.withSpan("engine:render-frame", { attributes: spanAttributes }), ); } @@ -3090,6 +3125,12 @@ async function runWorkflowBody( toolConfig, workflowName, cacheEnabled, + { + workflowPath: resolvedWorkflowPath ?? opts.workflowPath ?? null, + workflowHash: runMetadata.workflowHash, + logDir: logDir ?? undefined, + annotations: opts.annotations, + }, runAbortController.signal, disabledAgents, ).finally(() => inflight.delete(p)); @@ -3164,14 +3205,17 @@ export function runWorkflowEffect( workflow: SmithersWorkflow, opts: RunOptions, ) { + const spanAttributes = { + runId: opts.runId ?? "", + workflowPath: opts.workflowPath ?? "", + maxConcurrency: opts.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY, + hot: Boolean(opts.hot), + }; return fromPromise("run workflow", () => runWorkflowAsync(workflow, opts)).pipe( - Effect.annotateLogs({ - runId: opts.runId ?? "", - workflowPath: opts.workflowPath ?? "", - maxConcurrency: opts.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY, - hot: Boolean(opts.hot), - }), + Effect.annotateLogs(spanAttributes), + Effect.annotateSpans(spanAttributes), Effect.withLogSpan("engine:run-workflow"), + Effect.withSpan("engine:run-workflow", { attributes: spanAttributes }), ); } diff --git a/src/events.ts b/src/events.ts index 99256de6..b7ff5ccc 100644 --- a/src/events.ts +++ b/src/events.ts @@ -30,7 +30,9 @@ export class EventBus extends EventEmitter { } }).pipe( Effect.annotateLogs({ runId: event.runId, eventType: event.type }), + Effect.annotateSpans({ runId: event.runId, eventType: event.type }), Effect.withLogSpan(`event:${event.type}`), + Effect.withSpan(`event:${event.type}`, { attributes: { runId: event.runId, eventType: event.type } }), ); } @@ -44,7 +46,9 @@ export class EventBus extends EventEmitter { yield* this.persistEffect(event); }).pipe( Effect.annotateLogs({ runId: event.runId, eventType: event.type }), + Effect.annotateSpans({ runId: event.runId, eventType: event.type }), Effect.withLogSpan(`event:${event.type}:persist`), + Effect.withSpan(`event:${event.type}:persist`, { attributes: { runId: event.runId, eventType: event.type } }), ); } @@ -65,7 +69,10 @@ export class EventBus extends EventEmitter { this.persistError = null; throw err; } - }).pipe(Effect.withLogSpan("event:flush")); + }).pipe( + Effect.withLogSpan("event:flush"), + Effect.withSpan("event:flush"), + ); } async flush(): Promise { @@ -83,7 +90,9 @@ export class EventBus extends EventEmitter { } }).pipe( Effect.annotateLogs({ runId: event.runId, eventType: event.type }), + Effect.annotateSpans({ runId: event.runId, eventType: event.type }), Effect.withLogSpan("event:persist"), + Effect.withSpan("event:persist", { attributes: { runId: event.runId, eventType: event.type } }), ); } diff --git a/src/index.ts b/src/index.ts index 893f9ace..3e1a48ba 100644 --- a/src/index.ts +++ b/src/index.ts @@ -14,6 +14,15 @@ export type { SmithersCtx } from "./SmithersCtx"; export type { OutputAccessor, InferRow, InferOutputEntry } from "./OutputAccessor"; export type { SmithersEvent } from "./SmithersEvent"; export type { SmithersError } from "./SmithersError"; +export type { + AgentCaptureMode, + AgentFamily, + AgentTraceCapabilityProfile, + AgentTraceSummary, + CanonicalAgentTraceEvent, + CanonicalAgentTraceEventKind, + TraceCompleteness, +} from "./agent-trace"; export { SmithersError as SmithersErrorInstance, isSmithersError, errorToJson } from "./utils/errors"; export type { SmithersErrorCode } from "./utils/errors"; export type { @@ -88,6 +97,7 @@ export type { export { createSmithers } from "./create"; export type { CreateSmithersApi } from "./create"; export { runWorkflow, renderFrame } from "./engine"; +export { agentTraceCapabilities, detectAgentFamily, detectCaptureMode } from "./agent-trace"; // Tools export { tools, read, write, edit, grep, bash } from "./tools/index"; diff --git a/src/observability/index.ts b/src/observability/index.ts index b29835ff..bae08048 100644 --- a/src/observability/index.ts +++ b/src/observability/index.ts @@ -373,11 +373,19 @@ function makeService( return { options, annotate: (attributes) => Effect.void.pipe(Effect.annotateLogs(attributes)), - withSpan: (name, effect, attributes) => - (attributes && Object.keys(attributes).length > 0 - ? effect.pipe(Effect.annotateLogs(attributes)) - : effect - ).pipe(Effect.withLogSpan(name)), + withSpan: (name, effect, attributes) => { + let program = effect; + if (attributes && Object.keys(attributes).length > 0) { + program = program.pipe( + Effect.annotateLogs(attributes), + Effect.annotateSpans(attributes), + ); + } + return program.pipe( + Effect.withLogSpan(name), + Effect.withSpan(name, attributes ? { attributes } : undefined), + ); + }, }; } diff --git a/src/tools/logToolCall.ts b/src/tools/logToolCall.ts index 9e5fe2bb..34b06f37 100644 --- a/src/tools/logToolCall.ts +++ b/src/tools/logToolCall.ts @@ -37,6 +37,14 @@ export function logToolCallEffect( status, timestampMs: finished, }); + const spanAttributes = { + runId: ctx.runId, + nodeId: ctx.nodeId, + iteration: ctx.iteration, + attempt: ctx.attempt, + toolName, + toolStatus: status, + }; return Metric.update(toolDuration, durationMs).pipe( Effect.andThen( ctx.db.insertToolCallEffect({ @@ -54,15 +62,10 @@ export function logToolCallEffect( errorJson, }), ), - Effect.annotateLogs({ - runId: ctx.runId, - nodeId: ctx.nodeId, - iteration: ctx.iteration, - attempt: ctx.attempt, - toolName, - toolStatus: status, - }), + Effect.annotateLogs(spanAttributes), + Effect.annotateSpans(spanAttributes), Effect.withLogSpan(`tool:${toolName}:log`), + Effect.withSpan(`tool:${toolName}:log`, { attributes: spanAttributes }), ); } diff --git a/tests/agent-trace.test.tsx b/tests/agent-trace.test.tsx new file mode 100644 index 00000000..64f6cf6f --- /dev/null +++ b/tests/agent-trace.test.tsx @@ -0,0 +1,1198 @@ +/** @jsxImportSource smithers */ +import { describe, expect, test } from "bun:test"; +import { mkdtempSync, readFileSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { Workflow, Task, runWorkflow } from "../src"; +import { canonicalTraceEventToOtelLogRecord } from "../src/agent-trace"; +import { SmithersDb } from "../src/db/adapter"; +import { runPromise } from "../src/effect/runtime"; +import { + logToolCallEffect, + logToolCallStartEffect, +} from "../src/tools/logToolCall"; +import { createTestSmithers } from "./helpers"; +import { z } from "zod"; + +async function listRunEvents(db: any, runId: string) { + const adapter = new SmithersDb(db as any); + return adapter.listEvents(runId, -1, 500); +} + +describe("agent trace capture", () => { + test("captures high-fidelity Pi canonical trace events with ordering and redaction", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-pi"; + + const piLikeAgent: any = { + id: "pi-agent-test", + opts: { mode: "json" }, + generate: async (args: { onStdout?: (text: string) => void }) => { + args.onStdout?.( + JSON.stringify({ type: "session", id: "sess-1" }) + "\n", + ); + args.onStdout?.(JSON.stringify({ type: "turn_start" }) + "\n"); + args.onStdout?.( + JSON.stringify({ + type: "message_start", + message: { role: "assistant", content: [] }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message_update", + assistantMessageEvent: { + type: "thinking_delta", + delta: "thinking secret=sk_abc123456789", + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message_update", + assistantMessageEvent: { type: "text_delta", delta: "hello" }, + }) + "\n", + ); + await runPromise(logToolCallStartEffect("bash")); + await runPromise( + logToolCallEffect( + "bash", + { cmd: "echo secret=sk_live_123456789" }, + { ok: true }, + "success", + ), + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_execution_start", + toolExecution: { id: "t1", name: "bash", args: { cmd: "echo hi" } }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_execution_end", + toolExecution: { id: "t1", name: "bash", result: { ok: true } }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "turn_end", + message: { + role: "assistant", + content: [{ type: "text", text: "Final answer" }], + stopReason: "stop", + }, + }) + "\n", + ); + args.onStdout?.(JSON.stringify({ type: "agent_end" }) + "\n"); + return { + text: '{"answer":"Final answer"}', + output: { answer: "Final answer" }, + }; + }, + }; + + const workflow = smithers(() => ( + + + do the thing + + + )); + + const result = await runWorkflow(workflow, { + input: {}, + runId, + workflowPath: "/tmp/pi-trace.tsx", + annotations: { "custom.demo": true, ticket: 123 }, + }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + + expect(summary.agentFamily).toBe("pi"); + expect(summary.captureMode).toBe("cli-json-stream"); + expect(summary.traceCompleteness).toBe("full-observed"); + + const kinds = traceEvents.map((event: any) => event.event.kind); + expect(kinds).toContain("session.start"); + expect(kinds).toContain("turn.start"); + expect(kinds).toContain("assistant.thinking.delta"); + expect(kinds).toContain("assistant.text.delta"); + expect(kinds).toContain("tool.execution.start"); + expect(kinds).toContain("tool.execution.end"); + expect(kinds).toContain("assistant.message.final"); + + const sequences = traceEvents.map((event: any) => event.event.sequence); + expect(sequences).toEqual([...sequences].sort((a, b) => a - b)); + expect(new Set(sequences).size).toBe(sequences.length); + + const redacted = JSON.stringify(traceEvents); + expect(redacted).not.toContain("sk_abc123456789"); + expect(redacted).not.toContain("sk_live_123456789"); + expect(traceEvents.some((event: any) => event.redaction.applied)).toBe( + true, + ); + expect( + traceEvents.every( + (event: any) => event.annotations["custom.demo"] === true, + ), + ).toBe(true); + + cleanup(); + }); + + test("classifies malformed Pi JSON as capture-failed", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-pi-fail"; + + const piLikeAgent: any = { + id: "pi-agent-failure", + opts: { mode: "json" }, + generate: async (args: { onStdout?: (text: string) => void }) => { + args.onStdout?.("{not json}\n"); + throw new Error("subprocess exits early"); + }, + }; + + const workflow = smithers(() => ( + + + fail please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("failed"); + + const events = await listRunEvents(db, runId); + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + + expect(summary.traceCompleteness).toBe("capture-failed"); + expect( + traceEvents.some((event: any) => event.event.kind === "capture.error"), + ).toBe(true); + + cleanup(); + }); + + test("truthfully classifies sdk agents as final-only", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-sdk"; + + class OpenAIAgentFake { + id = "openai-sdk-fake"; + tools = {}; + async generate() { + return { + text: '{"answer":"sdk final"}', + output: { answer: "sdk final" }, + usage: { inputTokens: 10, outputTokens: 5 }, + }; + } + } + + const workflow = smithers(() => ( + + + sdk final only + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.agentFamily).toBe("openai"); + expect(summary.captureMode).toBe("sdk-events"); + expect(summary.traceCompleteness).toBe("final-only"); + expect(summary.unsupportedEventKinds).toContain("assistant.thinking.delta"); + expect( + traceEvents.some( + (event: any) => event.event.kind === "assistant.message.final", + ), + ).toBe(true); + expect(traceEvents.some((event: any) => event.event.kind === "usage")).toBe( + true, + ); + + cleanup(); + }); + + test("preserves structured Claude stream-json deltas truthfully", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-claude-structured"; + + class ClaudeCodeAgentFake { + id = "claude-code-fake"; + opts = { outputFormat: "stream-json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "message_start", + message: { role: "assistant" }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message_delta", + delta: { text: "claude delta" }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message_end", + message: { role: "assistant", content: "claude final" }, + usage: { input_tokens: 4, output_tokens: 2 }, + }) + "\n", + ); + return { + text: '{"answer":"claude final"}', + output: { answer: "claude final" }, + }; + } + } + + const workflow = smithers(() => ( + + + structured claude please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.agentFamily).toBe("claude-code"); + expect(summary.captureMode).toBe("cli-json-stream"); + expect(summary.traceCompleteness).toBe("full-observed"); + expect( + traceEvents.some((event: any) => event.event.kind === "message.start"), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.text.delta" && + event.payload.text === "claude delta", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "assistant.message.final", + ), + ).toBe(true); + expect(traceEvents.some((event: any) => event.event.kind === "usage")).toBe( + true, + ); + cleanup(); + }); + + test("normalizes real Claude stream-json assistant/result events", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-claude-real-schema"; + + class ClaudeCodeAgentRealSchemaFake { + id = "claude-code-real-fake"; + opts = { outputFormat: "stream-json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "system", + subtype: "init", + session_id: "sess-1", + model: "claude-opus-4-6[1m]", + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "assistant", + message: { + role: "assistant", + content: [{ type: "text", text: "claude real final" }], + usage: { + input_tokens: 3, + cache_read_input_tokens: 4, + output_tokens: 2, + }, + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "result", + subtype: "success", + result: "claude real final", + usage: { + input_tokens: 3, + cache_read_input_tokens: 4, + output_tokens: 20, + }, + }) + "\n", + ); + return { + text: '{"answer":"claude real final"}', + output: { answer: "claude real final" }, + }; + } + } + + const workflow = smithers(() => ( + + + structured claude real schema please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.traceCompleteness).toBe("full-observed"); + expect(summary.unsupportedEventKinds).not.toContain("artifact.created"); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.text.delta" && + event.payload.text === "claude real final", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.message.final" && + event.payload.text === "claude real final", + ), + ).toBe(true); + expect(traceEvents.filter((event: any) => event.event.kind === "usage").length).toBe(1); + + cleanup(); + }); + + test("preserves structured Codex completion usage and final message truthfully", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-codex-structured"; + + class CodexAgentFake { + id = "codex-fake"; + opts = { outputFormat: "stream-json", json: true }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "assistant_message.delta", + delta: { text: "codex delta" }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "turn.completed", + usage: { input_tokens: 8, output_tokens: 3 }, + message: { role: "assistant", content: "codex final" }, + }) + "\n", + ); + return { + text: '{"answer":"codex final"}', + output: { answer: "codex final" }, + }; + } + } + + const workflow = smithers(() => ( + + + structured codex please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.agentFamily).toBe("codex"); + expect(summary.captureMode).toBe("cli-json-stream"); + expect(summary.traceCompleteness).toBe("full-observed"); + expect(summary.unsupportedEventKinds).not.toContain("artifact.created"); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.text.delta" && + event.payload.text === "codex delta", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "assistant.message.final", + ), + ).toBe(true); + expect(traceEvents.some((event: any) => event.event.kind === "usage")).toBe( + true, + ); + expect(summary.unsupportedEventKinds).not.toContain("assistant.text.delta"); + cleanup(); + }); + + test("classifies structured Gemini stream-json traces truthfully", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-gemini-structured"; + + class GeminiAgentFake { + id = "gemini-fake"; + opts = { outputFormat: "stream-json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "response.output_text.delta", + delta: { text: "gemini delta" }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "response.completed", + response: { + role: "assistant", + content: [{ text: "gemini final" }], + }, + usage: { input_tokens: 6, output_tokens: 4 }, + }) + "\n", + ); + return { + text: '{"answer":"gemini final"}', + output: { answer: "gemini final" }, + }; + } + } + + const workflow = smithers(() => ( + + + structured gemini please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.agentFamily).toBe("gemini"); + expect(summary.captureMode).toBe("cli-json-stream"); + expect(summary.traceCompleteness).toBe("full-observed"); + expect(summary.unsupportedEventKinds).not.toContain("assistant.text.delta"); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.text.delta" && + event.payload.text === "gemini delta", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "assistant.message.final", + ), + ).toBe(true); + + cleanup(); + }); + + test("normalizes real Gemini stream-json message/result events", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-gemini-real-schema"; + + class GeminiAgentRealSchemaFake { + id = "gemini-real-fake"; + opts = { outputFormat: "stream-json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "init", + session_id: "sess-1", + model: "gemini-3-flash-preview", + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message", + role: "assistant", + content: "gemini ", + delta: true, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message", + role: "assistant", + content: "real final", + delta: true, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "result", + status: "success", + stats: { + input_tokens: 10, + output_tokens: 5, + total_tokens: 15, + }, + }) + "\n", + ); + return { + text: '{"answer":"gemini real final"}', + output: { answer: "gemini real final" }, + }; + } + } + + const workflow = smithers(() => ( + + + structured gemini real schema please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.traceCompleteness).toBe("full-observed"); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.text.delta" && + event.payload.text === "gemini ", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.text.delta" && + event.payload.text === "real final", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.message.final" && + event.payload.text === "gemini real final", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "usage" && + event.payload.inputTokens === 10 && + event.payload.outputTokens === 5, + ), + ).toBe(true); + + cleanup(); + }); + + test("classifies Gemini cli-json coarse output as final-only", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-gemini-final-only"; + + class GeminiAgentJsonFake { + id = "gemini-json-fake"; + opts = { outputFormat: "json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + text: "gemini coarse final", + stats: { models: { gemini: { tokens: { input: 3, output: 2 } } } }, + }) + "\n", + ); + return { + text: '{"answer":"gemini coarse final"}', + output: { answer: "gemini coarse final" }, + }; + } + } + + const workflow = smithers(() => ( + + + coarse gemini please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.agentFamily).toBe("gemini"); + expect(summary.captureMode).toBe("cli-json"); + expect(summary.traceCompleteness).toBe("final-only"); + expect(summary.unsupportedEventKinds).toContain("assistant.text.delta"); + expect( + traceEvents.some((event: any) => event.event.kind === "usage"), + ).toBe(false); + + cleanup(); + }); + + test("preserves structured Kimi stream-json tool lifecycle truthfully", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-kimi-structured"; + + class KimiAgentFake { + id = "kimi-fake"; + opts = { outputFormat: "stream-json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "assistant_message.delta", + delta: { text: "kimi delta" }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_execution_start", + toolExecution: { + id: "tool-1", + name: "bash", + args: { cmd: "echo hi" }, + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_execution_update", + toolExecution: { + id: "tool-1", + name: "bash", + result: { status: "running" }, + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_execution_end", + toolExecution: { id: "tool-1", name: "bash", result: { ok: true } }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "response.completed", + response: { role: "assistant", content: [{ text: "kimi final" }] }, + }) + "\n", + ); + return { + text: '{"answer":"kimi final"}', + output: { answer: "kimi final" }, + }; + } + } + + const workflow = smithers(() => ( + + + structured kimi please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.agentFamily).toBe("kimi"); + expect(summary.captureMode).toBe("cli-json-stream"); + expect(summary.traceCompleteness).toBe("full-observed"); + expect(summary.unsupportedEventKinds).not.toContain("tool.execution.start"); + expect(summary.unsupportedEventKinds).not.toContain("tool.execution.update"); + expect(summary.unsupportedEventKinds).not.toContain("tool.execution.end"); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.text.delta" && + event.payload.text === "kimi delta", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "tool.execution.start" && + event.payload.toolCallId === "tool-1", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "tool.execution.update", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "tool.execution.end", + ), + ).toBe(true); + + cleanup(); + }); + + test("sdk traces with smithers-observed tool lifecycle degrade to partial-observed instead of final-only", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-sdk-tools"; + + class OpenAIAgentWithToolsFake { + id = "openai-sdk-tools-fake"; + tools = {}; + async generate() { + const seq = await runPromise(logToolCallStartEffect("bash")); + await runPromise( + logToolCallEffect( + "bash", + { cmd: "echo hi" }, + { ok: true }, + "success", + undefined, + undefined, + typeof seq === "number" ? seq : undefined, + ), + ); + return { + text: '{"answer":"sdk tool final"}', + output: { answer: "sdk tool final" }, + }; + } + } + + const workflow = smithers(() => ( + + + sdk tools please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.traceCompleteness).toBe("partial-observed"); + expect( + traceEvents.some( + (event: any) => event.event.kind === "tool.execution.start", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "tool.execution.end", + ), + ).toBe(true); + + cleanup(); + }); + + test("classifies truncated structured streams as capture-failed", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-truncated-stream"; + + class CodexAgentTruncatedFake { + id = "codex-truncated-fake"; + opts = { outputFormat: "stream-json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "assistant_message.delta", + delta: { text: "partial" }, + }) + "\n", + ); + args.onStdout?.( + '{"type":"assistant_message.delta","delta":{"text":"unterminated"}', + ); + throw new Error("subprocess exits early"); + } + } + + const workflow = smithers(() => ( + + + truncated codex please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("failed"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.traceCompleteness).toBe("capture-failed"); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "capture.error" && + event.payload.reason === "truncated-json-stream", + ), + ).toBe(true); + + cleanup(); + }); + + test("records artifact write failures as capture warnings while durable DB truth remains", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-artifact-write-fail"; + const tmp = mkdtempSync(join(tmpdir(), "smithers-agent-trace-")); + const badLogPath = join(tmp, "not-a-directory"); + writeFileSync(badLogPath, "occupied"); + + class GeminiAgentArtifactFake { + id = "gemini-artifact-fake"; + opts = { outputFormat: "stream-json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "response.output_text.delta", + delta: { text: "artifact delta" }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "response.completed", + response: { + role: "assistant", + content: [{ text: "artifact final" }], + }, + }) + "\n", + ); + return { + text: '{"answer":"artifact final"}', + output: { answer: "artifact final" }, + }; + } + } + + const workflow = smithers(() => ( + + + artifact write failure please + + + )); + + const result = await runWorkflow(workflow, { + input: {}, + runId, + logDir: badLogPath, + }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.traceCompleteness).toBe("partial-observed"); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "capture.warning" && + event.payload.reason === "artifact-write-failed", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "assistant.message.final", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "artifact.created", + ), + ).toBe(false); + + cleanup(); + }); + + test("records persisted trace artifacts with a canonical artifact.created event", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-artifact-created"; + const logDir = mkdtempSync(join(tmpdir(), "smithers-agent-trace-artifact-")); + + class PiArtifactAgentFake { + id = "pi-artifact-fake"; + opts = { mode: "json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.(JSON.stringify({ type: "session", id: "sess-1" }) + "\n"); + args.onStdout?.( + JSON.stringify({ + type: "turn_end", + message: { + role: "assistant", + content: [{ type: "text", text: "artifact ok" }], + }, + }) + "\n", + ); + args.onStdout?.(JSON.stringify({ type: "agent_end" }) + "\n"); + return { + text: '{"answer":"artifact ok"}', + output: { answer: "artifact ok" }, + }; + } + } + + const workflow = smithers(() => ( + + + artifact created please + + + )); + + const result = await runWorkflow(workflow, { + input: {}, + runId, + logDir, + }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.rawArtifactRefs).toHaveLength(1); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "artifact.created" && + event.payload.artifactPath === summary.rawArtifactRefs[0], + ), + ).toBe(true); + const artifactText = readFileSync(summary.rawArtifactRefs[0], "utf8"); + expect(artifactText).toContain('"assistant.message.final"'); + expect(artifactText).toContain('"artifact.created"'); + expect(artifactText).toContain(summary.rawArtifactRefs[0]); + + cleanup(); + }); + + test("shapes OTEL log records with canonical query attributes and redacted body", () => { + const record = canonicalTraceEventToOtelLogRecord( + { + traceVersion: "1", + runId: "run-123", + workflowPath: "workflows/demo.tsx", + workflowHash: "abc123", + nodeId: "pi-rich-trace", + iteration: 0, + attempt: 1, + timestampMs: Date.now(), + event: { + sequence: 7, + kind: "assistant.thinking.delta", + phase: "message", + }, + source: { + agentFamily: "pi", + captureMode: "cli-json-stream", + rawType: "thinking_delta", + observed: true, + }, + traceCompleteness: "full-observed", + payload: { text: "thought [REDACTED_SECRET]" }, + raw: { text: "thought [REDACTED_SECRET]" }, + redaction: { applied: true, ruleIds: ["secret-ish"] }, + annotations: { "custom.demo": true, ticket: "OBS-1" }, + }, + { + agentId: "pi-agent", + model: "gpt-5.4", + }, + ); + + expect(record.severity).toBe("INFO"); + expect(record.attributes["run.id"]).toBe("run-123"); + expect(record.attributes["node.id"]).toBe("pi-rich-trace"); + expect(record.attributes["node.attempt"]).toBe(1); + expect(record.attributes["event.kind"]).toBe("assistant.thinking.delta"); + expect(record.attributes["custom.ticket"]).toBe("OBS-1"); + expect(record.body).toContain("REDACTED_SECRET"); + expect(record.body).not.toContain("sk_live_"); + }); +}); diff --git a/workflows/agent-trace-otel-demo.tsx b/workflows/agent-trace-otel-demo.tsx new file mode 100644 index 00000000..92d23e1e --- /dev/null +++ b/workflows/agent-trace-otel-demo.tsx @@ -0,0 +1,315 @@ +/** @jsxImportSource smithers */ +import { runPromise } from "../src/effect/runtime"; +import { logToolCallEffect, logToolCallStartEffect } from "../src/tools/logToolCall"; +import { createSmithers, Task, Workflow } from "../src"; +import { z } from "zod"; + +const PiOutput = z.object({ + answer: z.string(), +}); + +const ClaudeOutput = z.object({ + answer: z.string(), +}); + +const CodexOutput = z.object({ + answer: z.string(), +}); + +const GeminiOutput = z.object({ + answer: z.string(), +}); + +const SdkOutput = z.object({ + answer: z.string(), +}); + +const { smithers, outputs } = createSmithers( + { + piResult: PiOutput, + claudeResult: ClaudeOutput, + codexResult: CodexOutput, + geminiResult: GeminiOutput, + sdkResult: SdkOutput, + }, + { + dbPath: "./workflows/agent-trace-otel-demo.db", + journalMode: "DELETE", + }, +); + +function createPiDemoAgent(failureMode?: string) { + return { + id: failureMode ? `pi-observability-demo-${failureMode}` : "pi-observability-demo", + model: "demo-pi", + opts: { mode: "json" }, + generate: async (args: { onStdout?: (text: string) => void }) => { + if (failureMode === "malformed-json") { + args.onStdout?.("{not valid json}\n"); + throw new Error("demo malformed-json failure"); + } + args.onStdout?.(JSON.stringify({ type: "session", id: "demo-session" }) + "\n"); + args.onStdout?.(JSON.stringify({ type: "turn_start", id: "turn-1" }) + "\n"); + args.onStdout?.( + JSON.stringify({ + type: "message_start", + message: { role: "assistant", content: [] }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message_update", + assistantMessageEvent: { + type: "thinking_delta", + delta: "Planning demo work with token=sk_demo_secret_123456789", + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message_update", + assistantMessageEvent: { + type: "text_delta", + delta: "Working through the repository. ", + }, + }) + "\n", + ); + const seq = await runPromise(logToolCallStartEffect("bash")); + await runPromise( + logToolCallEffect( + "bash", + { command: "echo sk_live_secret_123456789" }, + { ok: true, stdout: "done" }, + "success", + undefined, + undefined, + typeof seq === "number" ? seq : undefined, + ), + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_execution_start", + toolExecution: { + id: "tool-1", + name: "bash", + args: { command: "echo hi" }, + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_execution_update", + toolExecution: { + id: "tool-1", + name: "bash", + result: { status: "running" }, + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_execution_end", + toolExecution: { + id: "tool-1", + name: "bash", + result: { ok: true }, + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "turn_end", + message: { + role: "assistant", + content: [{ type: "text", text: "Demo PI answer" }], + usage: { inputTokens: 11, outputTokens: 7 }, + }, + }) + "\n", + ); + args.onStdout?.(JSON.stringify({ type: "agent_end", id: "demo-session" }) + "\n"); + return { + text: '{"answer":"Demo PI answer"}', + output: { answer: "Demo PI answer" }, + usage: { inputTokens: 11, outputTokens: 7 }, + }; + }, + } as any; +} + +function createClaudeDemoAgent() { + return { + id: "claude-observability-demo", + model: "demo-claude", + opts: { outputFormat: "stream-json" }, + generate: async (args: { onStdout?: (text: string) => void }) => { + args.onStdout?.( + JSON.stringify({ + type: "system", + subtype: "init", + session_id: "claude-demo-session", + model: "claude-opus-demo", + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "assistant", + message: { + role: "assistant", + content: [{ type: "text", text: "Claude demo answer" }], + usage: { + input_tokens: 13, + cache_read_input_tokens: 2, + output_tokens: 5, + }, + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "result", + subtype: "success", + result: "Claude demo answer", + usage: { + input_tokens: 13, + cache_read_input_tokens: 2, + output_tokens: 5, + }, + }) + "\n", + ); + return { + text: '{"answer":"Claude demo answer"}', + output: { answer: "Claude demo answer" }, + }; + }, + } as any; +} + +function createGeminiDemoAgent() { + return { + id: "gemini-observability-demo", + model: "demo-gemini", + opts: { outputFormat: "stream-json" }, + generate: async (args: { onStdout?: (text: string) => void }) => { + args.onStdout?.( + JSON.stringify({ + type: "init", + session_id: "gemini-demo-session", + model: "gemini-demo", + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message", + role: "assistant", + content: "Gemini ", + delta: true, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message", + role: "assistant", + content: "demo answer", + delta: true, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "result", + status: "success", + stats: { + input_tokens: 9, + output_tokens: 4, + total_tokens: 13, + }, + }) + "\n", + ); + return { + text: '{"answer":"Gemini demo answer"}', + output: { answer: "Gemini demo answer" }, + }; + }, + } as any; +} + +function createCodexDemoAgent() { + return { + id: "codex-observability-demo", + model: "demo-codex", + opts: { outputFormat: "stream-json", json: true }, + generate: async (args: { onStdout?: (text: string) => void }) => { + args.onStdout?.( + JSON.stringify({ + type: "assistant_message.delta", + delta: { text: "Codex demo answer" }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "turn.completed", + usage: { input_tokens: 12, output_tokens: 6 }, + message: { role: "assistant", content: "Codex demo answer" }, + }) + "\n", + ); + return { + text: '{"answer":"Codex demo answer"}', + output: { answer: "Codex demo answer" }, + }; + }, + } as any; +} + +class OpenAIAgentDemo { + id = "openai-demo-agent"; + model = "demo-sdk"; + tools = {}; + + async generate() { + return { + text: '{"answer":"Demo SDK answer"}', + output: { answer: "Demo SDK answer" }, + usage: { inputTokens: 5, outputTokens: 3 }, + }; + } +} + +export default smithers((ctx) => { + const failureMode = typeof ctx.input.failureMode === "string" ? ctx.input.failureMode : undefined; + const piAgent = createPiDemoAgent(failureMode); + + return ( + + + {`Emit a PI-like trace for observability verification. failureMode=${failureMode ?? "none"}`} + + {!failureMode && ( + <> + + {`Emit a Claude-like structured trace after the PI demo task completes.`} + + + {`Emit a Gemini-like structured trace after the Claude demo task completes.`} + + + {`Emit a Codex-like structured trace after the Gemini demo task completes.`} + + + {`Return a final-only SDK response after the structured PI, Claude, Gemini, and Codex demo tasks complete.`} + + + )} + + ); +}); diff --git a/workflows/agent-trace-otel-loop.tsx b/workflows/agent-trace-otel-loop.tsx new file mode 100644 index 00000000..3182f85d --- /dev/null +++ b/workflows/agent-trace-otel-loop.tsx @@ -0,0 +1,451 @@ +/** @jsxImportSource smithers */ +import { spawnSync } from "node:child_process"; +import { + createSmithers, + PiAgent, + Workflow, + Task, + Sequence, + Ralph, +} from "smithers"; +import { z } from "zod"; + +const DEFAULT_VALIDATION_COMMANDS = [ + "bun run typecheck", + "bun run test", +]; +const DEFAULT_PI_PROVIDER = "openai-codex"; +const DEFAULT_PI_MODEL = "gpt-5.4"; + +const ImplementOutput = z.object({ + summary: z.string(), + filesChanged: z.array(z.string()).default([]), + notableDecisions: z.array(z.string()).default([]), + remainingRisks: z.array(z.string()).default([]), +}); + +const ValidateOutput = z.object({ + allPassed: z.boolean(), + summary: z.string(), + gitStatus: z.string(), + gitDiffStat: z.string(), + results: z.array( + z.object({ + command: z.string(), + exitCode: z.number().int(), + passed: z.boolean(), + stdout: z.string(), + stderr: z.string(), + }), + ), +}); + +const ReviewOutput = z.object({ + approved: z.boolean(), + summary: z.string(), + blockingFeedback: z.array(z.string()).default([]), + strengths: z.array(z.string()).default([]), + nextActions: z.array(z.string()).default([]), +}); + +const FinalOutput = z.object({ + approved: z.boolean(), + reviewRounds: z.number().int().min(1), + implementationSummary: z.string(), + validationSummary: z.string(), + reviewSummary: z.string(), + blockingFeedback: z.array(z.string()).default([]), +}); + +const { smithers, outputs } = createSmithers( + { + implement: ImplementOutput, + validate: ValidateOutput, + review: ReviewOutput, + finalReport: FinalOutput, + }, + { + dbPath: "./workflows/agent-trace-otel-loop.db", + journalMode: "DELETE", + }, +); + +function createImplementerAgent(provider: string, model: string) { + return new PiAgent({ + provider, + model, + mode: "rpc", + yolo: true, + timeoutMs: 45 * 60 * 1000, + idleTimeoutMs: 5 * 60 * 1000, + thinking: "high", + appendSystemPrompt: [ + "You are the implementation agent for Smithers.", + "Work directly in the current repository.", + "Read the specification before changing code.", + "You must make the smallest coherent set of changes that satisfies the specification.", + "Do not claim verification you did not actually obtain.", + "Return exact changed file paths and a concise risk summary.", + ].join("\n"), + }); +} + +function createReviewerAgent(provider: string, model: string) { + return new PiAgent({ + provider, + model, + mode: "rpc", + yolo: true, + timeoutMs: 30 * 60 * 1000, + idleTimeoutMs: 5 * 60 * 1000, + thinking: "high", + appendSystemPrompt: [ + "You are an extremely strict implementation reviewer.", + "Your job is to reject weak, incomplete, under-verified, or cosmetically-correct changes.", + "Be direct and unsentimental.", + "Approve only if the implementation clearly satisfies the specification and the deterministic validation evidence is strong.", + "If validation is weak, incomplete, or missing, that is blocking feedback.", + "If logs/traces/verification claims are hand-wavy, that is blocking feedback.", + "Prefer a short list of high-severity blocking issues over a long list of minor nits.", + ].join("\n"), + }); +} + +function asString(value: unknown, fallback = ""): string { + return typeof value === "string" ? value : fallback; +} + +function asStringArray(value: unknown): string[] { + return Array.isArray(value) ? value.filter((v): v is string => typeof v === "string") : []; +} + +function resolveValidationCommands(input: Record): string[] { + const commands = input.validationCommands; + const parsed = asStringArray(commands); + return parsed.length > 0 ? parsed : DEFAULT_VALIDATION_COMMANDS; +} + +function runShell(command: string) { + const result = spawnSync(command, { + shell: true, + cwd: process.cwd(), + env: process.env, + encoding: "utf8", + maxBuffer: 10 * 1024 * 1024, + }); + return { + command, + exitCode: result.status ?? (result.error ? 1 : 0), + passed: (result.status ?? 0) === 0 && !result.error, + stdout: (result.stdout ?? "").trim(), + stderr: [result.stderr ?? "", result.error?.message ?? ""].filter(Boolean).join("\n").trim(), + }; +} + +export default smithers((ctx) => { + const latestImplement = ctx.latest("implement", "implement"); + const latestValidate = ctx.latest("validate", "validate"); + const latestReview = ctx.latest("review", "review"); + + const approved = latestReview?.approved === true; + const piProvider = asString( + (ctx.input as Record).piProvider, + DEFAULT_PI_PROVIDER, + ); + const piModel = asString( + (ctx.input as Record).piModel, + DEFAULT_PI_MODEL, + ); + const implementer = createImplementerAgent(piProvider, piModel); + const reviewer = createReviewerAgent(piProvider, piModel); + const validationCommands = resolveValidationCommands(ctx.input as Record); + const branchName = asString((ctx.input as Record).branchName, "feat/agent-trace-otel-logs"); + const specPath = asString( + (ctx.input as Record).specPath, + "docs/concepts/agent-trace-otel-logs-spec.mdx", + ); + const composePath = asString( + (ctx.input as Record).composePath, + "observability/docker-compose.otel.yml", + ); + + return ( + + + + + + {`Read the spec first: ${specPath}. Treat it as the source of truth. The work is not “add some logs.” The work is: + +1. define and implement a canonical agent trace model +2. capture the fullest observable trace per agent family +3. export those canonical trace events as OTEL logs +4. verify end-to-end in Docker with Loki +5. prove correctness using the verification section in the spec + +Also note the current local stack in ${composePath} does not include Loki yet. It only starts collector, Prometheus, Tempo, and Grafana. Adding Loki and a collector logs pipeline is part of the assignment. + +What He Should Understand Before Coding +Read these first: + +- ${specPath} +- ${composePath} +- observability/otel-collector-config.yml +- docs/guides/monitoring-logs.mdx +- docs/runtime/events.mdx +- docs/integrations/pi-integration.mdx +- docs/integrations/cli-agents.mdx + +Then inspect the code paths that matter: + +- src/agents/PiAgent.ts +- src/agents/BaseCliAgent.ts +- src/events.ts +- src/effect/logging.ts +- src/observability/index.ts + +Expected Deliverable +He should not stop at implementation. He should hand back: + +- code +- updated Docker observability stack with Loki +- tests +- a reproducible verification procedure +- sample queries proving the system works + +Required Verification Environment +He should use Docker for the local stack: + +smithers observability --detach + +That CLI currently starts the stack from src/cli/index.ts:876, but he will need to extend it so the stack includes Loki too. + +Target local endpoints after his changes should be something like: + +- Grafana +- Prometheus +- Tempo +- Loki +- OTEL collector + +He should document exact ports if he changes them. + +How He Should Verify +Use the spec’s verification classes as the acceptance gate. In practice, he should prove all of this: + +1. Canonical trace capture works + Run a workflow with at least one agent that has a rich trace surface, preferably PiAgent. + He must show: + - assistant text deltas + - visible thinking deltas if Pi emits them + - tool execution start/update/end + - final assistant message + - run/node/attempt correlation fields +2. OTEL logs are really exported + Not just local files. He must prove the canonical trace events land in Loki or another OTEL log backend through the collector. +3. Logs are queryable + In Grafana/Loki he must show queries that answer: + - all events for one run.id + - one node.id + attempt + - only assistant.thinking.delta + - only tool.execution.* + - only capture.error +4. Current metrics/traces still work + He must ensure adding logs does not break existing Prometheus/Tempo behavior. +5. Failure cases are classified + He should simulate at least: + - malformed upstream JSON + - collector unavailable + - subprocess exits early + - redaction case + and show the system reports capture.error, partial-observed, or capture-failed correctly. +6. Redaction works + He must include at least one fixture with a secret-like token and show it is absent from: + - canonical persisted payload + - OTEL log body + - artifacts if artifacts are enabled + +Recommended Test Matrix +Tell him to verify at least this matrix: + +- PiAgent: + - rich trace success case + - failure case + - redaction case +- one structured CLI agent: + - CodexAgent or ClaudeCodeAgent + - prove structured events are preserved if available + - otherwise prove truthful partial classification +- one SDK agent: + - OpenAIAgent or AnthropicAgent + - prove partial/final-only capture is explicit, not faked + +Suggested Demo Workflow +He should create or reuse one simple workflow specifically for observability verification: + +- one task using PiAgent +- one task using a second agent family +- at least one tool invocation +- stable run annotations +- easy-to-query workflow path + +Then run it with OTEL enabled, for example: + +export SMITHERS_OTEL_ENABLED=1 +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 +export OTEL_SERVICE_NAME=smithers-dev + +What “Done” Means +Use this language with him: + +Done means: + +- Loki is part of the local Docker stack +- the collector has a real logs pipeline +- canonical agent trace events exist +- at least Pi is high-fidelity per spec +- other agent families are truthfully classified +- there are automated tests for schema, ordering, fidelity, completeness, export, redaction, and failure handling +- there is a written verification section with commands and Grafana/Loki queries +- a reviewer can reproduce the verification from a clean checkout + +What He Should Return +Ask him to return: + +- the branch +- a short “verification report” with: + - commands run + - workflow used + - exact Grafana/Loki queries + - screenshots or copied query results + - which verification classes from the spec passed + - any explicitly deferred items + +Execution context: +- branch: ${branchName} +- PI provider/model: ${piProvider}/${piModel} +${latestReview ? `- Previous review summary: ${latestReview.summary}` : "- This is the first implementation pass."} +${latestReview?.blockingFeedback?.length ? `- Blocking feedback to address:\n${latestReview.blockingFeedback.map((item: string) => ` - ${item}`).join("\n")}` : ""} +${latestValidate ? `- Latest deterministic validation summary: ${latestValidate.summary}` : "- No deterministic validation results exist yet."} + +Return JSON only, with: +- summary +- filesChanged +- notableDecisions +- remainingRisks`} + + + + {() => { + const results = validationCommands.map(runShell); + const allPassed = results.every((result) => result.passed); + const gitStatus = runShell("git status --short").stdout; + const gitDiffStat = runShell("git diff --stat").stdout; + const failed = results.filter((result) => !result.passed); + + return { + allPassed, + summary: allPassed + ? `All deterministic validation commands passed (${results.length} commands).` + : `${failed.length} of ${results.length} deterministic validation commands failed.`, + gitStatus, + gitDiffStat, + results, + }; + }} + + + + {`Review the implementation brutally. + +This is a Smithers implementation-review loop for the OTEL agent trace logging work. + +Primary brief files: +- ${specPath} +- ${composePath} +- PI provider/model under review: ${piProvider}/${piModel} + +You are reviewing the current repository state, not an abstract plan. + +Implementation agent report: +${latestImplement ? JSON.stringify(latestImplement, null, 2) : "No implementation report was produced."} + +Deterministic validation report: +${latestValidate ? JSON.stringify(latestValidate, null, 2) : "No validation report was produced."} + +Your role: +- You are the backpressure mechanism. +- You are not a collaborator helping the implementer feel good. +- You are the reviewer responsible for preventing weak, premature, under-verified changes from passing. +- You must assume the implementer will take shortcuts unless the evidence proves otherwise. + +Verification duties: +- Audit whether the implementation actually satisfies the verification section of the spec. +- Require evidence, not intent. +- Treat missing end-to-end verification as a blocking failure. +- Treat unverified Docker + Loki + collector integration as a blocking failure. +- Treat absent or weak query examples as a blocking failure. +- Treat missing failure-mode validation as a blocking failure. +- Treat redaction claims without proof as a blocking failure. +- Treat “it should work” or “it is wired” language as evidence of incompleteness. + +Smithers coding-style duties: +- Enforce repository-local conventions over generic agent habits. +- Reject unnecessary abstraction, speculative generalization, and architecture without pressure. +- Reject code that hides truthfulness about agent fidelity. +- Reject code that collapses raw event boundaries into summaries when the spec requires preservation. +- Reject code that uses high-cardinality labels for large bodies. +- Reject implementation that weakens durable local truth in favor of OTEL-only export. +- Reject code that does not fit the existing Smithers runtime, observability, and event model cleanly. +- Reject hand-wavy docs that are not reproducible from a clean checkout. + +Specific rejection criteria: +- Reject any vague or partial implementation. +- Reject if the OTEL logs path is not actually verifiable with Docker + Loki + Grafana. +- Reject if Pi trace capture is not clearly higher fidelity than the other agents. +- Reject if unsupported agent capabilities are hand-waved instead of truthfully classified. +- Reject if deterministic validation is weak, missing, or ignored. +- Reject if the work appears to optimize appearance over correctness. +- Reject if tests cover only the happy path. +- Reject if the coworker handoff is missing commands, queries, or exact verification steps. + +Approval standard: +- Approve only if the implementation appears specification-complete, verification-complete, reproducible, and operationally believable. +- If there is any meaningful unresolved risk, set approved=false. +- Default to rejection unless the evidence is boringly strong. + +Feedback style: +- Be harsh, concise, and specific. +- Prioritize the highest-severity blockers. +- Do not soften critique. +- Do not pad with praise unless it is genuinely earned and still useful. + +Return JSON only, with: +- approved +- summary +- blockingFeedback +- strengths +- nextActions`} + + + + + + {() => ({ + approved: latestReview?.approved === true, + reviewRounds: ctx.iterationCount("review", "review"), + implementationSummary: latestImplement?.summary ?? "No implementation summary available.", + validationSummary: latestValidate?.summary ?? "No validation summary available.", + reviewSummary: latestReview?.summary ?? "No review summary available.", + blockingFeedback: latestReview?.blockingFeedback ?? [], + })} + + + + ); +}); diff --git a/workflows/real-agent-trace-check.tsx b/workflows/real-agent-trace-check.tsx new file mode 100644 index 00000000..686b9baa --- /dev/null +++ b/workflows/real-agent-trace-check.tsx @@ -0,0 +1,63 @@ +/** @jsxImportSource smithers */ +import { createSmithers, Task, Workflow } from "../src"; +import { ClaudeCodeAgent } from "../src/agents/ClaudeCodeAgent"; +import { AmpAgent } from "../src/agents/AmpAgent"; +import { GeminiAgent } from "../src/agents/GeminiAgent"; +import { z } from "zod"; + +const Output = z.object({ + answer: z.string(), +}); + +const { smithers, outputs } = createSmithers( + { + result: Output, + }, + { + dbPath: "./workflows/real-agent-trace-check.db", + journalMode: "DELETE", + }, +); + +function buildAgent(kind: string) { + switch (kind) { + case "claude": + return new ClaudeCodeAgent({ + yolo: true, + outputFormat: "stream-json", + maxOutputBytes: 512 * 1024, + }); + case "amp": + return new AmpAgent({ + yolo: true, + maxOutputBytes: 512 * 1024, + }); + case "gemini": + return new GeminiAgent({ + approvalMode: "yolo", + outputFormat: "stream-json", + maxOutputBytes: 512 * 1024, + }); + default: + throw new Error(`Unsupported agent kind: ${kind}`); + } +} + +export default smithers((ctx) => { + const agentKind = + typeof ctx.input.agent === "string" ? ctx.input.agent : "claude"; + + return ( + + + {`Respond briefly. State which agent you are, then end with JSON matching the required schema.`} + + + ); +}); From d34824c0ba09874e720e2eb3da50baf15d3ba725 Mon Sep 17 00:00:00 2001 From: Samuel Huber Date: Sat, 28 Mar 2026 15:37:16 +0100 Subject: [PATCH 2/7] fix(observability): tighten agent trace fidelity semantics The trace collector was overstating capture fidelity by treating synthesized terminal events as if they had been directly observed. It also kept inconsistent usage payloads and relied on fragile tool lifecycle identity reconstruction. This change separates directly observed events from backfilled canonical events so completeness reflects what Smithers actually captured for each attempt. It normalizes usage records to one schema, preserves later terminal usage data, and threads a stable toolCallId through Smithers tool events so start and finish records stay correlated. The result keeps coarse capture modes truthfully classified as final-only while marking real structured gaps as partial-observed, which makes the trace model easier to reason about and aligns it with the rest of the codebase's observability behavior. --- src/SmithersEvent.ts | 2 + src/agent-trace.ts | 306 ++++++++++++++++++++++++------------- src/tools/bash.ts | 7 +- src/tools/edit.ts | 17 ++- src/tools/grep.ts | 7 +- src/tools/logToolCall.ts | 7 +- src/tools/read.ts | 17 ++- src/tools/write.ts | 17 ++- tests/agent-trace.test.tsx | 46 +++++- 9 files changed, 306 insertions(+), 120 deletions(-) diff --git a/src/SmithersEvent.ts b/src/SmithersEvent.ts index d69a79c0..87e73384 100644 --- a/src/SmithersEvent.ts +++ b/src/SmithersEvent.ts @@ -109,6 +109,7 @@ export type SmithersEvent = nodeId: string; iteration: number; attempt: number; + toolCallId: string; toolName: string; seq: number; timestampMs: number; @@ -119,6 +120,7 @@ export type SmithersEvent = nodeId: string; iteration: number; attempt: number; + toolCallId: string; toolName: string; seq: number; status: "success" | "error"; diff --git a/src/agent-trace.ts b/src/agent-trace.ts index 2544906c..2efc2a05 100644 --- a/src/agent-trace.ts +++ b/src/agent-trace.ts @@ -148,6 +148,11 @@ export type AgentTraceCapabilityProfile = { persistedSessionArtifact: boolean; }; +type PushOptions = { + recordSeen?: boolean; + direct?: boolean; +}; + export const agentTraceCapabilities: Record< AgentFamily, AgentTraceCapabilityProfile @@ -566,6 +571,7 @@ export class AgentTraceCollector { private readonly events: CanonicalAgentTraceEvent[] = []; private readonly rawArtifactRefs: string[] = []; private readonly seenKinds = new Set(); + private readonly directKinds = new Set(); private readonly expectedKinds = new Set(); private readonly failures: string[] = []; private readonly warnings: string[] = []; @@ -641,33 +647,18 @@ export class AgentTraceCollector { text, false, undefined, - true, + { recordSeen: true, direct: true }, ); } - const usage = result?.usage ?? result?.totalUsage; - if (this.hasMeaningfulUsage(usage) && !this.seenKinds.has("usage")) { + const usage = this.normalizeUsage(result?.usage ?? result?.totalUsage); + if (usage) { this.push( "usage", - { - inputTokens: usage.inputTokens ?? usage.promptTokens ?? 0, - outputTokens: usage.outputTokens ?? usage.completionTokens ?? 0, - cacheReadTokens: - usage.inputTokenDetails?.cacheReadTokens ?? - usage.cacheReadTokens ?? - undefined, - cacheWriteTokens: - usage.inputTokenDetails?.cacheWriteTokens ?? - usage.cacheWriteTokens ?? - undefined, - reasoningTokens: - usage.outputTokenDetails?.reasoningTokens ?? - usage.reasoningTokens ?? - undefined, - }, + usage, usage, false, "usage", - true, + { recordSeen: true, direct: true }, ); } } @@ -680,7 +671,7 @@ export class AgentTraceCollector { { error: this.failures.at(-1) }, false, "error", - true, + { recordSeen: true, direct: true }, ); } @@ -693,14 +684,14 @@ export class AgentTraceCollector { !this.seenKinds.has("assistant.message.final") && this.finalText ) { - this.push( - "assistant.message.final", - { text: this.finalText }, - this.finalText, - false, - undefined, - true, - ); + this.push( + "assistant.message.final", + { text: this.finalText }, + this.finalText, + false, + undefined, + { recordSeen: true }, + ); } if ( (this.captureMode === "cli-json-stream" || @@ -718,13 +709,13 @@ export class AgentTraceCollector { { reason: "missing-terminal-event" }, false, "capture", - true, + { recordSeen: true, direct: true }, ); } let traceCompleteness = this.resolveCompleteness(); let missingExpectedEventKinds = [...this.expectedKinds].filter( - (kind) => !this.seenKinds.has(kind), + (kind) => !this.directKinds.has(kind), ); let summary: AgentTraceSummary = { traceVersion: "1", @@ -766,7 +757,7 @@ export class AgentTraceCollector { }, false, "artifact", - true, + { recordSeen: true, direct: true }, ); summary = { ...summary, rawArtifactRefs: [...this.rawArtifactRefs] }; await this.rewriteNdjson(artifactPath, summary); @@ -778,11 +769,11 @@ export class AgentTraceCollector { { reason: "artifact-write-failed", error: persistedArtifact.error }, false, "artifact", - true, + { recordSeen: true, direct: true }, ); traceCompleteness = this.resolveCompleteness(); missingExpectedEventKinds = [...this.expectedKinds].filter( - (kind) => !this.seenKinds.has(kind), + (kind) => !this.directKinds.has(kind), ); summary = { ...summary, @@ -833,12 +824,18 @@ export class AgentTraceCollector { private processChunk(stream: "stdout" | "stderr", text: string) { if (stream === "stderr") { this.stderrBuffer += text; - this.push("stderr", { text }, text, true, stream, true); + this.push("stderr", { text }, text, true, stream, { + recordSeen: true, + direct: true, + }); return; } this.stdoutBuffer += text; if (this.captureMode === "cli-text" || this.captureMode === "sdk-events") { - this.push("stdout", { text }, text, true, stream, true); + this.push("stdout", { text }, text, true, stream, { + recordSeen: true, + direct: true, + }); return; } const lines = this.stdoutBuffer.split(/\r?\n/); @@ -868,7 +865,7 @@ export class AgentTraceCollector { line, true, "stdout", - true, + { recordSeen: true, direct: true }, ); } @@ -884,7 +881,7 @@ export class AgentTraceCollector { line, true, "stdout", - true, + { recordSeen: true, direct: true }, ); return; } @@ -909,22 +906,31 @@ export class AgentTraceCollector { parsed, true, rawType, - true, + { recordSeen: true, direct: true }, ); } else { - this.push("stdout", { eventType: rawType }, parsed, true, rawType, true); + this.push("stdout", { eventType: rawType }, parsed, true, rawType, { + recordSeen: true, + direct: true, + }); } - const usage = parsed?.message?.usage; - if (usage && !this.seenKinds.has("usage")) { - this.push("usage", usage, parsed, true, rawType, true); + const usage = this.normalizeUsage(parsed?.message?.usage); + if (usage) { + this.push("usage", usage, parsed, true, rawType, { + recordSeen: true, + direct: true, + }); } return; } if (this.agentFamily === "claude-code" && rawType === "result") { - const usage = parsed?.usage; - if (usage && !this.seenKinds.has("usage")) { - this.push("usage", usage, parsed, true, rawType, true); + const usage = this.normalizeUsage(parsed?.usage); + if (usage) { + this.push("usage", usage, parsed, true, rawType, { + recordSeen: true, + direct: true, + }); } const text = this.extractGenericMessageText(parsed); if (typeof text === "string" && text) { @@ -948,20 +954,19 @@ export class AgentTraceCollector { parsed, true, rawType, - true, + { recordSeen: true, direct: true }, ); return; } } if (this.agentFamily === "gemini" && rawType === "result" && parsed?.stats) { - const usage = { - inputTokens: parsed.stats.input_tokens ?? parsed.stats.input, - outputTokens: parsed.stats.output_tokens, - totalTokens: parsed.stats.total_tokens, - }; - if (this.hasMeaningfulUsage(usage) && !this.seenKinds.has("usage")) { - this.push("usage", usage, parsed, true, rawType, true); + const usage = this.normalizeUsage(parsed.stats); + if (usage) { + this.push("usage", usage, parsed, true, rawType, { + recordSeen: true, + direct: true, + }); } return; } @@ -977,7 +982,7 @@ export class AgentTraceCollector { parsed, true, rawType, - true, + { recordSeen: true, direct: true }, ); return; } @@ -1000,7 +1005,7 @@ export class AgentTraceCollector { parsed, true, rawType, - true, + { recordSeen: true, direct: true }, ); return; } @@ -1021,7 +1026,7 @@ export class AgentTraceCollector { parsed, true, rawType, - true, + { recordSeen: true, direct: true }, ); return; } @@ -1041,7 +1046,7 @@ export class AgentTraceCollector { parsed, true, rawType, - true, + { recordSeen: true, direct: true }, ); const text = this.extractGenericMessageText(parsed); if (text) { @@ -1052,11 +1057,15 @@ export class AgentTraceCollector { parsed, true, rawType, - true, + { recordSeen: true, direct: true }, ); } - if (parsed?.usage) { - this.push("usage", parsed.usage, parsed, true, rawType, true); + const usage = this.normalizeUsage(parsed?.usage); + if (usage) { + this.push("usage", usage, parsed, true, rawType, { + recordSeen: true, + direct: true, + }); } return; } @@ -1072,8 +1081,9 @@ export class AgentTraceCollector { parsed, true, rawType, - true, + { recordSeen: true, direct: true }, ); + this.expectedKinds.add("tool.execution.end"); return; } @@ -1091,8 +1101,9 @@ export class AgentTraceCollector { parsed, true, rawType, - true, + { recordSeen: true, direct: true }, ); + this.expectedKinds.add("tool.execution.end"); return; } @@ -1107,7 +1118,7 @@ export class AgentTraceCollector { parsed, true, rawType, - true, + { recordSeen: true, direct: true }, ); return; } @@ -1119,13 +1130,19 @@ export class AgentTraceCollector { parsed, true, rawType, - true, + { recordSeen: true, direct: true }, ); return; } if (rawType === "turn.completed" && parsed?.usage) { - this.push("usage", parsed.usage, parsed, true, rawType, true); + const usage = this.normalizeUsage(parsed.usage); + if (usage) { + this.push("usage", usage, parsed, true, rawType, { + recordSeen: true, + direct: true, + }); + } const text = this.extractGenericMessageText(parsed); if (text) { this.setFinalAssistantText(text); @@ -1135,13 +1152,16 @@ export class AgentTraceCollector { parsed, true, rawType, - true, + { recordSeen: true, direct: true }, ); } return; } - this.push("stdout", { eventType: rawType }, parsed, true, rawType, true); + this.push("stdout", { eventType: rawType }, parsed, true, rawType, { + recordSeen: true, + direct: true, + }); } private processPiEvent(parsed: any) { @@ -1155,7 +1175,7 @@ export class AgentTraceCollector { parsed, true, type, - true, + { recordSeen: true, direct: true }, ); return; case "agent_end": @@ -1165,7 +1185,7 @@ export class AgentTraceCollector { parsed, true, type, - true, + { recordSeen: true, direct: true }, ); return; case "turn_start": @@ -1175,8 +1195,9 @@ export class AgentTraceCollector { parsed, true, type, - true, + { recordSeen: true, direct: true }, ); + this.expectedKinds.add("turn.end"); return; case "turn_end": { this.push( @@ -1185,7 +1206,7 @@ export class AgentTraceCollector { parsed, true, type, - true, + { recordSeen: true, direct: true }, ); const finalText = this.extractText(parsed?.message); if (finalText) { @@ -1196,17 +1217,18 @@ export class AgentTraceCollector { parsed?.message, true, type, - true, + { recordSeen: true, direct: true }, ); } - if (parsed?.message?.usage) { + const usage = this.normalizeUsage(parsed?.message?.usage); + if (usage) { this.push( "usage", - parsed.message.usage, + usage, parsed.message.usage, true, "usage", - true, + { recordSeen: true, direct: true }, ); } return; @@ -1218,7 +1240,7 @@ export class AgentTraceCollector { parsed, true, type, - true, + { recordSeen: true, direct: true }, ); return; case "message_end": { @@ -1228,7 +1250,7 @@ export class AgentTraceCollector { parsed, true, type, - true, + { recordSeen: true, direct: true }, ); const finalText = this.extractText(parsed?.message); if (parsed?.message?.role === "assistant" && finalText) { @@ -1239,7 +1261,7 @@ export class AgentTraceCollector { parsed?.message, true, type, - true, + { recordSeen: true, direct: true }, ); } return; @@ -1254,7 +1276,7 @@ export class AgentTraceCollector { parsed, true, evt.type, - true, + { recordSeen: true, direct: true }, ); return; } @@ -1268,7 +1290,7 @@ export class AgentTraceCollector { parsed, true, evt.type, - true, + { recordSeen: true, direct: true }, ); return; } @@ -1278,7 +1300,7 @@ export class AgentTraceCollector { parsed, true, type, - true, + { recordSeen: true, direct: true }, ); return; } @@ -1289,8 +1311,9 @@ export class AgentTraceCollector { parsed, true, type, - true, + { recordSeen: true, direct: true }, ); + this.expectedKinds.add("tool.execution.end"); return; case "tool_execution_update": this.push( @@ -1299,8 +1322,9 @@ export class AgentTraceCollector { parsed, true, type, - true, + { recordSeen: true, direct: true }, ); + this.expectedKinds.add("tool.execution.end"); return; case "tool_execution_end": this.push( @@ -1309,7 +1333,7 @@ export class AgentTraceCollector { parsed, true, type, - true, + { recordSeen: true, direct: true }, ); return; case "auto_compaction_start": @@ -1319,7 +1343,7 @@ export class AgentTraceCollector { parsed, true, type, - true, + { recordSeen: true, direct: true }, ); return; case "auto_compaction_end": @@ -1329,7 +1353,7 @@ export class AgentTraceCollector { parsed, true, type, - true, + { recordSeen: true, direct: true }, ); return; case "auto_retry_start": @@ -1339,7 +1363,7 @@ export class AgentTraceCollector { parsed, true, type, - true, + { recordSeen: true, direct: true }, ); return; case "auto_retry_end": @@ -1349,11 +1373,14 @@ export class AgentTraceCollector { parsed, true, type, - true, + { recordSeen: true, direct: true }, ); return; default: - this.push("stdout", { eventType: type }, parsed, true, type, true); + this.push("stdout", { eventType: type }, parsed, true, type, { + recordSeen: true, + direct: true, + }); return; } } @@ -1493,27 +1520,28 @@ export class AgentTraceCollector { this.push( "tool.execution.start", { - toolCallId: `${event.toolName}:${event.seq}`, + toolCallId: event.toolCallId, toolName: event.toolName, }, event, false, event.type, - true, + { recordSeen: true, direct: true }, ); + this.expectedKinds.add("tool.execution.end"); } if (event.type === "ToolCallFinished") { this.push( "tool.execution.end", { - toolCallId: `${event.toolName}:${event.seq}`, + toolCallId: event.toolCallId, toolName: event.toolName, isError: event.status === "error", }, event, false, event.type, - true, + { recordSeen: true, direct: true }, ); } if (event.type === "TokenUsageReported") { @@ -1531,7 +1559,7 @@ export class AgentTraceCollector { event, false, event.type, - true, + { recordSeen: true, direct: true }, ); } } @@ -1539,12 +1567,6 @@ export class AgentTraceCollector { private resolveCompleteness(): TraceCompleteness { if (this.failures.length > 0) return "capture-failed"; - const missing = [...this.expectedKinds].filter( - (kind) => !this.seenKinds.has(kind), - ); - if (missing.length > 0 || this.warnings.length > 0) - return "partial-observed"; - const richKinds = new Set([ "session.start", "session.end", @@ -1564,11 +1586,26 @@ export class AgentTraceCollector { "compaction.start", "compaction.end", ]); - const sawRichStructure = [...this.seenKinds].some((kind) => + const sawRichStructure = [...this.directKinds].some((kind) => richKinds.has(kind), ); - if (this.captureMode === "sdk-events" || this.captureMode === "cli-text") { + const coarseCaptureMode = + this.captureMode === "sdk-events" || + this.captureMode === "cli-text" || + this.captureMode === "cli-json"; + + if (!sawRichStructure && this.warnings.length === 0 && coarseCaptureMode) { + return "final-only"; + } + + const missing = [...this.expectedKinds].filter( + (kind) => !this.directKinds.has(kind), + ); + if (missing.length > 0 || this.warnings.length > 0) + return "partial-observed"; + + if (coarseCaptureMode) { return sawRichStructure ? "partial-observed" : "final-only"; } @@ -1597,13 +1634,71 @@ export class AgentTraceCollector { ); } + private normalizeUsage(usage: any): Record | null { + if (!usage || typeof usage !== "object") return null; + + const normalized = { + ...(typeof (usage.inputTokens ?? usage.promptTokens ?? usage.input_tokens ?? usage.input ?? usage.models?.gemini?.tokens?.input) === "number" + ? { + inputTokens: + usage.inputTokens ?? + usage.promptTokens ?? + usage.input_tokens ?? + usage.input ?? + usage.models?.gemini?.tokens?.input, + } + : {}), + ...(typeof (usage.outputTokens ?? usage.completionTokens ?? usage.output_tokens ?? usage.output ?? usage.models?.gemini?.tokens?.output) === "number" + ? { + outputTokens: + usage.outputTokens ?? + usage.completionTokens ?? + usage.output_tokens ?? + usage.output ?? + usage.models?.gemini?.tokens?.output, + } + : {}), + ...(typeof (usage.cacheReadTokens ?? usage.cache_read_input_tokens ?? usage.inputTokenDetails?.cacheReadTokens ?? usage.cache_read_tokens) === "number" + ? { + cacheReadTokens: + usage.cacheReadTokens ?? + usage.cache_read_input_tokens ?? + usage.inputTokenDetails?.cacheReadTokens ?? + usage.cache_read_tokens, + } + : {}), + ...(typeof (usage.cacheWriteTokens ?? usage.cache_write_input_tokens ?? usage.inputTokenDetails?.cacheWriteTokens ?? usage.cache_write_tokens) === "number" + ? { + cacheWriteTokens: + usage.cacheWriteTokens ?? + usage.cache_write_input_tokens ?? + usage.inputTokenDetails?.cacheWriteTokens ?? + usage.cache_write_tokens, + } + : {}), + ...(typeof (usage.reasoningTokens ?? usage.reasoning_tokens ?? usage.outputTokenDetails?.reasoningTokens) === "number" + ? { + reasoningTokens: + usage.reasoningTokens ?? + usage.reasoning_tokens ?? + usage.outputTokenDetails?.reasoningTokens, + } + : {}), + ...(typeof (usage.totalTokens ?? usage.total_tokens) === "number" + ? { totalTokens: usage.totalTokens ?? usage.total_tokens } + : {}), + }; + + return this.hasMeaningfulUsage(normalized) ? normalized : null; + } + private push( kind: CanonicalAgentTraceEventKind, payload: Record | null, raw: unknown, observed: boolean, rawType?: string, - recordSeen = false, + options: PushOptions = {}, ) { const redactedPayload = redactValue(payload); const redactedRaw = redactValue(raw); @@ -1639,7 +1734,8 @@ export class AgentTraceCollector { annotations: this.annotations, }; this.events.push(event); - if (recordSeen) this.seenKinds.add(kind); + if (options.recordSeen) this.seenKinds.add(kind); + if (options.direct) this.directKinds.add(kind); } private async persistNdjson( diff --git a/src/tools/bash.ts b/src/tools/bash.ts index 90cb9497..577f92c4 100644 --- a/src/tools/bash.ts +++ b/src/tools/bash.ts @@ -24,8 +24,11 @@ export function bashToolEffect( const allowNetwork = ctx?.allowNetwork ?? false; const started = nowMs(); let seq: number | undefined; + let toolCallId: string | undefined; return Effect.gen(function* () { - seq = yield* logToolCallStartEffect("bash", started); + const startedCall = yield* logToolCallStartEffect("bash", started); + seq = startedCall?.seq; + toolCallId = startedCall?.toolCallId; const cwd = opts?.cwd ? yield* fromSync("resolve sandbox path", () => resolveSandboxPath(root, opts.cwd!), @@ -78,6 +81,7 @@ export function bashToolEffect( undefined, started, seq, + toolCallId, ); return output; }).pipe( @@ -98,6 +102,7 @@ export function bashToolEffect( error, started, seq, + toolCallId, ), ), ); diff --git a/src/tools/edit.ts b/src/tools/edit.ts index 9436fdb4..72de673f 100644 --- a/src/tools/edit.ts +++ b/src/tools/edit.ts @@ -20,8 +20,11 @@ export function editToolEffect(path: string, patch: string) { const patchBytes = Buffer.byteLength(patch, "utf8"); const logInput = { path, patchBytes, patchHash: sha256Hex(patch) }; let seq: number | undefined; + let toolCallId: string | undefined; return Effect.gen(function* () { - seq = yield* logToolCallStartEffect("edit", started); + const startedCall = yield* logToolCallStartEffect("edit", started); + seq = startedCall?.seq; + toolCallId = startedCall?.toolCallId; const fs = yield* FileSystem.FileSystem; const resolved = yield* fromSync("resolve sandbox path", () => resolveSandboxPath(root, path), @@ -50,6 +53,7 @@ export function editToolEffect(path: string, patch: string) { undefined, started, seq, + toolCallId, ); return "ok"; }).pipe( @@ -61,7 +65,16 @@ export function editToolEffect(path: string, patch: string) { }), Effect.withLogSpan("tool:edit"), Effect.tapError((error) => - logToolCallEffect("edit", logInput, null, "error", error, started, seq), + logToolCallEffect( + "edit", + logInput, + null, + "error", + error, + started, + seq, + toolCallId, + ), ), ); } diff --git a/src/tools/grep.ts b/src/tools/grep.ts index c9a99957..7b4f3eb9 100644 --- a/src/tools/grep.ts +++ b/src/tools/grep.ts @@ -15,8 +15,11 @@ export function grepToolEffect(pattern: string, path?: string) { const root = ctx?.rootDir ?? process.cwd(); const started = nowMs(); let seq: number | undefined; + let toolCallId: string | undefined; return Effect.gen(function* () { - seq = yield* logToolCallStartEffect("grep", started); + const startedCall = yield* logToolCallStartEffect("grep", started); + seq = startedCall?.seq; + toolCallId = startedCall?.toolCallId; const resolvedRoot = yield* fromSync("resolve sandbox path", () => resolveSandboxPath(root, path ?? "."), ); @@ -45,6 +48,7 @@ export function grepToolEffect(pattern: string, path?: string) { undefined, started, seq, + toolCallId, ); return result.stdout; }).pipe( @@ -64,6 +68,7 @@ export function grepToolEffect(pattern: string, path?: string) { error, started, seq, + toolCallId, ), ), ); diff --git a/src/tools/logToolCall.ts b/src/tools/logToolCall.ts index 34b06f37..3110d893 100644 --- a/src/tools/logToolCall.ts +++ b/src/tools/logToolCall.ts @@ -14,11 +14,13 @@ export function logToolCallEffect( error?: unknown, startedAtMs?: number, seqOverride?: number, + toolCallIdOverride?: string, ) { const ctx = getToolContext(); if (!ctx) return Effect.void; const seq = typeof seqOverride === "number" ? seqOverride : nextToolSeq(ctx); + const toolCallId = toolCallIdOverride ?? `${toolName}:${seq}`; const started = startedAtMs ?? nowMs(); const finished = nowMs(); const durationMs = finished - started; @@ -32,6 +34,7 @@ export function logToolCallEffect( nodeId: ctx.nodeId, iteration: ctx.iteration, attempt: ctx.attempt, + toolCallId, toolName, seq, status, @@ -89,6 +92,7 @@ export function logToolCallStartEffect( const ctx = getToolContext(); if (!ctx) return Effect.succeed(undefined); const seq = nextToolSeq(ctx); + const toolCallId = `${toolName}:${seq}`; const started = startedAtMs ?? nowMs(); void ctx.emitEvent?.({ type: "ToolCallStarted", @@ -96,11 +100,12 @@ export function logToolCallStartEffect( nodeId: ctx.nodeId, iteration: ctx.iteration, attempt: ctx.attempt, + toolCallId, toolName, seq, timestampMs: started, }); - return Effect.succeed(seq); + return Effect.succeed({ seq, toolCallId }); } export function truncateToBytes(text: string, maxBytes: number): string { diff --git a/src/tools/read.ts b/src/tools/read.ts index bcd4cf23..34ddf58c 100644 --- a/src/tools/read.ts +++ b/src/tools/read.ts @@ -19,8 +19,11 @@ export function readToolEffect(path: string) { const root = ctx?.rootDir ?? process.cwd(); const started = nowMs(); let seq: number | undefined; + let toolCallId: string | undefined; return Effect.gen(function* () { - seq = yield* logToolCallStartEffect("read", started); + const startedCall = yield* logToolCallStartEffect("read", started); + seq = startedCall?.seq; + toolCallId = startedCall?.toolCallId; const fs = yield* FileSystem.FileSystem; const resolved = yield* fromSync("resolve sandbox path", () => resolveSandboxPath(root, path), @@ -41,6 +44,7 @@ export function readToolEffect(path: string) { undefined, started, seq, + toolCallId, ); return output; }).pipe( @@ -51,7 +55,16 @@ export function readToolEffect(path: string) { }), Effect.withLogSpan("tool:read"), Effect.tapError((error) => - logToolCallEffect("read", { path }, null, "error", error, started, seq), + logToolCallEffect( + "read", + { path }, + null, + "error", + error, + started, + seq, + toolCallId, + ), ), ); } diff --git a/src/tools/write.ts b/src/tools/write.ts index 3783f057..e5c73698 100644 --- a/src/tools/write.ts +++ b/src/tools/write.ts @@ -20,8 +20,11 @@ export function writeToolEffect(path: string, content: string) { const contentBytes = Buffer.byteLength(content, "utf8"); const logInput = { path, contentBytes, contentHash: sha256Hex(content) }; let seq: number | undefined; + let toolCallId: string | undefined; return Effect.gen(function* () { - seq = yield* logToolCallStartEffect("write", started); + const startedCall = yield* logToolCallStartEffect("write", started); + seq = startedCall?.seq; + toolCallId = startedCall?.toolCallId; const fs = yield* FileSystem.FileSystem; const resolved = yield* fromSync("resolve sandbox path", () => resolveSandboxPath(root, path), @@ -40,6 +43,7 @@ export function writeToolEffect(path: string, content: string) { undefined, started, seq, + toolCallId, ); return "ok"; }).pipe( @@ -51,7 +55,16 @@ export function writeToolEffect(path: string, content: string) { }), Effect.withLogSpan("tool:write"), Effect.tapError((error) => - logToolCallEffect("write", logInput, null, "error", error, started, seq), + logToolCallEffect( + "write", + logInput, + null, + "error", + error, + started, + seq, + toolCallId, + ), ), ); } diff --git a/tests/agent-trace.test.tsx b/tests/agent-trace.test.tsx index 64f6cf6f..60e8e5d8 100644 --- a/tests/agent-trace.test.tsx +++ b/tests/agent-trace.test.tsx @@ -55,13 +55,17 @@ describe("agent trace capture", () => { assistantMessageEvent: { type: "text_delta", delta: "hello" }, }) + "\n", ); - await runPromise(logToolCallStartEffect("bash")); + const startedCall = await runPromise(logToolCallStartEffect("bash")); await runPromise( logToolCallEffect( "bash", { cmd: "echo secret=sk_live_123456789" }, { ok: true }, "success", + undefined, + undefined, + startedCall?.seq, + startedCall?.toolCallId, ), ); args.onStdout?.( @@ -411,7 +415,7 @@ describe("agent trace capture", () => { .filter((event: any) => event.type === "AgentTraceEvent") .map((event: any) => JSON.parse(event.payloadJson).trace); - expect(summary.traceCompleteness).toBe("full-observed"); + expect(summary.traceCompleteness).toBe("partial-observed"); expect(summary.unsupportedEventKinds).not.toContain("artifact.created"); expect( traceEvents.some( @@ -427,7 +431,21 @@ describe("agent trace capture", () => { event.payload.text === "claude real final", ), ).toBe(true); - expect(traceEvents.filter((event: any) => event.event.kind === "usage").length).toBe(1); + expect(traceEvents.filter((event: any) => event.event.kind === "usage").length).toBe(2); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "usage" && + event.payload.outputTokens === 20, + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.message.final" && + event.source.observed === false, + ), + ).toBe(true); cleanup(); }); @@ -661,7 +679,7 @@ describe("agent trace capture", () => { .filter((event: any) => event.type === "AgentTraceEvent") .map((event: any) => JSON.parse(event.payloadJson).trace); - expect(summary.traceCompleteness).toBe("full-observed"); + expect(summary.traceCompleteness).toBe("partial-observed"); expect( traceEvents.some( (event: any) => @@ -683,6 +701,13 @@ describe("agent trace capture", () => { event.payload.text === "gemini real final", ), ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.message.final" && + event.source.observed === false, + ), + ).toBe(true); expect( traceEvents.some( (event: any) => @@ -876,7 +901,7 @@ describe("agent trace capture", () => { id = "openai-sdk-tools-fake"; tools = {}; async generate() { - const seq = await runPromise(logToolCallStartEffect("bash")); + const startedCall = await runPromise(logToolCallStartEffect("bash")); await runPromise( logToolCallEffect( "bash", @@ -885,7 +910,8 @@ describe("agent trace capture", () => { "success", undefined, undefined, - typeof seq === "number" ? seq : undefined, + startedCall?.seq, + startedCall?.toolCallId, ), ); return { @@ -930,6 +956,14 @@ describe("agent trace capture", () => { (event: any) => event.event.kind === "tool.execution.end", ), ).toBe(true); + const toolStart = traceEvents.find( + (event: any) => event.event.kind === "tool.execution.start", + ); + const toolEnd = traceEvents.find( + (event: any) => event.event.kind === "tool.execution.end", + ); + expect(toolStart?.payload.toolCallId).toBe("bash:1"); + expect(toolEnd?.payload.toolCallId).toBe("bash:1"); cleanup(); }); From 9814a6e0096e534aba36eafad41780b84d31a734 Mon Sep 17 00:00:00 2001 From: Samuel Huber Date: Sat, 28 Mar 2026 15:46:54 +0100 Subject: [PATCH 3/7] fix(observability): capture real Codex and Claude terminal traces truthfully Smithers was still under-reporting fidelity for real Codex and Claude CLI runs even after the earlier trace semantics cleanup. Codex emitted structured dotted JSONL events that we were treating as coarse text in practice, and Claude exposed an observed terminal result that we were only backfilling into a synthetic final message. This change makes Codex self-describe its stream-json mode, normalizes real dotted Codex events into canonical turn and final-message records, and treats Claude result events as directly observed final assistant output. It also hardens the observability verification script so publication checks assert the actual trace summary semantics for Claude and Codex instead of only checking that logs exist. That keeps the published bookmark aligned with the behavior we verified manually and protects the trace fidelity guarantees the PR is trying to establish. --- scripts/verify-observability.sh | 13 ++++ src/agent-trace.ts | 51 +++++++++++++++ src/agents/CodexAgent.ts | 5 ++ tests/agent-trace.test.tsx | 108 +++++++++++++++++++++++++++++++- 4 files changed, 175 insertions(+), 2 deletions(-) diff --git a/scripts/verify-observability.sh b/scripts/verify-observability.sh index e5353fac..54219ee2 100755 --- a/scripts/verify-observability.sh +++ b/scripts/verify-observability.sh @@ -78,6 +78,11 @@ append_json() { } >> "$OUT_DIR/report.md" } +trace_summary_json() { + local path=$1 + node -e "const fs=require('fs'); const rows=fs.readFileSync(process.argv[1],'utf8').trim().split('\n').map(JSON.parse); console.log(JSON.stringify(rows.at(-1).summary));" "$path" +} + echo "[verify] reset stack" "$ROOT_DIR/scripts/obs-reset.sh" @@ -147,6 +152,12 @@ retry_until loki-codex-text-delta 6 '.streams >= 1' \ retry_until loki-codex-usage 6 '.streams >= 1' \ "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"codex-structured-trace\" | event_kind=\"usage\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" +retry_until trace-summary-claude 6 '.captureMode == "cli-json-stream" and .traceCompleteness == "full-observed" and (.missingExpectedEventKinds | length) == 0' \ + "trace_summary_json '$ROOT_DIR/workflows/.smithers/executions/$RUN_ID/logs/agent-trace/claude-structured-trace-0-1.ndjson'" + +retry_until trace-summary-codex 6 '.captureMode == "cli-json-stream" and .traceCompleteness == "full-observed" and (.missingExpectedEventKinds | length) == 0' \ + "trace_summary_json '$ROOT_DIR/workflows/.smithers/executions/$RUN_ID/logs/agent-trace/codex-structured-trace-0-1.ndjson'" + retry_until loki-capture-errors 6 '.streams >= 1' \ "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$FAIL_RUN_ID\" | event_kind=\"capture.error\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" @@ -206,6 +217,8 @@ append_json loki-claude-usage "$OUT_DIR/loki-claude-usage.json" append_json loki-gemini-usage "$OUT_DIR/loki-gemini-usage.json" append_json loki-codex-text-delta "$OUT_DIR/loki-codex-text-delta.json" append_json loki-codex-usage "$OUT_DIR/loki-codex-usage.json" +append_json trace-summary-claude "$OUT_DIR/trace-summary-claude.json" +append_json trace-summary-codex "$OUT_DIR/trace-summary-codex.json" append_json loki-capture-errors "$OUT_DIR/loki-capture-errors.json" append_json loki-redaction-presence "$OUT_DIR/loki-redaction-presence.json" append_json loki-secret-absence "$OUT_DIR/loki-secret-absence.json" diff --git a/src/agent-trace.ts b/src/agent-trace.ts index 2efc2a05..3dd8ba1b 100644 --- a/src/agent-trace.ts +++ b/src/agent-trace.ts @@ -935,6 +935,14 @@ export class AgentTraceCollector { const text = this.extractGenericMessageText(parsed); if (typeof text === "string" && text) { this.setFinalAssistantText(text); + this.push( + "assistant.message.final", + { text }, + parsed, + true, + rawType, + { recordSeen: true, direct: true }, + ); } return; } @@ -971,6 +979,43 @@ export class AgentTraceCollector { return; } + if (this.agentFamily === "codex" && rawType === "thread.started") { + this.push("stdout", { eventType: rawType }, parsed, true, rawType, { + recordSeen: true, + direct: true, + }); + return; + } + + if (this.agentFamily === "codex" && rawType === "turn.started") { + this.push("turn.start", {}, parsed, true, rawType, { + recordSeen: true, + direct: true, + }); + this.expectedKinds.add("turn.end"); + return; + } + + if ( + this.agentFamily === "codex" && + rawType === "item.completed" && + parsed?.item?.type === "agent_message" + ) { + const text = this.extractGenericMessageText(parsed.item); + if (typeof text === "string" && text) { + this.setFinalAssistantText(text); + this.push( + "assistant.message.final", + { text }, + parsed, + true, + rawType, + { recordSeen: true, direct: true }, + ); + return; + } + } + if ( ["message_start", "assistant_message_start", "response.started"].includes( rawType, @@ -1143,6 +1188,12 @@ export class AgentTraceCollector { direct: true, }); } + if (this.agentFamily === "codex") { + this.push("turn.end", {}, parsed, true, rawType, { + recordSeen: true, + direct: true, + }); + } const text = this.extractGenericMessageText(parsed); if (text) { this.setFinalAssistantText(text); diff --git a/src/agents/CodexAgent.ts b/src/agents/CodexAgent.ts index 28d93733..d972cbbc 100644 --- a/src/agents/CodexAgent.ts +++ b/src/agents/CodexAgent.ts @@ -35,6 +35,11 @@ export class CodexAgent extends BaseCliAgent { private readonly opts: CodexAgentOptions; constructor(opts: CodexAgentOptions = {}) { + opts = { + ...opts, + json: opts.json ?? true, + outputFormat: opts.outputFormat ?? "stream-json", + }; super(opts); this.opts = opts; } diff --git a/tests/agent-trace.test.tsx b/tests/agent-trace.test.tsx index 60e8e5d8..77664e3e 100644 --- a/tests/agent-trace.test.tsx +++ b/tests/agent-trace.test.tsx @@ -5,6 +5,7 @@ import { tmpdir } from "node:os"; import { join } from "node:path"; import { Workflow, Task, runWorkflow } from "../src"; import { canonicalTraceEventToOtelLogRecord } from "../src/agent-trace"; +import { CodexAgent } from "../src/agents"; import { SmithersDb } from "../src/db/adapter"; import { runPromise } from "../src/effect/runtime"; import { @@ -415,7 +416,7 @@ describe("agent trace capture", () => { .filter((event: any) => event.type === "AgentTraceEvent") .map((event: any) => JSON.parse(event.payloadJson).trace); - expect(summary.traceCompleteness).toBe("partial-observed"); + expect(summary.traceCompleteness).toBe("full-observed"); expect(summary.unsupportedEventKinds).not.toContain("artifact.created"); expect( traceEvents.some( @@ -443,7 +444,8 @@ describe("agent trace capture", () => { traceEvents.some( (event: any) => event.event.kind === "assistant.message.final" && - event.source.observed === false, + event.source.observed === true && + event.source.rawType === "result", ), ).toBe(true); @@ -527,6 +529,108 @@ describe("agent trace capture", () => { cleanup(); }); + test("captures real Codex dotted jsonl events as structured trace instead of cli-text fallback", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-codex-dotted-jsonl"; + + class CodexJsonlAgentFake extends CodexAgent { + constructor() { + super({ skipGitRepoCheck: true }); + } + + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "thread.started", + thread_id: "thread-1", + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "turn.started", + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "item.completed", + item: { + id: "item-1", + type: "agent_message", + text: "Codex dotted final", + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "turn.completed", + usage: { + input_tokens: 10, + cached_input_tokens: 2, + output_tokens: 4, + }, + }) + "\n", + ); + return { + text: '{"answer":"Codex dotted final"}', + output: { answer: "Codex dotted final" }, + }; + } + } + + const workflow = smithers(() => ( + + + structured codex dotted jsonl please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.captureMode).toBe("cli-json-stream"); + expect(summary.traceCompleteness).toBe("full-observed"); + expect( + traceEvents.some((event: any) => event.event.kind === "turn.start"), + ).toBe(true); + expect( + traceEvents.some((event: any) => event.event.kind === "turn.end"), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.message.final" && + event.source.observed === true && + event.payload.text === "Codex dotted final", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "usage" && + event.payload.inputTokens === 10 && + event.payload.outputTokens === 4, + ), + ).toBe(true); + + cleanup(); + }); + test("classifies structured Gemini stream-json traces truthfully", async () => { const { smithers, outputs, db, cleanup } = createTestSmithers({ result: z.object({ answer: z.string() }), From aa39ee67f0cd50f8756b60fc226dbb8f94fccaa8 Mon Sep 17 00:00:00 2001 From: Samuel Huber Date: Sat, 28 Mar 2026 18:24:19 +0100 Subject: [PATCH 4/7] refactor(observability): reduce agent trace normalization verbosity --- src/agent-trace.ts | 787 +++++++++++------------------------ src/agents/BaseCliAgent.ts | 92 ++-- src/utils/text-extraction.ts | 25 ++ src/utils/usage.ts | 89 ++++ tests/agent-trace.test.tsx | 29 ++ 5 files changed, 414 insertions(+), 608 deletions(-) create mode 100644 src/utils/text-extraction.ts create mode 100644 src/utils/usage.ts diff --git a/src/agent-trace.ts b/src/agent-trace.ts index 3dd8ba1b..08332ccf 100644 --- a/src/agent-trace.ts +++ b/src/agent-trace.ts @@ -3,7 +3,9 @@ import { join } from "node:path"; import type { SmithersEvent } from "./SmithersEvent"; import type { EventBus } from "./events"; import { logErrorAwait, logInfoAwait, logWarningAwait } from "./effect/logging"; +import { extractTextFromJsonValue } from "./utils/text-extraction"; import { nowMs } from "./utils/time"; +import { normalizeTokenUsage } from "./utils/usage"; export type AgentFamily = | "pi" @@ -148,11 +150,6 @@ export type AgentTraceCapabilityProfile = { persistedSessionArtifact: boolean; }; -type PushOptions = { - recordSeen?: boolean; - direct?: boolean; -}; - export const agentTraceCapabilities: Record< AgentFamily, AgentTraceCapabilityProfile @@ -487,6 +484,87 @@ function kindPhase( return "capture"; } +type PayloadKind = "message" | "tool" | "pi" | "none"; + +type MappedStructuredEvent = { + kind: CanonicalAgentTraceEventKind; + payloadKind: PayloadKind; + expect?: CanonicalAgentTraceEventKind; +}; + +const piSimpleEventMap: Record = { + session: { kind: "session.start", payloadKind: "pi" }, + agent_start: { kind: "session.start", payloadKind: "pi" }, + agent_end: { kind: "session.end", payloadKind: "pi" }, + turn_start: { + kind: "turn.start", + payloadKind: "pi", + expect: "turn.end", + }, + message_start: { kind: "message.start", payloadKind: "pi" }, + tool_execution_start: { + kind: "tool.execution.start", + payloadKind: "tool", + expect: "tool.execution.end", + }, + tool_execution_update: { + kind: "tool.execution.update", + payloadKind: "tool", + expect: "tool.execution.end", + }, + tool_execution_end: { kind: "tool.execution.end", payloadKind: "tool" }, + auto_compaction_start: { kind: "compaction.start", payloadKind: "pi" }, + auto_compaction_end: { kind: "compaction.end", payloadKind: "pi" }, + auto_retry_start: { kind: "retry.start", payloadKind: "pi" }, + auto_retry_end: { kind: "retry.end", payloadKind: "pi" }, +}; + +const genericStructuredEventMap: Record = { + message_start: { kind: "message.start", payloadKind: "message" }, + assistant_message_start: { kind: "message.start", payloadKind: "message" }, + "response.started": { kind: "message.start", payloadKind: "message" }, + tool_call_start: { + kind: "tool.execution.start", + payloadKind: "tool", + expect: "tool.execution.end", + }, + tool_execution_start: { + kind: "tool.execution.start", + payloadKind: "tool", + expect: "tool.execution.end", + }, + "tool_call.started": { + kind: "tool.execution.start", + payloadKind: "tool", + expect: "tool.execution.end", + }, + tool_call_delta: { + kind: "tool.execution.update", + payloadKind: "tool", + expect: "tool.execution.end", + }, + tool_call_update: { + kind: "tool.execution.update", + payloadKind: "tool", + expect: "tool.execution.end", + }, + tool_execution_update: { + kind: "tool.execution.update", + payloadKind: "tool", + expect: "tool.execution.end", + }, + "tool_call.delta": { + kind: "tool.execution.update", + payloadKind: "tool", + expect: "tool.execution.end", + }, + tool_call_end: { kind: "tool.execution.end", payloadKind: "tool" }, + tool_execution_end: { kind: "tool.execution.end", payloadKind: "tool" }, + "tool_call.completed": { kind: "tool.execution.end", payloadKind: "tool" }, + tool_result: { kind: "tool.result", payloadKind: "tool" }, + "tool_result.completed": { kind: "tool.result", payloadKind: "tool" }, +}; + export function canonicalTraceEventToOtelLogRecord( event: CanonicalAgentTraceEvent, context?: { agentId?: string; model?: string }, @@ -641,37 +719,30 @@ export class AgentTraceCollector { this.finalText = text; } if (this.captureMode === "sdk-events" && text) { - this.push( + this.pushDerived( "assistant.message.final", { text }, text, - false, - undefined, - { recordSeen: true, direct: true }, ); } - const usage = this.normalizeUsage(result?.usage ?? result?.totalUsage); + const usage = normalizeTokenUsage(result?.usage ?? result?.totalUsage); if (usage) { - this.push( + this.pushDerived( "usage", usage, usage, - false, "usage", - { recordSeen: true, direct: true }, ); } } observeError(error: unknown) { this.failures.push(error instanceof Error ? error.message : String(error)); - this.push( + this.pushDerived( "capture.error", { error: this.failures.at(-1) }, { error: this.failures.at(-1) }, - false, "error", - { recordSeen: true, direct: true }, ); } @@ -684,14 +755,13 @@ export class AgentTraceCollector { !this.seenKinds.has("assistant.message.final") && this.finalText ) { - this.push( - "assistant.message.final", - { text: this.finalText }, - this.finalText, - false, - undefined, - { recordSeen: true }, - ); + this.pushDerived( + "assistant.message.final", + { text: this.finalText }, + this.finalText, + undefined, + false, + ); } if ( (this.captureMode === "cli-json-stream" || @@ -703,13 +773,11 @@ export class AgentTraceCollector { this.warnings.push( "structured stream ended without a terminal assistant message", ); - this.push( + this.pushDerived( "capture.warning", { reason: "missing-terminal-event" }, { reason: "missing-terminal-event" }, - false, "capture", - { recordSeen: true, direct: true }, ); } @@ -717,6 +785,7 @@ export class AgentTraceCollector { let missingExpectedEventKinds = [...this.expectedKinds].filter( (kind) => !this.directKinds.has(kind), ); + this.applyTraceCompleteness(traceCompleteness); let summary: AgentTraceSummary = { traceVersion: "1", runId: this.runId, @@ -743,7 +812,7 @@ export class AgentTraceCollector { if (persistedArtifact.ok && persistedArtifact.file) { const artifactPath = persistedArtifact.file; this.rawArtifactRefs.push(artifactPath); - this.push( + this.pushDerived( "artifact.created", { artifactKind: "agent-trace.ndjson", @@ -755,26 +824,24 @@ export class AgentTraceCollector { artifactPath, contentType: "application/x-ndjson", }, - false, "artifact", - { recordSeen: true, direct: true }, ); + this.applyTraceCompleteness(traceCompleteness); summary = { ...summary, rawArtifactRefs: [...this.rawArtifactRefs] }; await this.rewriteNdjson(artifactPath, summary); } else if (!persistedArtifact.ok) { this.warnings.push(persistedArtifact.error); - this.push( + this.pushDerived( "capture.warning", { reason: "artifact-write-failed", error: persistedArtifact.error }, { reason: "artifact-write-failed", error: persistedArtifact.error }, - false, "artifact", - { recordSeen: true, direct: true }, ); traceCompleteness = this.resolveCompleteness(); missingExpectedEventKinds = [...this.expectedKinds].filter( (kind) => !this.directKinds.has(kind), ); + this.applyTraceCompleteness(traceCompleteness); summary = { ...summary, traceCompleteness, @@ -783,8 +850,8 @@ export class AgentTraceCollector { }; } + this.applyTraceCompleteness(traceCompleteness); for (const event of this.events) { - event.traceCompleteness = traceCompleteness; const smithersEvent: AgentTraceSmithersEvent = { type: "AgentTraceEvent", runId: this.runId, @@ -824,18 +891,12 @@ export class AgentTraceCollector { private processChunk(stream: "stdout" | "stderr", text: string) { if (stream === "stderr") { this.stderrBuffer += text; - this.push("stderr", { text }, text, true, stream, { - recordSeen: true, - direct: true, - }); + this.pushObserved("stderr", { text }, text, stream); return; } this.stdoutBuffer += text; if (this.captureMode === "cli-text" || this.captureMode === "sdk-events") { - this.push("stdout", { text }, text, true, stream, { - recordSeen: true, - direct: true, - }); + this.pushObserved("stdout", { text }, text, stream); return; } const lines = this.stdoutBuffer.split(/\r?\n/); @@ -859,13 +920,11 @@ export class AgentTraceCollector { if (!line) return; this.failures.push(`truncated structured stream: ${line.slice(0, 200)}`); - this.push( + this.pushObserved( "capture.error", { reason: "truncated-json-stream", linePreview: line.slice(0, 200) }, line, - true, "stdout", - { recordSeen: true, direct: true }, ); } @@ -875,13 +934,11 @@ export class AgentTraceCollector { parsed = JSON.parse(line); } catch { this.failures.push(`malformed upstream JSON: ${line.slice(0, 200)}`); - this.push( + this.pushObserved( "capture.error", { linePreview: line.slice(0, 200), reason: "malformed-json" }, line, - true, "stdout", - { recordSeen: true, direct: true }, ); return; } @@ -900,48 +957,35 @@ export class AgentTraceCollector { const text = this.extractGenericMessageText(parsed?.message ?? parsed); if (typeof text === "string" && text) { this.appendAssistantText(text); - this.push( + this.pushObserved( "assistant.text.delta", { text }, parsed, - true, rawType, - { recordSeen: true, direct: true }, ); } else { - this.push("stdout", { eventType: rawType }, parsed, true, rawType, { - recordSeen: true, - direct: true, - }); + this.pushObserved("stdout", { eventType: rawType }, parsed, rawType); } - const usage = this.normalizeUsage(parsed?.message?.usage); + const usage = normalizeTokenUsage(parsed?.message?.usage); if (usage) { - this.push("usage", usage, parsed, true, rawType, { - recordSeen: true, - direct: true, - }); + this.pushObserved("usage", usage, parsed, rawType); } return; } if (this.agentFamily === "claude-code" && rawType === "result") { - const usage = this.normalizeUsage(parsed?.usage); + const usage = normalizeTokenUsage(parsed?.usage); if (usage) { - this.push("usage", usage, parsed, true, rawType, { - recordSeen: true, - direct: true, - }); + this.pushObserved("usage", usage, parsed, rawType); } const text = this.extractGenericMessageText(parsed); if (typeof text === "string" && text) { this.setFinalAssistantText(text); - this.push( + this.pushObserved( "assistant.message.final", { text }, parsed, - true, rawType, - { recordSeen: true, direct: true }, ); } return; @@ -956,42 +1000,31 @@ export class AgentTraceCollector { } else { this.setFinalAssistantText(text); } - this.push( + this.pushObserved( parsed?.delta ? "assistant.text.delta" : "assistant.message.final", { text }, parsed, - true, rawType, - { recordSeen: true, direct: true }, ); return; } } if (this.agentFamily === "gemini" && rawType === "result" && parsed?.stats) { - const usage = this.normalizeUsage(parsed.stats); + const usage = normalizeTokenUsage(parsed.stats); if (usage) { - this.push("usage", usage, parsed, true, rawType, { - recordSeen: true, - direct: true, - }); + this.pushObserved("usage", usage, parsed, rawType); } return; } if (this.agentFamily === "codex" && rawType === "thread.started") { - this.push("stdout", { eventType: rawType }, parsed, true, rawType, { - recordSeen: true, - direct: true, - }); + this.pushObserved("stdout", { eventType: rawType }, parsed, rawType); return; } if (this.agentFamily === "codex" && rawType === "turn.started") { - this.push("turn.start", {}, parsed, true, rawType, { - recordSeen: true, - direct: true, - }); + this.pushObserved("turn.start", {}, parsed, rawType); this.expectedKinds.add("turn.end"); return; } @@ -1004,31 +1037,25 @@ export class AgentTraceCollector { const text = this.extractGenericMessageText(parsed.item); if (typeof text === "string" && text) { this.setFinalAssistantText(text); - this.push( + this.pushObserved( "assistant.message.final", { text }, parsed, - true, rawType, - { recordSeen: true, direct: true }, ); return; } } - if ( - ["message_start", "assistant_message_start", "response.started"].includes( - rawType, - ) - ) { - this.push( - "message.start", - this.extractGenericMessagePayload(parsed), + const mapped = genericStructuredEventMap[rawType]; + if (mapped) { + this.pushObserved( + mapped.kind, + this.extractMappedPayload(parsed, mapped.payloadKind), parsed, - true, rawType, - { recordSeen: true, direct: true }, ); + if (mapped.expect) this.expectedKinds.add(mapped.expect); return; } @@ -1044,13 +1071,11 @@ export class AgentTraceCollector { const text = this.extractGenericDeltaText(parsed); if (typeof text === "string" && text) { this.appendAssistantText(text); - this.push( + this.pushObserved( "assistant.text.delta", { text }, parsed, - true, rawType, - { recordSeen: true, direct: true }, ); return; } @@ -1065,13 +1090,11 @@ export class AgentTraceCollector { ) { const text = this.extractGenericDeltaText(parsed); if (typeof text === "string" && text) { - this.push( + this.pushObserved( "assistant.thinking.delta", { text }, parsed, - true, rawType, - { recordSeen: true, direct: true }, ); return; } @@ -1085,355 +1108,130 @@ export class AgentTraceCollector { "message_stop", ].includes(rawType) ) { - this.push( + this.pushObserved( "message.end", this.extractGenericMessagePayload(parsed), parsed, - true, rawType, - { recordSeen: true, direct: true }, ); const text = this.extractGenericMessageText(parsed); if (text) { this.setFinalAssistantText(text); - this.push( + this.pushObserved( "assistant.message.final", { text }, parsed, - true, rawType, - { recordSeen: true, direct: true }, ); } - const usage = this.normalizeUsage(parsed?.usage); + const usage = normalizeTokenUsage(parsed?.usage); if (usage) { - this.push("usage", usage, parsed, true, rawType, { - recordSeen: true, - direct: true, - }); + this.pushObserved("usage", usage, parsed, rawType); } return; } - if ( - ["tool_call_start", "tool_execution_start", "tool_call.started"].includes( - rawType, - ) - ) { - this.push( - "tool.execution.start", - this.extractGenericToolPayload(parsed), - parsed, - true, - rawType, - { recordSeen: true, direct: true }, - ); - this.expectedKinds.add("tool.execution.end"); - return; - } - - if ( - [ - "tool_call_delta", - "tool_call_update", - "tool_execution_update", - "tool_call.delta", - ].includes(rawType) - ) { - this.push( - "tool.execution.update", - this.extractGenericToolPayload(parsed), - parsed, - true, - rawType, - { recordSeen: true, direct: true }, - ); - this.expectedKinds.add("tool.execution.end"); - return; - } - - if ( - ["tool_call_end", "tool_execution_end", "tool_call.completed"].includes( - rawType, - ) - ) { - this.push( - "tool.execution.end", - this.extractGenericToolPayload(parsed), - parsed, - true, - rawType, - { recordSeen: true, direct: true }, - ); - return; - } - - if (["tool_result", "tool_result.completed"].includes(rawType)) { - this.push( - "tool.result", - this.extractGenericToolPayload(parsed), - parsed, - true, - rawType, - { recordSeen: true, direct: true }, - ); - return; - } - if (rawType === "turn.completed" && parsed?.usage) { - const usage = this.normalizeUsage(parsed.usage); + const usage = normalizeTokenUsage(parsed.usage); if (usage) { - this.push("usage", usage, parsed, true, rawType, { - recordSeen: true, - direct: true, - }); + this.pushObserved("usage", usage, parsed, rawType); } if (this.agentFamily === "codex") { - this.push("turn.end", {}, parsed, true, rawType, { - recordSeen: true, - direct: true, - }); + this.pushObserved("turn.end", {}, parsed, rawType); } const text = this.extractGenericMessageText(parsed); if (text) { this.setFinalAssistantText(text); - this.push( + this.pushObserved( "assistant.message.final", { text }, parsed, - true, rawType, - { recordSeen: true, direct: true }, ); } return; } - this.push("stdout", { eventType: rawType }, parsed, true, rawType, { - recordSeen: true, - direct: true, - }); + this.pushObserved("stdout", { eventType: rawType }, parsed, rawType); } private processPiEvent(parsed: any) { const type = String(parsed?.type ?? "unknown"); - switch (type) { - case "session": - case "agent_start": - this.push( - "session.start", - this.extractPiPayload(parsed), - parsed, - true, - type, - { recordSeen: true, direct: true }, - ); - return; - case "agent_end": - this.push( - "session.end", - this.extractPiPayload(parsed), - parsed, - true, - type, - { recordSeen: true, direct: true }, - ); - return; - case "turn_start": - this.push( - "turn.start", - this.extractPiPayload(parsed), - parsed, - true, - type, - { recordSeen: true, direct: true }, - ); - this.expectedKinds.add("turn.end"); - return; - case "turn_end": { - this.push( - "turn.end", - this.extractPiPayload(parsed), - parsed, - true, + const simple = piSimpleEventMap[type]; + if (simple) { + this.pushObserved( + simple.kind, + this.extractMappedPayload(parsed, simple.payloadKind), + parsed, + type, + ); + if (simple.expect) this.expectedKinds.add(simple.expect); + return; + } + + if (type === "turn_end") { + this.pushObserved("turn.end", this.extractPiPayload(parsed), parsed, type); + const finalText = this.extractGenericMessageText(parsed?.message); + if (finalText) { + this.setFinalAssistantText(finalText); + this.pushObserved( + "assistant.message.final", + { text: finalText }, + parsed?.message, type, - { recordSeen: true, direct: true }, ); - const finalText = this.extractText(parsed?.message); - if (finalText) { - this.setFinalAssistantText(finalText); - this.push( - "assistant.message.final", - { text: finalText }, - parsed?.message, - true, - type, - { recordSeen: true, direct: true }, - ); - } - const usage = this.normalizeUsage(parsed?.message?.usage); - if (usage) { - this.push( - "usage", - usage, - parsed.message.usage, - true, - "usage", - { recordSeen: true, direct: true }, - ); - } - return; } - case "message_start": - this.push( - "message.start", - this.extractPiPayload(parsed), - parsed, - true, - type, - { recordSeen: true, direct: true }, - ); - return; - case "message_end": { - this.push( - "message.end", - this.extractPiPayload(parsed), - parsed, - true, - type, - { recordSeen: true, direct: true }, - ); - const finalText = this.extractText(parsed?.message); - if (parsed?.message?.role === "assistant" && finalText) { - this.setFinalAssistantText(finalText); - this.push( - "assistant.message.final", - { text: finalText }, - parsed?.message, - true, - type, - { recordSeen: true, direct: true }, - ); - } - return; + const usage = normalizeTokenUsage(parsed?.message?.usage); + if (usage) { + this.pushObserved("usage", usage, parsed.message.usage, "usage"); } - case "message_update": { - const evt = parsed?.assistantMessageEvent; - if (evt?.type === "text_delta" && typeof evt.delta === "string") { - this.appendAssistantText(evt.delta); - this.push( - "assistant.text.delta", - { text: evt.delta }, - parsed, - true, - evt.type, - { recordSeen: true, direct: true }, - ); - return; - } - if ( - (evt?.type === "thinking_delta" || evt?.type === "reasoning_delta") && - typeof evt.delta === "string" - ) { - this.push( - "assistant.thinking.delta", - { text: evt.delta }, - parsed, - true, - evt.type, - { recordSeen: true, direct: true }, - ); - return; - } - this.push( - "message.update", - this.extractPiPayload(parsed), - parsed, - true, + return; + } + + if (type === "message_end") { + this.pushObserved("message.end", this.extractPiPayload(parsed), parsed, type); + const finalText = this.extractGenericMessageText(parsed?.message); + if (parsed?.message?.role === "assistant" && finalText) { + this.setFinalAssistantText(finalText); + this.pushObserved( + "assistant.message.final", + { text: finalText }, + parsed?.message, type, - { recordSeen: true, direct: true }, ); - return; } - case "tool_execution_start": - this.push( - "tool.execution.start", - this.extractPiToolPayload(parsed), - parsed, - true, - type, - { recordSeen: true, direct: true }, - ); - this.expectedKinds.add("tool.execution.end"); - return; - case "tool_execution_update": - this.push( - "tool.execution.update", - this.extractPiToolPayload(parsed), - parsed, - true, - type, - { recordSeen: true, direct: true }, - ); - this.expectedKinds.add("tool.execution.end"); - return; - case "tool_execution_end": - this.push( - "tool.execution.end", - this.extractPiToolPayload(parsed), - parsed, - true, - type, - { recordSeen: true, direct: true }, - ); - return; - case "auto_compaction_start": - this.push( - "compaction.start", - this.extractPiPayload(parsed), - parsed, - true, - type, - { recordSeen: true, direct: true }, - ); - return; - case "auto_compaction_end": - this.push( - "compaction.end", - this.extractPiPayload(parsed), - parsed, - true, - type, - { recordSeen: true, direct: true }, - ); - return; - case "auto_retry_start": - this.push( - "retry.start", - this.extractPiPayload(parsed), + return; + } + + if (type === "message_update") { + const evt = parsed?.assistantMessageEvent; + if (evt?.type === "text_delta" && typeof evt.delta === "string") { + this.appendAssistantText(evt.delta); + this.pushObserved( + "assistant.text.delta", + { text: evt.delta }, parsed, - true, - type, - { recordSeen: true, direct: true }, + evt.type, ); return; - case "auto_retry_end": - this.push( - "retry.end", - this.extractPiPayload(parsed), + } + if ( + (evt?.type === "thinking_delta" || evt?.type === "reasoning_delta") && + typeof evt.delta === "string" + ) { + this.pushObserved( + "assistant.thinking.delta", + { text: evt.delta }, parsed, - true, - type, - { recordSeen: true, direct: true }, + evt.type, ); return; - default: - this.push("stdout", { eventType: type }, parsed, true, type, { - recordSeen: true, - direct: true, - }); - return; + } + this.pushObserved("message.update", this.extractPiPayload(parsed), parsed, type); + return; } + + this.pushObserved("stdout", { eventType: type }, parsed, type); } private extractGenericDeltaText(parsed: any): string | undefined { @@ -1454,36 +1252,9 @@ export class AgentTraceCollector { } private extractGenericMessageText(parsed: any): string | undefined { - const message = - parsed?.message ?? parsed?.response ?? parsed?.item ?? parsed; - if (typeof message?.text === "string") return message.text; - if (typeof message?.content === "string") return message.content; - if (typeof parsed?.result === "string") return parsed.result; - if (typeof parsed?.text === "string") return parsed.text; - if (Array.isArray(message?.content)) { - const text = message.content - .map((part: any) => { - if (typeof part === "string") return part; - if (typeof part?.text === "string") return part.text; - if (typeof part?.content === "string") return part.content; - if (typeof part?.output_text === "string") return part.output_text; - return ""; - }) - .join(""); - if (text) return text; - } - if (Array.isArray(parsed?.output)) { - const text = parsed.output - .map((part: any) => { - if (typeof part?.text === "string") return part.text; - if (typeof part?.content === "string") return part.content; - if (typeof part?.output_text === "string") return part.output_text; - return ""; - }) - .join(""); - if (text) return text; - } - return undefined; + return extractTextFromJsonValue( + parsed?.message ?? parsed?.response ?? parsed?.item ?? parsed, + ); } private appendAssistantText(text: string) { @@ -1526,40 +1297,12 @@ export class AgentTraceCollector { private extractPiPayload(parsed: any): Record { const payload: Record = {}; if (parsed?.message?.role) payload.role = parsed.message.role; - const text = this.extractText(parsed?.message); + const text = this.extractGenericMessageText(parsed?.message); if (text) payload.text = text; if (parsed?.id) payload.id = parsed.id; return payload; } - private extractPiToolPayload(parsed: any): Record { - const tool = parsed?.toolExecution ?? parsed?.tool ?? parsed; - return { - toolCallId: tool?.id ?? tool?.toolCallId ?? parsed?.id, - toolName: tool?.name ?? tool?.toolName ?? parsed?.toolName, - argsPreview: tool?.args ?? tool?.arguments ?? parsed?.args, - resultPreview: tool?.result ?? parsed?.result, - isError: Boolean(tool?.isError ?? parsed?.isError ?? parsed?.error), - }; - } - - private extractText(message: any): string | undefined { - if (typeof message?.text === "string") return message.text; - if (typeof message?.content === "string") return message.content; - if (Array.isArray(message?.content)) { - const parts = message.content - .map((part: any) => { - if (typeof part === "string") return part; - if (typeof part?.text === "string") return part.text; - if (typeof part?.content === "string") return part.content; - return ""; - }) - .join(""); - return parts || undefined; - } - return undefined; - } - private observeSmithersEvent(event: SmithersEvent) { const sameAttempt = (event as any).runId === this.runId && @@ -1567,22 +1310,22 @@ export class AgentTraceCollector { (event as any).iteration === this.iteration && (event as any).attempt === this.attempt; if (!sameAttempt) return; + if (this.agentFamily === "pi") return; + if (event.type === "ToolCallStarted") { - this.push( + this.pushDerived( "tool.execution.start", { toolCallId: event.toolCallId, toolName: event.toolName, }, event, - false, event.type, - { recordSeen: true, direct: true }, ); this.expectedKinds.add("tool.execution.end"); } if (event.type === "ToolCallFinished") { - this.push( + this.pushDerived( "tool.execution.end", { toolCallId: event.toolCallId, @@ -1590,13 +1333,11 @@ export class AgentTraceCollector { isError: event.status === "error", }, event, - false, event.type, - { recordSeen: true, direct: true }, ); } if (event.type === "TokenUsageReported") { - this.push( + this.pushDerived( "usage", { model: event.model, @@ -1608,9 +1349,7 @@ export class AgentTraceCollector { reasoningTokens: event.reasoningTokens, }, event, - false, event.type, - { recordSeen: true, direct: true }, ); } } @@ -1664,83 +1403,14 @@ export class AgentTraceCollector { return "full-observed"; } - private hasMeaningfulUsage(usage: any): boolean { - if (!usage || typeof usage !== "object") return false; - - const values = [ - usage.inputTokens, - usage.promptTokens, - usage.outputTokens, - usage.completionTokens, - usage.cacheReadTokens, - usage.cacheWriteTokens, - usage.reasoningTokens, - usage.inputTokenDetails?.cacheReadTokens, - usage.inputTokenDetails?.cacheWriteTokens, - usage.outputTokenDetails?.reasoningTokens, - ]; - - return values.some( - (value) => typeof value === "number" && Number.isFinite(value) && value > 0, - ); - } - - private normalizeUsage(usage: any): Record | null { - if (!usage || typeof usage !== "object") return null; - - const normalized = { - ...(typeof (usage.inputTokens ?? usage.promptTokens ?? usage.input_tokens ?? usage.input ?? usage.models?.gemini?.tokens?.input) === "number" - ? { - inputTokens: - usage.inputTokens ?? - usage.promptTokens ?? - usage.input_tokens ?? - usage.input ?? - usage.models?.gemini?.tokens?.input, - } - : {}), - ...(typeof (usage.outputTokens ?? usage.completionTokens ?? usage.output_tokens ?? usage.output ?? usage.models?.gemini?.tokens?.output) === "number" - ? { - outputTokens: - usage.outputTokens ?? - usage.completionTokens ?? - usage.output_tokens ?? - usage.output ?? - usage.models?.gemini?.tokens?.output, - } - : {}), - ...(typeof (usage.cacheReadTokens ?? usage.cache_read_input_tokens ?? usage.inputTokenDetails?.cacheReadTokens ?? usage.cache_read_tokens) === "number" - ? { - cacheReadTokens: - usage.cacheReadTokens ?? - usage.cache_read_input_tokens ?? - usage.inputTokenDetails?.cacheReadTokens ?? - usage.cache_read_tokens, - } - : {}), - ...(typeof (usage.cacheWriteTokens ?? usage.cache_write_input_tokens ?? usage.inputTokenDetails?.cacheWriteTokens ?? usage.cache_write_tokens) === "number" - ? { - cacheWriteTokens: - usage.cacheWriteTokens ?? - usage.cache_write_input_tokens ?? - usage.inputTokenDetails?.cacheWriteTokens ?? - usage.cache_write_tokens, - } - : {}), - ...(typeof (usage.reasoningTokens ?? usage.reasoning_tokens ?? usage.outputTokenDetails?.reasoningTokens) === "number" - ? { - reasoningTokens: - usage.reasoningTokens ?? - usage.reasoning_tokens ?? - usage.outputTokenDetails?.reasoningTokens, - } - : {}), - ...(typeof (usage.totalTokens ?? usage.total_tokens) === "number" - ? { totalTokens: usage.totalTokens ?? usage.total_tokens } - : {}), - }; - - return this.hasMeaningfulUsage(normalized) ? normalized : null; + private extractMappedPayload( + parsed: any, + payloadKind: PayloadKind, + ): Record | null { + if (payloadKind === "message") return this.extractGenericMessagePayload(parsed); + if (payloadKind === "tool") return this.extractGenericToolPayload(parsed); + if (payloadKind === "pi") return this.extractPiPayload(parsed); + return {}; } private push( @@ -1749,7 +1419,7 @@ export class AgentTraceCollector { raw: unknown, observed: boolean, rawType?: string, - options: PushOptions = {}, + direct = true, ) { const redactedPayload = redactValue(payload); const redactedRaw = redactValue(raw); @@ -1785,8 +1455,33 @@ export class AgentTraceCollector { annotations: this.annotations, }; this.events.push(event); - if (options.recordSeen) this.seenKinds.add(kind); - if (options.direct) this.directKinds.add(kind); + this.seenKinds.add(kind); + if (direct) this.directKinds.add(kind); + } + + private pushObserved( + kind: CanonicalAgentTraceEventKind, + payload: Record | null, + raw: unknown, + rawType?: string, + ) { + this.push(kind, payload, raw, true, rawType); + } + + private pushDerived( + kind: CanonicalAgentTraceEventKind, + payload: Record | null, + raw: unknown, + rawType?: string, + direct = true, + ) { + this.push(kind, payload, raw, false, rawType, direct); + } + + private applyTraceCompleteness(traceCompleteness: TraceCompleteness) { + for (const event of this.events) { + event.traceCompleteness = traceCompleteness; + } } private async persistNdjson( diff --git a/src/agents/BaseCliAgent.ts b/src/agents/BaseCliAgent.ts index b7c451a8..3316c673 100644 --- a/src/agents/BaseCliAgent.ts +++ b/src/agents/BaseCliAgent.ts @@ -13,6 +13,8 @@ import { spawnCaptureEffect } from "../effect/child-process"; import { runPromise } from "../effect/runtime"; import { getToolContext } from "../tools/context"; import { SmithersError } from "../utils/errors"; +import { extractTextFromJsonValue } from "../utils/text-extraction"; +import { normalizeTokenUsage } from "../utils/usage"; import { launchDiagnostics, enrichReportWithErrorAnalysis } from "./diagnostics"; type TimeoutInput = number | { totalMs?: number; idleMs?: number } | undefined; @@ -201,30 +203,7 @@ export function tryParseJson(text: string): unknown | undefined { return undefined; } -export function extractTextFromJsonValue(value: any): string | undefined { - if (typeof value === "string") return value; - if (!value || typeof value !== "object") return undefined; - if (typeof value.text === "string") return value.text; - if (typeof value.content === "string") return value.content; - if (Array.isArray(value.content)) { - const parts = value.content - .map((part: any) => { - if (!part) return ""; - if (typeof part === "string") return part; - if (typeof part.text === "string") return part.text; - if (typeof part.content === "string") return part.content; - return ""; - }) - .join(""); - if (parts.trim()) return parts; - } - if (value.response) return extractTextFromJsonValue(value.response); - if (value.message) return extractTextFromJsonValue(value.message); - if (value.result) return extractTextFromJsonValue(value.result); - if (value.output) return extractTextFromJsonValue(value.output); - if (value.data) return extractTextFromJsonValue(value.data); - return undefined; -} +export { extractTextFromJsonValue }; function extractTextFromJsonPayload(raw: string): string | undefined { const trimmed = raw.trim(); @@ -315,6 +294,19 @@ export function extractUsageFromOutput(raw: string): CliUsageInfo | undefined { const usage: CliUsageInfo = {}; let found = false; + const addUsage = (next: CliUsageInfo | null | undefined) => { + if (!next) return false; + let changed = false; + for (const [key, value] of Object.entries(next) as Array< + [keyof CliUsageInfo, number | undefined] + >) { + if (typeof value !== "number" || value <= 0) continue; + usage[key] = (usage[key] ?? 0) + value; + changed = true; + } + return changed; + }; + for (const line of lines) { let parsed: any; try { parsed = JSON.parse(line); } catch { continue; } @@ -322,56 +314,34 @@ export function extractUsageFromOutput(raw: string): CliUsageInfo | undefined { // Claude Code stream-json: message_start contains input token counts if (parsed.type === "message_start" && parsed.message?.usage) { - const u = parsed.message.usage; - usage.inputTokens = (usage.inputTokens ?? 0) + (u.input_tokens ?? 0); - if (u.cache_read_input_tokens) { - usage.cacheReadTokens = (usage.cacheReadTokens ?? 0) + u.cache_read_input_tokens; - } - if (u.cache_creation_input_tokens) { - usage.cacheWriteTokens = (usage.cacheWriteTokens ?? 0) + u.cache_creation_input_tokens; - } - found = true; + const normalized = normalizeTokenUsage(parsed.message.usage); + found = addUsage({ + inputTokens: normalized?.inputTokens, + cacheReadTokens: normalized?.cacheReadTokens, + cacheWriteTokens: normalized?.cacheWriteTokens, + }) || found; continue; } // Claude Code stream-json: message_delta has output token count if (parsed.type === "message_delta" && parsed.usage) { - if (parsed.usage.output_tokens) { - usage.outputTokens = (usage.outputTokens ?? 0) + parsed.usage.output_tokens; - } - found = true; + const normalized = normalizeTokenUsage(parsed.usage); + found = addUsage({ + outputTokens: normalized?.outputTokens, + }) || found; continue; } // Codex --json: turn.completed event if (parsed.type === "turn.completed" && parsed.usage) { - const u = parsed.usage; - if (u.input_tokens) usage.inputTokens = (usage.inputTokens ?? 0) + u.input_tokens; - if (u.output_tokens) usage.outputTokens = (usage.outputTokens ?? 0) + u.output_tokens; - if (u.cached_input_tokens) usage.cacheReadTokens = (usage.cacheReadTokens ?? 0) + u.cached_input_tokens; - found = true; + found = addUsage(normalizeTokenUsage(parsed.usage)) || found; continue; } // Generic: any event with a top-level "usage" containing token fields if (parsed.usage && typeof parsed.usage === "object") { - const u = parsed.usage; - const inTok = u.input_tokens ?? u.inputTokens ?? u.prompt_tokens ?? 0; - const outTok = u.output_tokens ?? u.outputTokens ?? u.completion_tokens ?? 0; - if (inTok > 0 || outTok > 0) { - usage.inputTokens = (usage.inputTokens ?? 0) + inTok; - usage.outputTokens = (usage.outputTokens ?? 0) + outTok; - if (u.cache_read_input_tokens || u.cacheReadTokens || u.cached_input_tokens) { - usage.cacheReadTokens = (usage.cacheReadTokens ?? 0) + - (u.cache_read_input_tokens ?? u.cacheReadTokens ?? u.cached_input_tokens ?? 0); - } - if (u.reasoning_tokens ?? u.reasoningTokens) { - usage.reasoningTokens = (usage.reasoningTokens ?? 0) + - (u.reasoning_tokens ?? u.reasoningTokens ?? 0); - } - found = true; - continue; - } + found = addUsage(normalizeTokenUsage(parsed.usage)) || found; + continue; } } @@ -382,9 +352,7 @@ export function extractUsageFromOutput(raw: string): CliUsageInfo | undefined { if (parsed?.stats?.models && typeof parsed.stats.models === "object") { for (const data of Object.values(parsed.stats.models as Record)) { if (data?.tokens) { - usage.inputTokens = (usage.inputTokens ?? 0) + (data.tokens.input ?? data.tokens.prompt ?? 0); - usage.outputTokens = (usage.outputTokens ?? 0) + (data.tokens.output ?? 0); - found = true; + found = addUsage(normalizeTokenUsage(data.tokens)) || found; } } } diff --git a/src/utils/text-extraction.ts b/src/utils/text-extraction.ts new file mode 100644 index 00000000..14723dd5 --- /dev/null +++ b/src/utils/text-extraction.ts @@ -0,0 +1,25 @@ +export function extractTextFromJsonValue(value: any): string | undefined { + if (typeof value === "string") return value; + if (Array.isArray(value)) { + const text = value.map((item) => extractTextFromJsonValue(item) ?? "").join(""); + return text || undefined; + } + if (!value || typeof value !== "object") return undefined; + if (typeof value.text === "string") return value.text; + if (typeof value.content === "string") return value.content; + if (typeof value.output_text === "string") return value.output_text; + + if (Array.isArray(value.content)) { + const text = value.content + .map((part: any) => extractTextFromJsonValue(part) ?? "") + .join(""); + if (text) return text; + } + + for (const field of ["response", "message", "result", "output", "data", "item"]) { + const text = extractTextFromJsonValue(value[field]); + if (text) return text; + } + + return undefined; +} diff --git a/src/utils/usage.ts b/src/utils/usage.ts new file mode 100644 index 00000000..f02e33ef --- /dev/null +++ b/src/utils/usage.ts @@ -0,0 +1,89 @@ +export type NormalizedTokenUsage = { + inputTokens?: number; + outputTokens?: number; + cacheReadTokens?: number; + cacheWriteTokens?: number; + reasoningTokens?: number; + totalTokens?: number; +}; + +const usageFieldAliases: Record< + keyof NormalizedTokenUsage, + ReadonlyArray> +> = { + inputTokens: [ + ["inputTokens"], + ["promptTokens"], + ["prompt_tokens"], + ["input_tokens"], + ["input"], + ["models", "gemini", "tokens", "input"], + ], + outputTokens: [ + ["outputTokens"], + ["completionTokens"], + ["completion_tokens"], + ["output_tokens"], + ["output"], + ["models", "gemini", "tokens", "output"], + ], + cacheReadTokens: [ + ["cacheReadTokens"], + ["cache_read_input_tokens"], + ["cached_input_tokens"], + ["cache_read_tokens"], + ["inputTokenDetails", "cacheReadTokens"], + ], + cacheWriteTokens: [ + ["cacheWriteTokens"], + ["cache_write_input_tokens"], + ["cache_creation_input_tokens"], + ["cache_write_tokens"], + ["inputTokenDetails", "cacheWriteTokens"], + ], + reasoningTokens: [ + ["reasoningTokens"], + ["reasoning_tokens"], + ["outputTokenDetails", "reasoningTokens"], + ], + totalTokens: [ + ["totalTokens"], + ["total_tokens"], + ], +}; + +function readUsagePath(value: unknown, path: ReadonlyArray): unknown { + let current = value; + for (const segment of path) { + if (!current || typeof current !== "object") return undefined; + current = (current as Record)[segment]; + } + return current; +} + +export function hasMeaningfulTokenUsage(usage: NormalizedTokenUsage): boolean { + return Object.values(usage).some( + (value) => typeof value === "number" && Number.isFinite(value) && value > 0, + ); +} + +export function normalizeTokenUsage( + usage: unknown, +): NormalizedTokenUsage | null { + if (!usage || typeof usage !== "object") return null; + + const normalized: NormalizedTokenUsage = {}; + for (const [field, aliases] of Object.entries(usageFieldAliases) as Array< + [keyof NormalizedTokenUsage, ReadonlyArray>] + >) { + for (const path of aliases) { + const value = readUsagePath(usage, path); + if (typeof value === "number") { + normalized[field] = value; + break; + } + } + } + + return hasMeaningfulTokenUsage(normalized) ? normalized : null; +} diff --git a/tests/agent-trace.test.tsx b/tests/agent-trace.test.tsx index 77664e3e..8ac98245 100644 --- a/tests/agent-trace.test.tsx +++ b/tests/agent-trace.test.tsx @@ -123,6 +123,16 @@ describe("agent trace capture", () => { events.find((event: any) => event.type === "AgentTraceSummary")! .payloadJson, ).summary; + const artifactEvent = traceEvents.find( + (event: any) => event.event.kind === "artifact.created", + ); + const persistedRows = readFileSync( + artifactEvent?.payload?.artifactPath, + "utf8", + ) + .trim() + .split("\n") + .map((line) => JSON.parse(line)); expect(summary.agentFamily).toBe("pi"); expect(summary.captureMode).toBe("cli-json-stream"); @@ -140,6 +150,24 @@ describe("agent trace capture", () => { const sequences = traceEvents.map((event: any) => event.event.sequence); expect(sequences).toEqual([...sequences].sort((a, b) => a - b)); expect(new Set(sequences).size).toBe(sequences.length); + expect( + traceEvents.filter( + (event: any) => event.event.kind === "tool.execution.start", + ), + ).toHaveLength(1); + expect( + traceEvents.filter( + (event: any) => event.event.kind === "tool.execution.end", + ), + ).toHaveLength(1); + expect( + persistedRows + .filter((row: any) => "event" in row) + .every((row: any) => row.traceCompleteness === "full-observed"), + ).toBe(true); + expect(persistedRows.at(-1)?.summary?.traceCompleteness).toBe( + "full-observed", + ); const redacted = JSON.stringify(traceEvents); expect(redacted).not.toContain("sk_abc123456789"); @@ -624,6 +652,7 @@ describe("agent trace capture", () => { (event: any) => event.event.kind === "usage" && event.payload.inputTokens === 10 && + event.payload.cacheReadTokens === 2 && event.payload.outputTokens === 4, ), ).toBe(true); From 2272c8f0785de03224ea337d072c7c3fd6c2d73b Mon Sep 17 00:00:00 2001 From: Samuel Huber Date: Sun, 29 Mar 2026 01:07:10 +0100 Subject: [PATCH 5/7] fix(observability): correlate derived trace events with raw ids --- src/agent-trace.ts | 91 +++++++++++++++-- tests/agent-trace.test.tsx | 199 ++++++++++++++++++++++++++++++++++++- 2 files changed, 277 insertions(+), 13 deletions(-) diff --git a/src/agent-trace.ts b/src/agent-trace.ts index 08332ccf..e0609765 100644 --- a/src/agent-trace.ts +++ b/src/agent-trace.ts @@ -84,6 +84,7 @@ export type CanonicalAgentTraceEvent = { agentFamily: AgentFamily; captureMode: AgentCaptureMode; rawType?: string; + rawEventId?: string; observed: boolean; }; traceCompleteness: TraceCompleteness; @@ -590,6 +591,7 @@ export function canonicalTraceEventToOtelLogRecord( "event.phase": event.event.phase, "event.sequence": event.event.sequence, "source.raw_type": event.source.rawType, + "source.raw_event_id": event.source.rawEventId, "source.observed": event.source.observed, }; for (const [key, value] of Object.entries(event.annotations)) { @@ -654,10 +656,12 @@ export class AgentTraceCollector { private readonly failures: string[] = []; private readonly warnings: string[] = []; private sequence = 0; + private rawEventSequence = 0; private stdoutBuffer = ""; private stderrBuffer = ""; private assistantTextBuffer = ""; private finalText: string | null = null; + private currentRawEventId?: string; private listener?: (event: SmithersEvent) => void; constructor(opts: TraceCollectorOptions) { @@ -710,6 +714,7 @@ export class AgentTraceCollector { observeResult(result: any) { const text = String(result?.text ?? "").trim(); + const rawEventId = this.nextRawEventId("result"); if ( text && (!this.finalText || @@ -723,6 +728,9 @@ export class AgentTraceCollector { "assistant.message.final", { text }, text, + undefined, + true, + rawEventId, ); } const usage = normalizeTokenUsage(result?.usage ?? result?.totalUsage); @@ -732,17 +740,22 @@ export class AgentTraceCollector { usage, usage, "usage", + true, + rawEventId, ); } } observeError(error: unknown) { this.failures.push(error instanceof Error ? error.message : String(error)); + const rawEventId = this.nextRawEventId("error"); this.pushDerived( "capture.error", { error: this.failures.at(-1) }, { error: this.failures.at(-1) }, "error", + true, + rawEventId, ); } @@ -753,7 +766,8 @@ export class AgentTraceCollector { if ( this.captureMode !== "sdk-events" && !this.seenKinds.has("assistant.message.final") && - this.finalText + this.finalText && + this.failures.length === 0 ) { this.pushDerived( "assistant.message.final", @@ -828,7 +842,30 @@ export class AgentTraceCollector { ); this.applyTraceCompleteness(traceCompleteness); summary = { ...summary, rawArtifactRefs: [...this.rawArtifactRefs] }; - await this.rewriteNdjson(artifactPath, summary); + try { + await this.rewriteNdjson(artifactPath, summary); + } catch (error) { + const message = + error instanceof Error ? error.message : String(error); + this.warnings.push(message); + this.pushDerived( + "capture.warning", + { reason: "artifact-rewrite-failed", error: message }, + { reason: "artifact-rewrite-failed", error: message }, + "artifact", + ); + traceCompleteness = this.resolveCompleteness(); + missingExpectedEventKinds = [...this.expectedKinds].filter( + (kind) => !this.directKinds.has(kind), + ); + this.applyTraceCompleteness(traceCompleteness); + summary = { + ...summary, + traceCompleteness, + missingExpectedEventKinds, + rawArtifactRefs: [...this.rawArtifactRefs], + }; + } } else if (!persistedArtifact.ok) { this.warnings.push(persistedArtifact.error); this.pushDerived( @@ -942,11 +979,23 @@ export class AgentTraceCollector { ); return; } + const rawType = + typeof parsed?.type === "string" ? parsed.type : "structured"; + const previousRawEventId = this.currentRawEventId; + this.currentRawEventId = this.nextRawEventId(rawType); if (this.agentFamily === "pi") { - this.processPiEvent(parsed); + try { + this.processPiEvent(parsed); + } finally { + this.currentRawEventId = previousRawEventId; + } return; } - this.processGenericStructuredEvent(parsed); + try { + this.processGenericStructuredEvent(parsed); + } finally { + this.currentRawEventId = previousRawEventId; + } } private processGenericStructuredEvent(parsed: any) { @@ -956,10 +1005,9 @@ export class AgentTraceCollector { if (this.agentFamily === "claude-code" && rawType === "assistant") { const text = this.extractGenericMessageText(parsed?.message ?? parsed); if (typeof text === "string" && text) { - this.appendAssistantText(text); this.pushObserved( - "assistant.text.delta", - { text }, + "message.update", + this.extractGenericMessagePayload(parsed?.message ?? parsed), parsed, rawType, ); @@ -1131,8 +1179,8 @@ export class AgentTraceCollector { return; } - if (rawType === "turn.completed" && parsed?.usage) { - const usage = normalizeTokenUsage(parsed.usage); + if (rawType === "turn.completed") { + const usage = normalizeTokenUsage(parsed?.usage); if (usage) { this.pushObserved("usage", usage, parsed, rawType); } @@ -1313,6 +1361,7 @@ export class AgentTraceCollector { if (this.agentFamily === "pi") return; if (event.type === "ToolCallStarted") { + const rawEventId = this.nextRawEventId(event.type); this.pushDerived( "tool.execution.start", { @@ -1321,10 +1370,13 @@ export class AgentTraceCollector { }, event, event.type, + true, + rawEventId, ); this.expectedKinds.add("tool.execution.end"); } if (event.type === "ToolCallFinished") { + const rawEventId = this.nextRawEventId(event.type); this.pushDerived( "tool.execution.end", { @@ -1334,9 +1386,12 @@ export class AgentTraceCollector { }, event, event.type, + true, + rawEventId, ); } if (event.type === "TokenUsageReported") { + const rawEventId = this.nextRawEventId(event.type); this.pushDerived( "usage", { @@ -1350,6 +1405,8 @@ export class AgentTraceCollector { }, event, event.type, + true, + rawEventId, ); } } @@ -1420,6 +1477,7 @@ export class AgentTraceCollector { observed: boolean, rawType?: string, direct = true, + rawEventId?: string, ) { const redactedPayload = redactValue(payload); const redactedRaw = redactValue(raw); @@ -1441,6 +1499,11 @@ export class AgentTraceCollector { agentFamily: this.agentFamily, captureMode: this.captureMode, rawType, + rawEventId: + rawEventId ?? + (observed + ? (this.currentRawEventId ?? this.nextRawEventId(rawType ?? kind)) + : undefined), observed, }, traceCompleteness: "partial-observed", @@ -1464,8 +1527,9 @@ export class AgentTraceCollector { payload: Record | null, raw: unknown, rawType?: string, + rawEventId?: string, ) { - this.push(kind, payload, raw, true, rawType); + this.push(kind, payload, raw, true, rawType, true, rawEventId); } private pushDerived( @@ -1474,8 +1538,9 @@ export class AgentTraceCollector { raw: unknown, rawType?: string, direct = true, + rawEventId?: string, ) { - this.push(kind, payload, raw, false, rawType, direct); + this.push(kind, payload, raw, false, rawType, direct, rawEventId); } private applyTraceCompleteness(traceCompleteness: TraceCompleteness) { @@ -1484,6 +1549,10 @@ export class AgentTraceCollector { } } + private nextRawEventId(rawType: string) { + return `${rawType}:${this.rawEventSequence++}`; + } + private async persistNdjson( summary: AgentTraceSummary, ): Promise<{ ok: true; file?: string } | { ok: false; error: string }> { diff --git a/tests/agent-trace.test.tsx b/tests/agent-trace.test.tsx index 8ac98245..6f189899 100644 --- a/tests/agent-trace.test.tsx +++ b/tests/agent-trace.test.tsx @@ -4,7 +4,10 @@ import { mkdtempSync, readFileSync, writeFileSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; import { Workflow, Task, runWorkflow } from "../src"; -import { canonicalTraceEventToOtelLogRecord } from "../src/agent-trace"; +import { + AgentTraceCollector, + canonicalTraceEventToOtelLogRecord, +} from "../src/agent-trace"; import { CodexAgent } from "../src/agents"; import { SmithersDb } from "../src/db/adapter"; import { runPromise } from "../src/effect/runtime"; @@ -281,6 +284,12 @@ describe("agent trace capture", () => { expect(traceEvents.some((event: any) => event.event.kind === "usage")).toBe( true, ); + const finalMessage = traceEvents.find( + (event: any) => event.event.kind === "assistant.message.final", + ); + const usage = traceEvents.find((event: any) => event.event.kind === "usage"); + expect(finalMessage?.source.rawEventId).toBeTruthy(); + expect(finalMessage?.source.rawEventId).toBe(usage?.source.rawEventId); cleanup(); }); @@ -449,10 +458,17 @@ describe("agent trace capture", () => { expect( traceEvents.some( (event: any) => - event.event.kind === "assistant.text.delta" && + event.event.kind === "message.update" && event.payload.text === "claude real final", ), ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.text.delta" && + event.payload.text === "claude real final", + ), + ).toBe(false); expect( traceEvents.some( (event: any) => @@ -476,6 +492,20 @@ describe("agent trace capture", () => { event.source.rawType === "result", ), ).toBe(true); + const assistantUpdate = traceEvents.find( + (event: any) => + event.event.kind === "message.update" && + event.source.rawType === "assistant", + ); + const assistantUsage = traceEvents.find( + (event: any) => + event.event.kind === "usage" && + event.source.rawType === "assistant", + ); + expect(assistantUpdate?.source.rawEventId).toBeTruthy(); + expect(assistantUpdate?.source.rawEventId).toBe( + assistantUsage?.source.rawEventId, + ); cleanup(); }); @@ -554,6 +584,85 @@ describe("agent trace capture", () => { true, ); expect(summary.unsupportedEventKinds).not.toContain("assistant.text.delta"); + const turnUsage = traceEvents.find( + (event: any) => event.event.kind === "usage", + ); + const turnEnd = traceEvents.find( + (event: any) => event.event.kind === "turn.end", + ); + const finalMessage = traceEvents.find( + (event: any) => event.event.kind === "assistant.message.final", + ); + expect(turnUsage?.source.rawEventId).toBeTruthy(); + expect(turnUsage?.source.rawEventId).toBe(turnEnd?.source.rawEventId); + expect(turnUsage?.source.rawEventId).toBe(finalMessage?.source.rawEventId); + cleanup(); + }); + + test("treats Codex turn.completed without usage as a terminal structured event", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-codex-turn-completed-no-usage"; + + class CodexNoUsageAgentFake { + id = "codex-no-usage-fake"; + opts = { outputFormat: "stream-json", json: true }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "turn.started", + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "turn.completed", + message: { role: "assistant", content: "codex final without usage" }, + }) + "\n", + ); + return { + text: '{"answer":"codex final without usage"}', + output: { answer: "codex final without usage" }, + }; + } + } + + const workflow = smithers(() => ( + + + codex without usage please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.traceCompleteness).toBe("full-observed"); + expect( + traceEvents.some((event: any) => event.event.kind === "turn.end"), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.message.final" && + event.payload.text === "codex final without usage", + ), + ).toBe(true); + cleanup(); }); @@ -1097,6 +1206,9 @@ describe("agent trace capture", () => { ); expect(toolStart?.payload.toolCallId).toBe("bash:1"); expect(toolEnd?.payload.toolCallId).toBe("bash:1"); + expect(toolStart?.source.rawEventId).toBeTruthy(); + expect(toolEnd?.source.rawEventId).toBeTruthy(); + expect(toolStart?.source.rawEventId).not.toBe(toolEnd?.source.rawEventId); cleanup(); }); @@ -1157,6 +1269,11 @@ describe("agent trace capture", () => { event.payload.reason === "truncated-json-stream", ), ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "assistant.message.final", + ), + ).toBe(false); cleanup(); }); @@ -1246,6 +1363,82 @@ describe("agent trace capture", () => { cleanup(); }); + test("records artifact rewrite failures as capture warnings without failing the run", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-artifact-rewrite-fail"; + const originalRewriteNdjson = (AgentTraceCollector.prototype as any).rewriteNdjson; + (AgentTraceCollector.prototype as any).rewriteNdjson = async function () { + throw new Error("rewrite exploded"); + }; + + class PiAgentRewriteFake { + id = "pi-rewrite-fake"; + opts = { mode: "json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.(JSON.stringify({ type: "session", id: "sess-1" }) + "\n"); + args.onStdout?.(JSON.stringify({ type: "turn_start" }) + "\n"); + args.onStdout?.( + JSON.stringify({ + type: "message_end", + message: { + role: "assistant", + content: [{ type: "text", text: "rewrite final" }], + }, + }) + "\n", + ); + return { + text: '{"answer":"rewrite final"}', + output: { answer: "rewrite final" }, + }; + } + } + + try { + const workflow = smithers(() => ( + + + rewrite fail please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.traceCompleteness).toBe("partial-observed"); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "capture.warning" && + event.payload.reason === "artifact-rewrite-failed", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "artifact.created", + ), + ).toBe(true); + } finally { + (AgentTraceCollector.prototype as any).rewriteNdjson = originalRewriteNdjson; + cleanup(); + } + }); + test("records persisted trace artifacts with a canonical artifact.created event", async () => { const { smithers, outputs, db, cleanup } = createTestSmithers({ result: z.object({ answer: z.string() }), @@ -1339,6 +1532,7 @@ describe("agent trace capture", () => { agentFamily: "pi", captureMode: "cli-json-stream", rawType: "thinking_delta", + rawEventId: "thinking_delta:0", observed: true, }, traceCompleteness: "full-observed", @@ -1358,6 +1552,7 @@ describe("agent trace capture", () => { expect(record.attributes["node.id"]).toBe("pi-rich-trace"); expect(record.attributes["node.attempt"]).toBe(1); expect(record.attributes["event.kind"]).toBe("assistant.thinking.delta"); + expect(record.attributes["source.raw_event_id"]).toBe("thinking_delta:0"); expect(record.attributes["custom.ticket"]).toBe("OBS-1"); expect(record.body).toContain("REDACTED_SECRET"); expect(record.body).not.toContain("sk_live_"); From 11a0c298b4c66cbcda2e239b0e7995f00d1cd902 Mon Sep 17 00:00:00 2001 From: Samuel Huber Date: Sun, 29 Mar 2026 01:20:36 +0100 Subject: [PATCH 6/7] fix(observability): verify Claude message update traces --- scripts/verify-observability.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/verify-observability.sh b/scripts/verify-observability.sh index 54219ee2..e4e1f194 100755 --- a/scripts/verify-observability.sh +++ b/scripts/verify-observability.sh @@ -134,8 +134,8 @@ retry_until loki-tool-end 6 '.streams >= 1' \ retry_until loki-sdk-final-only 6 '.streams >= 1' \ "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"sdk-final-only\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" -retry_until loki-claude-text-delta 6 '.streams >= 1' \ - "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"claude-structured-trace\" | event_kind=\"assistant.text.delta\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" +retry_until loki-claude-message-update 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"claude-structured-trace\" | event_kind=\"message.update\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" retry_until loki-gemini-text-delta 6 '.streams >= 1' \ "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"gemini-structured-trace\" | event_kind=\"assistant.text.delta\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" @@ -211,7 +211,7 @@ append_json loki-tool-start "$OUT_DIR/loki-tool-start.json" append_json loki-tool-update "$OUT_DIR/loki-tool-update.json" append_json loki-tool-end "$OUT_DIR/loki-tool-end.json" append_json loki-sdk-final-only "$OUT_DIR/loki-sdk-final-only.json" -append_json loki-claude-text-delta "$OUT_DIR/loki-claude-text-delta.json" +append_json loki-claude-message-update "$OUT_DIR/loki-claude-message-update.json" append_json loki-gemini-text-delta "$OUT_DIR/loki-gemini-text-delta.json" append_json loki-claude-usage "$OUT_DIR/loki-claude-usage.json" append_json loki-gemini-usage "$OUT_DIR/loki-gemini-usage.json" From ae12acc38369e0a40d6bbafd56967cad3b10435c Mon Sep 17 00:00:00 2001 From: Samuel Huber Date: Sun, 29 Mar 2026 01:31:27 +0100 Subject: [PATCH 7/7] refactor(agent-trace): split structured normalization from collector --- src/agent-trace.ts | 825 +++++++++++++++++++++---------------- src/agents/CodexAgent.ts | 1 + tests/agent-trace.test.tsx | 245 ++++++++++- 3 files changed, 704 insertions(+), 367 deletions(-) diff --git a/src/agent-trace.ts b/src/agent-trace.ts index e0609765..5c509a03 100644 --- a/src/agent-trace.ts +++ b/src/agent-trace.ts @@ -493,6 +493,18 @@ type MappedStructuredEvent = { expect?: CanonicalAgentTraceEventKind; }; +type NormalizedTraceEvent = { + kind: CanonicalAgentTraceEventKind; + payload: Record | null; + raw: unknown; + rawType?: string; +}; + +type NormalizedTraceBatch = { + events: NormalizedTraceEvent[]; + expectedKinds?: CanonicalAgentTraceEventKind[]; +}; + const piSimpleEventMap: Record = { session: { kind: "session.start", payloadKind: "pi" }, agent_start: { kind: "session.start", payloadKind: "pi" }, @@ -566,6 +578,419 @@ const genericStructuredEventMap: Record = { "tool_result.completed": { kind: "tool.result", payloadKind: "tool" }, }; +function extractGenericDeltaText(parsed: any): string | undefined { + const candidates = [ + parsed?.delta?.text, + parsed?.delta, + parsed?.text, + parsed?.content_block?.text, + parsed?.contentBlock?.text, + parsed?.message?.text, + parsed?.message?.content, + parsed?.output_text, + ]; + for (const candidate of candidates) { + if (typeof candidate === "string" && candidate) return candidate; + } + return undefined; +} + +function extractGenericMessageText(parsed: any): string | undefined { + return extractTextFromJsonValue( + parsed?.message ?? parsed?.response ?? parsed?.item ?? parsed, + ); +} + +function extractGenericMessagePayload(parsed: any): Record { + const payload: Record = {}; + const role = + parsed?.message?.role ?? parsed?.role ?? parsed?.response?.role; + if (typeof role === "string") payload.role = role; + const text = extractGenericMessageText(parsed); + if (text) payload.text = text; + if (parsed?.id) payload.id = parsed.id; + return payload; +} + +function extractGenericToolPayload(parsed: any): Record { + const tool = + parsed?.tool ?? + parsed?.toolCall ?? + parsed?.tool_call ?? + parsed?.toolExecution ?? + parsed; + return { + toolCallId: tool?.id ?? tool?.toolCallId ?? parsed?.id, + toolName: tool?.name ?? tool?.toolName ?? parsed?.toolName, + argsPreview: tool?.args ?? tool?.arguments ?? parsed?.args, + resultPreview: tool?.result ?? parsed?.result, + isError: Boolean(tool?.isError ?? parsed?.isError ?? parsed?.error), + }; +} + +function extractPiPayload(parsed: any): Record { + const payload: Record = {}; + if (parsed?.message?.role) payload.role = parsed.message.role; + const text = extractGenericMessageText(parsed?.message); + if (text) payload.text = text; + if (parsed?.id) payload.id = parsed.id; + return payload; +} + +function extractMappedPayload( + parsed: any, + payloadKind: PayloadKind, +): Record | null { + if (payloadKind === "message") return extractGenericMessagePayload(parsed); + if (payloadKind === "tool") return extractGenericToolPayload(parsed); + if (payloadKind === "pi") return extractPiPayload(parsed); + return {}; +} + +function buildNormalizedEvent( + kind: CanonicalAgentTraceEventKind, + payload: Record | null, + raw: unknown, + rawType?: string, +): NormalizedTraceEvent { + return { kind, payload, raw, rawType }; +} + +function normalizeMappedEvent( + parsed: any, + rawType: string, + mapped: MappedStructuredEvent, +): NormalizedTraceBatch { + return { + events: [ + buildNormalizedEvent( + mapped.kind, + extractMappedPayload(parsed, mapped.payloadKind), + parsed, + rawType, + ), + ], + expectedKinds: mapped.expect ? [mapped.expect] : undefined, + }; +} + +function normalizeClaudeStructuredEvent( + parsed: any, + rawType: string, +): NormalizedTraceBatch | null { + if (rawType === "assistant") { + const text = extractGenericMessageText(parsed?.message ?? parsed); + const events = text + ? [ + buildNormalizedEvent( + "message.update", + extractGenericMessagePayload(parsed?.message ?? parsed), + parsed, + rawType, + ), + ] + : [ + buildNormalizedEvent("stdout", { eventType: rawType }, parsed, rawType), + ]; + const usage = normalizeTokenUsage(parsed?.message?.usage); + if (usage) events.push(buildNormalizedEvent("usage", usage, parsed, rawType)); + return { events }; + } + + if (rawType === "result") { + const events: NormalizedTraceEvent[] = []; + const usage = normalizeTokenUsage(parsed?.usage); + if (usage) events.push(buildNormalizedEvent("usage", usage, parsed, rawType)); + const text = extractGenericMessageText(parsed); + if (text) { + events.push( + buildNormalizedEvent("assistant.message.final", { text }, parsed, rawType), + ); + } + return events.length > 0 ? { events } : null; + } + + return null; +} + +function normalizeGeminiStructuredEvent( + parsed: any, + rawType: string, +): NormalizedTraceBatch | null { + if (rawType === "message") { + const text = extractGenericMessageText(parsed); + if (parsed?.role === "assistant" && typeof text === "string" && text) { + return { + events: [ + buildNormalizedEvent( + parsed?.delta ? "assistant.text.delta" : "assistant.message.final", + { text }, + parsed, + rawType, + ), + ], + }; + } + } + + if (rawType === "result" && parsed?.stats) { + const usage = normalizeTokenUsage(parsed.stats); + return usage + ? { events: [buildNormalizedEvent("usage", usage, parsed, rawType)] } + : null; + } + + return null; +} + +function normalizeCodexStructuredEvent( + parsed: any, + rawType: string, +): NormalizedTraceBatch | null { + if (rawType === "thread.started") { + return { + events: [buildNormalizedEvent("stdout", { eventType: rawType }, parsed, rawType)], + }; + } + + if (rawType === "turn.started") { + return { + events: [buildNormalizedEvent("turn.start", {}, parsed, rawType)], + expectedKinds: ["turn.end"], + }; + } + + if (rawType === "item.completed" && parsed?.item?.type === "agent_message") { + const text = extractGenericMessageText(parsed.item); + if (typeof text === "string" && text) { + return { + events: [ + buildNormalizedEvent( + "assistant.message.final", + { text }, + parsed, + rawType, + ), + ], + }; + } + } + + if (rawType === "turn.completed") { + const events: NormalizedTraceEvent[] = []; + const usage = normalizeTokenUsage(parsed?.usage); + if (usage) events.push(buildNormalizedEvent("usage", usage, parsed, rawType)); + events.push(buildNormalizedEvent("turn.end", {}, parsed, rawType)); + const text = extractGenericMessageText(parsed); + if (text) { + events.push( + buildNormalizedEvent("assistant.message.final", { text }, parsed, rawType), + ); + } + return { events }; + } + + return null; +} + +function normalizePiStructuredEvent( + parsed: any, + rawType: string, +): NormalizedTraceBatch | null { + const simple = piSimpleEventMap[rawType]; + if (simple) return normalizeMappedEvent(parsed, rawType, simple); + + if (rawType === "turn_end") { + const events: NormalizedTraceEvent[] = [ + buildNormalizedEvent("turn.end", extractPiPayload(parsed), parsed, rawType), + ]; + const text = extractGenericMessageText(parsed?.message); + if (text) { + events.push( + buildNormalizedEvent( + "assistant.message.final", + { text }, + parsed?.message, + rawType, + ), + ); + } + const usage = normalizeTokenUsage(parsed?.message?.usage); + if (usage) events.push(buildNormalizedEvent("usage", usage, parsed.message.usage, "usage")); + return { events }; + } + + if (rawType === "message_end") { + const events: NormalizedTraceEvent[] = [ + buildNormalizedEvent("message.end", extractPiPayload(parsed), parsed, rawType), + ]; + const text = extractGenericMessageText(parsed?.message); + if (parsed?.message?.role === "assistant" && text) { + events.push( + buildNormalizedEvent( + "assistant.message.final", + { text }, + parsed?.message, + rawType, + ), + ); + } + return { events }; + } + + if (rawType === "message_update") { + const evt = parsed?.assistantMessageEvent; + if (evt?.type === "text_delta" && typeof evt.delta === "string") { + return { + events: [ + buildNormalizedEvent( + "assistant.text.delta", + { text: evt.delta }, + parsed, + evt.type, + ), + ], + }; + } + if ( + (evt?.type === "thinking_delta" || evt?.type === "reasoning_delta") && + typeof evt.delta === "string" + ) { + return { + events: [ + buildNormalizedEvent( + "assistant.thinking.delta", + { text: evt.delta }, + parsed, + evt.type, + ), + ], + }; + } + return { + events: [ + buildNormalizedEvent("message.update", extractPiPayload(parsed), parsed, rawType), + ], + }; + } + + return { + events: [buildNormalizedEvent("stdout", { eventType: rawType }, parsed, rawType)], + }; +} + +function normalizeSharedStructuredEvent( + parsed: any, + rawType: string, +): NormalizedTraceBatch | null { + const mapped = genericStructuredEventMap[rawType]; + if (mapped) return normalizeMappedEvent(parsed, rawType, mapped); + + if ( + [ + "message_delta", + "assistant_message.delta", + "assistant_message_delta", + "response.output_text.delta", + "content_block_delta", + ].includes(rawType) + ) { + const text = extractGenericDeltaText(parsed); + if (typeof text === "string" && text) { + return { + events: [ + buildNormalizedEvent("assistant.text.delta", { text }, parsed, rawType), + ], + }; + } + } + + if ( + [ + "thinking_delta", + "reasoning_delta", + "response.reasoning.delta", + ].includes(rawType) + ) { + const text = extractGenericDeltaText(parsed); + if (typeof text === "string" && text) { + return { + events: [ + buildNormalizedEvent( + "assistant.thinking.delta", + { text }, + parsed, + rawType, + ), + ], + }; + } + } + + if ( + [ + "message_end", + "assistant_message_end", + "response.completed", + "message_stop", + ].includes(rawType) + ) { + const events: NormalizedTraceEvent[] = [ + buildNormalizedEvent( + "message.end", + extractGenericMessagePayload(parsed), + parsed, + rawType, + ), + ]; + const text = extractGenericMessageText(parsed); + if (text) { + events.push( + buildNormalizedEvent("assistant.message.final", { text }, parsed, rawType), + ); + } + const usage = normalizeTokenUsage(parsed?.usage); + if (usage) events.push(buildNormalizedEvent("usage", usage, parsed, rawType)); + return { events }; + } + + return null; +} + +function normalizeStructuredEvent( + agentFamily: AgentFamily, + parsed: any, + rawType: string, +): NormalizedTraceBatch { + if (agentFamily === "pi") { + return normalizePiStructuredEvent(parsed, rawType) ?? { + events: [buildNormalizedEvent("stdout", { eventType: rawType }, parsed, rawType)], + }; + } + + if (agentFamily === "claude-code") { + const normalized = normalizeClaudeStructuredEvent(parsed, rawType); + if (normalized) return normalized; + } + + if (agentFamily === "gemini") { + const normalized = normalizeGeminiStructuredEvent(parsed, rawType); + if (normalized) return normalized; + } + + if (agentFamily === "codex") { + const normalized = normalizeCodexStructuredEvent(parsed, rawType); + if (normalized) return normalized; + } + + const shared = normalizeSharedStructuredEvent(parsed, rawType); + if (shared) return shared; + + return { + events: [buildNormalizedEvent("stdout", { eventType: rawType }, parsed, rawType)], + }; +} + export function canonicalTraceEventToOtelLogRecord( event: CanonicalAgentTraceEvent, context?: { agentId?: string; model?: string }, @@ -983,328 +1408,15 @@ export class AgentTraceCollector { typeof parsed?.type === "string" ? parsed.type : "structured"; const previousRawEventId = this.currentRawEventId; this.currentRawEventId = this.nextRawEventId(rawType); - if (this.agentFamily === "pi") { - try { - this.processPiEvent(parsed); - } finally { - this.currentRawEventId = previousRawEventId; - } - return; - } try { - this.processGenericStructuredEvent(parsed); + this.emitObservedBatch( + normalizeStructuredEvent(this.agentFamily, parsed, rawType), + ); } finally { this.currentRawEventId = previousRawEventId; } } - private processGenericStructuredEvent(parsed: any) { - const rawType = - typeof parsed?.type === "string" ? parsed.type : "structured"; - - if (this.agentFamily === "claude-code" && rawType === "assistant") { - const text = this.extractGenericMessageText(parsed?.message ?? parsed); - if (typeof text === "string" && text) { - this.pushObserved( - "message.update", - this.extractGenericMessagePayload(parsed?.message ?? parsed), - parsed, - rawType, - ); - } else { - this.pushObserved("stdout", { eventType: rawType }, parsed, rawType); - } - const usage = normalizeTokenUsage(parsed?.message?.usage); - if (usage) { - this.pushObserved("usage", usage, parsed, rawType); - } - return; - } - - if (this.agentFamily === "claude-code" && rawType === "result") { - const usage = normalizeTokenUsage(parsed?.usage); - if (usage) { - this.pushObserved("usage", usage, parsed, rawType); - } - const text = this.extractGenericMessageText(parsed); - if (typeof text === "string" && text) { - this.setFinalAssistantText(text); - this.pushObserved( - "assistant.message.final", - { text }, - parsed, - rawType, - ); - } - return; - } - - if (this.agentFamily === "gemini" && rawType === "message") { - const role = parsed?.role; - const text = this.extractGenericMessageText(parsed); - if (role === "assistant" && typeof text === "string" && text) { - if (parsed?.delta) { - this.appendAssistantText(text); - } else { - this.setFinalAssistantText(text); - } - this.pushObserved( - parsed?.delta ? "assistant.text.delta" : "assistant.message.final", - { text }, - parsed, - rawType, - ); - return; - } - } - - if (this.agentFamily === "gemini" && rawType === "result" && parsed?.stats) { - const usage = normalizeTokenUsage(parsed.stats); - if (usage) { - this.pushObserved("usage", usage, parsed, rawType); - } - return; - } - - if (this.agentFamily === "codex" && rawType === "thread.started") { - this.pushObserved("stdout", { eventType: rawType }, parsed, rawType); - return; - } - - if (this.agentFamily === "codex" && rawType === "turn.started") { - this.pushObserved("turn.start", {}, parsed, rawType); - this.expectedKinds.add("turn.end"); - return; - } - - if ( - this.agentFamily === "codex" && - rawType === "item.completed" && - parsed?.item?.type === "agent_message" - ) { - const text = this.extractGenericMessageText(parsed.item); - if (typeof text === "string" && text) { - this.setFinalAssistantText(text); - this.pushObserved( - "assistant.message.final", - { text }, - parsed, - rawType, - ); - return; - } - } - - const mapped = genericStructuredEventMap[rawType]; - if (mapped) { - this.pushObserved( - mapped.kind, - this.extractMappedPayload(parsed, mapped.payloadKind), - parsed, - rawType, - ); - if (mapped.expect) this.expectedKinds.add(mapped.expect); - return; - } - - if ( - [ - "message_delta", - "assistant_message.delta", - "assistant_message_delta", - "response.output_text.delta", - "content_block_delta", - ].includes(rawType) - ) { - const text = this.extractGenericDeltaText(parsed); - if (typeof text === "string" && text) { - this.appendAssistantText(text); - this.pushObserved( - "assistant.text.delta", - { text }, - parsed, - rawType, - ); - return; - } - } - - if ( - [ - "thinking_delta", - "reasoning_delta", - "response.reasoning.delta", - ].includes(rawType) - ) { - const text = this.extractGenericDeltaText(parsed); - if (typeof text === "string" && text) { - this.pushObserved( - "assistant.thinking.delta", - { text }, - parsed, - rawType, - ); - return; - } - } - - if ( - [ - "message_end", - "assistant_message_end", - "response.completed", - "message_stop", - ].includes(rawType) - ) { - this.pushObserved( - "message.end", - this.extractGenericMessagePayload(parsed), - parsed, - rawType, - ); - const text = this.extractGenericMessageText(parsed); - if (text) { - this.setFinalAssistantText(text); - this.pushObserved( - "assistant.message.final", - { text }, - parsed, - rawType, - ); - } - const usage = normalizeTokenUsage(parsed?.usage); - if (usage) { - this.pushObserved("usage", usage, parsed, rawType); - } - return; - } - - if (rawType === "turn.completed") { - const usage = normalizeTokenUsage(parsed?.usage); - if (usage) { - this.pushObserved("usage", usage, parsed, rawType); - } - if (this.agentFamily === "codex") { - this.pushObserved("turn.end", {}, parsed, rawType); - } - const text = this.extractGenericMessageText(parsed); - if (text) { - this.setFinalAssistantText(text); - this.pushObserved( - "assistant.message.final", - { text }, - parsed, - rawType, - ); - } - return; - } - - this.pushObserved("stdout", { eventType: rawType }, parsed, rawType); - } - - private processPiEvent(parsed: any) { - const type = String(parsed?.type ?? "unknown"); - const simple = piSimpleEventMap[type]; - if (simple) { - this.pushObserved( - simple.kind, - this.extractMappedPayload(parsed, simple.payloadKind), - parsed, - type, - ); - if (simple.expect) this.expectedKinds.add(simple.expect); - return; - } - - if (type === "turn_end") { - this.pushObserved("turn.end", this.extractPiPayload(parsed), parsed, type); - const finalText = this.extractGenericMessageText(parsed?.message); - if (finalText) { - this.setFinalAssistantText(finalText); - this.pushObserved( - "assistant.message.final", - { text: finalText }, - parsed?.message, - type, - ); - } - const usage = normalizeTokenUsage(parsed?.message?.usage); - if (usage) { - this.pushObserved("usage", usage, parsed.message.usage, "usage"); - } - return; - } - - if (type === "message_end") { - this.pushObserved("message.end", this.extractPiPayload(parsed), parsed, type); - const finalText = this.extractGenericMessageText(parsed?.message); - if (parsed?.message?.role === "assistant" && finalText) { - this.setFinalAssistantText(finalText); - this.pushObserved( - "assistant.message.final", - { text: finalText }, - parsed?.message, - type, - ); - } - return; - } - - if (type === "message_update") { - const evt = parsed?.assistantMessageEvent; - if (evt?.type === "text_delta" && typeof evt.delta === "string") { - this.appendAssistantText(evt.delta); - this.pushObserved( - "assistant.text.delta", - { text: evt.delta }, - parsed, - evt.type, - ); - return; - } - if ( - (evt?.type === "thinking_delta" || evt?.type === "reasoning_delta") && - typeof evt.delta === "string" - ) { - this.pushObserved( - "assistant.thinking.delta", - { text: evt.delta }, - parsed, - evt.type, - ); - return; - } - this.pushObserved("message.update", this.extractPiPayload(parsed), parsed, type); - return; - } - - this.pushObserved("stdout", { eventType: type }, parsed, type); - } - - private extractGenericDeltaText(parsed: any): string | undefined { - const candidates = [ - parsed?.delta?.text, - parsed?.delta, - parsed?.text, - parsed?.content_block?.text, - parsed?.contentBlock?.text, - parsed?.message?.text, - parsed?.message?.content, - parsed?.output_text, - ]; - for (const candidate of candidates) { - if (typeof candidate === "string" && candidate) return candidate; - } - return undefined; - } - - private extractGenericMessageText(parsed: any): string | undefined { - return extractTextFromJsonValue( - parsed?.message ?? parsed?.response ?? parsed?.item ?? parsed, - ); - } - private appendAssistantText(text: string) { this.assistantTextBuffer += text; this.finalText = this.assistantTextBuffer; @@ -1315,42 +1427,6 @@ export class AgentTraceCollector { this.finalText = text; } - private extractGenericMessagePayload(parsed: any): Record { - const payload: Record = {}; - const role = - parsed?.message?.role ?? parsed?.role ?? parsed?.response?.role; - if (typeof role === "string") payload.role = role; - const text = this.extractGenericMessageText(parsed); - if (text) payload.text = text; - if (parsed?.id) payload.id = parsed.id; - return payload; - } - - private extractGenericToolPayload(parsed: any): Record { - const tool = - parsed?.tool ?? - parsed?.toolCall ?? - parsed?.tool_call ?? - parsed?.toolExecution ?? - parsed; - return { - toolCallId: tool?.id ?? tool?.toolCallId ?? parsed?.id, - toolName: tool?.name ?? tool?.toolName ?? parsed?.toolName, - argsPreview: tool?.args ?? tool?.arguments ?? parsed?.args, - resultPreview: tool?.result ?? parsed?.result, - isError: Boolean(tool?.isError ?? parsed?.isError ?? parsed?.error), - }; - } - - private extractPiPayload(parsed: any): Record { - const payload: Record = {}; - if (parsed?.message?.role) payload.role = parsed.message.role; - const text = this.extractGenericMessageText(parsed?.message); - if (text) payload.text = text; - if (parsed?.id) payload.id = parsed.id; - return payload; - } - private observeSmithersEvent(event: SmithersEvent) { const sameAttempt = (event as any).runId === this.runId && @@ -1460,16 +1536,6 @@ export class AgentTraceCollector { return "full-observed"; } - private extractMappedPayload( - parsed: any, - payloadKind: PayloadKind, - ): Record | null { - if (payloadKind === "message") return this.extractGenericMessagePayload(parsed); - if (payloadKind === "tool") return this.extractGenericToolPayload(parsed); - if (payloadKind === "pi") return this.extractPiPayload(parsed); - return {}; - } - private push( kind: CanonicalAgentTraceEventKind, payload: Record | null, @@ -1549,6 +1615,41 @@ export class AgentTraceCollector { } } + private emitObservedBatch( + batch: NormalizedTraceBatch, + rawEventId = this.currentRawEventId, + ) { + for (const kind of batch.expectedKinds ?? []) { + this.expectedKinds.add(kind); + } + for (const event of batch.events) { + this.observeNormalizedEvent(event); + this.pushObserved( + event.kind, + event.payload, + event.raw, + event.rawType, + rawEventId, + ); + } + } + + private observeNormalizedEvent(event: NormalizedTraceEvent) { + if ( + event.kind === "assistant.text.delta" && + typeof event.payload?.text === "string" + ) { + this.appendAssistantText(event.payload.text); + return; + } + if ( + event.kind === "assistant.message.final" && + typeof event.payload?.text === "string" + ) { + this.setFinalAssistantText(event.payload.text); + } + } + private nextRawEventId(rawType: string) { return `${rawType}:${this.rawEventSequence++}`; } diff --git a/src/agents/CodexAgent.ts b/src/agents/CodexAgent.ts index d972cbbc..b917b64c 100644 --- a/src/agents/CodexAgent.ts +++ b/src/agents/CodexAgent.ts @@ -16,6 +16,7 @@ type CodexAgentOptions = BaseCliAgentOptions & { disable?: string[]; image?: string[]; model?: string; + outputFormat?: "text" | "json" | "stream-json"; oss?: boolean; localProvider?: string; sandbox?: "read-only" | "workspace-write" | "danger-full-access"; diff --git a/tests/agent-trace.test.tsx b/tests/agent-trace.test.tsx index 6f189899..1a192952 100644 --- a/tests/agent-trace.test.tsx +++ b/tests/agent-trace.test.tsx @@ -8,7 +8,6 @@ import { AgentTraceCollector, canonicalTraceEventToOtelLogRecord, } from "../src/agent-trace"; -import { CodexAgent } from "../src/agents"; import { SmithersDb } from "../src/db/adapter"; import { runPromise } from "../src/effect/runtime"; import { @@ -230,6 +229,80 @@ describe("agent trace capture", () => { cleanup(); }); + test("preserves Pi message.update fallback and tool updates from one transcript", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-pi-update-fallback"; + + const piLikeAgent: any = { + id: "pi-agent-update-fallback", + opts: { mode: "json" }, + generate: async (args: { onStdout?: (text: string) => void }) => { + args.onStdout?.(JSON.stringify({ type: "session", id: "sess-2" }) + "\n"); + args.onStdout?.( + JSON.stringify({ + type: "message_update", + message: { role: "assistant", content: [{ type: "text", text: "partial state" }] }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_execution_update", + toolExecution: { id: "tool-2", name: "bash", result: { status: "running" } }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "turn_end", + message: { + role: "assistant", + content: [{ type: "text", text: "Pi final update" }], + }, + }) + "\n", + ); + args.onStdout?.(JSON.stringify({ type: "agent_end" }) + "\n"); + return { + text: '{"answer":"Pi final update"}', + output: { answer: "Pi final update" }, + }; + }, + }; + + const workflow = smithers(() => ( + + + pi update fallback please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect( + traceEvents.some( + (event: any) => + event.event.kind === "message.update" && + event.payload.text === "partial state", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "tool.execution.update" && + event.payload.toolCallId === "tool-2", + ), + ).toBe(true); + + cleanup(); + }); + test("truthfully classifies sdk agents as final-only", async () => { const { smithers, outputs, db, cleanup } = createTestSmithers({ result: z.object({ answer: z.string() }), @@ -510,6 +583,82 @@ describe("agent trace capture", () => { cleanup(); }); + test("preserves Claude structured tool lifecycle with shared raw event ids", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-claude-tool-lifecycle"; + + class ClaudeToolLifecycleFake { + id = "claude-tool-lifecycle-fake"; + opts = { outputFormat: "stream-json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "tool_call.started", + toolCall: { id: "tool-claude-1", name: "read", args: { path: "README.md" } }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_call.completed", + toolCall: { id: "tool-claude-1", name: "read", result: { ok: true } }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message_end", + message: { role: "assistant", content: "claude tool final" }, + }) + "\n", + ); + return { + text: '{"answer":"claude tool final"}', + output: { answer: "claude tool final" }, + }; + } + } + + const workflow = smithers(() => ( + + + structured claude tool lifecycle please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + const toolStart = traceEvents.find( + (event: any) => event.event.kind === "tool.execution.start", + ); + const toolEnd = traceEvents.find( + (event: any) => event.event.kind === "tool.execution.end", + ); + + expect(toolStart?.payload.toolCallId).toBe("tool-claude-1"); + expect(toolEnd?.payload.toolCallId).toBe("tool-claude-1"); + expect(toolStart?.source.rawEventId).not.toBe(toolEnd?.source.rawEventId); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.message.final" && + event.payload.text === "claude tool final", + ), + ).toBe(true); + + cleanup(); + }); + test("preserves structured Codex completion usage and final message truthfully", async () => { const { smithers, outputs, db, cleanup } = createTestSmithers({ result: z.object({ answer: z.string() }), @@ -666,16 +815,102 @@ describe("agent trace capture", () => { cleanup(); }); + test("preserves Codex structured tool lifecycle from one transcript", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-codex-tool-lifecycle"; + + class CodexToolLifecycleFake { + id = "codex-tool-lifecycle-fake"; + opts = { outputFormat: "stream-json", json: true }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "tool_call.started", + toolCall: { id: "codex-tool-1", name: "grep", args: { pattern: "trace" } }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_call.delta", + toolCall: { id: "codex-tool-1", name: "grep", result: { status: "running" } }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_call.completed", + toolCall: { id: "codex-tool-1", name: "grep", result: { ok: true } }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "turn.completed", + message: { role: "assistant", content: "codex tool final" }, + }) + "\n", + ); + return { + text: '{"answer":"codex tool final"}', + output: { answer: "codex tool final" }, + }; + } + } + + const workflow = smithers(() => ( + + + codex tool lifecycle please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect( + traceEvents.some( + (event: any) => + event.event.kind === "tool.execution.start" && + event.payload.toolCallId === "codex-tool-1", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "tool.execution.update" && + event.payload.toolCallId === "codex-tool-1", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "tool.execution.end" && + event.payload.toolCallId === "codex-tool-1", + ), + ).toBe(true); + + cleanup(); + }); + test("captures real Codex dotted jsonl events as structured trace instead of cli-text fallback", async () => { const { smithers, outputs, db, cleanup } = createTestSmithers({ result: z.object({ answer: z.string() }), }); const runId = "agent-trace-codex-dotted-jsonl"; - class CodexJsonlAgentFake extends CodexAgent { - constructor() { - super({ skipGitRepoCheck: true }); - } + class CodexJsonlAgentFake { + readonly id = "codex-jsonl-fake"; + readonly opts = { outputFormat: "stream-json" as const, json: true }; + readonly family = "codex"; async generate(args: { onStdout?: (text: string) => void }) { args.onStdout?.(