diff --git a/.gitignore b/.gitignore index 8c1b2ea6..4f57d598 100644 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,8 @@ deno.lock reference/ codex/ ghostty/ +observability/data/ +tmp/verification/ # Worktree script SQLite artifacts scripts/worktree-feature/*.db* diff --git a/docs/guides/agent-trace-otel-verification.mdx b/docs/guides/agent-trace-otel-verification.mdx new file mode 100644 index 00000000..2838d1eb --- /dev/null +++ b/docs/guides/agent-trace-otel-verification.mdx @@ -0,0 +1,200 @@ +--- +title: Agent Trace OTEL Log Verification +description: Reproducible local verification for canonical agent trace OTEL log export with Loki. +--- + +## Local stack + +Start the local stack: + +```bash +smithers observability --detach +``` + +Expected endpoints: + +- Grafana: `http://localhost:3001` +- Prometheus: `http://localhost:9090` +- Tempo: `http://localhost:3200` +- Loki: `http://localhost:3100` +- OTEL collector HTTP: `http://localhost:4318` + +Validate the stack: + +```bash +docker compose -f observability/docker-compose.otel.yml ps +curl -sf http://localhost:3100/ready +curl -sf http://localhost:3001/api/datasources | jq 'map({name,type,url})' +docker logs observability-otel-collector-1 | tail -n 80 +``` + +## Enable OTEL export + +```bash +export SMITHERS_OTEL_ENABLED=1 +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 +export OTEL_SERVICE_NAME=smithers-dev +export SMITHERS_LOG_FORMAT=json +``` + +## Demo workflow + +Use the built-in reproducible workflow at `workflows/agent-trace-otel-demo.tsx`. +It emits: + +- one Pi-like high-fidelity attempt with text deltas, thinking deltas, tool lifecycle, usage, and a redaction fixture +- one SDK-style final-only attempt +- stable run annotations for Loki queries + +Run the success case: + +```bash +smithers run workflows/agent-trace-otel-demo.tsx \ + --run-id agent-trace-otel-demo \ + --annotations '{"custom.demo":true,"custom.ticket":"OBS-123"}' +``` + +Run the malformed JSON failure case: + +```bash +smithers run workflows/agent-trace-otel-demo.tsx \ + --run-id agent-trace-otel-demo-fail \ + --input '{"failureMode":"malformed-json"}' \ + --annotations '{"custom.demo":true,"custom.ticket":"OBS-ERR"}' +``` + +Optional durable-local verification: + +```bash +jq 'select(.type == "AgentTraceEvent" or .type == "AgentTraceSummary")' \ + .smithers/executions/agent-trace-otel-demo/logs/stream.ndjson + +find .smithers/executions/agent-trace-otel-demo/logs/agent-trace -type f -maxdepth 1 -print +``` + +## Loki queries + +Smithers OTEL attributes are exposed to Loki as sanitized structured metadata fields such as: + +- `run_id` +- `workflow_path` +- `node_id` +- `node_attempt` +- `agent_family` +- `agent_capture_mode` +- `trace_completeness` +- `event_kind` + +Use `{service_name="smithers-dev"}` as the stream selector, then filter on structured metadata in the LogQL pipeline. Use `| json` to inspect the structured log body. + +All events for one run: + +```logql +{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" +``` + +One node attempt: + +```logql +{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" | node_id="pi-rich-trace" | node_attempt="1" +``` + +Thinking deltas only: + +```logql +{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" | event_kind="assistant.thinking.delta" +``` + +Tool execution only: + +```logql +{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" | event_kind=~"tool\.execution\..*" +``` + +Capture errors only: + +```logql +{service_name="smithers-dev"} | event_kind="capture.error" +``` + +Inspect the structured JSON body for one run: + +```logql +{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" | json +``` + +Redaction proof query: + +```logql +{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" |= "REDACTED" +``` + +## API query examples + +Equivalent direct Loki API checks: + +```bash +curl -sG 'http://localhost:3100/loki/api/v1/query_range' \ + --data-urlencode 'query={service_name="smithers-dev"} | run_id="agent-trace-otel-demo"' \ + --data-urlencode 'limit=200' | jq '.data.result[] | {stream, values: (.values | length)}' + +curl -sG 'http://localhost:3100/loki/api/v1/query_range' \ + --data-urlencode 'query={service_name="smithers-dev"} | run_id="agent-trace-otel-demo" | event_kind="assistant.thinking.delta"' \ + --data-urlencode 'limit=20' | jq '.data.result[]?.values[]?[1]' +``` + +## Tempo trace checks + +Tempo search should show Smithers spans once a workflow has run: + +```bash +curl -s http://localhost:3200/api/search | jq . +curl -s http://localhost:3200/api/search/tags | jq . +curl -s 'http://localhost:3200/api/search/tag/service.name/values' | jq . +curl -s 'http://localhost:3200/api/search/tag/runId/values' | jq . +``` + +Inspect one trace directly: + +```bash +TRACE_ID=$(curl -s http://localhost:3200/api/search \ + | jq -r '.traces[] | select(.rootTraceName=="engine:run-workflow") | .traceID' \ + | head -n 1) + +curl -s http://localhost:3200/api/traces/$TRACE_ID | jq . +``` + +Expected trace attributes include at least: + +- `service.name = smithers-dev` +- `runId = ` +- `workflowPath = ` + +## Verification checklist + +- stack starts successfully in Docker +- Loki is present and queryable +- collector logs pipeline is active +- Pi traces show text deltas, thinking deltas, tool execution lifecycle, final message, usage, and run/node/attempt correlation +- second agent family is exported with truthful `final-only` completeness classification +- Gemini `stream-json` attempts preserve structured deltas truthfully +- Kimi `stream-json` attempts preserve structured tool lifecycle truthfully +- malformed or truncated structured streams emit `capture.error` and classify as `capture-failed` +- artifact write failures emit `capture.warning` and degrade to `partial-observed` without losing durable DB truth +- Tempo search shows Smithers spans and trace attributes including `runId` +- Prometheus is still scraping the collector successfully +- secrets are redacted from canonical events, OTEL log bodies, and persisted trace artifacts + +## Automated coverage + +The fixture-style and workflow-backed tests in `tests/agent-trace.test.tsx` now cover: + +- Pi rich success and malformed JSON failure +- SDK truthful `final-only` +- Claude Code structured stream preservation +- Codex structured delta + usage preservation +- Gemini structured `stream-json` and coarse `cli-json` classification +- Kimi structured `stream-json` tool lifecycle preservation +- truncated structured stream failure classification +- artifact write failure classification +- OTEL body/attribute shaping for Loki query fields diff --git a/docs/guides/monitoring-logs.mdx b/docs/guides/monitoring-logs.mdx index 9e355de9..ea086c94 100644 --- a/docs/guides/monitoring-logs.mdx +++ b/docs/guides/monitoring-logs.mdx @@ -58,8 +58,9 @@ That starts: - OTLP collector on `http://localhost:4318` - Prometheus on `http://localhost:9090` -- Grafana on `http://localhost:3000` +- Grafana on `http://localhost:3001` - Tempo on `http://localhost:3200` +- Loki on `http://localhost:3100` The collector exports metrics on `:8889`, Prometheus scrapes that endpoint, and Grafana comes pre-provisioned with a Smithers dashboard. diff --git a/observability/docker-compose.otel.yml b/observability/docker-compose.otel.yml index 7dd0e55d..510b3f23 100644 --- a/observability/docker-compose.otel.yml +++ b/observability/docker-compose.otel.yml @@ -2,7 +2,8 @@ version: "3.9" services: otel-collector: - image: otel/opentelemetry-collector-contrib:latest + image: otel/opentelemetry-collector-contrib:0.148.0 + restart: unless-stopped command: ["--config", "/etc/otelcol/config.yml"] volumes: - ./otel-collector-config.yml:/etc/otelcol/config.yml:ro @@ -11,28 +12,71 @@ services: - "4318:4318" # HTTP (Smithers sends here) - "8889:8889" # Prometheus exporter scrape endpoint depends_on: - - tempo + tempo: + condition: service_healthy + loki: + condition: service_healthy + healthcheck: + test: ["CMD", "/otelcol-contrib", "--version"] + interval: 10s + timeout: 5s + retries: 12 + start_period: 10s + + loki: + image: grafana/loki:3.3.2 + restart: unless-stopped + command: ["-config.file=/etc/loki/local-config.yaml"] + volumes: + - ./loki/loki-config.yaml:/etc/loki/local-config.yaml:ro + - ./data/loki:/loki + ports: + - "3100:3100" + healthcheck: + test: ["CMD", "wget", "-q", "-O-", "http://127.0.0.1:3100/ready"] + interval: 10s + timeout: 5s + retries: 18 + start_period: 10s prometheus: image: prom/prometheus:v3.2.0 + restart: unless-stopped volumes: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./data/prometheus:/prometheus ports: - "9090:9090" depends_on: - - otel-collector + otel-collector: + condition: service_healthy + healthcheck: + test: ["CMD", "wget", "-q", "-O-", "http://127.0.0.1:9090/-/ready"] + interval: 10s + timeout: 5s + retries: 18 + start_period: 10s tempo: image: grafana/tempo:2.7.0 + restart: unless-stopped command: ["-config.file=/etc/tempo/tempo.yml"] volumes: - ./tempo/tempo.yml:/etc/tempo/tempo.yml:ro + - ./data/tempo:/tmp/tempo ports: - "3200:3200" # Tempo query API - "4317" # OTLP gRPC (internal) + healthcheck: + test: ["CMD", "wget", "-q", "-O-", "http://127.0.0.1:3200/api/search"] + interval: 10s + timeout: 5s + retries: 24 + start_period: 20s grafana: image: grafana/grafana:11.5.0 + restart: unless-stopped environment: - GF_SECURITY_ADMIN_PASSWORD=admin - GF_AUTH_ANONYMOUS_ENABLED=true @@ -40,8 +84,19 @@ services: volumes: - ./grafana/provisioning:/etc/grafana/provisioning:ro - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + - ./data/grafana:/var/lib/grafana ports: - "3001:3000" depends_on: - - prometheus - - tempo + prometheus: + condition: service_healthy + tempo: + condition: service_healthy + loki: + condition: service_healthy + healthcheck: + test: ["CMD", "wget", "-q", "-O-", "http://127.0.0.1:3000/api/health"] + interval: 10s + timeout: 5s + retries: 24 + start_period: 20s diff --git a/observability/grafana/provisioning/datasources/datasources.yml b/observability/grafana/provisioning/datasources/datasources.yml index 8b8c6244..3c885539 100644 --- a/observability/grafana/provisioning/datasources/datasources.yml +++ b/observability/grafana/provisioning/datasources/datasources.yml @@ -14,6 +14,15 @@ datasources: - name: traceID datasourceUid: tempo + - name: Loki + type: loki + uid: loki + access: proxy + url: http://loki:3100 + editable: false + jsonData: + maxLines: 1000 + - name: Tempo type: tempo uid: tempo @@ -21,6 +30,17 @@ datasources: url: http://tempo:3200 editable: false jsonData: + tracesToLogs: + datasourceUid: loki + tags: ["service.name"] + mappedTags: + - key: service.name + value: service.name + mapTagNamesEnabled: false + spanStartTimeShift: "-1h" + spanEndTimeShift: "1h" + filterByTraceID: false + filterBySpanID: false tracesToMetrics: datasourceUid: prometheus spanStartTimeShift: "-1h" diff --git a/observability/loki/loki-config.yaml b/observability/loki/loki-config.yaml new file mode 100644 index 00000000..eb20f384 --- /dev/null +++ b/observability/loki/loki-config.yaml @@ -0,0 +1,35 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2020-10-24 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +limits_config: + allow_structured_metadata: true + +ruler: + alertmanager_url: http://localhost:9093 + +analytics: + reporting_enabled: false diff --git a/observability/otel-collector-config.yml b/observability/otel-collector-config.yml index 68f29b27..daa29aef 100644 --- a/observability/otel-collector-config.yml +++ b/observability/otel-collector-config.yml @@ -10,10 +10,12 @@ exporters: prometheus: endpoint: "0.0.0.0:8889" namespace: smithers - otlp/tempo: + otlp_grpc/tempo: endpoint: "tempo:4317" tls: insecure: true + otlp_http/loki: + endpoint: "http://loki:3100/otlp" processors: batch: @@ -29,4 +31,8 @@ service: traces: receivers: [otlp] processors: [batch] - exporters: [otlp/tempo] + exporters: [otlp_grpc/tempo] + logs: + receivers: [otlp] + processors: [batch] + exporters: [otlp_http/loki] diff --git a/scripts/obs-reset.sh b/scripts/obs-reset.sh new file mode 100755 index 00000000..1dfdf798 --- /dev/null +++ b/scripts/obs-reset.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) +COMPOSE_FILE="$ROOT_DIR/observability/docker-compose.otel.yml" + +if ! command -v docker >/dev/null 2>&1; then + echo "docker is required" >&2 + exit 1 +fi + +DATA_DIR="$ROOT_DIR/observability/data" + +echo "[obs-reset] bringing stack down and removing volumes..." +docker compose -f "$COMPOSE_FILE" down -v --remove-orphans || true + +echo "[obs-reset] resetting local data dirs..." +rm -rf "$DATA_DIR" +mkdir -p "$DATA_DIR"/loki "$DATA_DIR"/tempo "$DATA_DIR"/prometheus "$DATA_DIR"/grafana +chmod -R 0777 "$DATA_DIR" + +echo "[obs-reset] starting fresh stack..." +docker compose -f "$COMPOSE_FILE" up -d + +echo "[obs-reset] waiting for health..." +"$ROOT_DIR/scripts/obs-wait-healthy.sh" + +echo "[obs-reset] stack is healthy" diff --git a/scripts/obs-wait-healthy.sh b/scripts/obs-wait-healthy.sh new file mode 100755 index 00000000..57587e0a --- /dev/null +++ b/scripts/obs-wait-healthy.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) +COMPOSE_FILE="$ROOT_DIR/observability/docker-compose.otel.yml" +TIMEOUT_SECONDS="${1:-240}" +START_TS=$(date +%s) + +services=(loki tempo otel-collector prometheus grafana) + +status_of() { + local container=$1 + docker inspect "$container" --format '{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}' 2>/dev/null || echo "missing" +} + +container_name() { + local service=$1 + docker compose -f "$COMPOSE_FILE" ps -q "$service" | head -n 1 +} + +echo "[obs-wait] timeout=${TIMEOUT_SECONDS}s" + +while true; do + all_ready=1 + for service in "${services[@]}"; do + cid=$(container_name "$service") + if [[ -z "$cid" ]]; then + echo "[obs-wait] $service: missing" + all_ready=0 + continue + fi + state=$(status_of "$cid") + echo "[obs-wait] $service: $state" + if [[ "$state" != "healthy" && "$state" != "running" ]]; then + all_ready=0 + fi + if [[ "$service" != "otel-collector" && "$state" != "healthy" ]]; then + all_ready=0 + fi + done + + if [[ "$all_ready" == "1" ]]; then + break + fi + + now=$(date +%s) + if (( now - START_TS >= TIMEOUT_SECONDS )); then + echo "[obs-wait] timed out waiting for health" >&2 + docker compose -f "$COMPOSE_FILE" ps >&2 || true + exit 1 + fi + + sleep 5 + echo + done + +echo "[obs-wait] endpoint checks" +curl --max-time 10 -fsS http://localhost:3100/ready >/dev/null +curl --max-time 10 -fsS http://localhost:9090/-/ready >/dev/null +curl --max-time 20 -fsS http://localhost:3001/api/health >/dev/null + +# Tempo can report not-ready briefly even when query/search already works, so allow retries. +tempo_ready=0 +for _ in $(seq 1 12); do + if curl --max-time 10 -fsS http://localhost:3200/ready >/dev/null; then + tempo_ready=1 + break + fi + sleep 5 +done + +if [[ "$tempo_ready" != "1" ]]; then + echo "[obs-wait] tempo never became ready" >&2 + exit 1 +fi + +echo "[obs-wait] healthy" diff --git a/scripts/verify-observability.sh b/scripts/verify-observability.sh new file mode 100755 index 00000000..e4e1f194 --- /dev/null +++ b/scripts/verify-observability.sh @@ -0,0 +1,229 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) +COMPOSE_FILE="$ROOT_DIR/observability/docker-compose.otel.yml" +STAMP=$(date +%Y%m%d-%H%M%S) +OUT_DIR="$ROOT_DIR/tmp/verification/$STAMP" +WORKFLOW_PATH="$ROOT_DIR/workflows/agent-trace-otel-demo.tsx" +RUN_ID="agent-trace-otel-demo-$STAMP" +FAIL_RUN_ID="agent-trace-otel-demo-fail-$STAMP" +mkdir -p "$OUT_DIR" + +export SMITHERS_OTEL_ENABLED=1 +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 +export OTEL_SERVICE_NAME=smithers-dev +export SMITHERS_LOG_FORMAT=json + +run_success() { + bun run "$ROOT_DIR/src/cli/index.ts" run "$WORKFLOW_PATH" --run-id "$RUN_ID" --annotations '{"custom.demo":true,"custom.ticket":"OBS-123"}' +} + +run_failure() { + bun run "$ROOT_DIR/src/cli/index.ts" run "$WORKFLOW_PATH" --run-id "$FAIL_RUN_ID" --input '{"failureMode":"malformed-json"}' --annotations '{"custom.demo":true,"custom.ticket":"OBS-ERR"}' +} + +retry_until() { + local name=$1 + local attempts=$2 + local validator=$3 + local command=$4 + local delay=2 + local out="$OUT_DIR/$name.json" + + for attempt in $(seq 1 "$attempts"); do + echo "[verify] $name attempt $attempt/$attempts" + eval "$command" > "$out.tmp" + if jq -e "$validator" "$out.tmp" >/dev/null 2>&1; then + mv "$out.tmp" "$out" + return 0 + fi + sleep "$delay" + delay=$((delay * 2)) + done + + echo "[verify] failed: $name" >&2 + cat "$out.tmp" >&2 || true + return 1 +} + +loki_query() { + local query=$1 + local limit=${2:-200} + curl --max-time 30 -sG 'http://localhost:3100/loki/api/v1/query_range' \ + --data-urlencode "query=$query" \ + --data-urlencode "limit=$limit" +} + +tempo_search() { + curl --max-time 30 -s 'http://localhost:3200/api/search' +} + +tempo_trace() { + local trace_id=$1 + curl --max-time 30 -s "http://localhost:3200/api/traces/$trace_id" +} + +append_json() { + local title=$1 + local path=$2 + { + echo + echo "## $title" + echo + echo '```json' + cat "$path" + echo + echo '```' + } >> "$OUT_DIR/report.md" +} + +trace_summary_json() { + local path=$1 + node -e "const fs=require('fs'); const rows=fs.readFileSync(process.argv[1],'utf8').trim().split('\n').map(JSON.parse); console.log(JSON.stringify(rows.at(-1).summary));" "$path" +} + +echo "[verify] reset stack" +"$ROOT_DIR/scripts/obs-reset.sh" + +echo "[verify] capture stack state" +docker compose -f "$COMPOSE_FILE" ps > "$OUT_DIR/compose-ps.txt" +curl --max-time 20 -s http://localhost:3001/api/health | jq . > "$OUT_DIR/grafana-health.json" +curl --max-time 20 -s http://localhost:3001/api/datasources | jq 'map({name,type,url,uid})' > "$OUT_DIR/grafana-datasources.json" +curl --max-time 20 -s http://localhost:9090/api/v1/query --data-urlencode 'query=up' | jq . > "$OUT_DIR/prometheus-up.json" + +echo "[verify] run success workflow" +run_success | tee "$OUT_DIR/success-run.txt" + +echo "[verify] run failure workflow" +set +e +run_failure | tee "$OUT_DIR/failure-run.txt" +FAIL_EXIT=$? +set -e +printf '%s\n' "$FAIL_EXIT" > "$OUT_DIR/failure-exit-code.txt" + +sleep 6 + +retry_until loki-all-events 6 '.streams >= 2' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\"' 200 | jq '{streams:(.data.result|length), counts:(.data.result|map(.values|length)), lines:[.data.result[]?.values[]?[1]][0:5]}'" + +retry_until loki-node-attempt 6 '.streams >= 10' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"pi-rich-trace\" | node_attempt=\"1\"' 200 | jq '{streams:(.data.result|length), counts:(.data.result|map(.values|length))}'" + +retry_until loki-claude-structured 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"claude-structured-trace\"' 50 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]][0:10]}'" + +retry_until loki-gemini-structured 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"gemini-structured-trace\"' 50 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]][0:10]}'" + +retry_until loki-codex-structured 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"codex-structured-trace\"' 50 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]][0:10]}'" + +retry_until loki-thinking 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | event_kind=\"assistant.thinking.delta\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-tool-start 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | event_kind=\"tool.execution.start\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-tool-update 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | event_kind=\"tool.execution.update\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-tool-end 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | event_kind=\"tool.execution.end\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-sdk-final-only 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"sdk-final-only\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-claude-message-update 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"claude-structured-trace\" | event_kind=\"message.update\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-gemini-text-delta 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"gemini-structured-trace\" | event_kind=\"assistant.text.delta\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-claude-usage 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"claude-structured-trace\" | event_kind=\"usage\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-gemini-usage 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"gemini-structured-trace\" | event_kind=\"usage\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-codex-text-delta 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"codex-structured-trace\" | event_kind=\"assistant.text.delta\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-codex-usage 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" | node_id=\"codex-structured-trace\" | event_kind=\"usage\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until trace-summary-claude 6 '.captureMode == "cli-json-stream" and .traceCompleteness == "full-observed" and (.missingExpectedEventKinds | length) == 0' \ + "trace_summary_json '$ROOT_DIR/workflows/.smithers/executions/$RUN_ID/logs/agent-trace/claude-structured-trace-0-1.ndjson'" + +retry_until trace-summary-codex 6 '.captureMode == "cli-json-stream" and .traceCompleteness == "full-observed" and (.missingExpectedEventKinds | length) == 0' \ + "trace_summary_json '$ROOT_DIR/workflows/.smithers/executions/$RUN_ID/logs/agent-trace/codex-structured-trace-0-1.ndjson'" + +retry_until loki-capture-errors 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$FAIL_RUN_ID\" | event_kind=\"capture.error\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-redaction-presence 6 '.streams >= 1' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" |= \"REDACTED_SECRET\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until loki-secret-absence 6 '.streams == 0' \ + "loki_query '{service_name=\"smithers-dev\"} | run_id=\"$RUN_ID\" |= \"sk_demo_secret_123456789\"' 20 | jq '{streams:(.data.result|length), lines:[.data.result[]?.values[]?[1]]}'" + +retry_until tempo-search 6 '.smithers | length >= 1' \ + "tempo_search | jq '{smithers:[.traces[] | select(.rootServiceName==\"smithers-dev\") | {traceID, rootServiceName, rootTraceName}]}'" + +TRACE_ID="" +for candidate in $(jq -r '.smithers[]?.traceID' "$OUT_DIR/tempo-search.json"); do + if tempo_trace "$candidate" \ + | jq '{resourceAttrs:[.batches[].resource.attributes], spans:[.batches[].scopeSpans[].spans[] | {name, attrs:.attributes}]}' > "$OUT_DIR/tempo-trace-candidate.json.tmp"; then + if jq -e ".spans | any(.attrs | any(.key==\"runId\" and .value.stringValue==\"$RUN_ID\"))" "$OUT_DIR/tempo-trace-candidate.json.tmp" >/dev/null 2>&1; then + TRACE_ID="$candidate" + mv "$OUT_DIR/tempo-trace-candidate.json.tmp" "$OUT_DIR/tempo-trace-with-runid.json" + break + fi + fi +done + +if [[ -z "$TRACE_ID" ]]; then + echo "[verify] failed: unable to find Tempo trace for success run id $RUN_ID" >&2 + exit 1 +fi + +cat > "$OUT_DIR/report.md" <; }; diff --git a/src/SmithersEvent.ts b/src/SmithersEvent.ts index 6b25e4ff..87e73384 100644 --- a/src/SmithersEvent.ts +++ b/src/SmithersEvent.ts @@ -1,4 +1,5 @@ import type { RunStatus } from "./RunStatus"; +import type { AgentTraceSummary, CanonicalAgentTraceEvent } from "./agent-trace"; export type SmithersEvent = | { type: "RunStarted"; runId: string; timestampMs: number } @@ -108,6 +109,7 @@ export type SmithersEvent = nodeId: string; iteration: number; attempt: number; + toolCallId: string; toolName: string; seq: number; timestampMs: number; @@ -118,6 +120,7 @@ export type SmithersEvent = nodeId: string; iteration: number; attempt: number; + toolCallId: string; toolName: string; seq: number; status: "success" | "error"; @@ -194,4 +197,22 @@ export type SmithersEvent = cacheWriteTokens?: number; reasoningTokens?: number; timestampMs: number; + } + | { + type: "AgentTraceEvent"; + runId: string; + nodeId: string; + iteration: number; + attempt: number; + trace: CanonicalAgentTraceEvent; + timestampMs: number; + } + | { + type: "AgentTraceSummary"; + runId: string; + nodeId: string; + iteration: number; + attempt: number; + summary: AgentTraceSummary; + timestampMs: number; }; diff --git a/src/agent-trace.ts b/src/agent-trace.ts new file mode 100644 index 00000000..5c509a03 --- /dev/null +++ b/src/agent-trace.ts @@ -0,0 +1,1687 @@ +import { appendFile, mkdir, writeFile } from "node:fs/promises"; +import { join } from "node:path"; +import type { SmithersEvent } from "./SmithersEvent"; +import type { EventBus } from "./events"; +import { logErrorAwait, logInfoAwait, logWarningAwait } from "./effect/logging"; +import { extractTextFromJsonValue } from "./utils/text-extraction"; +import { nowMs } from "./utils/time"; +import { normalizeTokenUsage } from "./utils/usage"; + +export type AgentFamily = + | "pi" + | "codex" + | "claude-code" + | "gemini" + | "kimi" + | "openai" + | "anthropic" + | "amp" + | "forge" + | "unknown"; + +export type AgentCaptureMode = + | "sdk-events" + | "rpc-events" + | "cli-json-stream" + | "cli-json" + | "cli-text" + | "artifact-import"; + +export type TraceCompleteness = + | "full-observed" + | "partial-observed" + | "final-only" + | "capture-failed"; + +export type CanonicalAgentTraceEventKind = + | "session.start" + | "session.end" + | "turn.start" + | "turn.end" + | "message.start" + | "message.update" + | "message.end" + | "assistant.text.delta" + | "assistant.thinking.delta" + | "assistant.message.final" + | "tool.execution.start" + | "tool.execution.update" + | "tool.execution.end" + | "tool.result" + | "retry.start" + | "retry.end" + | "compaction.start" + | "compaction.end" + | "stderr" + | "stdout" + | "usage" + | "capture.warning" + | "capture.error" + | "artifact.created"; + +export type CanonicalAgentTraceEvent = { + traceVersion: "1"; + runId: string; + workflowPath?: string; + workflowHash?: string; + nodeId: string; + iteration: number; + attempt: number; + timestampMs: number; + event: { + sequence: number; + kind: CanonicalAgentTraceEventKind; + phase: + | "agent" + | "turn" + | "message" + | "tool" + | "session" + | "capture" + | "artifact"; + }; + source: { + agentFamily: AgentFamily; + captureMode: AgentCaptureMode; + rawType?: string; + rawEventId?: string; + observed: boolean; + }; + traceCompleteness: TraceCompleteness; + payload: Record | null; + raw: unknown; + redaction: { + applied: boolean; + ruleIds: string[]; + }; + annotations: Record; +}; + +export type AgentTraceSummary = { + traceVersion: "1"; + runId: string; + workflowPath?: string; + workflowHash?: string; + nodeId: string; + iteration: number; + attempt: number; + traceStartedAtMs: number; + traceFinishedAtMs: number; + agentFamily: AgentFamily; + agentId?: string; + model?: string; + captureMode: AgentCaptureMode; + traceCompleteness: TraceCompleteness; + unsupportedEventKinds: CanonicalAgentTraceEventKind[]; + missingExpectedEventKinds: CanonicalAgentTraceEventKind[]; + rawArtifactRefs: string[]; +}; + +export type AgentTraceSmithersEvent = { + type: "AgentTraceEvent"; + runId: string; + nodeId: string; + iteration: number; + attempt: number; + trace: CanonicalAgentTraceEvent; + timestampMs: number; +}; + +export type AgentTraceSummarySmithersEvent = { + type: "AgentTraceSummary"; + runId: string; + nodeId: string; + iteration: number; + attempt: number; + summary: AgentTraceSummary; + timestampMs: number; +}; + +export type AgentTraceCapabilityProfile = { + sessionMetadata: boolean; + assistantTextDeltas: boolean; + visibleThinkingDeltas: boolean; + finalAssistantMessage: boolean; + toolExecutionStart: boolean; + toolExecutionUpdate: boolean; + toolExecutionEnd: boolean; + retryEvents: boolean; + compactionEvents: boolean; + rawStderrDiagnostics: boolean; + persistedSessionArtifact: boolean; +}; + +export const agentTraceCapabilities: Record< + AgentFamily, + AgentTraceCapabilityProfile +> = { + pi: { + sessionMetadata: true, + assistantTextDeltas: true, + visibleThinkingDeltas: true, + finalAssistantMessage: true, + toolExecutionStart: true, + toolExecutionUpdate: true, + toolExecutionEnd: true, + retryEvents: true, + compactionEvents: true, + rawStderrDiagnostics: true, + persistedSessionArtifact: true, + }, + codex: { + sessionMetadata: false, + assistantTextDeltas: false, + visibleThinkingDeltas: false, + finalAssistantMessage: true, + toolExecutionStart: false, + toolExecutionUpdate: false, + toolExecutionEnd: false, + retryEvents: false, + compactionEvents: false, + rawStderrDiagnostics: true, + persistedSessionArtifact: false, + }, + "claude-code": { + sessionMetadata: false, + assistantTextDeltas: true, + visibleThinkingDeltas: false, + finalAssistantMessage: true, + toolExecutionStart: false, + toolExecutionUpdate: false, + toolExecutionEnd: false, + retryEvents: false, + compactionEvents: false, + rawStderrDiagnostics: true, + persistedSessionArtifact: false, + }, + gemini: { + sessionMetadata: false, + assistantTextDeltas: false, + visibleThinkingDeltas: false, + finalAssistantMessage: true, + toolExecutionStart: false, + toolExecutionUpdate: false, + toolExecutionEnd: false, + retryEvents: false, + compactionEvents: false, + rawStderrDiagnostics: true, + persistedSessionArtifact: false, + }, + kimi: { + sessionMetadata: false, + assistantTextDeltas: false, + visibleThinkingDeltas: false, + finalAssistantMessage: true, + toolExecutionStart: false, + toolExecutionUpdate: false, + toolExecutionEnd: false, + retryEvents: false, + compactionEvents: false, + rawStderrDiagnostics: true, + persistedSessionArtifact: false, + }, + openai: { + sessionMetadata: false, + assistantTextDeltas: false, + visibleThinkingDeltas: false, + finalAssistantMessage: true, + toolExecutionStart: true, + toolExecutionUpdate: false, + toolExecutionEnd: true, + retryEvents: false, + compactionEvents: false, + rawStderrDiagnostics: false, + persistedSessionArtifact: false, + }, + anthropic: { + sessionMetadata: false, + assistantTextDeltas: false, + visibleThinkingDeltas: false, + finalAssistantMessage: true, + toolExecutionStart: true, + toolExecutionUpdate: false, + toolExecutionEnd: true, + retryEvents: false, + compactionEvents: false, + rawStderrDiagnostics: false, + persistedSessionArtifact: false, + }, + amp: { + sessionMetadata: false, + assistantTextDeltas: false, + visibleThinkingDeltas: false, + finalAssistantMessage: true, + toolExecutionStart: true, + toolExecutionUpdate: false, + toolExecutionEnd: true, + retryEvents: false, + compactionEvents: false, + rawStderrDiagnostics: true, + persistedSessionArtifact: false, + }, + forge: { + sessionMetadata: false, + assistantTextDeltas: false, + visibleThinkingDeltas: false, + finalAssistantMessage: true, + toolExecutionStart: true, + toolExecutionUpdate: false, + toolExecutionEnd: true, + retryEvents: false, + compactionEvents: false, + rawStderrDiagnostics: true, + persistedSessionArtifact: false, + }, + unknown: { + sessionMetadata: false, + assistantTextDeltas: false, + visibleThinkingDeltas: false, + finalAssistantMessage: true, + toolExecutionStart: true, + toolExecutionUpdate: false, + toolExecutionEnd: true, + retryEvents: false, + compactionEvents: false, + rawStderrDiagnostics: true, + persistedSessionArtifact: false, + }, +}; + +const capabilityKindMap: Array< + [keyof AgentTraceCapabilityProfile, CanonicalAgentTraceEventKind[]] +> = [ + ["sessionMetadata", ["session.start", "session.end"]], + ["assistantTextDeltas", ["assistant.text.delta"]], + ["visibleThinkingDeltas", ["assistant.thinking.delta"]], + ["finalAssistantMessage", ["assistant.message.final"]], + ["toolExecutionStart", ["tool.execution.start"]], + ["toolExecutionUpdate", ["tool.execution.update"]], + ["toolExecutionEnd", ["tool.execution.end", "tool.result"]], + ["retryEvents", ["retry.start", "retry.end"]], + ["compactionEvents", ["compaction.start", "compaction.end"]], + ["rawStderrDiagnostics", ["stderr"]], + ["persistedSessionArtifact", ["artifact.created"]], +]; + +export function resolveAgentTraceCapabilities( + agentFamily: AgentFamily, + captureMode: AgentCaptureMode, +): AgentTraceCapabilityProfile { + const base = { + ...agentTraceCapabilities[agentFamily], + // Smithers persists a canonical NDJSON trace artifact for every successful + // flush regardless of the upstream agent family. + persistedSessionArtifact: true, + }; + + if (captureMode === "sdk-events" || captureMode === "cli-text") { + return base; + } + + if (agentFamily === "codex") { + return { + ...base, + assistantTextDeltas: captureMode === "cli-json-stream", + toolExecutionStart: captureMode === "cli-json-stream", + toolExecutionUpdate: captureMode === "cli-json-stream", + toolExecutionEnd: captureMode === "cli-json-stream", + }; + } + + if (agentFamily === "claude-code") { + return { + ...base, + toolExecutionStart: captureMode === "cli-json-stream", + toolExecutionUpdate: captureMode === "cli-json-stream", + toolExecutionEnd: captureMode === "cli-json-stream", + }; + } + + if (agentFamily === "gemini") { + return { + ...base, + assistantTextDeltas: captureMode === "cli-json-stream", + toolExecutionStart: captureMode === "cli-json-stream", + toolExecutionUpdate: captureMode === "cli-json-stream", + toolExecutionEnd: captureMode === "cli-json-stream", + }; + } + + if (agentFamily === "kimi") { + return { + ...base, + assistantTextDeltas: captureMode === "cli-json-stream", + toolExecutionStart: captureMode === "cli-json-stream", + toolExecutionUpdate: captureMode === "cli-json-stream", + toolExecutionEnd: captureMode === "cli-json-stream", + }; + } + + return base; +} + +function unsupportedKindsForCapabilities( + profile: AgentTraceCapabilityProfile, +): CanonicalAgentTraceEventKind[] { + const kinds: CanonicalAgentTraceEventKind[] = []; + for (const [field, mappedKinds] of capabilityKindMap) { + if (!profile[field]) kinds.push(...mappedKinds); + } + return kinds; +} + +export function detectAgentFamily(agent: any): AgentFamily { + const constructorName = String(agent?.constructor?.name ?? "").toLowerCase(); + const idName = String(agent?.id ?? "").toLowerCase(); + const name = + constructorName && constructorName !== "object" + ? `${constructorName} ${idName}` + : idName; + if (name.includes("pi")) return "pi"; + if (name.includes("codex")) return "codex"; + if (name.includes("claude")) return "claude-code"; + if (name.includes("gemini")) return "gemini"; + if (name.includes("kimi")) return "kimi"; + if (name.includes("openai")) return "openai"; + if (name.includes("anthropic")) return "anthropic"; + if (name.includes("amp")) return "amp"; + if (name.includes("forge")) return "forge"; + return "unknown"; +} + +export function detectCaptureMode(agent: any): AgentCaptureMode { + const family = detectAgentFamily(agent); + const mode = agent?.opts?.mode ?? agent?.mode; + if (family === "pi") { + if (mode === "rpc") return "rpc-events"; + if (mode === "json") return "cli-json-stream"; + return "cli-text"; + } + const outputFormat = agent?.opts?.outputFormat ?? agent?.outputFormat; + if (family === "openai" || family === "anthropic") return "sdk-events"; + if (outputFormat === "stream-json") return "cli-json-stream"; + if (outputFormat === "json" || agent?.opts?.json) return "cli-json"; + return "cli-text"; +} + +function normalizeAnnotations( + annotations?: Record, +): Record { + const normalized: Record = {}; + for (const [key, value] of Object.entries(annotations ?? {})) { + if (["string", "number", "boolean"].includes(typeof value)) { + normalized[key] = value as string | number | boolean; + } + } + return normalized; +} + +function redactValue(value: unknown): { + value: unknown; + applied: boolean; + ruleIds: string[]; +} { + const rules: Array<{ id: string; pattern: RegExp; replace: string }> = [ + { + id: "api-key", + pattern: /\b(?:sk|pk)_[A-Za-z0-9_-]{8,}\b/g, + replace: "[REDACTED_API_KEY]", + }, + { + id: "bearer-token", + pattern: /Bearer\s+[A-Za-z0-9._-]{8,}/gi, + replace: "Bearer [REDACTED_TOKEN]", + }, + { + id: "auth-header", + pattern: /"authorization"\s*:\s*"[^"]+"/gi, + replace: '"authorization":"[REDACTED]"', + }, + { + id: "cookie-header", + pattern: /"cookie"\s*:\s*"[^"]+"/gi, + replace: '"cookie":"[REDACTED]"', + }, + { + id: "secret-ish", + pattern: /\b(?:api[_-]?key|token|secret|password)=([^\s"']+)/gi, + replace: "$&" as unknown as string, + }, + ]; + const input = + typeof value === "string" ? value : JSON.stringify(value ?? null); + let next = input; + const applied = new Set(); + for (const rule of rules) { + const replaced = next.replace(rule.pattern, (match) => { + applied.add(rule.id); + if (rule.id === "secret-ish") { + const idx = match.indexOf("="); + return `${match.slice(0, idx + 1)}[REDACTED_SECRET]`; + } + return rule.replace; + }); + next = replaced; + } + if (applied.size === 0) return { value, applied: false, ruleIds: [] }; + if (typeof value === "string") + return { value: next, applied: true, ruleIds: [...applied] }; + try { + return { value: JSON.parse(next), applied: true, ruleIds: [...applied] }; + } catch { + return { value: next, applied: true, ruleIds: [...applied] }; + } +} + +function kindPhase( + kind: CanonicalAgentTraceEventKind, +): CanonicalAgentTraceEvent["event"]["phase"] { + if (kind.startsWith("session.")) return "session"; + if (kind.startsWith("turn.")) return "turn"; + if (kind.startsWith("message.") || kind.startsWith("assistant.")) + return "message"; + if (kind.startsWith("tool.")) return "tool"; + if (kind.startsWith("artifact.")) return "artifact"; + return "capture"; +} + +type PayloadKind = "message" | "tool" | "pi" | "none"; + +type MappedStructuredEvent = { + kind: CanonicalAgentTraceEventKind; + payloadKind: PayloadKind; + expect?: CanonicalAgentTraceEventKind; +}; + +type NormalizedTraceEvent = { + kind: CanonicalAgentTraceEventKind; + payload: Record | null; + raw: unknown; + rawType?: string; +}; + +type NormalizedTraceBatch = { + events: NormalizedTraceEvent[]; + expectedKinds?: CanonicalAgentTraceEventKind[]; +}; + +const piSimpleEventMap: Record = { + session: { kind: "session.start", payloadKind: "pi" }, + agent_start: { kind: "session.start", payloadKind: "pi" }, + agent_end: { kind: "session.end", payloadKind: "pi" }, + turn_start: { + kind: "turn.start", + payloadKind: "pi", + expect: "turn.end", + }, + message_start: { kind: "message.start", payloadKind: "pi" }, + tool_execution_start: { + kind: "tool.execution.start", + payloadKind: "tool", + expect: "tool.execution.end", + }, + tool_execution_update: { + kind: "tool.execution.update", + payloadKind: "tool", + expect: "tool.execution.end", + }, + tool_execution_end: { kind: "tool.execution.end", payloadKind: "tool" }, + auto_compaction_start: { kind: "compaction.start", payloadKind: "pi" }, + auto_compaction_end: { kind: "compaction.end", payloadKind: "pi" }, + auto_retry_start: { kind: "retry.start", payloadKind: "pi" }, + auto_retry_end: { kind: "retry.end", payloadKind: "pi" }, +}; + +const genericStructuredEventMap: Record = { + message_start: { kind: "message.start", payloadKind: "message" }, + assistant_message_start: { kind: "message.start", payloadKind: "message" }, + "response.started": { kind: "message.start", payloadKind: "message" }, + tool_call_start: { + kind: "tool.execution.start", + payloadKind: "tool", + expect: "tool.execution.end", + }, + tool_execution_start: { + kind: "tool.execution.start", + payloadKind: "tool", + expect: "tool.execution.end", + }, + "tool_call.started": { + kind: "tool.execution.start", + payloadKind: "tool", + expect: "tool.execution.end", + }, + tool_call_delta: { + kind: "tool.execution.update", + payloadKind: "tool", + expect: "tool.execution.end", + }, + tool_call_update: { + kind: "tool.execution.update", + payloadKind: "tool", + expect: "tool.execution.end", + }, + tool_execution_update: { + kind: "tool.execution.update", + payloadKind: "tool", + expect: "tool.execution.end", + }, + "tool_call.delta": { + kind: "tool.execution.update", + payloadKind: "tool", + expect: "tool.execution.end", + }, + tool_call_end: { kind: "tool.execution.end", payloadKind: "tool" }, + tool_execution_end: { kind: "tool.execution.end", payloadKind: "tool" }, + "tool_call.completed": { kind: "tool.execution.end", payloadKind: "tool" }, + tool_result: { kind: "tool.result", payloadKind: "tool" }, + "tool_result.completed": { kind: "tool.result", payloadKind: "tool" }, +}; + +function extractGenericDeltaText(parsed: any): string | undefined { + const candidates = [ + parsed?.delta?.text, + parsed?.delta, + parsed?.text, + parsed?.content_block?.text, + parsed?.contentBlock?.text, + parsed?.message?.text, + parsed?.message?.content, + parsed?.output_text, + ]; + for (const candidate of candidates) { + if (typeof candidate === "string" && candidate) return candidate; + } + return undefined; +} + +function extractGenericMessageText(parsed: any): string | undefined { + return extractTextFromJsonValue( + parsed?.message ?? parsed?.response ?? parsed?.item ?? parsed, + ); +} + +function extractGenericMessagePayload(parsed: any): Record { + const payload: Record = {}; + const role = + parsed?.message?.role ?? parsed?.role ?? parsed?.response?.role; + if (typeof role === "string") payload.role = role; + const text = extractGenericMessageText(parsed); + if (text) payload.text = text; + if (parsed?.id) payload.id = parsed.id; + return payload; +} + +function extractGenericToolPayload(parsed: any): Record { + const tool = + parsed?.tool ?? + parsed?.toolCall ?? + parsed?.tool_call ?? + parsed?.toolExecution ?? + parsed; + return { + toolCallId: tool?.id ?? tool?.toolCallId ?? parsed?.id, + toolName: tool?.name ?? tool?.toolName ?? parsed?.toolName, + argsPreview: tool?.args ?? tool?.arguments ?? parsed?.args, + resultPreview: tool?.result ?? parsed?.result, + isError: Boolean(tool?.isError ?? parsed?.isError ?? parsed?.error), + }; +} + +function extractPiPayload(parsed: any): Record { + const payload: Record = {}; + if (parsed?.message?.role) payload.role = parsed.message.role; + const text = extractGenericMessageText(parsed?.message); + if (text) payload.text = text; + if (parsed?.id) payload.id = parsed.id; + return payload; +} + +function extractMappedPayload( + parsed: any, + payloadKind: PayloadKind, +): Record | null { + if (payloadKind === "message") return extractGenericMessagePayload(parsed); + if (payloadKind === "tool") return extractGenericToolPayload(parsed); + if (payloadKind === "pi") return extractPiPayload(parsed); + return {}; +} + +function buildNormalizedEvent( + kind: CanonicalAgentTraceEventKind, + payload: Record | null, + raw: unknown, + rawType?: string, +): NormalizedTraceEvent { + return { kind, payload, raw, rawType }; +} + +function normalizeMappedEvent( + parsed: any, + rawType: string, + mapped: MappedStructuredEvent, +): NormalizedTraceBatch { + return { + events: [ + buildNormalizedEvent( + mapped.kind, + extractMappedPayload(parsed, mapped.payloadKind), + parsed, + rawType, + ), + ], + expectedKinds: mapped.expect ? [mapped.expect] : undefined, + }; +} + +function normalizeClaudeStructuredEvent( + parsed: any, + rawType: string, +): NormalizedTraceBatch | null { + if (rawType === "assistant") { + const text = extractGenericMessageText(parsed?.message ?? parsed); + const events = text + ? [ + buildNormalizedEvent( + "message.update", + extractGenericMessagePayload(parsed?.message ?? parsed), + parsed, + rawType, + ), + ] + : [ + buildNormalizedEvent("stdout", { eventType: rawType }, parsed, rawType), + ]; + const usage = normalizeTokenUsage(parsed?.message?.usage); + if (usage) events.push(buildNormalizedEvent("usage", usage, parsed, rawType)); + return { events }; + } + + if (rawType === "result") { + const events: NormalizedTraceEvent[] = []; + const usage = normalizeTokenUsage(parsed?.usage); + if (usage) events.push(buildNormalizedEvent("usage", usage, parsed, rawType)); + const text = extractGenericMessageText(parsed); + if (text) { + events.push( + buildNormalizedEvent("assistant.message.final", { text }, parsed, rawType), + ); + } + return events.length > 0 ? { events } : null; + } + + return null; +} + +function normalizeGeminiStructuredEvent( + parsed: any, + rawType: string, +): NormalizedTraceBatch | null { + if (rawType === "message") { + const text = extractGenericMessageText(parsed); + if (parsed?.role === "assistant" && typeof text === "string" && text) { + return { + events: [ + buildNormalizedEvent( + parsed?.delta ? "assistant.text.delta" : "assistant.message.final", + { text }, + parsed, + rawType, + ), + ], + }; + } + } + + if (rawType === "result" && parsed?.stats) { + const usage = normalizeTokenUsage(parsed.stats); + return usage + ? { events: [buildNormalizedEvent("usage", usage, parsed, rawType)] } + : null; + } + + return null; +} + +function normalizeCodexStructuredEvent( + parsed: any, + rawType: string, +): NormalizedTraceBatch | null { + if (rawType === "thread.started") { + return { + events: [buildNormalizedEvent("stdout", { eventType: rawType }, parsed, rawType)], + }; + } + + if (rawType === "turn.started") { + return { + events: [buildNormalizedEvent("turn.start", {}, parsed, rawType)], + expectedKinds: ["turn.end"], + }; + } + + if (rawType === "item.completed" && parsed?.item?.type === "agent_message") { + const text = extractGenericMessageText(parsed.item); + if (typeof text === "string" && text) { + return { + events: [ + buildNormalizedEvent( + "assistant.message.final", + { text }, + parsed, + rawType, + ), + ], + }; + } + } + + if (rawType === "turn.completed") { + const events: NormalizedTraceEvent[] = []; + const usage = normalizeTokenUsage(parsed?.usage); + if (usage) events.push(buildNormalizedEvent("usage", usage, parsed, rawType)); + events.push(buildNormalizedEvent("turn.end", {}, parsed, rawType)); + const text = extractGenericMessageText(parsed); + if (text) { + events.push( + buildNormalizedEvent("assistant.message.final", { text }, parsed, rawType), + ); + } + return { events }; + } + + return null; +} + +function normalizePiStructuredEvent( + parsed: any, + rawType: string, +): NormalizedTraceBatch | null { + const simple = piSimpleEventMap[rawType]; + if (simple) return normalizeMappedEvent(parsed, rawType, simple); + + if (rawType === "turn_end") { + const events: NormalizedTraceEvent[] = [ + buildNormalizedEvent("turn.end", extractPiPayload(parsed), parsed, rawType), + ]; + const text = extractGenericMessageText(parsed?.message); + if (text) { + events.push( + buildNormalizedEvent( + "assistant.message.final", + { text }, + parsed?.message, + rawType, + ), + ); + } + const usage = normalizeTokenUsage(parsed?.message?.usage); + if (usage) events.push(buildNormalizedEvent("usage", usage, parsed.message.usage, "usage")); + return { events }; + } + + if (rawType === "message_end") { + const events: NormalizedTraceEvent[] = [ + buildNormalizedEvent("message.end", extractPiPayload(parsed), parsed, rawType), + ]; + const text = extractGenericMessageText(parsed?.message); + if (parsed?.message?.role === "assistant" && text) { + events.push( + buildNormalizedEvent( + "assistant.message.final", + { text }, + parsed?.message, + rawType, + ), + ); + } + return { events }; + } + + if (rawType === "message_update") { + const evt = parsed?.assistantMessageEvent; + if (evt?.type === "text_delta" && typeof evt.delta === "string") { + return { + events: [ + buildNormalizedEvent( + "assistant.text.delta", + { text: evt.delta }, + parsed, + evt.type, + ), + ], + }; + } + if ( + (evt?.type === "thinking_delta" || evt?.type === "reasoning_delta") && + typeof evt.delta === "string" + ) { + return { + events: [ + buildNormalizedEvent( + "assistant.thinking.delta", + { text: evt.delta }, + parsed, + evt.type, + ), + ], + }; + } + return { + events: [ + buildNormalizedEvent("message.update", extractPiPayload(parsed), parsed, rawType), + ], + }; + } + + return { + events: [buildNormalizedEvent("stdout", { eventType: rawType }, parsed, rawType)], + }; +} + +function normalizeSharedStructuredEvent( + parsed: any, + rawType: string, +): NormalizedTraceBatch | null { + const mapped = genericStructuredEventMap[rawType]; + if (mapped) return normalizeMappedEvent(parsed, rawType, mapped); + + if ( + [ + "message_delta", + "assistant_message.delta", + "assistant_message_delta", + "response.output_text.delta", + "content_block_delta", + ].includes(rawType) + ) { + const text = extractGenericDeltaText(parsed); + if (typeof text === "string" && text) { + return { + events: [ + buildNormalizedEvent("assistant.text.delta", { text }, parsed, rawType), + ], + }; + } + } + + if ( + [ + "thinking_delta", + "reasoning_delta", + "response.reasoning.delta", + ].includes(rawType) + ) { + const text = extractGenericDeltaText(parsed); + if (typeof text === "string" && text) { + return { + events: [ + buildNormalizedEvent( + "assistant.thinking.delta", + { text }, + parsed, + rawType, + ), + ], + }; + } + } + + if ( + [ + "message_end", + "assistant_message_end", + "response.completed", + "message_stop", + ].includes(rawType) + ) { + const events: NormalizedTraceEvent[] = [ + buildNormalizedEvent( + "message.end", + extractGenericMessagePayload(parsed), + parsed, + rawType, + ), + ]; + const text = extractGenericMessageText(parsed); + if (text) { + events.push( + buildNormalizedEvent("assistant.message.final", { text }, parsed, rawType), + ); + } + const usage = normalizeTokenUsage(parsed?.usage); + if (usage) events.push(buildNormalizedEvent("usage", usage, parsed, rawType)); + return { events }; + } + + return null; +} + +function normalizeStructuredEvent( + agentFamily: AgentFamily, + parsed: any, + rawType: string, +): NormalizedTraceBatch { + if (agentFamily === "pi") { + return normalizePiStructuredEvent(parsed, rawType) ?? { + events: [buildNormalizedEvent("stdout", { eventType: rawType }, parsed, rawType)], + }; + } + + if (agentFamily === "claude-code") { + const normalized = normalizeClaudeStructuredEvent(parsed, rawType); + if (normalized) return normalized; + } + + if (agentFamily === "gemini") { + const normalized = normalizeGeminiStructuredEvent(parsed, rawType); + if (normalized) return normalized; + } + + if (agentFamily === "codex") { + const normalized = normalizeCodexStructuredEvent(parsed, rawType); + if (normalized) return normalized; + } + + const shared = normalizeSharedStructuredEvent(parsed, rawType); + if (shared) return shared; + + return { + events: [buildNormalizedEvent("stdout", { eventType: rawType }, parsed, rawType)], + }; +} + +export function canonicalTraceEventToOtelLogRecord( + event: CanonicalAgentTraceEvent, + context?: { agentId?: string; model?: string }, +): { + body: string; + attributes: Record; + severity: "INFO" | "WARN" | "ERROR"; +} { + const attributes: Record = { + "smithers.trace.version": event.traceVersion, + "run.id": event.runId, + "workflow.path": event.workflowPath, + "workflow.hash": event.workflowHash, + "node.id": event.nodeId, + "node.iteration": event.iteration, + "node.attempt": event.attempt, + "agent.family": event.source.agentFamily, + "agent.id": context?.agentId, + "agent.model": context?.model, + "agent.capture_mode": event.source.captureMode, + "trace.completeness": event.traceCompleteness, + "event.kind": event.event.kind, + "event.phase": event.event.phase, + "event.sequence": event.event.sequence, + "source.raw_type": event.source.rawType, + "source.raw_event_id": event.source.rawEventId, + "source.observed": event.source.observed, + }; + for (const [key, value] of Object.entries(event.annotations)) { + attributes[key.startsWith("custom.") ? key : `custom.${key}`] = value; + } + + const severity = + event.event.kind === "capture.error" + ? "ERROR" + : event.event.kind === "capture.warning" || event.event.kind === "stderr" + ? "WARN" + : "INFO"; + + return { + body: JSON.stringify({ + payload: event.payload, + raw: event.raw, + redaction: event.redaction, + annotations: event.annotations, + }), + attributes, + severity, + }; +} + +type TraceCollectorOptions = { + eventBus: EventBus; + runId: string; + workflowPath?: string | null; + workflowHash?: string | null; + nodeId: string; + iteration: number; + attempt: number; + agent: any; + agentId?: string; + model?: string; + logDir?: string; + annotations?: Record; +}; + +export class AgentTraceCollector { + private readonly eventBus: EventBus; + private readonly runId: string; + private readonly workflowPath?: string; + private readonly workflowHash?: string; + private readonly nodeId: string; + private readonly iteration: number; + private readonly attempt: number; + private readonly agentFamily: AgentFamily; + private readonly captureMode: AgentCaptureMode; + private readonly agentId?: string; + private readonly model?: string; + private readonly annotations: Record; + private readonly logDir?: string; + private readonly capabilities: AgentTraceCapabilityProfile; + private readonly startedAtMs = nowMs(); + private readonly events: CanonicalAgentTraceEvent[] = []; + private readonly rawArtifactRefs: string[] = []; + private readonly seenKinds = new Set(); + private readonly directKinds = new Set(); + private readonly expectedKinds = new Set(); + private readonly failures: string[] = []; + private readonly warnings: string[] = []; + private sequence = 0; + private rawEventSequence = 0; + private stdoutBuffer = ""; + private stderrBuffer = ""; + private assistantTextBuffer = ""; + private finalText: string | null = null; + private currentRawEventId?: string; + private listener?: (event: SmithersEvent) => void; + + constructor(opts: TraceCollectorOptions) { + this.eventBus = opts.eventBus; + this.runId = opts.runId; + this.workflowPath = opts.workflowPath ?? undefined; + this.workflowHash = opts.workflowHash ?? undefined; + this.nodeId = opts.nodeId; + this.iteration = opts.iteration; + this.attempt = opts.attempt; + this.agentFamily = detectAgentFamily(opts.agent); + this.captureMode = detectCaptureMode(opts.agent); + this.capabilities = resolveAgentTraceCapabilities( + this.agentFamily, + this.captureMode, + ); + this.agentId = opts.agentId; + this.model = opts.model; + this.annotations = normalizeAnnotations(opts.annotations); + this.logDir = opts.logDir; + + const profile = this.capabilities; + if (profile.sessionMetadata && this.agentFamily === "pi") { + this.expectedKinds.add("session.start"); + this.expectedKinds.add("session.end"); + this.expectedKinds.add("turn.start"); + this.expectedKinds.add("turn.end"); + } + if (profile.finalAssistantMessage) + this.expectedKinds.add("assistant.message.final"); + } + + begin() { + this.listener = (event) => this.observeSmithersEvent(event); + this.eventBus.on("event", this.listener); + } + + endListener() { + if (this.listener) this.eventBus.off("event", this.listener); + this.listener = undefined; + } + + onStdout(text: string) { + this.processChunk("stdout", text); + } + + onStderr(text: string) { + this.processChunk("stderr", text); + } + + observeResult(result: any) { + const text = String(result?.text ?? "").trim(); + const rawEventId = this.nextRawEventId("result"); + if ( + text && + (!this.finalText || + (!this.seenKinds.has("assistant.text.delta") && + !this.seenKinds.has("assistant.message.final"))) + ) { + this.finalText = text; + } + if (this.captureMode === "sdk-events" && text) { + this.pushDerived( + "assistant.message.final", + { text }, + text, + undefined, + true, + rawEventId, + ); + } + const usage = normalizeTokenUsage(result?.usage ?? result?.totalUsage); + if (usage) { + this.pushDerived( + "usage", + usage, + usage, + "usage", + true, + rawEventId, + ); + } + } + + observeError(error: unknown) { + this.failures.push(error instanceof Error ? error.message : String(error)); + const rawEventId = this.nextRawEventId("error"); + this.pushDerived( + "capture.error", + { error: this.failures.at(-1) }, + { error: this.failures.at(-1) }, + "error", + true, + rawEventId, + ); + } + + async flush() { + this.endListener(); + const finishedAtMs = nowMs(); + this.flushStructuredBuffers(); + if ( + this.captureMode !== "sdk-events" && + !this.seenKinds.has("assistant.message.final") && + this.finalText && + this.failures.length === 0 + ) { + this.pushDerived( + "assistant.message.final", + { text: this.finalText }, + this.finalText, + undefined, + false, + ); + } + if ( + (this.captureMode === "cli-json-stream" || + this.captureMode === "rpc-events") && + this.events.length > 0 && + !this.seenKinds.has("assistant.message.final") && + this.failures.length === 0 + ) { + this.warnings.push( + "structured stream ended without a terminal assistant message", + ); + this.pushDerived( + "capture.warning", + { reason: "missing-terminal-event" }, + { reason: "missing-terminal-event" }, + "capture", + ); + } + + let traceCompleteness = this.resolveCompleteness(); + let missingExpectedEventKinds = [...this.expectedKinds].filter( + (kind) => !this.directKinds.has(kind), + ); + this.applyTraceCompleteness(traceCompleteness); + let summary: AgentTraceSummary = { + traceVersion: "1", + runId: this.runId, + workflowPath: this.workflowPath, + workflowHash: this.workflowHash, + nodeId: this.nodeId, + iteration: this.iteration, + attempt: this.attempt, + traceStartedAtMs: this.startedAtMs, + traceFinishedAtMs: finishedAtMs, + agentFamily: this.agentFamily, + agentId: this.agentId, + model: this.model, + captureMode: this.captureMode, + traceCompleteness, + unsupportedEventKinds: unsupportedKindsForCapabilities( + this.capabilities, + ).filter((kind) => !this.seenKinds.has(kind)), + missingExpectedEventKinds, + rawArtifactRefs: this.rawArtifactRefs, + }; + + const persistedArtifact = await this.persistNdjson(summary); + if (persistedArtifact.ok && persistedArtifact.file) { + const artifactPath = persistedArtifact.file; + this.rawArtifactRefs.push(artifactPath); + this.pushDerived( + "artifact.created", + { + artifactKind: "agent-trace.ndjson", + artifactPath, + contentType: "application/x-ndjson", + }, + { + artifactKind: "agent-trace.ndjson", + artifactPath, + contentType: "application/x-ndjson", + }, + "artifact", + ); + this.applyTraceCompleteness(traceCompleteness); + summary = { ...summary, rawArtifactRefs: [...this.rawArtifactRefs] }; + try { + await this.rewriteNdjson(artifactPath, summary); + } catch (error) { + const message = + error instanceof Error ? error.message : String(error); + this.warnings.push(message); + this.pushDerived( + "capture.warning", + { reason: "artifact-rewrite-failed", error: message }, + { reason: "artifact-rewrite-failed", error: message }, + "artifact", + ); + traceCompleteness = this.resolveCompleteness(); + missingExpectedEventKinds = [...this.expectedKinds].filter( + (kind) => !this.directKinds.has(kind), + ); + this.applyTraceCompleteness(traceCompleteness); + summary = { + ...summary, + traceCompleteness, + missingExpectedEventKinds, + rawArtifactRefs: [...this.rawArtifactRefs], + }; + } + } else if (!persistedArtifact.ok) { + this.warnings.push(persistedArtifact.error); + this.pushDerived( + "capture.warning", + { reason: "artifact-write-failed", error: persistedArtifact.error }, + { reason: "artifact-write-failed", error: persistedArtifact.error }, + "artifact", + ); + traceCompleteness = this.resolveCompleteness(); + missingExpectedEventKinds = [...this.expectedKinds].filter( + (kind) => !this.directKinds.has(kind), + ); + this.applyTraceCompleteness(traceCompleteness); + summary = { + ...summary, + traceCompleteness, + missingExpectedEventKinds, + rawArtifactRefs: [...this.rawArtifactRefs], + }; + } + + this.applyTraceCompleteness(traceCompleteness); + for (const event of this.events) { + const smithersEvent: AgentTraceSmithersEvent = { + type: "AgentTraceEvent", + runId: this.runId, + nodeId: this.nodeId, + iteration: this.iteration, + attempt: this.attempt, + trace: event, + timestampMs: event.timestampMs, + }; + await this.eventBus.emitEventQueued( + smithersEvent as unknown as SmithersEvent, + ); + const record = canonicalTraceEventToOtelLogRecord(event, { + agentId: this.agentId, + model: this.model, + }); + if (record.severity === "ERROR") { + await logErrorAwait(record.body, record.attributes, "agent-trace"); + } else if (record.severity === "WARN") { + await logWarningAwait(record.body, record.attributes, "agent-trace"); + } else { + await logInfoAwait(record.body, record.attributes, "agent-trace"); + } + } + + await this.eventBus.emitEventQueued({ + type: "AgentTraceSummary", + runId: this.runId, + nodeId: this.nodeId, + iteration: this.iteration, + attempt: this.attempt, + summary, + timestampMs: finishedAtMs, + } as unknown as SmithersEvent); + } + + private processChunk(stream: "stdout" | "stderr", text: string) { + if (stream === "stderr") { + this.stderrBuffer += text; + this.pushObserved("stderr", { text }, text, stream); + return; + } + this.stdoutBuffer += text; + if (this.captureMode === "cli-text" || this.captureMode === "sdk-events") { + this.pushObserved("stdout", { text }, text, stream); + return; + } + const lines = this.stdoutBuffer.split(/\r?\n/); + this.stdoutBuffer = lines.pop() ?? ""; + for (const line of lines) { + if (!line.trim()) continue; + this.processStructuredStdoutLine(line); + } + } + + private flushStructuredBuffers() { + if (this.captureMode === "cli-text" || this.captureMode === "sdk-events") { + this.stdoutBuffer = ""; + this.stderrBuffer = ""; + return; + } + + const line = this.stdoutBuffer.trim(); + this.stdoutBuffer = ""; + this.stderrBuffer = ""; + if (!line) return; + + this.failures.push(`truncated structured stream: ${line.slice(0, 200)}`); + this.pushObserved( + "capture.error", + { reason: "truncated-json-stream", linePreview: line.slice(0, 200) }, + line, + "stdout", + ); + } + + private processStructuredStdoutLine(line: string) { + let parsed: any; + try { + parsed = JSON.parse(line); + } catch { + this.failures.push(`malformed upstream JSON: ${line.slice(0, 200)}`); + this.pushObserved( + "capture.error", + { linePreview: line.slice(0, 200), reason: "malformed-json" }, + line, + "stdout", + ); + return; + } + const rawType = + typeof parsed?.type === "string" ? parsed.type : "structured"; + const previousRawEventId = this.currentRawEventId; + this.currentRawEventId = this.nextRawEventId(rawType); + try { + this.emitObservedBatch( + normalizeStructuredEvent(this.agentFamily, parsed, rawType), + ); + } finally { + this.currentRawEventId = previousRawEventId; + } + } + + private appendAssistantText(text: string) { + this.assistantTextBuffer += text; + this.finalText = this.assistantTextBuffer; + } + + private setFinalAssistantText(text: string) { + this.assistantTextBuffer = text; + this.finalText = text; + } + + private observeSmithersEvent(event: SmithersEvent) { + const sameAttempt = + (event as any).runId === this.runId && + (event as any).nodeId === this.nodeId && + (event as any).iteration === this.iteration && + (event as any).attempt === this.attempt; + if (!sameAttempt) return; + if (this.agentFamily === "pi") return; + + if (event.type === "ToolCallStarted") { + const rawEventId = this.nextRawEventId(event.type); + this.pushDerived( + "tool.execution.start", + { + toolCallId: event.toolCallId, + toolName: event.toolName, + }, + event, + event.type, + true, + rawEventId, + ); + this.expectedKinds.add("tool.execution.end"); + } + if (event.type === "ToolCallFinished") { + const rawEventId = this.nextRawEventId(event.type); + this.pushDerived( + "tool.execution.end", + { + toolCallId: event.toolCallId, + toolName: event.toolName, + isError: event.status === "error", + }, + event, + event.type, + true, + rawEventId, + ); + } + if (event.type === "TokenUsageReported") { + const rawEventId = this.nextRawEventId(event.type); + this.pushDerived( + "usage", + { + model: event.model, + agent: event.agent, + inputTokens: event.inputTokens, + outputTokens: event.outputTokens, + cacheReadTokens: event.cacheReadTokens, + cacheWriteTokens: event.cacheWriteTokens, + reasoningTokens: event.reasoningTokens, + }, + event, + event.type, + true, + rawEventId, + ); + } + } + + private resolveCompleteness(): TraceCompleteness { + if (this.failures.length > 0) return "capture-failed"; + + const richKinds = new Set([ + "session.start", + "session.end", + "turn.start", + "turn.end", + "message.start", + "message.update", + "message.end", + "assistant.text.delta", + "assistant.thinking.delta", + "tool.execution.start", + "tool.execution.update", + "tool.execution.end", + "tool.result", + "retry.start", + "retry.end", + "compaction.start", + "compaction.end", + ]); + const sawRichStructure = [...this.directKinds].some((kind) => + richKinds.has(kind), + ); + + const coarseCaptureMode = + this.captureMode === "sdk-events" || + this.captureMode === "cli-text" || + this.captureMode === "cli-json"; + + if (!sawRichStructure && this.warnings.length === 0 && coarseCaptureMode) { + return "final-only"; + } + + const missing = [...this.expectedKinds].filter( + (kind) => !this.directKinds.has(kind), + ); + if (missing.length > 0 || this.warnings.length > 0) + return "partial-observed"; + + if (coarseCaptureMode) { + return sawRichStructure ? "partial-observed" : "final-only"; + } + + if (!sawRichStructure) return "final-only"; + return "full-observed"; + } + + private push( + kind: CanonicalAgentTraceEventKind, + payload: Record | null, + raw: unknown, + observed: boolean, + rawType?: string, + direct = true, + rawEventId?: string, + ) { + const redactedPayload = redactValue(payload); + const redactedRaw = redactValue(raw); + const event: CanonicalAgentTraceEvent = { + traceVersion: "1", + runId: this.runId, + workflowPath: this.workflowPath, + workflowHash: this.workflowHash, + nodeId: this.nodeId, + iteration: this.iteration, + attempt: this.attempt, + timestampMs: nowMs(), + event: { + sequence: this.sequence++, + kind, + phase: kindPhase(kind), + }, + source: { + agentFamily: this.agentFamily, + captureMode: this.captureMode, + rawType, + rawEventId: + rawEventId ?? + (observed + ? (this.currentRawEventId ?? this.nextRawEventId(rawType ?? kind)) + : undefined), + observed, + }, + traceCompleteness: "partial-observed", + payload: redactedPayload.value as Record | null, + raw: redactedRaw.value, + redaction: { + applied: redactedPayload.applied || redactedRaw.applied, + ruleIds: [ + ...new Set([...redactedPayload.ruleIds, ...redactedRaw.ruleIds]), + ], + }, + annotations: this.annotations, + }; + this.events.push(event); + this.seenKinds.add(kind); + if (direct) this.directKinds.add(kind); + } + + private pushObserved( + kind: CanonicalAgentTraceEventKind, + payload: Record | null, + raw: unknown, + rawType?: string, + rawEventId?: string, + ) { + this.push(kind, payload, raw, true, rawType, true, rawEventId); + } + + private pushDerived( + kind: CanonicalAgentTraceEventKind, + payload: Record | null, + raw: unknown, + rawType?: string, + direct = true, + rawEventId?: string, + ) { + this.push(kind, payload, raw, false, rawType, direct, rawEventId); + } + + private applyTraceCompleteness(traceCompleteness: TraceCompleteness) { + for (const event of this.events) { + event.traceCompleteness = traceCompleteness; + } + } + + private emitObservedBatch( + batch: NormalizedTraceBatch, + rawEventId = this.currentRawEventId, + ) { + for (const kind of batch.expectedKinds ?? []) { + this.expectedKinds.add(kind); + } + for (const event of batch.events) { + this.observeNormalizedEvent(event); + this.pushObserved( + event.kind, + event.payload, + event.raw, + event.rawType, + rawEventId, + ); + } + } + + private observeNormalizedEvent(event: NormalizedTraceEvent) { + if ( + event.kind === "assistant.text.delta" && + typeof event.payload?.text === "string" + ) { + this.appendAssistantText(event.payload.text); + return; + } + if ( + event.kind === "assistant.message.final" && + typeof event.payload?.text === "string" + ) { + this.setFinalAssistantText(event.payload.text); + } + } + + private nextRawEventId(rawType: string) { + return `${rawType}:${this.rawEventSequence++}`; + } + + private async persistNdjson( + summary: AgentTraceSummary, + ): Promise<{ ok: true; file?: string } | { ok: false; error: string }> { + if (!this.logDir) return { ok: true }; + const dir = join(this.logDir, "agent-trace"); + const file = join( + dir, + `${this.nodeId}-${this.iteration}-${this.attempt}.ndjson`, + ); + const lines = this.events + .map((event) => JSON.stringify(event)) + .concat(JSON.stringify({ summary })); + try { + await mkdir(dir, { recursive: true }); + await appendFile(file, `${lines.join("\n")}\n`, "utf8"); + return { ok: true, file }; + } catch (error) { + return { + ok: false, + error: error instanceof Error ? error.message : String(error), + }; + } + } + + private async rewriteNdjson(file: string, summary: AgentTraceSummary) { + const lines = this.events + .map((event) => JSON.stringify(event)) + .concat(JSON.stringify({ summary })); + await writeFile(file, `${lines.join("\n")}\n`, "utf8"); + } +} diff --git a/src/agents/BaseCliAgent.ts b/src/agents/BaseCliAgent.ts index 40f1bd3b..3316c673 100644 --- a/src/agents/BaseCliAgent.ts +++ b/src/agents/BaseCliAgent.ts @@ -13,6 +13,8 @@ import { spawnCaptureEffect } from "../effect/child-process"; import { runPromise } from "../effect/runtime"; import { getToolContext } from "../tools/context"; import { SmithersError } from "../utils/errors"; +import { extractTextFromJsonValue } from "../utils/text-extraction"; +import { normalizeTokenUsage } from "../utils/usage"; import { launchDiagnostics, enrichReportWithErrorAnalysis } from "./diagnostics"; type TimeoutInput = number | { totalMs?: number; idleMs?: number } | undefined; @@ -58,6 +60,7 @@ type RunRpcCommandOptions = { idleTimeoutMs?: number; signal?: AbortSignal; maxOutputBytes?: number; + onStdout?: (chunk: string) => void; onStderr?: (chunk: string) => void; onExtensionUiRequest?: (request: PiExtensionUiRequest) => | Promise @@ -200,30 +203,7 @@ export function tryParseJson(text: string): unknown | undefined { return undefined; } -export function extractTextFromJsonValue(value: any): string | undefined { - if (typeof value === "string") return value; - if (!value || typeof value !== "object") return undefined; - if (typeof value.text === "string") return value.text; - if (typeof value.content === "string") return value.content; - if (Array.isArray(value.content)) { - const parts = value.content - .map((part: any) => { - if (!part) return ""; - if (typeof part === "string") return part; - if (typeof part.text === "string") return part.text; - if (typeof part.content === "string") return part.content; - return ""; - }) - .join(""); - if (parts.trim()) return parts; - } - if (value.response) return extractTextFromJsonValue(value.response); - if (value.message) return extractTextFromJsonValue(value.message); - if (value.result) return extractTextFromJsonValue(value.result); - if (value.output) return extractTextFromJsonValue(value.output); - if (value.data) return extractTextFromJsonValue(value.data); - return undefined; -} +export { extractTextFromJsonValue }; function extractTextFromJsonPayload(raw: string): string | undefined { const trimmed = raw.trim(); @@ -314,6 +294,19 @@ export function extractUsageFromOutput(raw: string): CliUsageInfo | undefined { const usage: CliUsageInfo = {}; let found = false; + const addUsage = (next: CliUsageInfo | null | undefined) => { + if (!next) return false; + let changed = false; + for (const [key, value] of Object.entries(next) as Array< + [keyof CliUsageInfo, number | undefined] + >) { + if (typeof value !== "number" || value <= 0) continue; + usage[key] = (usage[key] ?? 0) + value; + changed = true; + } + return changed; + }; + for (const line of lines) { let parsed: any; try { parsed = JSON.parse(line); } catch { continue; } @@ -321,56 +314,34 @@ export function extractUsageFromOutput(raw: string): CliUsageInfo | undefined { // Claude Code stream-json: message_start contains input token counts if (parsed.type === "message_start" && parsed.message?.usage) { - const u = parsed.message.usage; - usage.inputTokens = (usage.inputTokens ?? 0) + (u.input_tokens ?? 0); - if (u.cache_read_input_tokens) { - usage.cacheReadTokens = (usage.cacheReadTokens ?? 0) + u.cache_read_input_tokens; - } - if (u.cache_creation_input_tokens) { - usage.cacheWriteTokens = (usage.cacheWriteTokens ?? 0) + u.cache_creation_input_tokens; - } - found = true; + const normalized = normalizeTokenUsage(parsed.message.usage); + found = addUsage({ + inputTokens: normalized?.inputTokens, + cacheReadTokens: normalized?.cacheReadTokens, + cacheWriteTokens: normalized?.cacheWriteTokens, + }) || found; continue; } // Claude Code stream-json: message_delta has output token count if (parsed.type === "message_delta" && parsed.usage) { - if (parsed.usage.output_tokens) { - usage.outputTokens = (usage.outputTokens ?? 0) + parsed.usage.output_tokens; - } - found = true; + const normalized = normalizeTokenUsage(parsed.usage); + found = addUsage({ + outputTokens: normalized?.outputTokens, + }) || found; continue; } // Codex --json: turn.completed event if (parsed.type === "turn.completed" && parsed.usage) { - const u = parsed.usage; - if (u.input_tokens) usage.inputTokens = (usage.inputTokens ?? 0) + u.input_tokens; - if (u.output_tokens) usage.outputTokens = (usage.outputTokens ?? 0) + u.output_tokens; - if (u.cached_input_tokens) usage.cacheReadTokens = (usage.cacheReadTokens ?? 0) + u.cached_input_tokens; - found = true; + found = addUsage(normalizeTokenUsage(parsed.usage)) || found; continue; } // Generic: any event with a top-level "usage" containing token fields if (parsed.usage && typeof parsed.usage === "object") { - const u = parsed.usage; - const inTok = u.input_tokens ?? u.inputTokens ?? u.prompt_tokens ?? 0; - const outTok = u.output_tokens ?? u.outputTokens ?? u.completion_tokens ?? 0; - if (inTok > 0 || outTok > 0) { - usage.inputTokens = (usage.inputTokens ?? 0) + inTok; - usage.outputTokens = (usage.outputTokens ?? 0) + outTok; - if (u.cache_read_input_tokens || u.cacheReadTokens || u.cached_input_tokens) { - usage.cacheReadTokens = (usage.cacheReadTokens ?? 0) + - (u.cache_read_input_tokens ?? u.cacheReadTokens ?? u.cached_input_tokens ?? 0); - } - if (u.reasoning_tokens ?? u.reasoningTokens) { - usage.reasoningTokens = (usage.reasoningTokens ?? 0) + - (u.reasoning_tokens ?? u.reasoningTokens ?? 0); - } - found = true; - continue; - } + found = addUsage(normalizeTokenUsage(parsed.usage)) || found; + continue; } } @@ -381,9 +352,7 @@ export function extractUsageFromOutput(raw: string): CliUsageInfo | undefined { if (parsed?.stats?.models && typeof parsed.stats.models === "object") { for (const data of Object.values(parsed.stats.models as Record)) { if (data?.tokens) { - usage.inputTokens = (usage.inputTokens ?? 0) + (data.tokens.input ?? data.tokens.prompt ?? 0); - usage.outputTokens = (usage.outputTokens ?? 0) + (data.tokens.output ?? 0); - found = true; + found = addUsage(normalizeTokenUsage(data.tokens)) || found; } } } @@ -594,7 +563,19 @@ export function runCommandEffect( agentArgs: args.join(" "), cwd, }), + Effect.annotateSpans({ + agentCommand: command, + agentArgs: args.join(" "), + cwd, + }), Effect.withLogSpan(`agent:${command}`), + Effect.withSpan(`agent:${command}`, { + attributes: { + agentCommand: command, + agentArgs: args.join(" "), + cwd, + }, + }), ); } @@ -613,7 +594,7 @@ export function runRpcCommandEffect(command: string, args: string[], options: Ru exitCode: number | null; usage?: any; }, Error> { - const { cwd, env, prompt, timeoutMs, idleTimeoutMs, signal, maxOutputBytes, onStderr, onExtensionUiRequest } = options; + const { cwd, env, prompt, timeoutMs, idleTimeoutMs, signal, maxOutputBytes, onStdout, onStderr, onExtensionUiRequest } = options; return Effect.async<{ text: string; output: unknown; @@ -823,8 +804,10 @@ export function runRpcCommandEffect(command: string, args: string[], options: Ru }); }); - child.stdout?.on("data", () => { + child.stdout?.on("data", (chunk) => { inactivity.reset(); + const text = chunk.toString("utf8"); + onStdout?.(text); }); child.stderr?.on("data", (chunk) => { @@ -882,7 +865,21 @@ export function runRpcCommandEffect(command: string, args: string[], options: Ru cwd, rpc: true, }), + Effect.annotateSpans({ + agentCommand: command, + agentArgs: args.join(" "), + cwd, + rpc: true, + }), Effect.withLogSpan(`agent:${command}:rpc`), + Effect.withSpan(`agent:${command}:rpc`, { + attributes: { + agentCommand: command, + agentArgs: args.join(" "), + cwd, + rpc: true, + }, + }), ); } diff --git a/src/agents/CodexAgent.ts b/src/agents/CodexAgent.ts index 28d93733..b917b64c 100644 --- a/src/agents/CodexAgent.ts +++ b/src/agents/CodexAgent.ts @@ -16,6 +16,7 @@ type CodexAgentOptions = BaseCliAgentOptions & { disable?: string[]; image?: string[]; model?: string; + outputFormat?: "text" | "json" | "stream-json"; oss?: boolean; localProvider?: string; sandbox?: "read-only" | "workspace-write" | "danger-full-access"; @@ -35,6 +36,11 @@ export class CodexAgent extends BaseCliAgent { private readonly opts: CodexAgentOptions; constructor(opts: CodexAgentOptions = {}) { + opts = { + ...opts, + json: opts.json ?? true, + outputFormat: opts.outputFormat ?? "stream-json", + }; super(opts); this.opts = opts; } diff --git a/src/agents/PiAgent.ts b/src/agents/PiAgent.ts index a3f71e2c..9506a5ac 100644 --- a/src/agents/PiAgent.ts +++ b/src/agents/PiAgent.ts @@ -220,6 +220,27 @@ export class PiAgent extends BaseCliAgent { } throw err; } + + // RPC mode + const rpcResult = await runRpcCommand("pi", args, { + cwd, + env, + prompt, + timeoutMs: callTimeouts.totalMs, + idleTimeoutMs: callTimeouts.idleMs, + signal: options?.abortSignal, + maxOutputBytes: this.maxOutputBytes ?? getToolContext()?.maxOutputBytes, + onStdout: options?.onStdout, + onStderr: options?.onStderr, + onExtensionUiRequest: this.opts.onExtensionUiRequest, + }); + + return buildGenerateResult( + rpcResult.text, + rpcResult.output, + this.opts.model ?? "pi", + rpcResult.usage, + ); } protected async buildCommand(_params: { diff --git a/src/cli/index.ts b/src/cli/index.ts index 6d830e3c..a1310d7a 100755 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -11,7 +11,7 @@ import { ensureSmithersTables } from "../db/ensure"; import { SmithersDb } from "../db/adapter"; import { buildContext } from "../context"; import { fromPromise } from "../effect/interop"; -import { runPromise } from "../effect/runtime"; +import { runPromise, shutdownRuntime } from "../effect/runtime"; import type { SmithersWorkflow } from "../SmithersWorkflow"; import { Smithers } from "../effect/builder"; import { revertToAttempt } from "../revert"; @@ -90,6 +90,30 @@ function parseJsonInput(raw: string | undefined, label: string, fail: FailFn) { } } +function parseAnnotations(raw: string | undefined, fail: FailFn) { + const parsed = parseJsonInput(raw, "annotations", fail); + if (parsed === undefined) return undefined; + if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) { + return fail({ + code: "INVALID_ANNOTATIONS", + message: "Run annotations must be a flat JSON object of string/number/boolean values", + exitCode: 4, + }); + } + const annotations: Record = {}; + for (const [key, value] of Object.entries(parsed)) { + if (!["string", "number", "boolean"].includes(typeof value)) { + return fail({ + code: "INVALID_ANNOTATIONS", + message: `Run annotation ${key} must be a string, number, or boolean`, + exitCode: 4, + }); + } + annotations[key] = value as string | number | boolean; + } + return annotations; +} + function formatStatusExitCode(status: string | undefined) { if (status === "finished") return 0; if (status === "waiting-approval") return 3; @@ -219,6 +243,7 @@ const commonRunOptions = z.object({ maxOutputBytes: z.number().int().min(1).optional().describe("Max bytes a single tool call can return (default: 200000)"), toolTimeoutMs: z.number().int().min(1).optional().describe("Max wall-clock time per tool call in ms (default: 60000)"), hot: z.boolean().default(false).describe("Enable hot module replacement for .tsx workflows"), + annotations: z.string().optional().describe("Run annotations as a flat JSON object of string/number/boolean values"), }); const runOptions = commonRunOptions.extend({ @@ -296,6 +321,7 @@ const cli = Cli.create({ const resolvedWorkflowPath = resolve(process.cwd(), workflowPath); const isToon = extname(resolvedWorkflowPath) === ".toon"; const input = parseJsonInput(options.input, "input", fail) ?? {}; + const annotations = parseAnnotations(options.annotations, fail); const runId = options.runId; const resume = Boolean(options.resume); @@ -388,6 +414,7 @@ const cli = Cli.create({ maxOutputBytes: options.maxOutputBytes, toolTimeoutMs: options.toolTimeoutMs, hot: options.hot, + annotations, onProgress, signal: abort.signal, }) @@ -397,6 +424,7 @@ const cli = Cli.create({ process.exitCode = formatStatusExitCode( typeof status === "string" ? status : undefined, ); + await shutdownRuntime(); return ok(result); } @@ -412,11 +440,13 @@ const cli = Cli.create({ maxOutputBytes: options.maxOutputBytes, toolTimeoutMs: options.toolTimeoutMs, hot: options.hot, + annotations, onProgress, signal: abort.signal, }); process.exitCode = formatStatusExitCode(result.status); + await shutdownRuntime(); return ok(result); } catch (err: any) { return fail({ @@ -443,6 +473,7 @@ const cli = Cli.create({ const resolvedWorkflowPath = resolve(process.cwd(), workflowPath); const isToon = extname(resolvedWorkflowPath) === ".toon"; const input = parseJsonInput(options.input, "input", fail) ?? {}; + const annotations = parseAnnotations(options.annotations, fail); const runId = options.runId; if (options.hot) { @@ -501,6 +532,7 @@ const cli = Cli.create({ maxOutputBytes: options.maxOutputBytes, toolTimeoutMs: options.toolTimeoutMs, hot: options.hot, + annotations, onProgress, signal: abort.signal, }) @@ -510,6 +542,7 @@ const cli = Cli.create({ process.exitCode = formatStatusExitCode( typeof status === "string" ? status : undefined, ); + await shutdownRuntime(); return ok(result); } @@ -525,11 +558,13 @@ const cli = Cli.create({ maxOutputBytes: options.maxOutputBytes, toolTimeoutMs: options.toolTimeoutMs, hot: options.hot, + annotations, onProgress, signal: abort.signal, }); process.exitCode = formatStatusExitCode(result.status); + await shutdownRuntime(); return ok(result); } catch (err: any) { return fail({ @@ -873,7 +908,7 @@ const cli = Cli.create({ }, }) .command("observability", { - description: "Start the local observability stack (Grafana, Prometheus, Tempo, OTLP Collector) via Docker Compose.", + description: "Start the local observability stack (Grafana, Prometheus, Tempo, Loki, OTLP Collector) via Docker Compose.", options: z.object({ detach: z.boolean().default(false).describe("Run containers in the background"), down: z.boolean().default(false).describe("Stop and remove the observability stack"), @@ -909,7 +944,9 @@ const cli = Cli.create({ : `[smithers] Starting observability stack...\n` + ` Grafana: http://localhost:3001\n` + ` Prometheus: http://localhost:9090\n` + - ` Tempo: http://localhost:3200\n`, + ` Tempo: http://localhost:3200\n` + + ` Loki: http://localhost:3100\n` + + ` OTEL HTTP: http://localhost:4318\n`, ); const child = spawn("docker", composeArgs, { diff --git a/src/effect/child-process.ts b/src/effect/child-process.ts index c00241f3..ffaa892c 100644 --- a/src/effect/child-process.ts +++ b/src/effect/child-process.ts @@ -172,6 +172,22 @@ export function spawnCaptureEffect( timeoutMs: timeoutMs ?? null, idleTimeoutMs: idleTimeoutMs ?? null, }), + Effect.annotateSpans({ + command, + args: args.join(" "), + cwd, + timeoutMs: timeoutMs ?? null, + idleTimeoutMs: idleTimeoutMs ?? null, + }), Effect.withLogSpan(`process:${command}`), + Effect.withSpan(`process:${command}`, { + attributes: { + command, + args: args.join(" "), + cwd, + timeoutMs: timeoutMs ?? null, + idleTimeoutMs: idleTimeoutMs ?? null, + }, + }), ); } diff --git a/src/effect/logging.ts b/src/effect/logging.ts index cbdac8a3..79a3bf26 100644 --- a/src/effect/logging.ts +++ b/src/effect/logging.ts @@ -1,9 +1,9 @@ import { Effect } from "effect"; -import { runFork } from "./runtime"; +import { runFork, runPromise } from "./runtime"; type LogAnnotations = Record | undefined; -function emitLog( +function withLogContext( effect: Effect.Effect, annotations?: LogAnnotations, span?: string, @@ -15,7 +15,23 @@ function emitLog( if (span) { program = program.pipe(Effect.withLogSpan(span)); } - void runFork(program); + return program; +} + +function emitLog( + effect: Effect.Effect, + annotations?: LogAnnotations, + span?: string, +) { + void runFork(withLogContext(effect, annotations, span)); +} + +async function emitLogAwait( + effect: Effect.Effect, + annotations?: LogAnnotations, + span?: string, +) { + await runPromise(withLogContext(effect, annotations, span)); } export function logDebug( @@ -26,6 +42,14 @@ export function logDebug( emitLog(Effect.logDebug(message), annotations, span); } +export async function logDebugAwait( + message: string, + annotations?: LogAnnotations, + span?: string, +) { + await emitLogAwait(Effect.logDebug(message), annotations, span); +} + export function logInfo( message: string, annotations?: LogAnnotations, @@ -34,6 +58,14 @@ export function logInfo( emitLog(Effect.logInfo(message), annotations, span); } +export async function logInfoAwait( + message: string, + annotations?: LogAnnotations, + span?: string, +) { + await emitLogAwait(Effect.logInfo(message), annotations, span); +} + export function logWarning( message: string, annotations?: LogAnnotations, @@ -42,6 +74,14 @@ export function logWarning( emitLog(Effect.logWarning(message), annotations, span); } +export async function logWarningAwait( + message: string, + annotations?: LogAnnotations, + span?: string, +) { + await emitLogAwait(Effect.logWarning(message), annotations, span); +} + export function logError( message: string, annotations?: LogAnnotations, @@ -49,3 +89,11 @@ export function logError( ) { emitLog(Effect.logError(message), annotations, span); } + +export async function logErrorAwait( + message: string, + annotations?: LogAnnotations, + span?: string, +) { + await emitLogAwait(Effect.logError(message), annotations, span); +} diff --git a/src/effect/runtime.ts b/src/effect/runtime.ts index b4aa8484..8a958506 100644 --- a/src/effect/runtime.ts +++ b/src/effect/runtime.ts @@ -50,4 +50,8 @@ export function runSync(effect: Effect.Effect) { return runtime.runSync(decorate(effect) as Effect.Effect); } +export async function shutdownRuntime() { + await runtime.dispose(); +} + export { SmithersRuntimeLayer }; diff --git a/src/engine/approvals.ts b/src/engine/approvals.ts index dbe84f8c..d1f5be03 100644 --- a/src/engine/approvals.ts +++ b/src/engine/approvals.ts @@ -47,7 +47,9 @@ export function approveNodeEffect( }); }).pipe( Effect.annotateLogs({ runId, nodeId, iteration, approvalStatus: "approved" }), + Effect.annotateSpans({ runId, nodeId, iteration, approvalStatus: "approved" }), Effect.withLogSpan("approval:grant"), + Effect.withSpan("approval:grant", { attributes: { runId, nodeId, iteration, approvalStatus: "approved" } }), ); } @@ -108,7 +110,9 @@ export function denyNodeEffect( }); }).pipe( Effect.annotateLogs({ runId, nodeId, iteration, approvalStatus: "denied" }), + Effect.annotateSpans({ runId, nodeId, iteration, approvalStatus: "denied" }), Effect.withLogSpan("approval:deny"), + Effect.withSpan("approval:deny", { attributes: { runId, nodeId, iteration, approvalStatus: "denied" } }), ); } diff --git a/src/engine/index.ts b/src/engine/index.ts index 589aa874..713232e5 100644 --- a/src/engine/index.ts +++ b/src/engine/index.ts @@ -54,6 +54,7 @@ import { spawn as nodeSpawn } from "node:child_process"; import { randomUUID } from "node:crypto"; import { platform } from "node:os"; import { withTaskRuntime } from "../effect/task-runtime"; +import { AgentTraceCollector } from "../agent-trace"; /** * Track which worktree paths have already been created this run so we don't @@ -1291,6 +1292,12 @@ async function executeTask( }, workflowName: string, cacheEnabled: boolean, + traceContext?: { + workflowPath?: string | null; + workflowHash?: string | null; + logDir?: string; + annotations?: Record; + }, signal?: AbortSignal, disabledAgents?: Set, ) { @@ -1495,6 +1502,21 @@ async function executeTask( let agentResult: any; if (effectiveAgent) { // Use fallback agent on retry attempts when available + const traceCollector = new AgentTraceCollector({ + eventBus, + runId, + workflowPath: traceContext?.workflowPath ?? null, + workflowHash: traceContext?.workflowHash ?? null, + nodeId: desc.nodeId, + iteration: desc.iteration, + attempt: attemptNo, + agent: effectiveAgent, + agentId: (effectiveAgent as any).id, + model: (effectiveAgent as any).model, + logDir: traceContext?.logDir, + annotations: traceContext?.annotations, + }); + traceCollector.begin(); const result = await runWithToolContext( { db: adapter, @@ -1545,6 +1567,8 @@ async function executeTask( stream, timestampMs: nowMs(), }); + if (stream === "stdout") traceCollector.onStdout(text); + else traceCollector.onStderr(text); }; return (effectiveAgent as any).generate({ options: undefined as any, @@ -1556,7 +1580,15 @@ async function executeTask( outputSchema: desc.outputSchema, }); }, - ); + ).then(async (result) => { + traceCollector.observeResult(result); + await traceCollector.flush(); + return result; + }, async (error) => { + traceCollector.observeError(error); + await traceCollector.flush(); + throw error; + }); agentResult = result; @@ -2299,12 +2331,15 @@ export function renderFrameEffect( ctx: any, opts?: { baseRootDir?: string }, ) { + const spanAttributes = { + runId: ctx?.runId ?? "", + iteration: ctx?.iteration ?? 0, + }; return fromPromise("render frame", () => renderFrameAsync(workflow, ctx, opts)).pipe( - Effect.annotateLogs({ - runId: ctx?.runId ?? "", - iteration: ctx?.iteration ?? 0, - }), + Effect.annotateLogs(spanAttributes), + Effect.annotateSpans(spanAttributes), Effect.withLogSpan("engine:render-frame"), + Effect.withSpan("engine:render-frame", { attributes: spanAttributes }), ); } @@ -3090,6 +3125,12 @@ async function runWorkflowBody( toolConfig, workflowName, cacheEnabled, + { + workflowPath: resolvedWorkflowPath ?? opts.workflowPath ?? null, + workflowHash: runMetadata.workflowHash, + logDir: logDir ?? undefined, + annotations: opts.annotations, + }, runAbortController.signal, disabledAgents, ).finally(() => inflight.delete(p)); @@ -3164,14 +3205,17 @@ export function runWorkflowEffect( workflow: SmithersWorkflow, opts: RunOptions, ) { + const spanAttributes = { + runId: opts.runId ?? "", + workflowPath: opts.workflowPath ?? "", + maxConcurrency: opts.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY, + hot: Boolean(opts.hot), + }; return fromPromise("run workflow", () => runWorkflowAsync(workflow, opts)).pipe( - Effect.annotateLogs({ - runId: opts.runId ?? "", - workflowPath: opts.workflowPath ?? "", - maxConcurrency: opts.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY, - hot: Boolean(opts.hot), - }), + Effect.annotateLogs(spanAttributes), + Effect.annotateSpans(spanAttributes), Effect.withLogSpan("engine:run-workflow"), + Effect.withSpan("engine:run-workflow", { attributes: spanAttributes }), ); } diff --git a/src/events.ts b/src/events.ts index 99256de6..b7ff5ccc 100644 --- a/src/events.ts +++ b/src/events.ts @@ -30,7 +30,9 @@ export class EventBus extends EventEmitter { } }).pipe( Effect.annotateLogs({ runId: event.runId, eventType: event.type }), + Effect.annotateSpans({ runId: event.runId, eventType: event.type }), Effect.withLogSpan(`event:${event.type}`), + Effect.withSpan(`event:${event.type}`, { attributes: { runId: event.runId, eventType: event.type } }), ); } @@ -44,7 +46,9 @@ export class EventBus extends EventEmitter { yield* this.persistEffect(event); }).pipe( Effect.annotateLogs({ runId: event.runId, eventType: event.type }), + Effect.annotateSpans({ runId: event.runId, eventType: event.type }), Effect.withLogSpan(`event:${event.type}:persist`), + Effect.withSpan(`event:${event.type}:persist`, { attributes: { runId: event.runId, eventType: event.type } }), ); } @@ -65,7 +69,10 @@ export class EventBus extends EventEmitter { this.persistError = null; throw err; } - }).pipe(Effect.withLogSpan("event:flush")); + }).pipe( + Effect.withLogSpan("event:flush"), + Effect.withSpan("event:flush"), + ); } async flush(): Promise { @@ -83,7 +90,9 @@ export class EventBus extends EventEmitter { } }).pipe( Effect.annotateLogs({ runId: event.runId, eventType: event.type }), + Effect.annotateSpans({ runId: event.runId, eventType: event.type }), Effect.withLogSpan("event:persist"), + Effect.withSpan("event:persist", { attributes: { runId: event.runId, eventType: event.type } }), ); } diff --git a/src/index.ts b/src/index.ts index 893f9ace..3e1a48ba 100644 --- a/src/index.ts +++ b/src/index.ts @@ -14,6 +14,15 @@ export type { SmithersCtx } from "./SmithersCtx"; export type { OutputAccessor, InferRow, InferOutputEntry } from "./OutputAccessor"; export type { SmithersEvent } from "./SmithersEvent"; export type { SmithersError } from "./SmithersError"; +export type { + AgentCaptureMode, + AgentFamily, + AgentTraceCapabilityProfile, + AgentTraceSummary, + CanonicalAgentTraceEvent, + CanonicalAgentTraceEventKind, + TraceCompleteness, +} from "./agent-trace"; export { SmithersError as SmithersErrorInstance, isSmithersError, errorToJson } from "./utils/errors"; export type { SmithersErrorCode } from "./utils/errors"; export type { @@ -88,6 +97,7 @@ export type { export { createSmithers } from "./create"; export type { CreateSmithersApi } from "./create"; export { runWorkflow, renderFrame } from "./engine"; +export { agentTraceCapabilities, detectAgentFamily, detectCaptureMode } from "./agent-trace"; // Tools export { tools, read, write, edit, grep, bash } from "./tools/index"; diff --git a/src/observability/index.ts b/src/observability/index.ts index b29835ff..bae08048 100644 --- a/src/observability/index.ts +++ b/src/observability/index.ts @@ -373,11 +373,19 @@ function makeService( return { options, annotate: (attributes) => Effect.void.pipe(Effect.annotateLogs(attributes)), - withSpan: (name, effect, attributes) => - (attributes && Object.keys(attributes).length > 0 - ? effect.pipe(Effect.annotateLogs(attributes)) - : effect - ).pipe(Effect.withLogSpan(name)), + withSpan: (name, effect, attributes) => { + let program = effect; + if (attributes && Object.keys(attributes).length > 0) { + program = program.pipe( + Effect.annotateLogs(attributes), + Effect.annotateSpans(attributes), + ); + } + return program.pipe( + Effect.withLogSpan(name), + Effect.withSpan(name, attributes ? { attributes } : undefined), + ); + }, }; } diff --git a/src/tools/bash.ts b/src/tools/bash.ts index 90cb9497..577f92c4 100644 --- a/src/tools/bash.ts +++ b/src/tools/bash.ts @@ -24,8 +24,11 @@ export function bashToolEffect( const allowNetwork = ctx?.allowNetwork ?? false; const started = nowMs(); let seq: number | undefined; + let toolCallId: string | undefined; return Effect.gen(function* () { - seq = yield* logToolCallStartEffect("bash", started); + const startedCall = yield* logToolCallStartEffect("bash", started); + seq = startedCall?.seq; + toolCallId = startedCall?.toolCallId; const cwd = opts?.cwd ? yield* fromSync("resolve sandbox path", () => resolveSandboxPath(root, opts.cwd!), @@ -78,6 +81,7 @@ export function bashToolEffect( undefined, started, seq, + toolCallId, ); return output; }).pipe( @@ -98,6 +102,7 @@ export function bashToolEffect( error, started, seq, + toolCallId, ), ), ); diff --git a/src/tools/edit.ts b/src/tools/edit.ts index 9436fdb4..72de673f 100644 --- a/src/tools/edit.ts +++ b/src/tools/edit.ts @@ -20,8 +20,11 @@ export function editToolEffect(path: string, patch: string) { const patchBytes = Buffer.byteLength(patch, "utf8"); const logInput = { path, patchBytes, patchHash: sha256Hex(patch) }; let seq: number | undefined; + let toolCallId: string | undefined; return Effect.gen(function* () { - seq = yield* logToolCallStartEffect("edit", started); + const startedCall = yield* logToolCallStartEffect("edit", started); + seq = startedCall?.seq; + toolCallId = startedCall?.toolCallId; const fs = yield* FileSystem.FileSystem; const resolved = yield* fromSync("resolve sandbox path", () => resolveSandboxPath(root, path), @@ -50,6 +53,7 @@ export function editToolEffect(path: string, patch: string) { undefined, started, seq, + toolCallId, ); return "ok"; }).pipe( @@ -61,7 +65,16 @@ export function editToolEffect(path: string, patch: string) { }), Effect.withLogSpan("tool:edit"), Effect.tapError((error) => - logToolCallEffect("edit", logInput, null, "error", error, started, seq), + logToolCallEffect( + "edit", + logInput, + null, + "error", + error, + started, + seq, + toolCallId, + ), ), ); } diff --git a/src/tools/grep.ts b/src/tools/grep.ts index c9a99957..7b4f3eb9 100644 --- a/src/tools/grep.ts +++ b/src/tools/grep.ts @@ -15,8 +15,11 @@ export function grepToolEffect(pattern: string, path?: string) { const root = ctx?.rootDir ?? process.cwd(); const started = nowMs(); let seq: number | undefined; + let toolCallId: string | undefined; return Effect.gen(function* () { - seq = yield* logToolCallStartEffect("grep", started); + const startedCall = yield* logToolCallStartEffect("grep", started); + seq = startedCall?.seq; + toolCallId = startedCall?.toolCallId; const resolvedRoot = yield* fromSync("resolve sandbox path", () => resolveSandboxPath(root, path ?? "."), ); @@ -45,6 +48,7 @@ export function grepToolEffect(pattern: string, path?: string) { undefined, started, seq, + toolCallId, ); return result.stdout; }).pipe( @@ -64,6 +68,7 @@ export function grepToolEffect(pattern: string, path?: string) { error, started, seq, + toolCallId, ), ), ); diff --git a/src/tools/logToolCall.ts b/src/tools/logToolCall.ts index 9e5fe2bb..3110d893 100644 --- a/src/tools/logToolCall.ts +++ b/src/tools/logToolCall.ts @@ -14,11 +14,13 @@ export function logToolCallEffect( error?: unknown, startedAtMs?: number, seqOverride?: number, + toolCallIdOverride?: string, ) { const ctx = getToolContext(); if (!ctx) return Effect.void; const seq = typeof seqOverride === "number" ? seqOverride : nextToolSeq(ctx); + const toolCallId = toolCallIdOverride ?? `${toolName}:${seq}`; const started = startedAtMs ?? nowMs(); const finished = nowMs(); const durationMs = finished - started; @@ -32,11 +34,20 @@ export function logToolCallEffect( nodeId: ctx.nodeId, iteration: ctx.iteration, attempt: ctx.attempt, + toolCallId, toolName, seq, status, timestampMs: finished, }); + const spanAttributes = { + runId: ctx.runId, + nodeId: ctx.nodeId, + iteration: ctx.iteration, + attempt: ctx.attempt, + toolName, + toolStatus: status, + }; return Metric.update(toolDuration, durationMs).pipe( Effect.andThen( ctx.db.insertToolCallEffect({ @@ -54,15 +65,10 @@ export function logToolCallEffect( errorJson, }), ), - Effect.annotateLogs({ - runId: ctx.runId, - nodeId: ctx.nodeId, - iteration: ctx.iteration, - attempt: ctx.attempt, - toolName, - toolStatus: status, - }), + Effect.annotateLogs(spanAttributes), + Effect.annotateSpans(spanAttributes), Effect.withLogSpan(`tool:${toolName}:log`), + Effect.withSpan(`tool:${toolName}:log`, { attributes: spanAttributes }), ); } @@ -86,6 +92,7 @@ export function logToolCallStartEffect( const ctx = getToolContext(); if (!ctx) return Effect.succeed(undefined); const seq = nextToolSeq(ctx); + const toolCallId = `${toolName}:${seq}`; const started = startedAtMs ?? nowMs(); void ctx.emitEvent?.({ type: "ToolCallStarted", @@ -93,11 +100,12 @@ export function logToolCallStartEffect( nodeId: ctx.nodeId, iteration: ctx.iteration, attempt: ctx.attempt, + toolCallId, toolName, seq, timestampMs: started, }); - return Effect.succeed(seq); + return Effect.succeed({ seq, toolCallId }); } export function truncateToBytes(text: string, maxBytes: number): string { diff --git a/src/tools/read.ts b/src/tools/read.ts index bcd4cf23..34ddf58c 100644 --- a/src/tools/read.ts +++ b/src/tools/read.ts @@ -19,8 +19,11 @@ export function readToolEffect(path: string) { const root = ctx?.rootDir ?? process.cwd(); const started = nowMs(); let seq: number | undefined; + let toolCallId: string | undefined; return Effect.gen(function* () { - seq = yield* logToolCallStartEffect("read", started); + const startedCall = yield* logToolCallStartEffect("read", started); + seq = startedCall?.seq; + toolCallId = startedCall?.toolCallId; const fs = yield* FileSystem.FileSystem; const resolved = yield* fromSync("resolve sandbox path", () => resolveSandboxPath(root, path), @@ -41,6 +44,7 @@ export function readToolEffect(path: string) { undefined, started, seq, + toolCallId, ); return output; }).pipe( @@ -51,7 +55,16 @@ export function readToolEffect(path: string) { }), Effect.withLogSpan("tool:read"), Effect.tapError((error) => - logToolCallEffect("read", { path }, null, "error", error, started, seq), + logToolCallEffect( + "read", + { path }, + null, + "error", + error, + started, + seq, + toolCallId, + ), ), ); } diff --git a/src/tools/write.ts b/src/tools/write.ts index 3783f057..e5c73698 100644 --- a/src/tools/write.ts +++ b/src/tools/write.ts @@ -20,8 +20,11 @@ export function writeToolEffect(path: string, content: string) { const contentBytes = Buffer.byteLength(content, "utf8"); const logInput = { path, contentBytes, contentHash: sha256Hex(content) }; let seq: number | undefined; + let toolCallId: string | undefined; return Effect.gen(function* () { - seq = yield* logToolCallStartEffect("write", started); + const startedCall = yield* logToolCallStartEffect("write", started); + seq = startedCall?.seq; + toolCallId = startedCall?.toolCallId; const fs = yield* FileSystem.FileSystem; const resolved = yield* fromSync("resolve sandbox path", () => resolveSandboxPath(root, path), @@ -40,6 +43,7 @@ export function writeToolEffect(path: string, content: string) { undefined, started, seq, + toolCallId, ); return "ok"; }).pipe( @@ -51,7 +55,16 @@ export function writeToolEffect(path: string, content: string) { }), Effect.withLogSpan("tool:write"), Effect.tapError((error) => - logToolCallEffect("write", logInput, null, "error", error, started, seq), + logToolCallEffect( + "write", + logInput, + null, + "error", + error, + started, + seq, + toolCallId, + ), ), ); } diff --git a/src/utils/text-extraction.ts b/src/utils/text-extraction.ts new file mode 100644 index 00000000..14723dd5 --- /dev/null +++ b/src/utils/text-extraction.ts @@ -0,0 +1,25 @@ +export function extractTextFromJsonValue(value: any): string | undefined { + if (typeof value === "string") return value; + if (Array.isArray(value)) { + const text = value.map((item) => extractTextFromJsonValue(item) ?? "").join(""); + return text || undefined; + } + if (!value || typeof value !== "object") return undefined; + if (typeof value.text === "string") return value.text; + if (typeof value.content === "string") return value.content; + if (typeof value.output_text === "string") return value.output_text; + + if (Array.isArray(value.content)) { + const text = value.content + .map((part: any) => extractTextFromJsonValue(part) ?? "") + .join(""); + if (text) return text; + } + + for (const field of ["response", "message", "result", "output", "data", "item"]) { + const text = extractTextFromJsonValue(value[field]); + if (text) return text; + } + + return undefined; +} diff --git a/src/utils/usage.ts b/src/utils/usage.ts new file mode 100644 index 00000000..f02e33ef --- /dev/null +++ b/src/utils/usage.ts @@ -0,0 +1,89 @@ +export type NormalizedTokenUsage = { + inputTokens?: number; + outputTokens?: number; + cacheReadTokens?: number; + cacheWriteTokens?: number; + reasoningTokens?: number; + totalTokens?: number; +}; + +const usageFieldAliases: Record< + keyof NormalizedTokenUsage, + ReadonlyArray> +> = { + inputTokens: [ + ["inputTokens"], + ["promptTokens"], + ["prompt_tokens"], + ["input_tokens"], + ["input"], + ["models", "gemini", "tokens", "input"], + ], + outputTokens: [ + ["outputTokens"], + ["completionTokens"], + ["completion_tokens"], + ["output_tokens"], + ["output"], + ["models", "gemini", "tokens", "output"], + ], + cacheReadTokens: [ + ["cacheReadTokens"], + ["cache_read_input_tokens"], + ["cached_input_tokens"], + ["cache_read_tokens"], + ["inputTokenDetails", "cacheReadTokens"], + ], + cacheWriteTokens: [ + ["cacheWriteTokens"], + ["cache_write_input_tokens"], + ["cache_creation_input_tokens"], + ["cache_write_tokens"], + ["inputTokenDetails", "cacheWriteTokens"], + ], + reasoningTokens: [ + ["reasoningTokens"], + ["reasoning_tokens"], + ["outputTokenDetails", "reasoningTokens"], + ], + totalTokens: [ + ["totalTokens"], + ["total_tokens"], + ], +}; + +function readUsagePath(value: unknown, path: ReadonlyArray): unknown { + let current = value; + for (const segment of path) { + if (!current || typeof current !== "object") return undefined; + current = (current as Record)[segment]; + } + return current; +} + +export function hasMeaningfulTokenUsage(usage: NormalizedTokenUsage): boolean { + return Object.values(usage).some( + (value) => typeof value === "number" && Number.isFinite(value) && value > 0, + ); +} + +export function normalizeTokenUsage( + usage: unknown, +): NormalizedTokenUsage | null { + if (!usage || typeof usage !== "object") return null; + + const normalized: NormalizedTokenUsage = {}; + for (const [field, aliases] of Object.entries(usageFieldAliases) as Array< + [keyof NormalizedTokenUsage, ReadonlyArray>] + >) { + for (const path of aliases) { + const value = readUsagePath(usage, path); + if (typeof value === "number") { + normalized[field] = value; + break; + } + } + } + + return hasMeaningfulTokenUsage(normalized) ? normalized : null; +} diff --git a/tests/agent-trace.test.tsx b/tests/agent-trace.test.tsx new file mode 100644 index 00000000..1a192952 --- /dev/null +++ b/tests/agent-trace.test.tsx @@ -0,0 +1,1795 @@ +/** @jsxImportSource smithers */ +import { describe, expect, test } from "bun:test"; +import { mkdtempSync, readFileSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { Workflow, Task, runWorkflow } from "../src"; +import { + AgentTraceCollector, + canonicalTraceEventToOtelLogRecord, +} from "../src/agent-trace"; +import { SmithersDb } from "../src/db/adapter"; +import { runPromise } from "../src/effect/runtime"; +import { + logToolCallEffect, + logToolCallStartEffect, +} from "../src/tools/logToolCall"; +import { createTestSmithers } from "./helpers"; +import { z } from "zod"; + +async function listRunEvents(db: any, runId: string) { + const adapter = new SmithersDb(db as any); + return adapter.listEvents(runId, -1, 500); +} + +describe("agent trace capture", () => { + test("captures high-fidelity Pi canonical trace events with ordering and redaction", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-pi"; + + const piLikeAgent: any = { + id: "pi-agent-test", + opts: { mode: "json" }, + generate: async (args: { onStdout?: (text: string) => void }) => { + args.onStdout?.( + JSON.stringify({ type: "session", id: "sess-1" }) + "\n", + ); + args.onStdout?.(JSON.stringify({ type: "turn_start" }) + "\n"); + args.onStdout?.( + JSON.stringify({ + type: "message_start", + message: { role: "assistant", content: [] }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message_update", + assistantMessageEvent: { + type: "thinking_delta", + delta: "thinking secret=sk_abc123456789", + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message_update", + assistantMessageEvent: { type: "text_delta", delta: "hello" }, + }) + "\n", + ); + const startedCall = await runPromise(logToolCallStartEffect("bash")); + await runPromise( + logToolCallEffect( + "bash", + { cmd: "echo secret=sk_live_123456789" }, + { ok: true }, + "success", + undefined, + undefined, + startedCall?.seq, + startedCall?.toolCallId, + ), + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_execution_start", + toolExecution: { id: "t1", name: "bash", args: { cmd: "echo hi" } }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_execution_end", + toolExecution: { id: "t1", name: "bash", result: { ok: true } }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "turn_end", + message: { + role: "assistant", + content: [{ type: "text", text: "Final answer" }], + stopReason: "stop", + }, + }) + "\n", + ); + args.onStdout?.(JSON.stringify({ type: "agent_end" }) + "\n"); + return { + text: '{"answer":"Final answer"}', + output: { answer: "Final answer" }, + }; + }, + }; + + const workflow = smithers(() => ( + + + do the thing + + + )); + + const result = await runWorkflow(workflow, { + input: {}, + runId, + workflowPath: "/tmp/pi-trace.tsx", + annotations: { "custom.demo": true, ticket: 123 }, + }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const artifactEvent = traceEvents.find( + (event: any) => event.event.kind === "artifact.created", + ); + const persistedRows = readFileSync( + artifactEvent?.payload?.artifactPath, + "utf8", + ) + .trim() + .split("\n") + .map((line) => JSON.parse(line)); + + expect(summary.agentFamily).toBe("pi"); + expect(summary.captureMode).toBe("cli-json-stream"); + expect(summary.traceCompleteness).toBe("full-observed"); + + const kinds = traceEvents.map((event: any) => event.event.kind); + expect(kinds).toContain("session.start"); + expect(kinds).toContain("turn.start"); + expect(kinds).toContain("assistant.thinking.delta"); + expect(kinds).toContain("assistant.text.delta"); + expect(kinds).toContain("tool.execution.start"); + expect(kinds).toContain("tool.execution.end"); + expect(kinds).toContain("assistant.message.final"); + + const sequences = traceEvents.map((event: any) => event.event.sequence); + expect(sequences).toEqual([...sequences].sort((a, b) => a - b)); + expect(new Set(sequences).size).toBe(sequences.length); + expect( + traceEvents.filter( + (event: any) => event.event.kind === "tool.execution.start", + ), + ).toHaveLength(1); + expect( + traceEvents.filter( + (event: any) => event.event.kind === "tool.execution.end", + ), + ).toHaveLength(1); + expect( + persistedRows + .filter((row: any) => "event" in row) + .every((row: any) => row.traceCompleteness === "full-observed"), + ).toBe(true); + expect(persistedRows.at(-1)?.summary?.traceCompleteness).toBe( + "full-observed", + ); + + const redacted = JSON.stringify(traceEvents); + expect(redacted).not.toContain("sk_abc123456789"); + expect(redacted).not.toContain("sk_live_123456789"); + expect(traceEvents.some((event: any) => event.redaction.applied)).toBe( + true, + ); + expect( + traceEvents.every( + (event: any) => event.annotations["custom.demo"] === true, + ), + ).toBe(true); + + cleanup(); + }); + + test("classifies malformed Pi JSON as capture-failed", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-pi-fail"; + + const piLikeAgent: any = { + id: "pi-agent-failure", + opts: { mode: "json" }, + generate: async (args: { onStdout?: (text: string) => void }) => { + args.onStdout?.("{not json}\n"); + throw new Error("subprocess exits early"); + }, + }; + + const workflow = smithers(() => ( + + + fail please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("failed"); + + const events = await listRunEvents(db, runId); + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + + expect(summary.traceCompleteness).toBe("capture-failed"); + expect( + traceEvents.some((event: any) => event.event.kind === "capture.error"), + ).toBe(true); + + cleanup(); + }); + + test("preserves Pi message.update fallback and tool updates from one transcript", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-pi-update-fallback"; + + const piLikeAgent: any = { + id: "pi-agent-update-fallback", + opts: { mode: "json" }, + generate: async (args: { onStdout?: (text: string) => void }) => { + args.onStdout?.(JSON.stringify({ type: "session", id: "sess-2" }) + "\n"); + args.onStdout?.( + JSON.stringify({ + type: "message_update", + message: { role: "assistant", content: [{ type: "text", text: "partial state" }] }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_execution_update", + toolExecution: { id: "tool-2", name: "bash", result: { status: "running" } }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "turn_end", + message: { + role: "assistant", + content: [{ type: "text", text: "Pi final update" }], + }, + }) + "\n", + ); + args.onStdout?.(JSON.stringify({ type: "agent_end" }) + "\n"); + return { + text: '{"answer":"Pi final update"}', + output: { answer: "Pi final update" }, + }; + }, + }; + + const workflow = smithers(() => ( + + + pi update fallback please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect( + traceEvents.some( + (event: any) => + event.event.kind === "message.update" && + event.payload.text === "partial state", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "tool.execution.update" && + event.payload.toolCallId === "tool-2", + ), + ).toBe(true); + + cleanup(); + }); + + test("truthfully classifies sdk agents as final-only", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-sdk"; + + class OpenAIAgentFake { + id = "openai-sdk-fake"; + tools = {}; + async generate() { + return { + text: '{"answer":"sdk final"}', + output: { answer: "sdk final" }, + usage: { inputTokens: 10, outputTokens: 5 }, + }; + } + } + + const workflow = smithers(() => ( + + + sdk final only + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.agentFamily).toBe("openai"); + expect(summary.captureMode).toBe("sdk-events"); + expect(summary.traceCompleteness).toBe("final-only"); + expect(summary.unsupportedEventKinds).toContain("assistant.thinking.delta"); + expect( + traceEvents.some( + (event: any) => event.event.kind === "assistant.message.final", + ), + ).toBe(true); + expect(traceEvents.some((event: any) => event.event.kind === "usage")).toBe( + true, + ); + const finalMessage = traceEvents.find( + (event: any) => event.event.kind === "assistant.message.final", + ); + const usage = traceEvents.find((event: any) => event.event.kind === "usage"); + expect(finalMessage?.source.rawEventId).toBeTruthy(); + expect(finalMessage?.source.rawEventId).toBe(usage?.source.rawEventId); + + cleanup(); + }); + + test("preserves structured Claude stream-json deltas truthfully", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-claude-structured"; + + class ClaudeCodeAgentFake { + id = "claude-code-fake"; + opts = { outputFormat: "stream-json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "message_start", + message: { role: "assistant" }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message_delta", + delta: { text: "claude delta" }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message_end", + message: { role: "assistant", content: "claude final" }, + usage: { input_tokens: 4, output_tokens: 2 }, + }) + "\n", + ); + return { + text: '{"answer":"claude final"}', + output: { answer: "claude final" }, + }; + } + } + + const workflow = smithers(() => ( + + + structured claude please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.agentFamily).toBe("claude-code"); + expect(summary.captureMode).toBe("cli-json-stream"); + expect(summary.traceCompleteness).toBe("full-observed"); + expect( + traceEvents.some((event: any) => event.event.kind === "message.start"), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.text.delta" && + event.payload.text === "claude delta", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "assistant.message.final", + ), + ).toBe(true); + expect(traceEvents.some((event: any) => event.event.kind === "usage")).toBe( + true, + ); + cleanup(); + }); + + test("normalizes real Claude stream-json assistant/result events", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-claude-real-schema"; + + class ClaudeCodeAgentRealSchemaFake { + id = "claude-code-real-fake"; + opts = { outputFormat: "stream-json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "system", + subtype: "init", + session_id: "sess-1", + model: "claude-opus-4-6[1m]", + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "assistant", + message: { + role: "assistant", + content: [{ type: "text", text: "claude real final" }], + usage: { + input_tokens: 3, + cache_read_input_tokens: 4, + output_tokens: 2, + }, + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "result", + subtype: "success", + result: "claude real final", + usage: { + input_tokens: 3, + cache_read_input_tokens: 4, + output_tokens: 20, + }, + }) + "\n", + ); + return { + text: '{"answer":"claude real final"}', + output: { answer: "claude real final" }, + }; + } + } + + const workflow = smithers(() => ( + + + structured claude real schema please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.traceCompleteness).toBe("full-observed"); + expect(summary.unsupportedEventKinds).not.toContain("artifact.created"); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "message.update" && + event.payload.text === "claude real final", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.text.delta" && + event.payload.text === "claude real final", + ), + ).toBe(false); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.message.final" && + event.payload.text === "claude real final", + ), + ).toBe(true); + expect(traceEvents.filter((event: any) => event.event.kind === "usage").length).toBe(2); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "usage" && + event.payload.outputTokens === 20, + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.message.final" && + event.source.observed === true && + event.source.rawType === "result", + ), + ).toBe(true); + const assistantUpdate = traceEvents.find( + (event: any) => + event.event.kind === "message.update" && + event.source.rawType === "assistant", + ); + const assistantUsage = traceEvents.find( + (event: any) => + event.event.kind === "usage" && + event.source.rawType === "assistant", + ); + expect(assistantUpdate?.source.rawEventId).toBeTruthy(); + expect(assistantUpdate?.source.rawEventId).toBe( + assistantUsage?.source.rawEventId, + ); + + cleanup(); + }); + + test("preserves Claude structured tool lifecycle with shared raw event ids", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-claude-tool-lifecycle"; + + class ClaudeToolLifecycleFake { + id = "claude-tool-lifecycle-fake"; + opts = { outputFormat: "stream-json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "tool_call.started", + toolCall: { id: "tool-claude-1", name: "read", args: { path: "README.md" } }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_call.completed", + toolCall: { id: "tool-claude-1", name: "read", result: { ok: true } }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message_end", + message: { role: "assistant", content: "claude tool final" }, + }) + "\n", + ); + return { + text: '{"answer":"claude tool final"}', + output: { answer: "claude tool final" }, + }; + } + } + + const workflow = smithers(() => ( + + + structured claude tool lifecycle please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + const toolStart = traceEvents.find( + (event: any) => event.event.kind === "tool.execution.start", + ); + const toolEnd = traceEvents.find( + (event: any) => event.event.kind === "tool.execution.end", + ); + + expect(toolStart?.payload.toolCallId).toBe("tool-claude-1"); + expect(toolEnd?.payload.toolCallId).toBe("tool-claude-1"); + expect(toolStart?.source.rawEventId).not.toBe(toolEnd?.source.rawEventId); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.message.final" && + event.payload.text === "claude tool final", + ), + ).toBe(true); + + cleanup(); + }); + + test("preserves structured Codex completion usage and final message truthfully", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-codex-structured"; + + class CodexAgentFake { + id = "codex-fake"; + opts = { outputFormat: "stream-json", json: true }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "assistant_message.delta", + delta: { text: "codex delta" }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "turn.completed", + usage: { input_tokens: 8, output_tokens: 3 }, + message: { role: "assistant", content: "codex final" }, + }) + "\n", + ); + return { + text: '{"answer":"codex final"}', + output: { answer: "codex final" }, + }; + } + } + + const workflow = smithers(() => ( + + + structured codex please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.agentFamily).toBe("codex"); + expect(summary.captureMode).toBe("cli-json-stream"); + expect(summary.traceCompleteness).toBe("full-observed"); + expect(summary.unsupportedEventKinds).not.toContain("artifact.created"); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.text.delta" && + event.payload.text === "codex delta", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "assistant.message.final", + ), + ).toBe(true); + expect(traceEvents.some((event: any) => event.event.kind === "usage")).toBe( + true, + ); + expect(summary.unsupportedEventKinds).not.toContain("assistant.text.delta"); + const turnUsage = traceEvents.find( + (event: any) => event.event.kind === "usage", + ); + const turnEnd = traceEvents.find( + (event: any) => event.event.kind === "turn.end", + ); + const finalMessage = traceEvents.find( + (event: any) => event.event.kind === "assistant.message.final", + ); + expect(turnUsage?.source.rawEventId).toBeTruthy(); + expect(turnUsage?.source.rawEventId).toBe(turnEnd?.source.rawEventId); + expect(turnUsage?.source.rawEventId).toBe(finalMessage?.source.rawEventId); + cleanup(); + }); + + test("treats Codex turn.completed without usage as a terminal structured event", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-codex-turn-completed-no-usage"; + + class CodexNoUsageAgentFake { + id = "codex-no-usage-fake"; + opts = { outputFormat: "stream-json", json: true }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "turn.started", + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "turn.completed", + message: { role: "assistant", content: "codex final without usage" }, + }) + "\n", + ); + return { + text: '{"answer":"codex final without usage"}', + output: { answer: "codex final without usage" }, + }; + } + } + + const workflow = smithers(() => ( + + + codex without usage please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.traceCompleteness).toBe("full-observed"); + expect( + traceEvents.some((event: any) => event.event.kind === "turn.end"), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.message.final" && + event.payload.text === "codex final without usage", + ), + ).toBe(true); + + cleanup(); + }); + + test("preserves Codex structured tool lifecycle from one transcript", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-codex-tool-lifecycle"; + + class CodexToolLifecycleFake { + id = "codex-tool-lifecycle-fake"; + opts = { outputFormat: "stream-json", json: true }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "tool_call.started", + toolCall: { id: "codex-tool-1", name: "grep", args: { pattern: "trace" } }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_call.delta", + toolCall: { id: "codex-tool-1", name: "grep", result: { status: "running" } }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_call.completed", + toolCall: { id: "codex-tool-1", name: "grep", result: { ok: true } }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "turn.completed", + message: { role: "assistant", content: "codex tool final" }, + }) + "\n", + ); + return { + text: '{"answer":"codex tool final"}', + output: { answer: "codex tool final" }, + }; + } + } + + const workflow = smithers(() => ( + + + codex tool lifecycle please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect( + traceEvents.some( + (event: any) => + event.event.kind === "tool.execution.start" && + event.payload.toolCallId === "codex-tool-1", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "tool.execution.update" && + event.payload.toolCallId === "codex-tool-1", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "tool.execution.end" && + event.payload.toolCallId === "codex-tool-1", + ), + ).toBe(true); + + cleanup(); + }); + + test("captures real Codex dotted jsonl events as structured trace instead of cli-text fallback", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-codex-dotted-jsonl"; + + class CodexJsonlAgentFake { + readonly id = "codex-jsonl-fake"; + readonly opts = { outputFormat: "stream-json" as const, json: true }; + readonly family = "codex"; + + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "thread.started", + thread_id: "thread-1", + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "turn.started", + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "item.completed", + item: { + id: "item-1", + type: "agent_message", + text: "Codex dotted final", + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "turn.completed", + usage: { + input_tokens: 10, + cached_input_tokens: 2, + output_tokens: 4, + }, + }) + "\n", + ); + return { + text: '{"answer":"Codex dotted final"}', + output: { answer: "Codex dotted final" }, + }; + } + } + + const workflow = smithers(() => ( + + + structured codex dotted jsonl please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.captureMode).toBe("cli-json-stream"); + expect(summary.traceCompleteness).toBe("full-observed"); + expect( + traceEvents.some((event: any) => event.event.kind === "turn.start"), + ).toBe(true); + expect( + traceEvents.some((event: any) => event.event.kind === "turn.end"), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.message.final" && + event.source.observed === true && + event.payload.text === "Codex dotted final", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "usage" && + event.payload.inputTokens === 10 && + event.payload.cacheReadTokens === 2 && + event.payload.outputTokens === 4, + ), + ).toBe(true); + + cleanup(); + }); + + test("classifies structured Gemini stream-json traces truthfully", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-gemini-structured"; + + class GeminiAgentFake { + id = "gemini-fake"; + opts = { outputFormat: "stream-json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "response.output_text.delta", + delta: { text: "gemini delta" }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "response.completed", + response: { + role: "assistant", + content: [{ text: "gemini final" }], + }, + usage: { input_tokens: 6, output_tokens: 4 }, + }) + "\n", + ); + return { + text: '{"answer":"gemini final"}', + output: { answer: "gemini final" }, + }; + } + } + + const workflow = smithers(() => ( + + + structured gemini please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.agentFamily).toBe("gemini"); + expect(summary.captureMode).toBe("cli-json-stream"); + expect(summary.traceCompleteness).toBe("full-observed"); + expect(summary.unsupportedEventKinds).not.toContain("assistant.text.delta"); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.text.delta" && + event.payload.text === "gemini delta", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "assistant.message.final", + ), + ).toBe(true); + + cleanup(); + }); + + test("normalizes real Gemini stream-json message/result events", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-gemini-real-schema"; + + class GeminiAgentRealSchemaFake { + id = "gemini-real-fake"; + opts = { outputFormat: "stream-json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "init", + session_id: "sess-1", + model: "gemini-3-flash-preview", + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message", + role: "assistant", + content: "gemini ", + delta: true, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message", + role: "assistant", + content: "real final", + delta: true, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "result", + status: "success", + stats: { + input_tokens: 10, + output_tokens: 5, + total_tokens: 15, + }, + }) + "\n", + ); + return { + text: '{"answer":"gemini real final"}', + output: { answer: "gemini real final" }, + }; + } + } + + const workflow = smithers(() => ( + + + structured gemini real schema please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.traceCompleteness).toBe("partial-observed"); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.text.delta" && + event.payload.text === "gemini ", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.text.delta" && + event.payload.text === "real final", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.message.final" && + event.payload.text === "gemini real final", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.message.final" && + event.source.observed === false, + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "usage" && + event.payload.inputTokens === 10 && + event.payload.outputTokens === 5, + ), + ).toBe(true); + + cleanup(); + }); + + test("classifies Gemini cli-json coarse output as final-only", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-gemini-final-only"; + + class GeminiAgentJsonFake { + id = "gemini-json-fake"; + opts = { outputFormat: "json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + text: "gemini coarse final", + stats: { models: { gemini: { tokens: { input: 3, output: 2 } } } }, + }) + "\n", + ); + return { + text: '{"answer":"gemini coarse final"}', + output: { answer: "gemini coarse final" }, + }; + } + } + + const workflow = smithers(() => ( + + + coarse gemini please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.agentFamily).toBe("gemini"); + expect(summary.captureMode).toBe("cli-json"); + expect(summary.traceCompleteness).toBe("final-only"); + expect(summary.unsupportedEventKinds).toContain("assistant.text.delta"); + expect( + traceEvents.some((event: any) => event.event.kind === "usage"), + ).toBe(false); + + cleanup(); + }); + + test("preserves structured Kimi stream-json tool lifecycle truthfully", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-kimi-structured"; + + class KimiAgentFake { + id = "kimi-fake"; + opts = { outputFormat: "stream-json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "assistant_message.delta", + delta: { text: "kimi delta" }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_execution_start", + toolExecution: { + id: "tool-1", + name: "bash", + args: { cmd: "echo hi" }, + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_execution_update", + toolExecution: { + id: "tool-1", + name: "bash", + result: { status: "running" }, + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_execution_end", + toolExecution: { id: "tool-1", name: "bash", result: { ok: true } }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "response.completed", + response: { role: "assistant", content: [{ text: "kimi final" }] }, + }) + "\n", + ); + return { + text: '{"answer":"kimi final"}', + output: { answer: "kimi final" }, + }; + } + } + + const workflow = smithers(() => ( + + + structured kimi please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.agentFamily).toBe("kimi"); + expect(summary.captureMode).toBe("cli-json-stream"); + expect(summary.traceCompleteness).toBe("full-observed"); + expect(summary.unsupportedEventKinds).not.toContain("tool.execution.start"); + expect(summary.unsupportedEventKinds).not.toContain("tool.execution.update"); + expect(summary.unsupportedEventKinds).not.toContain("tool.execution.end"); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "assistant.text.delta" && + event.payload.text === "kimi delta", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "tool.execution.start" && + event.payload.toolCallId === "tool-1", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "tool.execution.update", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "tool.execution.end", + ), + ).toBe(true); + + cleanup(); + }); + + test("sdk traces with smithers-observed tool lifecycle degrade to partial-observed instead of final-only", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-sdk-tools"; + + class OpenAIAgentWithToolsFake { + id = "openai-sdk-tools-fake"; + tools = {}; + async generate() { + const startedCall = await runPromise(logToolCallStartEffect("bash")); + await runPromise( + logToolCallEffect( + "bash", + { cmd: "echo hi" }, + { ok: true }, + "success", + undefined, + undefined, + startedCall?.seq, + startedCall?.toolCallId, + ), + ); + return { + text: '{"answer":"sdk tool final"}', + output: { answer: "sdk tool final" }, + }; + } + } + + const workflow = smithers(() => ( + + + sdk tools please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.traceCompleteness).toBe("partial-observed"); + expect( + traceEvents.some( + (event: any) => event.event.kind === "tool.execution.start", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "tool.execution.end", + ), + ).toBe(true); + const toolStart = traceEvents.find( + (event: any) => event.event.kind === "tool.execution.start", + ); + const toolEnd = traceEvents.find( + (event: any) => event.event.kind === "tool.execution.end", + ); + expect(toolStart?.payload.toolCallId).toBe("bash:1"); + expect(toolEnd?.payload.toolCallId).toBe("bash:1"); + expect(toolStart?.source.rawEventId).toBeTruthy(); + expect(toolEnd?.source.rawEventId).toBeTruthy(); + expect(toolStart?.source.rawEventId).not.toBe(toolEnd?.source.rawEventId); + + cleanup(); + }); + + test("classifies truncated structured streams as capture-failed", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-truncated-stream"; + + class CodexAgentTruncatedFake { + id = "codex-truncated-fake"; + opts = { outputFormat: "stream-json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "assistant_message.delta", + delta: { text: "partial" }, + }) + "\n", + ); + args.onStdout?.( + '{"type":"assistant_message.delta","delta":{"text":"unterminated"}', + ); + throw new Error("subprocess exits early"); + } + } + + const workflow = smithers(() => ( + + + truncated codex please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("failed"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.traceCompleteness).toBe("capture-failed"); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "capture.error" && + event.payload.reason === "truncated-json-stream", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "assistant.message.final", + ), + ).toBe(false); + + cleanup(); + }); + + test("records artifact write failures as capture warnings while durable DB truth remains", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-artifact-write-fail"; + const tmp = mkdtempSync(join(tmpdir(), "smithers-agent-trace-")); + const badLogPath = join(tmp, "not-a-directory"); + writeFileSync(badLogPath, "occupied"); + + class GeminiAgentArtifactFake { + id = "gemini-artifact-fake"; + opts = { outputFormat: "stream-json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.( + JSON.stringify({ + type: "response.output_text.delta", + delta: { text: "artifact delta" }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "response.completed", + response: { + role: "assistant", + content: [{ text: "artifact final" }], + }, + }) + "\n", + ); + return { + text: '{"answer":"artifact final"}', + output: { answer: "artifact final" }, + }; + } + } + + const workflow = smithers(() => ( + + + artifact write failure please + + + )); + + const result = await runWorkflow(workflow, { + input: {}, + runId, + logDir: badLogPath, + }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.traceCompleteness).toBe("partial-observed"); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "capture.warning" && + event.payload.reason === "artifact-write-failed", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "assistant.message.final", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "artifact.created", + ), + ).toBe(false); + + cleanup(); + }); + + test("records artifact rewrite failures as capture warnings without failing the run", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-artifact-rewrite-fail"; + const originalRewriteNdjson = (AgentTraceCollector.prototype as any).rewriteNdjson; + (AgentTraceCollector.prototype as any).rewriteNdjson = async function () { + throw new Error("rewrite exploded"); + }; + + class PiAgentRewriteFake { + id = "pi-rewrite-fake"; + opts = { mode: "json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.(JSON.stringify({ type: "session", id: "sess-1" }) + "\n"); + args.onStdout?.(JSON.stringify({ type: "turn_start" }) + "\n"); + args.onStdout?.( + JSON.stringify({ + type: "message_end", + message: { + role: "assistant", + content: [{ type: "text", text: "rewrite final" }], + }, + }) + "\n", + ); + return { + text: '{"answer":"rewrite final"}', + output: { answer: "rewrite final" }, + }; + } + } + + try { + const workflow = smithers(() => ( + + + rewrite fail please + + + )); + + const result = await runWorkflow(workflow, { input: {}, runId }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.traceCompleteness).toBe("partial-observed"); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "capture.warning" && + event.payload.reason === "artifact-rewrite-failed", + ), + ).toBe(true); + expect( + traceEvents.some( + (event: any) => event.event.kind === "artifact.created", + ), + ).toBe(true); + } finally { + (AgentTraceCollector.prototype as any).rewriteNdjson = originalRewriteNdjson; + cleanup(); + } + }); + + test("records persisted trace artifacts with a canonical artifact.created event", async () => { + const { smithers, outputs, db, cleanup } = createTestSmithers({ + result: z.object({ answer: z.string() }), + }); + const runId = "agent-trace-artifact-created"; + const logDir = mkdtempSync(join(tmpdir(), "smithers-agent-trace-artifact-")); + + class PiArtifactAgentFake { + id = "pi-artifact-fake"; + opts = { mode: "json" }; + async generate(args: { onStdout?: (text: string) => void }) { + args.onStdout?.(JSON.stringify({ type: "session", id: "sess-1" }) + "\n"); + args.onStdout?.( + JSON.stringify({ + type: "turn_end", + message: { + role: "assistant", + content: [{ type: "text", text: "artifact ok" }], + }, + }) + "\n", + ); + args.onStdout?.(JSON.stringify({ type: "agent_end" }) + "\n"); + return { + text: '{"answer":"artifact ok"}', + output: { answer: "artifact ok" }, + }; + } + } + + const workflow = smithers(() => ( + + + artifact created please + + + )); + + const result = await runWorkflow(workflow, { + input: {}, + runId, + logDir, + }); + expect(result.status).toBe("finished"); + + const events = await listRunEvents(db, runId); + const summary = JSON.parse( + events.find((event: any) => event.type === "AgentTraceSummary")! + .payloadJson, + ).summary; + const traceEvents = events + .filter((event: any) => event.type === "AgentTraceEvent") + .map((event: any) => JSON.parse(event.payloadJson).trace); + + expect(summary.rawArtifactRefs).toHaveLength(1); + expect( + traceEvents.some( + (event: any) => + event.event.kind === "artifact.created" && + event.payload.artifactPath === summary.rawArtifactRefs[0], + ), + ).toBe(true); + const artifactText = readFileSync(summary.rawArtifactRefs[0], "utf8"); + expect(artifactText).toContain('"assistant.message.final"'); + expect(artifactText).toContain('"artifact.created"'); + expect(artifactText).toContain(summary.rawArtifactRefs[0]); + + cleanup(); + }); + + test("shapes OTEL log records with canonical query attributes and redacted body", () => { + const record = canonicalTraceEventToOtelLogRecord( + { + traceVersion: "1", + runId: "run-123", + workflowPath: "workflows/demo.tsx", + workflowHash: "abc123", + nodeId: "pi-rich-trace", + iteration: 0, + attempt: 1, + timestampMs: Date.now(), + event: { + sequence: 7, + kind: "assistant.thinking.delta", + phase: "message", + }, + source: { + agentFamily: "pi", + captureMode: "cli-json-stream", + rawType: "thinking_delta", + rawEventId: "thinking_delta:0", + observed: true, + }, + traceCompleteness: "full-observed", + payload: { text: "thought [REDACTED_SECRET]" }, + raw: { text: "thought [REDACTED_SECRET]" }, + redaction: { applied: true, ruleIds: ["secret-ish"] }, + annotations: { "custom.demo": true, ticket: "OBS-1" }, + }, + { + agentId: "pi-agent", + model: "gpt-5.4", + }, + ); + + expect(record.severity).toBe("INFO"); + expect(record.attributes["run.id"]).toBe("run-123"); + expect(record.attributes["node.id"]).toBe("pi-rich-trace"); + expect(record.attributes["node.attempt"]).toBe(1); + expect(record.attributes["event.kind"]).toBe("assistant.thinking.delta"); + expect(record.attributes["source.raw_event_id"]).toBe("thinking_delta:0"); + expect(record.attributes["custom.ticket"]).toBe("OBS-1"); + expect(record.body).toContain("REDACTED_SECRET"); + expect(record.body).not.toContain("sk_live_"); + }); +}); diff --git a/workflows/agent-trace-otel-demo.tsx b/workflows/agent-trace-otel-demo.tsx new file mode 100644 index 00000000..92d23e1e --- /dev/null +++ b/workflows/agent-trace-otel-demo.tsx @@ -0,0 +1,315 @@ +/** @jsxImportSource smithers */ +import { runPromise } from "../src/effect/runtime"; +import { logToolCallEffect, logToolCallStartEffect } from "../src/tools/logToolCall"; +import { createSmithers, Task, Workflow } from "../src"; +import { z } from "zod"; + +const PiOutput = z.object({ + answer: z.string(), +}); + +const ClaudeOutput = z.object({ + answer: z.string(), +}); + +const CodexOutput = z.object({ + answer: z.string(), +}); + +const GeminiOutput = z.object({ + answer: z.string(), +}); + +const SdkOutput = z.object({ + answer: z.string(), +}); + +const { smithers, outputs } = createSmithers( + { + piResult: PiOutput, + claudeResult: ClaudeOutput, + codexResult: CodexOutput, + geminiResult: GeminiOutput, + sdkResult: SdkOutput, + }, + { + dbPath: "./workflows/agent-trace-otel-demo.db", + journalMode: "DELETE", + }, +); + +function createPiDemoAgent(failureMode?: string) { + return { + id: failureMode ? `pi-observability-demo-${failureMode}` : "pi-observability-demo", + model: "demo-pi", + opts: { mode: "json" }, + generate: async (args: { onStdout?: (text: string) => void }) => { + if (failureMode === "malformed-json") { + args.onStdout?.("{not valid json}\n"); + throw new Error("demo malformed-json failure"); + } + args.onStdout?.(JSON.stringify({ type: "session", id: "demo-session" }) + "\n"); + args.onStdout?.(JSON.stringify({ type: "turn_start", id: "turn-1" }) + "\n"); + args.onStdout?.( + JSON.stringify({ + type: "message_start", + message: { role: "assistant", content: [] }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message_update", + assistantMessageEvent: { + type: "thinking_delta", + delta: "Planning demo work with token=sk_demo_secret_123456789", + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message_update", + assistantMessageEvent: { + type: "text_delta", + delta: "Working through the repository. ", + }, + }) + "\n", + ); + const seq = await runPromise(logToolCallStartEffect("bash")); + await runPromise( + logToolCallEffect( + "bash", + { command: "echo sk_live_secret_123456789" }, + { ok: true, stdout: "done" }, + "success", + undefined, + undefined, + typeof seq === "number" ? seq : undefined, + ), + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_execution_start", + toolExecution: { + id: "tool-1", + name: "bash", + args: { command: "echo hi" }, + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_execution_update", + toolExecution: { + id: "tool-1", + name: "bash", + result: { status: "running" }, + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "tool_execution_end", + toolExecution: { + id: "tool-1", + name: "bash", + result: { ok: true }, + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "turn_end", + message: { + role: "assistant", + content: [{ type: "text", text: "Demo PI answer" }], + usage: { inputTokens: 11, outputTokens: 7 }, + }, + }) + "\n", + ); + args.onStdout?.(JSON.stringify({ type: "agent_end", id: "demo-session" }) + "\n"); + return { + text: '{"answer":"Demo PI answer"}', + output: { answer: "Demo PI answer" }, + usage: { inputTokens: 11, outputTokens: 7 }, + }; + }, + } as any; +} + +function createClaudeDemoAgent() { + return { + id: "claude-observability-demo", + model: "demo-claude", + opts: { outputFormat: "stream-json" }, + generate: async (args: { onStdout?: (text: string) => void }) => { + args.onStdout?.( + JSON.stringify({ + type: "system", + subtype: "init", + session_id: "claude-demo-session", + model: "claude-opus-demo", + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "assistant", + message: { + role: "assistant", + content: [{ type: "text", text: "Claude demo answer" }], + usage: { + input_tokens: 13, + cache_read_input_tokens: 2, + output_tokens: 5, + }, + }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "result", + subtype: "success", + result: "Claude demo answer", + usage: { + input_tokens: 13, + cache_read_input_tokens: 2, + output_tokens: 5, + }, + }) + "\n", + ); + return { + text: '{"answer":"Claude demo answer"}', + output: { answer: "Claude demo answer" }, + }; + }, + } as any; +} + +function createGeminiDemoAgent() { + return { + id: "gemini-observability-demo", + model: "demo-gemini", + opts: { outputFormat: "stream-json" }, + generate: async (args: { onStdout?: (text: string) => void }) => { + args.onStdout?.( + JSON.stringify({ + type: "init", + session_id: "gemini-demo-session", + model: "gemini-demo", + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message", + role: "assistant", + content: "Gemini ", + delta: true, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "message", + role: "assistant", + content: "demo answer", + delta: true, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "result", + status: "success", + stats: { + input_tokens: 9, + output_tokens: 4, + total_tokens: 13, + }, + }) + "\n", + ); + return { + text: '{"answer":"Gemini demo answer"}', + output: { answer: "Gemini demo answer" }, + }; + }, + } as any; +} + +function createCodexDemoAgent() { + return { + id: "codex-observability-demo", + model: "demo-codex", + opts: { outputFormat: "stream-json", json: true }, + generate: async (args: { onStdout?: (text: string) => void }) => { + args.onStdout?.( + JSON.stringify({ + type: "assistant_message.delta", + delta: { text: "Codex demo answer" }, + }) + "\n", + ); + args.onStdout?.( + JSON.stringify({ + type: "turn.completed", + usage: { input_tokens: 12, output_tokens: 6 }, + message: { role: "assistant", content: "Codex demo answer" }, + }) + "\n", + ); + return { + text: '{"answer":"Codex demo answer"}', + output: { answer: "Codex demo answer" }, + }; + }, + } as any; +} + +class OpenAIAgentDemo { + id = "openai-demo-agent"; + model = "demo-sdk"; + tools = {}; + + async generate() { + return { + text: '{"answer":"Demo SDK answer"}', + output: { answer: "Demo SDK answer" }, + usage: { inputTokens: 5, outputTokens: 3 }, + }; + } +} + +export default smithers((ctx) => { + const failureMode = typeof ctx.input.failureMode === "string" ? ctx.input.failureMode : undefined; + const piAgent = createPiDemoAgent(failureMode); + + return ( + + + {`Emit a PI-like trace for observability verification. failureMode=${failureMode ?? "none"}`} + + {!failureMode && ( + <> + + {`Emit a Claude-like structured trace after the PI demo task completes.`} + + + {`Emit a Gemini-like structured trace after the Claude demo task completes.`} + + + {`Emit a Codex-like structured trace after the Gemini demo task completes.`} + + + {`Return a final-only SDK response after the structured PI, Claude, Gemini, and Codex demo tasks complete.`} + + + )} + + ); +}); diff --git a/workflows/agent-trace-otel-loop.tsx b/workflows/agent-trace-otel-loop.tsx new file mode 100644 index 00000000..3182f85d --- /dev/null +++ b/workflows/agent-trace-otel-loop.tsx @@ -0,0 +1,451 @@ +/** @jsxImportSource smithers */ +import { spawnSync } from "node:child_process"; +import { + createSmithers, + PiAgent, + Workflow, + Task, + Sequence, + Ralph, +} from "smithers"; +import { z } from "zod"; + +const DEFAULT_VALIDATION_COMMANDS = [ + "bun run typecheck", + "bun run test", +]; +const DEFAULT_PI_PROVIDER = "openai-codex"; +const DEFAULT_PI_MODEL = "gpt-5.4"; + +const ImplementOutput = z.object({ + summary: z.string(), + filesChanged: z.array(z.string()).default([]), + notableDecisions: z.array(z.string()).default([]), + remainingRisks: z.array(z.string()).default([]), +}); + +const ValidateOutput = z.object({ + allPassed: z.boolean(), + summary: z.string(), + gitStatus: z.string(), + gitDiffStat: z.string(), + results: z.array( + z.object({ + command: z.string(), + exitCode: z.number().int(), + passed: z.boolean(), + stdout: z.string(), + stderr: z.string(), + }), + ), +}); + +const ReviewOutput = z.object({ + approved: z.boolean(), + summary: z.string(), + blockingFeedback: z.array(z.string()).default([]), + strengths: z.array(z.string()).default([]), + nextActions: z.array(z.string()).default([]), +}); + +const FinalOutput = z.object({ + approved: z.boolean(), + reviewRounds: z.number().int().min(1), + implementationSummary: z.string(), + validationSummary: z.string(), + reviewSummary: z.string(), + blockingFeedback: z.array(z.string()).default([]), +}); + +const { smithers, outputs } = createSmithers( + { + implement: ImplementOutput, + validate: ValidateOutput, + review: ReviewOutput, + finalReport: FinalOutput, + }, + { + dbPath: "./workflows/agent-trace-otel-loop.db", + journalMode: "DELETE", + }, +); + +function createImplementerAgent(provider: string, model: string) { + return new PiAgent({ + provider, + model, + mode: "rpc", + yolo: true, + timeoutMs: 45 * 60 * 1000, + idleTimeoutMs: 5 * 60 * 1000, + thinking: "high", + appendSystemPrompt: [ + "You are the implementation agent for Smithers.", + "Work directly in the current repository.", + "Read the specification before changing code.", + "You must make the smallest coherent set of changes that satisfies the specification.", + "Do not claim verification you did not actually obtain.", + "Return exact changed file paths and a concise risk summary.", + ].join("\n"), + }); +} + +function createReviewerAgent(provider: string, model: string) { + return new PiAgent({ + provider, + model, + mode: "rpc", + yolo: true, + timeoutMs: 30 * 60 * 1000, + idleTimeoutMs: 5 * 60 * 1000, + thinking: "high", + appendSystemPrompt: [ + "You are an extremely strict implementation reviewer.", + "Your job is to reject weak, incomplete, under-verified, or cosmetically-correct changes.", + "Be direct and unsentimental.", + "Approve only if the implementation clearly satisfies the specification and the deterministic validation evidence is strong.", + "If validation is weak, incomplete, or missing, that is blocking feedback.", + "If logs/traces/verification claims are hand-wavy, that is blocking feedback.", + "Prefer a short list of high-severity blocking issues over a long list of minor nits.", + ].join("\n"), + }); +} + +function asString(value: unknown, fallback = ""): string { + return typeof value === "string" ? value : fallback; +} + +function asStringArray(value: unknown): string[] { + return Array.isArray(value) ? value.filter((v): v is string => typeof v === "string") : []; +} + +function resolveValidationCommands(input: Record): string[] { + const commands = input.validationCommands; + const parsed = asStringArray(commands); + return parsed.length > 0 ? parsed : DEFAULT_VALIDATION_COMMANDS; +} + +function runShell(command: string) { + const result = spawnSync(command, { + shell: true, + cwd: process.cwd(), + env: process.env, + encoding: "utf8", + maxBuffer: 10 * 1024 * 1024, + }); + return { + command, + exitCode: result.status ?? (result.error ? 1 : 0), + passed: (result.status ?? 0) === 0 && !result.error, + stdout: (result.stdout ?? "").trim(), + stderr: [result.stderr ?? "", result.error?.message ?? ""].filter(Boolean).join("\n").trim(), + }; +} + +export default smithers((ctx) => { + const latestImplement = ctx.latest("implement", "implement"); + const latestValidate = ctx.latest("validate", "validate"); + const latestReview = ctx.latest("review", "review"); + + const approved = latestReview?.approved === true; + const piProvider = asString( + (ctx.input as Record).piProvider, + DEFAULT_PI_PROVIDER, + ); + const piModel = asString( + (ctx.input as Record).piModel, + DEFAULT_PI_MODEL, + ); + const implementer = createImplementerAgent(piProvider, piModel); + const reviewer = createReviewerAgent(piProvider, piModel); + const validationCommands = resolveValidationCommands(ctx.input as Record); + const branchName = asString((ctx.input as Record).branchName, "feat/agent-trace-otel-logs"); + const specPath = asString( + (ctx.input as Record).specPath, + "docs/concepts/agent-trace-otel-logs-spec.mdx", + ); + const composePath = asString( + (ctx.input as Record).composePath, + "observability/docker-compose.otel.yml", + ); + + return ( + + + + + + {`Read the spec first: ${specPath}. Treat it as the source of truth. The work is not “add some logs.” The work is: + +1. define and implement a canonical agent trace model +2. capture the fullest observable trace per agent family +3. export those canonical trace events as OTEL logs +4. verify end-to-end in Docker with Loki +5. prove correctness using the verification section in the spec + +Also note the current local stack in ${composePath} does not include Loki yet. It only starts collector, Prometheus, Tempo, and Grafana. Adding Loki and a collector logs pipeline is part of the assignment. + +What He Should Understand Before Coding +Read these first: + +- ${specPath} +- ${composePath} +- observability/otel-collector-config.yml +- docs/guides/monitoring-logs.mdx +- docs/runtime/events.mdx +- docs/integrations/pi-integration.mdx +- docs/integrations/cli-agents.mdx + +Then inspect the code paths that matter: + +- src/agents/PiAgent.ts +- src/agents/BaseCliAgent.ts +- src/events.ts +- src/effect/logging.ts +- src/observability/index.ts + +Expected Deliverable +He should not stop at implementation. He should hand back: + +- code +- updated Docker observability stack with Loki +- tests +- a reproducible verification procedure +- sample queries proving the system works + +Required Verification Environment +He should use Docker for the local stack: + +smithers observability --detach + +That CLI currently starts the stack from src/cli/index.ts:876, but he will need to extend it so the stack includes Loki too. + +Target local endpoints after his changes should be something like: + +- Grafana +- Prometheus +- Tempo +- Loki +- OTEL collector + +He should document exact ports if he changes them. + +How He Should Verify +Use the spec’s verification classes as the acceptance gate. In practice, he should prove all of this: + +1. Canonical trace capture works + Run a workflow with at least one agent that has a rich trace surface, preferably PiAgent. + He must show: + - assistant text deltas + - visible thinking deltas if Pi emits them + - tool execution start/update/end + - final assistant message + - run/node/attempt correlation fields +2. OTEL logs are really exported + Not just local files. He must prove the canonical trace events land in Loki or another OTEL log backend through the collector. +3. Logs are queryable + In Grafana/Loki he must show queries that answer: + - all events for one run.id + - one node.id + attempt + - only assistant.thinking.delta + - only tool.execution.* + - only capture.error +4. Current metrics/traces still work + He must ensure adding logs does not break existing Prometheus/Tempo behavior. +5. Failure cases are classified + He should simulate at least: + - malformed upstream JSON + - collector unavailable + - subprocess exits early + - redaction case + and show the system reports capture.error, partial-observed, or capture-failed correctly. +6. Redaction works + He must include at least one fixture with a secret-like token and show it is absent from: + - canonical persisted payload + - OTEL log body + - artifacts if artifacts are enabled + +Recommended Test Matrix +Tell him to verify at least this matrix: + +- PiAgent: + - rich trace success case + - failure case + - redaction case +- one structured CLI agent: + - CodexAgent or ClaudeCodeAgent + - prove structured events are preserved if available + - otherwise prove truthful partial classification +- one SDK agent: + - OpenAIAgent or AnthropicAgent + - prove partial/final-only capture is explicit, not faked + +Suggested Demo Workflow +He should create or reuse one simple workflow specifically for observability verification: + +- one task using PiAgent +- one task using a second agent family +- at least one tool invocation +- stable run annotations +- easy-to-query workflow path + +Then run it with OTEL enabled, for example: + +export SMITHERS_OTEL_ENABLED=1 +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 +export OTEL_SERVICE_NAME=smithers-dev + +What “Done” Means +Use this language with him: + +Done means: + +- Loki is part of the local Docker stack +- the collector has a real logs pipeline +- canonical agent trace events exist +- at least Pi is high-fidelity per spec +- other agent families are truthfully classified +- there are automated tests for schema, ordering, fidelity, completeness, export, redaction, and failure handling +- there is a written verification section with commands and Grafana/Loki queries +- a reviewer can reproduce the verification from a clean checkout + +What He Should Return +Ask him to return: + +- the branch +- a short “verification report” with: + - commands run + - workflow used + - exact Grafana/Loki queries + - screenshots or copied query results + - which verification classes from the spec passed + - any explicitly deferred items + +Execution context: +- branch: ${branchName} +- PI provider/model: ${piProvider}/${piModel} +${latestReview ? `- Previous review summary: ${latestReview.summary}` : "- This is the first implementation pass."} +${latestReview?.blockingFeedback?.length ? `- Blocking feedback to address:\n${latestReview.blockingFeedback.map((item: string) => ` - ${item}`).join("\n")}` : ""} +${latestValidate ? `- Latest deterministic validation summary: ${latestValidate.summary}` : "- No deterministic validation results exist yet."} + +Return JSON only, with: +- summary +- filesChanged +- notableDecisions +- remainingRisks`} + + + + {() => { + const results = validationCommands.map(runShell); + const allPassed = results.every((result) => result.passed); + const gitStatus = runShell("git status --short").stdout; + const gitDiffStat = runShell("git diff --stat").stdout; + const failed = results.filter((result) => !result.passed); + + return { + allPassed, + summary: allPassed + ? `All deterministic validation commands passed (${results.length} commands).` + : `${failed.length} of ${results.length} deterministic validation commands failed.`, + gitStatus, + gitDiffStat, + results, + }; + }} + + + + {`Review the implementation brutally. + +This is a Smithers implementation-review loop for the OTEL agent trace logging work. + +Primary brief files: +- ${specPath} +- ${composePath} +- PI provider/model under review: ${piProvider}/${piModel} + +You are reviewing the current repository state, not an abstract plan. + +Implementation agent report: +${latestImplement ? JSON.stringify(latestImplement, null, 2) : "No implementation report was produced."} + +Deterministic validation report: +${latestValidate ? JSON.stringify(latestValidate, null, 2) : "No validation report was produced."} + +Your role: +- You are the backpressure mechanism. +- You are not a collaborator helping the implementer feel good. +- You are the reviewer responsible for preventing weak, premature, under-verified changes from passing. +- You must assume the implementer will take shortcuts unless the evidence proves otherwise. + +Verification duties: +- Audit whether the implementation actually satisfies the verification section of the spec. +- Require evidence, not intent. +- Treat missing end-to-end verification as a blocking failure. +- Treat unverified Docker + Loki + collector integration as a blocking failure. +- Treat absent or weak query examples as a blocking failure. +- Treat missing failure-mode validation as a blocking failure. +- Treat redaction claims without proof as a blocking failure. +- Treat “it should work” or “it is wired” language as evidence of incompleteness. + +Smithers coding-style duties: +- Enforce repository-local conventions over generic agent habits. +- Reject unnecessary abstraction, speculative generalization, and architecture without pressure. +- Reject code that hides truthfulness about agent fidelity. +- Reject code that collapses raw event boundaries into summaries when the spec requires preservation. +- Reject code that uses high-cardinality labels for large bodies. +- Reject implementation that weakens durable local truth in favor of OTEL-only export. +- Reject code that does not fit the existing Smithers runtime, observability, and event model cleanly. +- Reject hand-wavy docs that are not reproducible from a clean checkout. + +Specific rejection criteria: +- Reject any vague or partial implementation. +- Reject if the OTEL logs path is not actually verifiable with Docker + Loki + Grafana. +- Reject if Pi trace capture is not clearly higher fidelity than the other agents. +- Reject if unsupported agent capabilities are hand-waved instead of truthfully classified. +- Reject if deterministic validation is weak, missing, or ignored. +- Reject if the work appears to optimize appearance over correctness. +- Reject if tests cover only the happy path. +- Reject if the coworker handoff is missing commands, queries, or exact verification steps. + +Approval standard: +- Approve only if the implementation appears specification-complete, verification-complete, reproducible, and operationally believable. +- If there is any meaningful unresolved risk, set approved=false. +- Default to rejection unless the evidence is boringly strong. + +Feedback style: +- Be harsh, concise, and specific. +- Prioritize the highest-severity blockers. +- Do not soften critique. +- Do not pad with praise unless it is genuinely earned and still useful. + +Return JSON only, with: +- approved +- summary +- blockingFeedback +- strengths +- nextActions`} + + + + + + {() => ({ + approved: latestReview?.approved === true, + reviewRounds: ctx.iterationCount("review", "review"), + implementationSummary: latestImplement?.summary ?? "No implementation summary available.", + validationSummary: latestValidate?.summary ?? "No validation summary available.", + reviewSummary: latestReview?.summary ?? "No review summary available.", + blockingFeedback: latestReview?.blockingFeedback ?? [], + })} + + + + ); +}); diff --git a/workflows/real-agent-trace-check.tsx b/workflows/real-agent-trace-check.tsx new file mode 100644 index 00000000..686b9baa --- /dev/null +++ b/workflows/real-agent-trace-check.tsx @@ -0,0 +1,63 @@ +/** @jsxImportSource smithers */ +import { createSmithers, Task, Workflow } from "../src"; +import { ClaudeCodeAgent } from "../src/agents/ClaudeCodeAgent"; +import { AmpAgent } from "../src/agents/AmpAgent"; +import { GeminiAgent } from "../src/agents/GeminiAgent"; +import { z } from "zod"; + +const Output = z.object({ + answer: z.string(), +}); + +const { smithers, outputs } = createSmithers( + { + result: Output, + }, + { + dbPath: "./workflows/real-agent-trace-check.db", + journalMode: "DELETE", + }, +); + +function buildAgent(kind: string) { + switch (kind) { + case "claude": + return new ClaudeCodeAgent({ + yolo: true, + outputFormat: "stream-json", + maxOutputBytes: 512 * 1024, + }); + case "amp": + return new AmpAgent({ + yolo: true, + maxOutputBytes: 512 * 1024, + }); + case "gemini": + return new GeminiAgent({ + approvalMode: "yolo", + outputFormat: "stream-json", + maxOutputBytes: 512 * 1024, + }); + default: + throw new Error(`Unsupported agent kind: ${kind}`); + } +} + +export default smithers((ctx) => { + const agentKind = + typeof ctx.input.agent === "string" ? ctx.input.agent : "claude"; + + return ( + + + {`Respond briefly. State which agent you are, then end with JSON matching the required schema.`} + + + ); +});