fullsend-ai · ralphbean · May 29, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/.github/workflows/functional-evals.yml b/.github/workflows/functional-evals.yml
@@ -0,0 +1,176 @@
+name: Functional Evals
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - 'eval/**'
+      - 'internal/scaffold/**'
+  pull_request:
+    branches: [main]
+    paths:
+      - 'eval/**'
+      - 'internal/scaffold/**'
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  id-token: write
+
+concurrency:
+  group: functional-evals-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  functional-evals:
+    runs-on: ubuntu-latest
+    timeout-minutes: 45
+    steps:
+      - uses: actions/checkout@v6.0.2
+        with:
+          submodules: true
+
+      - uses: actions/setup-go@v5
+        with:
+          go-version-file: go.mod
+
+      - uses: actions/setup-python@v6.2.0
+        with:
+          python-version: "3.12"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7.6.0
+
+      - name: Install agent-eval-harness
+        run: uv pip install --system 'agent-eval-harness[anthropic] @ git+https://github.com/opendatahub-io/agent-eval-harness.git'
+
+      - name: Install yq
+        run: |
+          curl -sSfL "https://github.com/mikefarah/yq/releases/download/v4.47.1/yq_linux_amd64" -o /usr/local/bin/yq
+          chmod +x /usr/local/bin/yq
+
+      - name: Configure git identity
+        run: |
+          git config --global user.name "fullsend-eval[bot]"
+          git config --global user.email "fullsend-eval[bot]@users.noreply.github.com"
+
+      - name: Build fullsend
+        run: make go-build
+
+      - name: Add bin to PATH
+        run: echo "${{ github.workspace }}/bin" >> "$GITHUB_PATH"
+
+      # TODO: The openshell setup below (version, CLI, gateway, Podman,
+      # gateway start) is duplicated from action.yml. Extract into a
+      # shared script (e.g. .github/scripts/setup-openshell.sh) so the
+      # version and config stay in sync across both places.
+      - name: Set OpenShell version
+        run: echo "OPENSHELL_VERSION=0.0.38" >> "${GITHUB_ENV}"
+
+      - name: Install OpenShell CLI
+        run: |
+          uv tool install "openshell==${OPENSHELL_VERSION}"
+          openshell --version
+
+      - name: Download openshell-gateway
+        run: |
+          set -euo pipefail
+          arch="$(uname -m)"
+          case "${arch}" in
+            x86_64) ;;
+            aarch64|arm64) arch=aarch64 ;;
+            *) echo "::error::Unsupported architecture: ${arch}"; exit 1 ;;
+          esac
+          GATEWAY_ASSET="openshell-gateway-${arch}-unknown-linux-gnu.tar.gz"
+          GATEWAY_URL="https://github.com/NVIDIA/OpenShell/releases/download/v${OPENSHELL_VERSION}/${GATEWAY_ASSET}"
+          curl -fsSL "${GATEWAY_URL}" -o "/tmp/${GATEWAY_ASSET}"
+          tar xzf "/tmp/${GATEWAY_ASSET}" -C "${{ runner.temp }}"
+          rm -f "/tmp/${GATEWAY_ASSET}"
+
+      - name: Install Podman
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y podman
+
+      - name: Configure rootless Podman
+        run: |
+          whoami_user="$(whoami)"
+          grep -q "^${whoami_user}:" /etc/subuid || sudo usermod --add-subuids 100000-165535 --add-subgids 100000-165535 "${whoami_user}"
+          podman system migrate
+
+      - name: Start Podman API service
+        run: |
+          SOCKET_PATH="${XDG_RUNTIME_DIR:-/run/user/$(id -u)}/podman/podman.sock"
+          if [ ! -S "${SOCKET_PATH}" ]; then
+            mkdir -p "$(dirname "${SOCKET_PATH}")"
+            podman system service --time=0 "unix://${SOCKET_PATH}" &
+            for _i in $(seq 1 30); do
+              [ -S "${SOCKET_PATH}" ] && podman --url "unix://${SOCKET_PATH}" info >/dev/null 2>&1 && break
+              sleep 1
+            done
+            [ -S "${SOCKET_PATH}" ] || { echo "::error::Podman socket not ready"; exit 1; }
+          fi
+
+      - name: Start openshell-gateway
+        run: |
+          set -euo pipefail
+          OPENSHELL_SSH_HANDSHAKE_SECRET="ci-$(openssl rand -hex 16)"
+          export OPENSHELL_SSH_HANDSHAKE_SECRET
+          echo "::add-mask::${OPENSHELL_SSH_HANDSHAKE_SECRET}"
+          export OPENSHELL_SUPERVISOR_IMAGE="ghcr.io/nvidia/openshell/supervisor:dfd47683e7da4f1a4a8fa5d77f92d3696e6a41f9"
+          "${{ runner.temp }}/openshell-gateway" \
+            --bind-address 0.0.0.0 \
+            --health-port 8081 \
+            --drivers podman \
+            --disable-tls \
+            --db-url "sqlite:/tmp/gateway.db?mode=rwc" \
+            >/tmp/gateway.log 2>&1 &
+          for _i in $(seq 1 30); do
+            curl -sf http://127.0.0.1:8081/healthz >/dev/null 2>&1 && break
+            sleep 2
+          done
+          curl -sf http://127.0.0.1:8081/healthz >/dev/null 2>&1 || {
+            echo "::error::Gateway health check failed"
+            cat /tmp/gateway.log 2>/dev/null || true
+            exit 1
+          }
+          openshell gateway add http://127.0.0.1:8080 --local --name local
+          openshell gateway select local
+
+      - name: Install validation dependencies
+        run: pip install --quiet "jsonschema>=4.18.0"
+
+      - name: Authenticate to GCP
+        uses: google-github-actions/auth@v2
+        with:
+          workload_identity_provider: ${{ secrets.E2E_GCP_WIF_PROVIDER }}
+          service_account: ${{ secrets.E2E_GCP_SERVICE_ACCOUNT }}
+
+      - name: Prepare sandbox credentials
+        run: |
+          echo "HOST_GOOGLE_APPLICATION_CREDENTIALS=$GOOGLE_APPLICATION_CREDENTIALS" >> "$GITHUB_ENV"
+          bash internal/scaffold/fullsend-repo/scripts/prepare-sandbox-credentials.sh
+
+      - name: Run functional evals
+        env:
+          EVAL_ORG: ${{ vars.EVAL_ORG }}
+          GH_TOKEN: ${{ secrets.EVAL_GH_TOKEN }}
+          ANTHROPIC_VERTEX_PROJECT_ID: ${{ vars.EVALS_VERTEX_PROJECT_ID }}
+          GOOGLE_CLOUD_PROJECT: ${{ secrets.E2E_GCP_PROJECT_ID }}
+          CLOUD_ML_REGION: ${{ vars.EVALS_GCP_REGION }}
+          EVALS_HOST_CREDENTIALS: ${{ env.HOST_GOOGLE_APPLICATION_CREDENTIALS }}
+        run: make functional-evals
+
+      - name: Scrub secrets from eval results
+        if: always()
+        run: find eval/runs/ -name '.eval-env' -delete 2>/dev/null || true
+
+      - name: Upload eval results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: eval-results
+          path: |
+            eval/runs/
+            !eval/runs/**/.eval-env
+          retention-days: 30
diff --git a/.gitignore b/.gitignore
@@ -20,3 +20,4 @@ bin/
 .env.*
 !.env.example
 .transcripts/
+eval/runs/
diff --git a/.gitmodules b/.gitmodules
@@ -2,3 +2,6 @@
 	path = experiments
 	url = git@github.com:fullsend-ai/experiments.git
 	branch = main
+[submodule "eval/.agent-eval-harness"]
+	path = eval/.agent-eval-harness
+	url = https://github.com/opendatahub-io/agent-eval-harness.git
diff --git a/Makefile b/Makefile
@@ -2,7 +2,8 @@
 .PHONY: help bootstrap lint lint-all check fmt \
        mindmap go-build go-test go-lint go-fmt go-vet go-tidy \
        lint-md-links script-test test \
-       e2e-test e2e-playwright e2e-export-session e2e-upload-session
+       e2e-test e2e-playwright e2e-export-session e2e-upload-session \
+       functional-evals
 
 # Let Go automatically download the toolchain version required by go.mod.
 # This ensures local builds use the right version without manual intervention.
@@ -30,6 +31,7 @@ help:
 	@echo "  e2e-test             - Run admin e2e tests (requires E2E_GITHUB_SESSION_FILE or E2E_GITHUB_USERNAME + E2E_GITHUB_PASSWORD)"
 	@echo "  e2e-export-session   - Login to GitHub and export a Playwright session file"
 	@echo "  e2e-upload-session   - Export session and upload it as a GitHub repo secret"
+	@echo "  functional-evals     - Run functional agent evals (requires EVAL_ORG, FULLSEND_DIR, GH_TOKEN, GCP creds)"
 
 # Install all development tools needed for linting, formatting, and pre-commit hooks.
 # Prerequisites: uv (https://docs.astral.sh/uv/) and go (https://go.dev/)
@@ -147,3 +149,14 @@ e2e-playwright:
 		echo "==> Installing Playwright Chromium..."; \
 		go run github.com/playwright-community/playwright-go/cmd/playwright install chromium; \
 	fi
+
+# Functional agent evals — run agents against ephemeral GitHub repos and judge results.
+# Required env: EVAL_ORG (GitHub org for ephemeral repos), plus GCP creds for Vertex AI.
+# GH_TOKEN defaults to `gh auth token` if not set.
+FULLSEND_DIR ?= $(CURDIR)/internal/scaffold/fullsend-repo
+EVAL_AGENTS  ?= triage
+
+functional-evals:
+	@for agent in $(EVAL_AGENTS); do \
+		FULLSEND_DIR="$(FULLSEND_DIR)" ./eval/run-functional.sh "$$agent"; \
+	done
diff --git a/docs/ADRs/0044-functional-evals-for-agent-pipelines.md b/docs/ADRs/0044-functional-evals-for-agent-pipelines.md
@@ -0,0 +1,122 @@
+---
+title: "44. Functional evals for agent pipelines"
+status: Accepted
+relates_to:
+  - testing-agents
+topics:
+  - testing
+  - evals
+---
+
+# 44. Functional evals for agent pipelines
+
+Date: 2026-05-29
+
+## Status
+
+Accepted
+
+<!-- Once this ADR is Accepted, its content is frozen. Do not edit the Context,
+     Decision, or Consequences sections. If circumstances change, write a new
+     ADR that supersedes this one. Only status changes and links to superseding
+     ADRs should be added after acceptance. -->
+
+## Context
+
+The [testing-agents](../problems/testing-agents.md) problem doc identifies a
+gap: we have CI for code but no CI for prompts. It surveys prompt-level eval
+frameworks (promptfoo, deepeval) and agent-level runners (Inspect AI), but
+notes that most eval frameworks test prompts, not agents — they send a single
+prompt to a model API and score the response, without exercising the full
+agent loop (tool calls, multi-turn reasoning, environment interaction).
+
+Prior attempts to run agent evals were cut short because agents misbehaved
+during eval runs — misusing credentials and producing side effects outside
+the test boundary. The sandboxed execution model introduced in
+[ADR 0036](0036-agent-execution-sandbox.md) changed this: agents now run in
+containers with controlled network access and scoped credentials, limiting
+blast radius enough to make eval suites practical.
+
+PR [#1682](https://github.com/fullsend-ai/fullsend/pull/1682) introduces a
+functional eval framework that tests at a different level: the complete agent
+pipeline (pre-script, agent execution, post-script) running against ephemeral
+GitHub fixtures and scored by an LLM judge. A key property of functional
+evals is that they verify post-scripts and credential use actually work
+against real external services — not just that the agent produces plausible
+output, but that the full pipeline's interaction with GitHub (labeling,
+commenting, state transitions) succeeds end-to-end.
+
+This creates a new test category that needs a name and a place in the testing
+taxonomy. The emerging test pyramid for this project has four layers:
+
+1. **Unit tests** — deterministic Go tests (`make go-test`). Cheap, fast,
+   plentiful.
+2. **Prompt evals** — test agent prompts and skills in isolation, with mocked
+   external dependencies (not yet implemented). Cheaper than functional evals
+   because they avoid real service interactions, so they can be more numerous
+   and provide broader coverage. Custom network policies could enforce the
+   mocking boundary.
+3. **Functional evals** — exercise the full agent pipeline against real
+   GitHub fixtures. More expensive because they interact with live services,
+   so their number should be kept deliberately small — enough to cover the
+   critical integration paths, not exhaustive.
+4. **E2e tests** — browser-driven install/uninstall flows (`make e2e-test`).
+   The most expensive layer; limited to a narrow happy-path verification of
+   the admin install/uninstall flow.
+
+Each layer up the pyramid costs more per case and should therefore have fewer
+cases. This ADR addresses layer 3. Layer 2 remains an open opportunity.
+
+## Decision
+
+We adopt **functional evals** as a distinct test category for agent pipelines.
+
+A functional eval exercises the full `fullsend run` pipeline — dispatch,
+sandbox setup, agent execution, and post-processing — against a controlled
+GitHub fixture (ephemeral repo + issue/PR), then scores the agent's observable
+side effects (labels applied, comments posted, PR state) using both
+deterministic checks and LLM-graded rubrics.
+
+The eval infrastructure lives in `eval/` at the repo root, organized per
+agent skill:
+
+```
+eval/
+  fullsend-runner.sh          # CLI runner: fixture setup -> fullsend run -> capture state
+  run-functional.sh           # Orchestrator: iterate cases, score
+  <skill>/
+    eval.yaml                 # Eval config: judges, thresholds, models
+    cases/
+      001-<name>/
+        input.yaml            # Fixture definition
+        annotations.yaml      # Expected state and rubric hints
+        repo/                 # Source tree the agent sees
+    repos/                    # Shared repo content, symlinked by cases
+```
+
+Functional evals run in CI when `eval/` or `internal/scaffold/` changes, and
+are triggered via `make functional-evals`. They are gated on score thresholds
+(e.g., `min_mean: 2.5` for LLM quality, `min_pass_rate: 0.9` for
+deterministic checks) rather than binary pass/fail, acknowledging the
+non-determinism inherent in agent behavior.
+
+## Consequences
+
+- The test pyramid now has three implemented layers (unit, functional eval,
+  e2e) with a fourth (prompt eval) identified but not yet built. Each layer
+  has a distinct scope, cost profile, and trigger.
+- Functional evals require cloud credentials (GCP for Vertex AI, GitHub token
+  for fixture repos), so they cannot run in unprivileged CI contexts.
+- Adding a new agent skill's eval requires only a new directory under `eval/`
+  with the standard case layout — no framework code changes.
+- LLM-as-judge introduces a second layer of non-determinism: both the agent
+  under test and the judge are probabilistic. Threshold-based gating mitigates
+  this but does not eliminate flakiness.
+- The `eval/` directory is a new top-level concern that contributors need to
+  know about. Documentation belongs in `docs/testing/evals.md`.
+- Functional eval count should be monitored to prevent bloat. Because each
+  case interacts with live services, the suite's cost and runtime scale
+  directly with case count.
+- This decision does not preclude a lighter-weight prompt eval layer that
+  tests agent prompts and skills without the full pipeline. Such a layer
+  would complement functional evals by covering more cases at lower cost.