Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/verifier-trajectory-events.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@browserbasehq/stagehand": patch
---

Capture verifier trajectory evidence from v3 agent events for offline scoring.
75 changes: 75 additions & 0 deletions packages/core/lib/v3/agent/utils/captureAriaTreeProbe.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/**
* captureAriaTreeProbe — capture a truncated accessibility tree of the active
* page for use as tier-2 evidence in the trajectory recorder.
*
* Shared by v3AgentHandler and v3CuaAgentHandler. Listener-gated by the
* callers so ordinary agent runs (no TrajectoryRecorder attached) don't pay
* the cost.
*
* The a11y tree is the same payload the agent's `ariaTree` tool sees, but
* captured by the harness (not the agent) so the verifier has independent
* textual ground truth for grounding non-visual claims — prices, names,
* dates, list contents — without OCR'ing screenshots.
*
* Budget: defaults to ~8000 tokens (32k chars). Per-step a11y captures
* across a ~30-step trajectory at that cap sum to ~240k tokens total,
* which the verifier handles via per-criterion top-K selection. The cap
* is configurable via VERIFIER_ARIATREE_TOKEN_BUDGET so consumers can
* trade RAM/disk for fidelity. Truncated content is marked explicitly so
* the verifier knows it was clipped.
*/
import type { V3 } from "../../v3.js";

const APPROX_CHARS_PER_TOKEN = 4;
const DEFAULT_TOKEN_BUDGET = 8_000;
const DEFAULT_TIMEOUT_MS = 5_000;

interface CaptureAriaTreeOptions {
/** Soft cap on token count (chars/4 approximation). Default 8000. */
tokenBudget?: number;
/** Hard timeout on the capture. Default 5s. */
timeoutMs?: number;
}

/**
* Returns the truncated a11y tree as a plain string, or undefined when
* capture fails. Never throws — a11y capture is best-effort tier-2 evidence,
* not a hard requirement, so failures are silently absorbed (the verifier
* surfaces this via evidence_insufficient).
*/
export async function captureAriaTreeProbe(
v3: V3,
opts: CaptureAriaTreeOptions = {},
): Promise<string | undefined> {
const envBudget = parseInt(
process.env.VERIFIER_ARIATREE_TOKEN_BUDGET ?? "",
10,
);
const tokenBudget =
opts.tokenBudget ??
(Number.isFinite(envBudget) && envBudget > 0
? envBudget
: DEFAULT_TOKEN_BUDGET);
const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS;
const maxChars = tokenBudget * APPROX_CHARS_PER_TOKEN;

try {
// v3.extract() without a schema returns { pageText } where pageText is the
// rendered accessibility tree — same path the agent's ariaTree tool uses.
const result = (await v3.extract({ timeout: timeoutMs })) as {
pageText?: string;
};
const pageText = result?.pageText;
if (typeof pageText !== "string" || pageText.length === 0) return undefined;

if (pageText.length > maxChars) {
return (
pageText.slice(0, maxChars) +
`\n\n[CONTENT TRUNCATED at ~${tokenBudget} tokens — set VERIFIER_ARIATREE_TOKEN_BUDGET to raise]`
);
}
return pageText;
} catch {
return undefined;
}
}
109 changes: 109 additions & 0 deletions packages/core/lib/v3/handlers/v3AgentHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import {
AgentAbortError,
} from "../types/public/sdkErrors.js";
import { handleDoneToolCall } from "../agent/utils/handleDoneToolCall.js";
import { captureAriaTreeProbe } from "../agent/utils/captureAriaTreeProbe.js";
import {
CaptchaSolver,
CAPTCHA_SOLVED_MSG,
Expand Down Expand Up @@ -248,13 +249,22 @@ export class V3AgentHandler {
| GenerateTextOnStepFinishCallback<ToolSet>
| StreamTextOnStepFinishCallback<ToolSet>,
) {
// Monotonic step counter scoped to this execute() call. Each tool call in
// the agent loop becomes one trajectory step. The counter feeds stepIndex
// on the bus events the TrajectoryRecorder subscribes to.
let stepCounter = 0;
return async (event: StepResult<ToolSet>) => {
this.logger({
category: "agent",
message: `Step finished: ${event.finishReason}`,
level: 2,
});

const stepIndicesInTurn: number[] = [];
let lastFinalAnswer:
| { message: string; output?: Record<string, unknown> }
| undefined;

if (event.toolCalls && event.toolCalls.length > 0) {
for (let i = 0; i < event.toolCalls.length; i++) {
const toolCall = event.toolCalls[i];
Expand All @@ -279,6 +289,13 @@ export class V3AgentHandler {
? `${allReasoning} ${doneReasoning}`.trim()
: allReasoning || "Task completed successfully";
}
lastFinalAnswer = {
message: state.finalMessage,
output:
typeof args?.output === "object" && args?.output !== null
? (args.output as Record<string, unknown>)
: undefined,
};
}
const mappedActions = mapToolResultToActions({
toolCallName: toolCall.toolName,
Expand All @@ -292,8 +309,100 @@ export class V3AgentHandler {
action.timestamp = Date.now();
state.actions.push(action);
}

// Emit step_finished_event per tool call. The TrajectoryRecorder
// builds one Trajectory.Step per emission. tier-1 evidence (the
// bytes the LLM consumed) is captured separately via an
// onStepFinish wrapper in the harness.
const stepIndex = stepCounter++;
stepIndicesInTurn.push(stepIndex);
const toolOk =
!toolResult ||
(typeof toolResult === "object" &&
!("error" in toolResult) &&
!("isError" in toolResult && toolResult.isError));
this.v3.bus.emit("agent_step_finished_event", {
stepIndex,
actionName: toolCall.toolName,
actionArgs:
typeof args === "object" && args !== null
? (args as Record<string, unknown>)
: {},
reasoning: event.text ?? "",
toolOutput: {
ok: toolOk,
result: toolResult,
error:
toolResult &&
typeof toolResult === "object" &&
"error" in toolResult &&
typeof (toolResult as { error?: unknown }).error === "string"
? (toolResult as { error: string }).error
: undefined,
},
finishedAt: new Date().toISOString(),
});
}
state.currentPageUrl = (await this.v3.context.awaitActivePage()).url();

// Harness probe — take a single screenshot / a11y snapshot per AI SDK
// step and attach it to every tool call in that turn. The observation
// reflects the settled page state after the batch of tool calls; this
// is more faithful than dropping probe evidence for all but the last
// tool call, while still avoiding per-tool screenshot overhead.
const wantsScreenshotProbe =
this.v3.bus.listenerCount("agent_screenshot_taken_event") > 0;
const wantsStepObservation =
this.v3.bus.listenerCount("agent_step_observed_event") > 0;
if (
stepIndicesInTurn.length > 0 &&
(wantsScreenshotProbe || wantsStepObservation)
) {
try {
const page = await this.v3.context.awaitActivePage();
let screenshot: Buffer | undefined;
if (wantsScreenshotProbe) {
screenshot = await page.screenshot({ fullPage: false });
}
let ariaTree: string | undefined;
if (wantsStepObservation) {
// Capture the a11y tree alongside the URL probe so the verifier
// can ground textual claims (prices, names, dates) without OCR.
// Best-effort: returns undefined on failure/timeout.
ariaTree = await captureAriaTreeProbe(this.v3);
}
for (const stepIndex of stepIndicesInTurn) {
if (screenshot) {
// DOM/hybrid: this post-step screenshot is a harness probe
// only. The agent's tier-1 evidence is the tool's return value
// captured separately in agent_step_finished_event.
this.v3.bus.emit("agent_screenshot_taken_event", {
stepIndex,
screenshot,
url: state.currentPageUrl,
evidenceRole: "probe",
});
}
if (wantsStepObservation) {
this.v3.bus.emit("agent_step_observed_event", {
stepIndex,
url: state.currentPageUrl,
ariaTree,
});
}
}
} catch (e) {
this.logger({
category: "agent",
message: `Warning: harness probe failed: ${getErrorMessage(e)}`,
level: 1,
});
}
}
}

if (lastFinalAnswer) {
this.v3.bus.emit("agent_final_answer_event", lastFinalAnswer);
}

if (userCallback) {
Expand Down
Loading
Loading