From af07ca1debab06439c0d7a6c263c9a8ad85a789a Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 13:45:58 -0700 Subject: [PATCH 1/5] feat(verifier): normalize canonical evidence --- packages/core/lib/v3/verifier/evidence.ts | 533 ++++++++++++++++++++++ packages/core/package.json | 3 +- pnpm-lock.yaml | 3 + 3 files changed, 538 insertions(+), 1 deletion(-) create mode 100644 packages/core/lib/v3/verifier/evidence.ts diff --git a/packages/core/lib/v3/verifier/evidence.ts b/packages/core/lib/v3/verifier/evidence.ts new file mode 100644 index 000000000..1bd67ec06 --- /dev/null +++ b/packages/core/lib/v3/verifier/evidence.ts @@ -0,0 +1,533 @@ +/** + * Evidence — Step 1 of fara's MMRubricAgent pipeline (port). + * + * Loads probe screenshots from a Trajectory (file path or in-memory Buffer), + * deduplicates near-identical frames using a quick MSE + SSIM dissimilarity + * check (mirrors `packages/evals/utils/ScreenshotCollector`), and downsizes + * each kept frame by `VERIFIER_IMAGE_RESIZE` (default 0.7) so the + * Step 2 relevance scoring LLM call sees smaller images. + * + * Always-keep policy: the first and last screenshots are kept regardless of + * similarity, so the verifier can always cite the trajectory's bookends. + * + * Environment knobs: + * - VERIFIER_SSIM_THRESHOLD (default 0.75) — frames with SSIM >= threshold + * are considered duplicates and dropped. + * - VERIFIER_MSE_THRESHOLD (default 30) — frames with MSE < threshold + * short-circuit to "duplicate" without running SSIM. + * - VERIFIER_IMAGE_RESIZE (default 0.7) — scale factor applied before + * relevance scoring (matches fara's pre-resize for token economy). + * + * Architectural notes: + * - This module never touches a live browser. It reads screenshots from + * `Trajectory.steps[i].probeEvidence.{screenshot,screenshotPath}` only. + * - `sharp` is loaded via dynamic import so core stays portable for + * consumers that don't install image deps; if sharp is unavailable, the + * dedup/resize steps no-op and every screenshot is kept at its native + * size. The verifier still runs end-to-end, just with more tokens spent + * on near-duplicate frames. + * - `originalStepIndex → canonicalScreenshotIndex` mapping is exposed so + * downstream prompts can keep citing the trajectory step (e.g., + * "Screenshot N — step=K, action=..."), preserving the rubric's link + * between visual evidence and the action history. + */ +import type { Trajectory } from "./trajectory.js"; + +// Lazy-loaded `sharp` namespace. When `sharp` is not installed, we fall back +// to keep-everything-at-native-size. Stored as `unknown` because the real +// types live in an optional dep — see loadSharp(). +type Sharp = typeof import("sharp"); +let sharpPromise: Promise | null = null; + +async function loadSharp(): Promise { + if (sharpPromise) return sharpPromise; + sharpPromise = (async (): Promise => { + try { + const mod = (await import("sharp")) as unknown as { + default?: Sharp; + } & Sharp; + // Some bundlers wrap ESM CJS deps differently — handle both. + return (mod.default ?? mod) as Sharp; + } catch { + return null; + } + })(); + return sharpPromise; +} + +const DEFAULT_SSIM_THRESHOLD = 0.75; +const DEFAULT_MSE_THRESHOLD = 30; +const DEFAULT_IMAGE_RESIZE = 0.7; + +/** A single screenshot kept by Step 1, ready for downstream relevance scoring. */ +export interface CanonicalScreenshot { + /** 0-based position in the kept-screenshots array. Stable across the pipeline. */ + canonicalIndex: number; + /** + * Trajectory step index this screenshot came from. Matches + * `Trajectory.steps[i].index`. Lets downstream prompts cross-reference the + * action history. + */ + originalStepIndex: number; + /** Position of the step in `Trajectory.steps` (0..steps.length-1). */ + trajectoryStepPosition: number; + /** The resized PNG/JPEG buffer (or native bytes if sharp unavailable). */ + bytes: Buffer; + /** MIME media type. Always "image/png" after the (optional) resize. */ + mediaType: string; + /** Reason this frame was kept: "first" / "last" / "diverges". */ + keptReason: "first" | "last" | "diverges" | "no-dedup"; +} + +/** + * A text evidence point sourced from tier-2 probes or tier-1 tool outputs. + * These feed the same relevance + scoring path as screenshots, letting DOM + * and hybrid agents preserve extract/aria/tool-return evidence without a + * separate verifier architecture. + */ +export interface CanonicalTextEvidence { + /** 0-based position in the combined evidence-point array. */ + canonicalIndex: number; + originalStepIndex: number; + trajectoryStepPosition: number; + /** Where the text came from. */ + source: "probe-aria" | "agent-text" | "agent-json" | "tool-output"; + /** The text payload, already truncated. */ + content: string; +} + +export type CanonicalEvidence = CanonicalScreenshot | CanonicalTextEvidence; + +/** Discriminator helpers — kind === "image" for screenshots. */ +export function isImageEvidence( + e: CanonicalEvidence, +): e is CanonicalScreenshot { + return "bytes" in e && (e as CanonicalScreenshot).bytes instanceof Buffer; +} + +export function isTextEvidence( + e: CanonicalEvidence, +): e is CanonicalTextEvidence { + return ( + "content" in e && typeof (e as CanonicalTextEvidence).content === "string" + ); +} + +/** Result of Step 1. */ +export interface EvidenceLoadResult { + /** Kept frames, in chronological order. */ + screenshots: CanonicalScreenshot[]; + /** + * Maps `Trajectory.steps[i].index` → canonical index in `screenshots`. Step + * indices that were deduplicated point to the surviving canonical frame + * (typically the prior kept frame). Useful for "find me the screenshot for + * step K" lookups in downstream prompts. + */ + stepIndexToCanonical: Map; + /** Number of original frames considered. */ + originalCount: number; + /** Number of frames kept post-dedup (== screenshots.length). */ + keptCount: number; + /** Effective thresholds used (resolved from env). */ + thresholds: { + ssim: number; + mse: number; + resize: number; + }; +} + +/** Options for {@link loadAndReduceScreenshots}. Mainly env override hooks for tests. */ +export interface EvidenceLoadOptions { + /** Override VERIFIER_SSIM_THRESHOLD. */ + ssimThreshold?: number; + /** Override VERIFIER_MSE_THRESHOLD. */ + mseThreshold?: number; + /** Override VERIFIER_IMAGE_RESIZE. */ + imageResize?: number; +} + +/** + * Step 1 — load trajectory screenshots from disk (or memory), deduplicate, + * and downsize. + * + * Returns an array of canonical screenshots ready to feed into Step 2. + * Steps without a captured probe screenshot are skipped silently — they + * never reach the canonical array, but their action context still appears + * in the prompt's action history. + */ +export async function loadAndReduceScreenshots( + trajectory: Trajectory, + opts: EvidenceLoadOptions = {}, +): Promise { + const ssimThreshold = + opts.ssimThreshold ?? + readPositiveFloatEnv("VERIFIER_SSIM_THRESHOLD", DEFAULT_SSIM_THRESHOLD); + const mseThreshold = + opts.mseThreshold ?? + readPositiveFloatEnv("VERIFIER_MSE_THRESHOLD", DEFAULT_MSE_THRESHOLD); + const imageResize = + opts.imageResize ?? + readPositiveFloatEnv("VERIFIER_IMAGE_RESIZE", DEFAULT_IMAGE_RESIZE); + + // Collect raw frames in chronological order. probeEvidence.screenshot is + // populated either live (Buffer) or after loadTrajectoryFromDisk(). When + // both are absent we skip — there's no image to score. + const rawFrames: Array<{ + bytes: Buffer; + originalStepIndex: number; + trajectoryStepPosition: number; + }> = []; + + for (let i = 0; i < trajectory.steps.length; i++) { + const step = trajectory.steps[i]; + const buf = step.probeEvidence?.screenshot; + if (!buf || buf.length === 0) continue; + rawFrames.push({ + bytes: buf, + originalStepIndex: step.index, + trajectoryStepPosition: i, + }); + } + + const sharp = await loadSharp(); + const stepIndexToCanonical = new Map(); + + if (rawFrames.length === 0) { + return { + screenshots: [], + stepIndexToCanonical, + originalCount: 0, + keptCount: 0, + thresholds: { + ssim: ssimThreshold, + mse: mseThreshold, + resize: imageResize, + }, + }; + } + + // ── Dedup ────────────────────────────────────────────────────────────── + // First + last always kept. Middle frames kept iff they're sufficiently + // dissimilar to the previously kept frame. Dissimilarity check mirrors + // ScreenshotCollector: quick MSE pass, escalate to SSIM only if MSE + // suggests the frames differ. If sharp is unavailable, keep everything. + const keptRaw: Array<{ + bytes: Buffer; + originalStepIndex: number; + trajectoryStepPosition: number; + keptReason: CanonicalScreenshot["keptReason"]; + }> = []; + + // Track which raw-frame index each step maps to. Pre-fill with "the last + // kept frame so far" so dropped frames fall back to their surviving peer. + const rawIdxByStep = new Map(); + + let lastKeptIdx = 0; + + for (let i = 0; i < rawFrames.length; i++) { + const frame = rawFrames[i]; + const isFirst = i === 0; + const isLast = i === rawFrames.length - 1; + + let keep = true; + let reason: CanonicalScreenshot["keptReason"] = sharp + ? "diverges" + : "no-dedup"; + + if (sharp && !isFirst && !isLast) { + const prev = keptRaw[keptRaw.length - 1]; + try { + const mse = await calculateMSE(sharp, prev.bytes, frame.bytes); + if (mse < mseThreshold) { + keep = false; + } else { + const ssim = await calculateSSIM(sharp, prev.bytes, frame.bytes); + // Drop when "too similar" (SSIM at or above threshold). Mirrors + // ScreenshotCollector.shouldKeep = ssim < threshold. + if (ssim >= ssimThreshold) keep = false; + } + } catch { + // Comparison error → keep the frame (safer to err on inclusion). + keep = true; + } + } else if (isFirst) { + reason = "first"; + } else if (isLast) { + reason = "last"; + } + + if (keep) { + keptRaw.push({ + bytes: frame.bytes, + originalStepIndex: frame.originalStepIndex, + trajectoryStepPosition: frame.trajectoryStepPosition, + keptReason: reason, + }); + lastKeptIdx = keptRaw.length - 1; + } + rawIdxByStep.set(frame.originalStepIndex, lastKeptIdx); + } + + // ── Resize ───────────────────────────────────────────────────────────── + const screenshots: CanonicalScreenshot[] = await Promise.all( + keptRaw.map(async (raw, canonicalIndex): Promise => { + const resized = sharp + ? await resizePng(sharp, raw.bytes, imageResize) + : raw.bytes; + return { + canonicalIndex, + originalStepIndex: raw.originalStepIndex, + trajectoryStepPosition: raw.trajectoryStepPosition, + bytes: resized, + mediaType: "image/png", + keptReason: raw.keptReason, + }; + }), + ); + + // Build step → canonical index lookup from the dropped-fallback table. + for (const [stepIdx, rawIdx] of rawIdxByStep.entries()) { + stepIndexToCanonical.set(stepIdx, rawIdx); + } + + return { + screenshots, + stepIndexToCanonical, + originalCount: rawFrames.length, + keptCount: screenshots.length, + thresholds: { + ssim: ssimThreshold, + mse: mseThreshold, + resize: imageResize, + }, + }; +} + +/** + * Collect a combined evidence-point list (images + ariaTree text snippets). + * + * Images go through {@link loadAndReduceScreenshots} (dedup + downscale). + * Text evidence is sourced from: + * - tier-2 `probeEvidence.ariaTree` + * - tier-1 text/json modalities in `agentEvidence` + * - native `toolOutput.result` + * + * Text snippets are deduplicated by content hash so a "stuck on the same + * page" agent doesn't produce a flood of identical snippets. + * + * `canonicalIndex` is unified across both kinds: the first image gets 0, + * the next image or text snippet gets 1, etc. Downstream Step-2 relevance + * scoring sees all evidence points in one numbering scheme. + */ +export async function collectCanonicalEvidence( + trajectory: Trajectory, + opts: EvidenceLoadOptions = {}, +): Promise<{ + evidence: CanonicalEvidence[]; + loaded: EvidenceLoadResult; +}> { + const loaded = await loadAndReduceScreenshots(trajectory, opts); + const evidence: CanonicalEvidence[] = []; + + // Interleave images and text by step position so the resulting array is + // (roughly) chronological. We collect texts per step, then merge by + // step position with images. + type Pending = + | { kind: "image"; shot: CanonicalScreenshot } + | { + kind: "text"; + stepPos: number; + stepIdx: number; + source: CanonicalTextEvidence["source"]; + content: string; + }; + const pending: Pending[] = []; + + for (const shot of loaded.screenshots) { + pending.push({ kind: "image", shot }); + } + + const seenText = new Set(); + const addTextEvidence = ( + stepPos: number, + stepIdx: number, + source: CanonicalTextEvidence["source"], + raw: unknown, + ) => { + const text = typeof raw === "string" ? raw : safeStringifyEvidence(raw); + if (!text || text.length === 0) return; + const trimmed = text.length > 4000 ? text.slice(0, 4000) : text; + const normalized = trimmed.replace(/\s+/g, " ").trim(); + if (normalized.length === 0) return; + const dedupKey = `${normalized.length}:${normalized.slice(0, 200)}`; + if (seenText.has(dedupKey)) return; + seenText.add(dedupKey); + pending.push({ + kind: "text", + stepPos, + stepIdx, + source, + content: trimmed, + }); + }; + + for (let i = 0; i < trajectory.steps.length; i++) { + const step = trajectory.steps[i]; + addTextEvidence(i, step.index, "probe-aria", step.probeEvidence?.ariaTree); + + for (const modality of step.agentEvidence?.modalities ?? []) { + if (modality.type === "text") { + addTextEvidence(i, step.index, "agent-text", modality.content); + } else if (modality.type === "json") { + addTextEvidence(i, step.index, "agent-json", modality.content); + } + } + + // Defensive: agentEvidence is derived from toolOutput today, but keeping + // the native result as a fallback preserves evidence if that mapping + // changes or an adapter omits modalities. + addTextEvidence(i, step.index, "tool-output", step.toolOutput?.result); + } + + // Sort by trajectoryStepPosition asc; ties → image before text so the + // "page state before the harness probed text" reads naturally. + pending.sort((a, b) => { + const pa = a.kind === "image" ? a.shot.trajectoryStepPosition : a.stepPos; + const pb = b.kind === "image" ? b.shot.trajectoryStepPosition : b.stepPos; + if (pa !== pb) return pa - pb; + return a.kind === "image" ? -1 : 1; + }); + + for (const p of pending) { + if (p.kind === "image") { + // Re-stamp canonical index in the combined ordering. + const shot: CanonicalScreenshot = { + ...p.shot, + canonicalIndex: evidence.length, + }; + evidence.push(shot); + } else { + evidence.push({ + canonicalIndex: evidence.length, + originalStepIndex: p.stepIdx, + trajectoryStepPosition: p.stepPos, + source: p.source, + content: p.content, + }); + } + } + + return { evidence, loaded }; +} + +// ─── Internals ──────────────────────────────────────────────────────────── + +function safeStringifyEvidence(value: unknown): string | undefined { + if (value === undefined || value === null) return undefined; + try { + return JSON.stringify(value); + } catch { + return String(value); + } +} + +/** + * Port of `imageResize` from packages/evals/utils. Re-encodes as PNG with + * palette + max compression so the downstream LLM call sees smaller bytes. + * No-op when sharp can't read the buffer dimensions. + */ +async function resizePng( + sharp: Sharp, + bytes: Buffer, + scaleFactor: number, +): Promise { + if (scaleFactor >= 0.999 && scaleFactor <= 1.001) return bytes; + try { + const metadata = await sharp(bytes).metadata(); + if (!metadata.width || !metadata.height) return bytes; + const width = Math.max(1, Math.round(metadata.width * scaleFactor)); + const height = Math.max(1, Math.round(metadata.height * scaleFactor)); + return await sharp(bytes) + .resize(width, height, { + fit: "inside", + kernel: sharp.kernel.lanczos3, + }) + .png({ + compressionLevel: 9, + adaptiveFiltering: true, + palette: true, + }) + .toBuffer(); + } catch { + return bytes; + } +} + +/** + * Port of `ScreenshotCollector.calculateMSE`. Resamples both images to a + * fixed small size and computes the per-byte squared error mean. Lower + * means more similar. + */ +async function calculateMSE( + sharp: Sharp, + img1: Buffer, + img2: Buffer, +): Promise { + const size = { width: 400, height: 300 }; + const data1 = await sharp(img1).resize(size).raw().toBuffer(); + const data2 = await sharp(img2).resize(size).raw().toBuffer(); + if (data1.length !== data2.length) return Number.MAX_SAFE_INTEGER; + let sum = 0; + for (let i = 0; i < data1.length; i++) { + const diff = data1[i] - data2[i]; + sum += diff * diff; + } + return sum / data1.length; +} + +/** + * Port of `ScreenshotCollector.calculateSSIM`. Simplified single-window + * SSIM on grayscale at 400×300. Returns a value in [0, 1] where 1 == + * identical (after grayscale downsample). + */ +async function calculateSSIM( + sharp: Sharp, + img1: Buffer, + img2: Buffer, +): Promise { + const size = { width: 400, height: 300 }; + const gray1 = await sharp(img1).resize(size).grayscale().raw().toBuffer(); + const gray2 = await sharp(img2).resize(size).grayscale().raw().toBuffer(); + if (gray1.length !== gray2.length) return 0; + + const c1 = 0.01 * 0.01; + const c2 = 0.03 * 0.03; + let sum1 = 0; + let sum2 = 0; + let sum1Sq = 0; + let sum2Sq = 0; + let sum12 = 0; + const N = gray1.length; + for (let i = 0; i < N; i++) { + sum1 += gray1[i]; + sum2 += gray2[i]; + sum1Sq += gray1[i] * gray1[i]; + sum2Sq += gray2[i] * gray2[i]; + sum12 += gray1[i] * gray2[i]; + } + const mean1 = sum1 / N; + const mean2 = sum2 / N; + const var1 = sum1Sq / N - mean1 * mean1; + const var2 = sum2Sq / N - mean2 * mean2; + const cov12 = sum12 / N - mean1 * mean2; + const numerator = (2 * mean1 * mean2 + c1) * (2 * cov12 + c2); + const denominator = (mean1 * mean1 + mean2 * mean2 + c1) * (var1 + var2 + c2); + return numerator / denominator; +} + +function readPositiveFloatEnv(name: string, fallback: number): number { + const raw = process.env[name]; + if (!raw) return fallback; + const parsed = Number.parseFloat(raw); + return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback; +} diff --git a/packages/core/package.json b/packages/core/package.json index d16c07465..ff6c043e9 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -109,7 +109,8 @@ "@ai-sdk/xai": "^2.0.26", "bufferutil": "^4.0.9", "chrome-launcher": "^1.2.0", - "ollama-ai-provider-v2": "^1.5.0" + "ollama-ai-provider-v2": "^1.5.0", + "sharp": "^0.34.5" }, "devDependencies": { "@playwright/test": "^1.55.1", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index b82d41e5f..febdcd30a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -216,6 +216,9 @@ importers: ollama-ai-provider-v2: specifier: ^1.5.0 version: 1.5.0(zod@4.2.1) + sharp: + specifier: ^0.34.5 + version: 0.34.5 devDependencies: '@playwright/test': specifier: ^1.55.1 From 591e873482d61051b578bcb675eae9b024095f9a Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 14:16:33 -0700 Subject: [PATCH 2/5] chore: add evidence normalization changeset --- .changeset/verifier-evidence-normalization.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/verifier-evidence-normalization.md diff --git a/.changeset/verifier-evidence-normalization.md b/.changeset/verifier-evidence-normalization.md new file mode 100644 index 000000000..7f6986a66 --- /dev/null +++ b/.changeset/verifier-evidence-normalization.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +Add canonical verifier evidence normalization for screenshots and text signals. From 8d387fd34b3fd53135199110234373f56d59f40f Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 14:36:04 -0700 Subject: [PATCH 3/5] fix(core): keep sharp out of verifier install path --- .changeset/verifier-evidence-normalization.md | 2 +- packages/core/lib/v3/verifier/evidence.ts | 30 +++++++++++++++---- packages/core/package.json | 3 +- pnpm-lock.yaml | 3 -- 4 files changed, 27 insertions(+), 11 deletions(-) diff --git a/.changeset/verifier-evidence-normalization.md b/.changeset/verifier-evidence-normalization.md index 7f6986a66..b7e98643c 100644 --- a/.changeset/verifier-evidence-normalization.md +++ b/.changeset/verifier-evidence-normalization.md @@ -2,4 +2,4 @@ "@browserbasehq/stagehand": patch --- -Add canonical verifier evidence normalization for screenshots and text signals. +Add canonical verifier evidence normalization for screenshots and text signals without requiring image dependencies in core installs. diff --git a/packages/core/lib/v3/verifier/evidence.ts b/packages/core/lib/v3/verifier/evidence.ts index 1bd67ec06..f949f220f 100644 --- a/packages/core/lib/v3/verifier/evidence.ts +++ b/packages/core/lib/v3/verifier/evidence.ts @@ -1,5 +1,5 @@ /** - * Evidence — Step 1 of fara's MMRubricAgent pipeline (port). + * Evidence — Step 1 of the rubric verifier pipeline. * * Loads probe screenshots from a Trajectory (file path or in-memory Buffer), * deduplicates near-identical frames using a quick MSE + SSIM dissimilarity @@ -16,7 +16,7 @@ * - VERIFIER_MSE_THRESHOLD (default 30) — frames with MSE < threshold * short-circuit to "duplicate" without running SSIM. * - VERIFIER_IMAGE_RESIZE (default 0.7) — scale factor applied before - * relevance scoring (matches fara's pre-resize for token economy). + * relevance scoring. * * Architectural notes: * - This module never touches a live browser. It reads screenshots from @@ -34,9 +34,29 @@ import type { Trajectory } from "./trajectory.js"; // Lazy-loaded `sharp` namespace. When `sharp` is not installed, we fall back -// to keep-everything-at-native-size. Stored as `unknown` because the real -// types live in an optional dep — see loadSharp(). -type Sharp = typeof import("sharp"); +// to keep-everything-at-native-size. Keep this structural so core does not +// need to publish sharp as a runtime dependency. +interface SharpImage { + metadata(): Promise<{ width?: number; height?: number }>; + resize( + width: number, + height?: number, + options?: { fit?: string; kernel?: unknown }, + ): SharpImage; + resize(options: { width: number; height: number }): SharpImage; + raw(): SharpImage; + grayscale(): SharpImage; + png(options?: { + compressionLevel?: number; + adaptiveFiltering?: boolean; + palette?: boolean; + }): SharpImage; + toBuffer(): Promise; +} + +type Sharp = ((input: Buffer) => SharpImage) & { + kernel: { lanczos3: unknown }; +}; let sharpPromise: Promise | null = null; async function loadSharp(): Promise { diff --git a/packages/core/package.json b/packages/core/package.json index ff6c043e9..d16c07465 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -109,8 +109,7 @@ "@ai-sdk/xai": "^2.0.26", "bufferutil": "^4.0.9", "chrome-launcher": "^1.2.0", - "ollama-ai-provider-v2": "^1.5.0", - "sharp": "^0.34.5" + "ollama-ai-provider-v2": "^1.5.0" }, "devDependencies": { "@playwright/test": "^1.55.1", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index febdcd30a..b82d41e5f 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -216,9 +216,6 @@ importers: ollama-ai-provider-v2: specifier: ^1.5.0 version: 1.5.0(zod@4.2.1) - sharp: - specifier: ^0.34.5 - version: 0.34.5 devDependencies: '@playwright/test': specifier: ^1.55.1 From 6dae7da0bbbb9c188f5c9755553c3e4c1ce875cd Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 15:30:01 -0700 Subject: [PATCH 4/5] refactor(verifier): consolidate evidence types --- packages/core/lib/v3/index.ts | 5 ++ packages/core/lib/v3/verifier/evidence.ts | 89 ++++------------------- packages/core/lib/v3/verifier/index.ts | 5 ++ packages/core/lib/v3/verifier/types.ts | 72 ++++++++++++++++++ 4 files changed, 98 insertions(+), 73 deletions(-) diff --git a/packages/core/lib/v3/index.ts b/packages/core/lib/v3/index.ts index 8e21fb030..262827c39 100644 --- a/packages/core/lib/v3/index.ts +++ b/packages/core/lib/v3/index.ts @@ -79,6 +79,11 @@ export type { RubricCriterion, AgentEvidence, AgentEvidenceModality, + CanonicalEvidence, + CanonicalScreenshot, + CanonicalTextEvidence, + EvidenceLoadOptions, + EvidenceLoadResult, ProbeEvidence, ToolOutput, Verifier, diff --git a/packages/core/lib/v3/verifier/evidence.ts b/packages/core/lib/v3/verifier/evidence.ts index f949f220f..85168d48d 100644 --- a/packages/core/lib/v3/verifier/evidence.ts +++ b/packages/core/lib/v3/verifier/evidence.ts @@ -31,7 +31,22 @@ * "Screenshot N — step=K, action=..."), preserving the rubric's link * between visual evidence and the action history. */ -import type { Trajectory } from "./trajectory.js"; +import type { + CanonicalEvidence, + CanonicalScreenshot, + CanonicalTextEvidence, + EvidenceLoadOptions, + EvidenceLoadResult, + Trajectory, +} from "./types.js"; + +export type { + CanonicalEvidence, + CanonicalScreenshot, + CanonicalTextEvidence, + EvidenceLoadOptions, + EvidenceLoadResult, +} from "./types.js"; // Lazy-loaded `sharp` namespace. When `sharp` is not installed, we fall back // to keep-everything-at-native-size. Keep this structural so core does not @@ -79,45 +94,6 @@ const DEFAULT_SSIM_THRESHOLD = 0.75; const DEFAULT_MSE_THRESHOLD = 30; const DEFAULT_IMAGE_RESIZE = 0.7; -/** A single screenshot kept by Step 1, ready for downstream relevance scoring. */ -export interface CanonicalScreenshot { - /** 0-based position in the kept-screenshots array. Stable across the pipeline. */ - canonicalIndex: number; - /** - * Trajectory step index this screenshot came from. Matches - * `Trajectory.steps[i].index`. Lets downstream prompts cross-reference the - * action history. - */ - originalStepIndex: number; - /** Position of the step in `Trajectory.steps` (0..steps.length-1). */ - trajectoryStepPosition: number; - /** The resized PNG/JPEG buffer (or native bytes if sharp unavailable). */ - bytes: Buffer; - /** MIME media type. Always "image/png" after the (optional) resize. */ - mediaType: string; - /** Reason this frame was kept: "first" / "last" / "diverges". */ - keptReason: "first" | "last" | "diverges" | "no-dedup"; -} - -/** - * A text evidence point sourced from tier-2 probes or tier-1 tool outputs. - * These feed the same relevance + scoring path as screenshots, letting DOM - * and hybrid agents preserve extract/aria/tool-return evidence without a - * separate verifier architecture. - */ -export interface CanonicalTextEvidence { - /** 0-based position in the combined evidence-point array. */ - canonicalIndex: number; - originalStepIndex: number; - trajectoryStepPosition: number; - /** Where the text came from. */ - source: "probe-aria" | "agent-text" | "agent-json" | "tool-output"; - /** The text payload, already truncated. */ - content: string; -} - -export type CanonicalEvidence = CanonicalScreenshot | CanonicalTextEvidence; - /** Discriminator helpers — kind === "image" for screenshots. */ export function isImageEvidence( e: CanonicalEvidence, @@ -133,39 +109,6 @@ export function isTextEvidence( ); } -/** Result of Step 1. */ -export interface EvidenceLoadResult { - /** Kept frames, in chronological order. */ - screenshots: CanonicalScreenshot[]; - /** - * Maps `Trajectory.steps[i].index` → canonical index in `screenshots`. Step - * indices that were deduplicated point to the surviving canonical frame - * (typically the prior kept frame). Useful for "find me the screenshot for - * step K" lookups in downstream prompts. - */ - stepIndexToCanonical: Map; - /** Number of original frames considered. */ - originalCount: number; - /** Number of frames kept post-dedup (== screenshots.length). */ - keptCount: number; - /** Effective thresholds used (resolved from env). */ - thresholds: { - ssim: number; - mse: number; - resize: number; - }; -} - -/** Options for {@link loadAndReduceScreenshots}. Mainly env override hooks for tests. */ -export interface EvidenceLoadOptions { - /** Override VERIFIER_SSIM_THRESHOLD. */ - ssimThreshold?: number; - /** Override VERIFIER_MSE_THRESHOLD. */ - mseThreshold?: number; - /** Override VERIFIER_IMAGE_RESIZE. */ - imageResize?: number; -} - /** * Step 1 — load trajectory screenshots from disk (or memory), deduplicate, * and downsize. diff --git a/packages/core/lib/v3/verifier/index.ts b/packages/core/lib/v3/verifier/index.ts index 4061533ab..6179adc7a 100644 --- a/packages/core/lib/v3/verifier/index.ts +++ b/packages/core/lib/v3/verifier/index.ts @@ -4,8 +4,13 @@ export type { AgentEvidence, AgentEvidenceModality, + CanonicalEvidence, + CanonicalScreenshot, + CanonicalTextEvidence, CriterionScore, EvaluationResult, + EvidenceLoadOptions, + EvidenceLoadResult, FirstPointOfFailure, ProbeEvidence, Rubric, diff --git a/packages/core/lib/v3/verifier/types.ts b/packages/core/lib/v3/verifier/types.ts index 88b7e275e..e4c3b977e 100644 --- a/packages/core/lib/v3/verifier/types.ts +++ b/packages/core/lib/v3/verifier/types.ts @@ -151,6 +151,78 @@ export interface Trajectory { timing: { startedAt: string; endedAt: string }; } +/** A single screenshot kept by Step 1, ready for downstream relevance scoring. */ +export interface CanonicalScreenshot { + /** 0-based position in the kept-screenshots array. Stable across the pipeline. */ + canonicalIndex: number; + /** + * Trajectory step index this screenshot came from. Matches + * `Trajectory.steps[i].index`. Lets downstream prompts cross-reference the + * action history. + */ + originalStepIndex: number; + /** Position of the step in `Trajectory.steps` (0..steps.length-1). */ + trajectoryStepPosition: number; + /** The resized PNG/JPEG buffer (or native bytes if sharp unavailable). */ + bytes: Buffer; + /** MIME media type. Always "image/png" after the optional resize. */ + mediaType: string; + /** Reason this frame was kept: "first" / "last" / "diverges". */ + keptReason: "first" | "last" | "diverges" | "no-dedup"; +} + +/** + * A text evidence point sourced from tier-2 probes or tier-1 tool outputs. + * These feed the same relevance + scoring path as screenshots, letting DOM + * and hybrid agents preserve extract/aria/tool-return evidence without a + * separate verifier architecture. + */ +export interface CanonicalTextEvidence { + /** 0-based position in the combined evidence-point array. */ + canonicalIndex: number; + originalStepIndex: number; + trajectoryStepPosition: number; + /** Where the text came from. */ + source: "probe-aria" | "agent-text" | "agent-json" | "tool-output"; + /** The text payload, already truncated. */ + content: string; +} + +export type CanonicalEvidence = CanonicalScreenshot | CanonicalTextEvidence; + +/** Result of Step 1 evidence loading. */ +export interface EvidenceLoadResult { + /** Kept frames, in chronological order. */ + screenshots: CanonicalScreenshot[]; + /** + * Maps `Trajectory.steps[i].index` → canonical index in `screenshots`. Step + * indices that were deduplicated point to the surviving canonical frame + * (typically the prior kept frame). Useful for "find me the screenshot for + * step K" lookups in downstream prompts. + */ + stepIndexToCanonical: Map; + /** Number of original frames considered. */ + originalCount: number; + /** Number of frames kept post-dedup (== screenshots.length). */ + keptCount: number; + /** Effective thresholds used (resolved from env). */ + thresholds: { + ssim: number; + mse: number; + resize: number; + }; +} + +/** Options for evidence loading. Mainly env override hooks for tests. */ +export interface EvidenceLoadOptions { + /** Override VERIFIER_SSIM_THRESHOLD. */ + ssimThreshold?: number; + /** Override VERIFIER_MSE_THRESHOLD. */ + mseThreshold?: number; + /** Override VERIFIER_IMAGE_RESIZE. */ + imageResize?: number; +} + /** Score for a single rubric criterion after evidence analysis + rescoring. */ export interface CriterionScore { /** Matches RubricCriterion.criterion (the criterion's short name). */ From 4e9c26eaef05ce9720a6b6a0e914647b8e82c451 Mon Sep 17 00:00:00 2001 From: miguel Date: Fri, 15 May 2026 16:23:43 -0700 Subject: [PATCH 5/5] refactor(verifier): keep evidence types in types module --- packages/core/lib/v3/verifier/evidence.ts | 8 -------- 1 file changed, 8 deletions(-) diff --git a/packages/core/lib/v3/verifier/evidence.ts b/packages/core/lib/v3/verifier/evidence.ts index 85168d48d..fcb412d80 100644 --- a/packages/core/lib/v3/verifier/evidence.ts +++ b/packages/core/lib/v3/verifier/evidence.ts @@ -40,14 +40,6 @@ import type { Trajectory, } from "./types.js"; -export type { - CanonicalEvidence, - CanonicalScreenshot, - CanonicalTextEvidence, - EvidenceLoadOptions, - EvidenceLoadResult, -} from "./types.js"; - // Lazy-loaded `sharp` namespace. When `sharp` is not installed, we fall back // to keep-everything-at-native-size. Keep this structural so core does not // need to publish sharp as a runtime dependency.