Merge pull request #146 from SentienceAPI/bench_gaps

rcholic · web-flow · commit 47b5bac95729 · 2026-01-22T07:40:56.000-08:00
vision executor + enrich form actions
diff --git a/src/actions.ts b/src/actions.ts
diff --git a/src/agent-runtime.ts b/src/agent-runtime.ts
@@ -38,6 +38,8 @@
  * ```
  */
 
+import * as fs from 'fs';
+import * as path from 'path';
 import { Page } from 'playwright';
 import { Snapshot } from './types';
 import { AssertContext, Predicate } from './verification';
@@ -336,6 +338,8 @@ export class AgentRuntime {
   stepIndex: number = 0;
   /** Most recent snapshot (for assertion context) */
   lastSnapshot: Snapshot | null = null;
+  /** Best-effort download records (Playwright downloads) */
+  private downloads: Array<Record<string, any>> = [];
 
   /** Failure artifact buffer (Phase 1) */
   private artifactBuffer: FailureArtifactBuffer | null = null;
@@ -438,6 +442,15 @@ export class AgentRuntime {
     this.browser = browser;
     this.page = page;
     this.tracer = tracer;
+
+    // Best-effort download tracking (does not change behavior unless a download occurs).
+    try {
+      this.page.on('download', download => {
+        void this.trackDownload(download);
+      });
+    } catch {
+      // ignore
+    }
   }
 
   /**
@@ -466,9 +479,48 @@ export class AgentRuntime {
       snapshot: this.lastSnapshot,
       url,
       stepId: this.stepId,
+      downloads: this.downloads,
     };
   }
 
+  private async trackDownload(download: any): Promise<void> {
+    const rec: Record<string, any> = {
+      status: 'started',
+      suggested_filename: download?.suggestedFilename?.() ?? download?.suggested_filename,
+      url: download?.url?.() ?? download?.url,
+    };
+    this.downloads.push(rec);
+    try {
+      const p = (await download.path?.()) as string | null;
+      rec.status = 'completed';
+      if (p) {
+        rec.path = p;
+        try {
+          // Best-effort size and mime type (no new deps).
+          rec.size_bytes = Number(fs.statSync(p).size);
+          const ext = String(path.extname(p) || '').toLowerCase();
+          const mimeByExt: Record<string, string> = {
+            '.pdf': 'application/pdf',
+            '.txt': 'text/plain',
+            '.csv': 'text/csv',
+            '.json': 'application/json',
+            '.zip': 'application/zip',
+            '.png': 'image/png',
+            '.jpg': 'image/jpeg',
+            '.jpeg': 'image/jpeg',
+            '.webp': 'image/webp',
+          };
+          if (mimeByExt[ext]) rec.mime_type = mimeByExt[ext];
+        } catch {
+          // ignore
+        }
+      }
+    } catch (e: any) {
+      rec.status = 'failed';
+      rec.error = String(e?.message ?? e);
+    }
+  }
+
   /**
    * Take a snapshot of the current page state.
    *
diff --git a/src/index.ts b/src/index.ts
@@ -5,7 +5,21 @@
 export { SentienceBrowser } from './browser';
 export { snapshot, SnapshotOptions } from './snapshot';
 export { query, find, parseSelector } from './query';
-export { click, typeText, press, scrollTo, clickRect, ClickRect } from './actions';
+export {
+  back,
+  check,
+  clear,
+  click,
+  clickRect,
+  ClickRect,
+  press,
+  scrollTo,
+  selectOption,
+  submit,
+  typeText,
+  uncheck,
+  uploadFile,
+} from './actions';
 export { CursorPolicy, CursorMode, CursorMovementMetadata, CursorPathPoint } from './cursor-policy';
 export { waitFor } from './wait';
 export { expect, Expectation } from './expect';
@@ -52,6 +66,7 @@ export {
   AssertOutcome,
   AssertContext,
   Predicate,
+  downloadCompleted,
   urlMatches,
   urlContains,
   exists,
@@ -72,6 +87,7 @@ export {
 export { AgentRuntime, AssertionHandle, AssertionRecord, EventuallyOptions } from './agent-runtime';
 export { RuntimeAgent } from './runtime-agent';
 export type { RuntimeStep, StepVerification } from './runtime-agent';
+export { parseVisionExecutorAction, executeVisionExecutorAction } from './vision-executor';
 export * from './captcha/types';
 export * from './captcha/strategies';
 
diff --git a/src/snapshot.ts b/src/snapshot.ts
@@ -2,7 +2,6 @@
  * Snapshot functionality - calls window.sentience.snapshot() or server-side API
  */
 
-import { SentienceBrowser } from './browser';
 import { IBrowser } from './protocols/browser-protocol';
 import { Snapshot } from './types';
 import * as fs from 'fs';
@@ -212,7 +211,7 @@ async function snapshotViaApi(
       () => typeof (window as any).sentience !== 'undefined',
       5000
     );
-  } catch (e) {
+  } catch (_e) {
     throw new Error(
       'Sentience extension failed to inject. Cannot collect raw data for API processing.'
     );
@@ -239,9 +238,15 @@ async function snapshotViaApi(
   // Use raw_elements (raw data) instead of elements (processed data)
   // Server validates API key and applies proprietary ranking logic
   const clientMetrics = rawResult?.diagnostics?.metrics;
-  const clientDiagnostics = rawResult?.diagnostics?.captcha
-    ? { captcha: rawResult.diagnostics.captcha }
-    : undefined;
+  const diag = rawResult?.diagnostics;
+  const clientDiagnostics =
+    diag?.captcha || diag?.requires_vision || diag?.requires_vision_reason
+      ? {
+          captcha: diag?.captcha,
+          requires_vision: diag?.requires_vision,
+          requires_vision_reason: diag?.requires_vision_reason,
+        }
+      : undefined;
   const payload = {
     raw_elements: rawResult.raw_elements || [], // Raw data needed for server processing
     url: rawResult.url || '',
diff --git a/src/types.ts b/src/types.ts
@@ -176,6 +176,9 @@ export interface SnapshotDiagnostics {
   reasons?: string[];
   metrics?: SnapshotDiagnosticsMetrics;
   captcha?: CaptchaDiagnostics;
+  /** P1-01: forward-compatible vision recommendation signal (optional) */
+  requires_vision?: boolean | null;
+  requires_vision_reason?: string | null;
 }
 
 /**
diff --git a/src/verification.ts b/src/verification.ts
@@ -60,6 +60,8 @@ export interface AssertContext {
   url: string | null;
   /** Current step identifier (for trace correlation) */
   stepId: string | null;
+  /** Optional: non-snapshot state signals for verification (e.g., downloads). */
+  downloads?: Array<Record<string, any>> | null;
 }
 
 /**
@@ -68,6 +70,32 @@ export interface AssertContext {
  */
 export type Predicate = (ctx: AssertContext) => AssertOutcome;
 
+/**
+ * Predicate that passes if a browser download has completed.
+ *
+ * Notes:
+ * - This relies on `AssertContext.downloads` being populated by the runtime/browser.
+ */
+export function downloadCompleted(filenameSubstring?: string): Predicate {
+  return (ctx: AssertContext): AssertOutcome => {
+    const downloads = ctx.downloads ?? [];
+    for (const d of downloads) {
+      if (String(d?.status ?? '') !== 'completed') continue;
+      const fname = String(d?.filename ?? d?.suggested_filename ?? '');
+      if (!filenameSubstring || fname.includes(filenameSubstring)) {
+        return { passed: true, reason: '', details: { download: d } };
+      }
+    }
+    return {
+      passed: false,
+      reason: filenameSubstring
+        ? `no completed download matched: ${filenameSubstring}`
+        : 'no completed downloads',
+      details: { filenameSubstring, downloads },
+    };
+  };
+}
+
 /**
  * Create a predicate that checks if current URL matches a regex pattern.
  *
diff --git a/src/vision-executor.ts b/src/vision-executor.ts
@@ -0,0 +1,76 @@
+/**
+ * Vision executor primitives (shared parsing/execution helpers).
+ *
+ * This is used by higher-level agents when falling back to a vision model to propose
+ * coordinate-based actions.
+ */
+
+export type VisionExecutorActionKind = 'click_xy' | 'click_rect' | 'press' | 'type' | 'finish';
+
+export interface VisionExecutorAction {
+  kind: VisionExecutorActionKind;
+  args: Record<string, any>;
+}
+
+export function parseVisionExecutorAction(text: string): VisionExecutorAction {
+  const t = String(text || '')
+    .replace(/```[\w]*\n?/g, '')
+    .trim();
+
+  if (/^FINISH\s*\(\s*\)\s*$/i.test(t)) return { kind: 'finish', args: {} };
+
+  let m = t.match(/^PRESS\s*\(\s*["']([^"']+)["']\s*\)\s*$/i);
+  if (m) return { kind: 'press', args: { key: m[1] } };
+
+  m = t.match(/^TYPE\s*\(\s*["']([\s\S]*?)["']\s*\)\s*$/i);
+  if (m) return { kind: 'type', args: { text: m[1] } };
+
+  m = t.match(/^CLICK_XY\s*\(\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*\)\s*$/i);
+  if (m) return { kind: 'click_xy', args: { x: Number(m[1]), y: Number(m[2]) } };
+
+  m = t.match(
+    /^CLICK_RECT\s*\(\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*\)\s*$/i
+  );
+  if (m)
+    return {
+      kind: 'click_rect',
+      args: { x: Number(m[1]), y: Number(m[2]), w: Number(m[3]), h: Number(m[4]) },
+    };
+
+  throw new Error(`unrecognized vision action: ${t.slice(0, 200)}`);
+}
+
+export async function executeVisionExecutorAction(params: {
+  backend: any;
+  page?: any;
+  action: VisionExecutorAction;
+}): Promise<void> {
+  const { backend, page, action } = params;
+
+  if (action.kind === 'click_xy') {
+    await backend.mouse_click(Number(action.args.x), Number(action.args.y));
+    return;
+  }
+
+  if (action.kind === 'click_rect') {
+    const cx = Number(action.args.x) + Number(action.args.w) / 2;
+    const cy = Number(action.args.y) + Number(action.args.h) / 2;
+    await backend.mouse_click(cx, cy);
+    return;
+  }
+
+  if (action.kind === 'press') {
+    if (!page) throw new Error('PRESS requires a Playwright page');
+    await page.keyboard.press(String(action.args.key));
+    return;
+  }
+
+  if (action.kind === 'type') {
+    await backend.type_text(String(action.args.text));
+    return;
+  }
+
+  if (action.kind === 'finish') return;
+
+  throw new Error(`unknown vision action kind: ${(action as any).kind}`);
+}
diff --git a/tests/actions.test.ts b/tests/actions.test.ts
diff --git a/tests/verification.test.ts b/tests/verification.test.ts

Original file line number	Diff line number	Diff line change
`@@ -176,6 +176,9 @@ export interface SnapshotDiagnostics {`
`176`	`176`	`reasons?: string[];`
`177`	`177`	`metrics?: SnapshotDiagnosticsMetrics;`
`178`	`178`	`captcha?: CaptchaDiagnostics;`
	`179`	`+ /** P1-01: forward-compatible vision recommendation signal (optional) */`
	`180`	`+ requires_vision?: boolean \| null;`
	`181`	`+ requires_vision_reason?: string \| null;`
`179`	`182`	`}`
`180`	`183`
`181`	`184`	`/**`