Skip to content

Commit 47b5bac

Browse files
authored
Merge pull request #146 from SentienceAPI/bench_gaps
vision executor + enrich form actions
2 parents 2ddb702 + c351f60 commit 47b5bac

File tree

9 files changed

+841
-6
lines changed

9 files changed

+841
-6
lines changed

src/actions.ts

Lines changed: 522 additions & 0 deletions
Large diffs are not rendered by default.

src/agent-runtime.ts

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@
3838
* ```
3939
*/
4040

41+
import * as fs from 'fs';
42+
import * as path from 'path';
4143
import { Page } from 'playwright';
4244
import { Snapshot } from './types';
4345
import { AssertContext, Predicate } from './verification';
@@ -336,6 +338,8 @@ export class AgentRuntime {
336338
stepIndex: number = 0;
337339
/** Most recent snapshot (for assertion context) */
338340
lastSnapshot: Snapshot | null = null;
341+
/** Best-effort download records (Playwright downloads) */
342+
private downloads: Array<Record<string, any>> = [];
339343

340344
/** Failure artifact buffer (Phase 1) */
341345
private artifactBuffer: FailureArtifactBuffer | null = null;
@@ -438,6 +442,15 @@ export class AgentRuntime {
438442
this.browser = browser;
439443
this.page = page;
440444
this.tracer = tracer;
445+
446+
// Best-effort download tracking (does not change behavior unless a download occurs).
447+
try {
448+
this.page.on('download', download => {
449+
void this.trackDownload(download);
450+
});
451+
} catch {
452+
// ignore
453+
}
441454
}
442455

443456
/**
@@ -466,9 +479,48 @@ export class AgentRuntime {
466479
snapshot: this.lastSnapshot,
467480
url,
468481
stepId: this.stepId,
482+
downloads: this.downloads,
469483
};
470484
}
471485

486+
private async trackDownload(download: any): Promise<void> {
487+
const rec: Record<string, any> = {
488+
status: 'started',
489+
suggested_filename: download?.suggestedFilename?.() ?? download?.suggested_filename,
490+
url: download?.url?.() ?? download?.url,
491+
};
492+
this.downloads.push(rec);
493+
try {
494+
const p = (await download.path?.()) as string | null;
495+
rec.status = 'completed';
496+
if (p) {
497+
rec.path = p;
498+
try {
499+
// Best-effort size and mime type (no new deps).
500+
rec.size_bytes = Number(fs.statSync(p).size);
501+
const ext = String(path.extname(p) || '').toLowerCase();
502+
const mimeByExt: Record<string, string> = {
503+
'.pdf': 'application/pdf',
504+
'.txt': 'text/plain',
505+
'.csv': 'text/csv',
506+
'.json': 'application/json',
507+
'.zip': 'application/zip',
508+
'.png': 'image/png',
509+
'.jpg': 'image/jpeg',
510+
'.jpeg': 'image/jpeg',
511+
'.webp': 'image/webp',
512+
};
513+
if (mimeByExt[ext]) rec.mime_type = mimeByExt[ext];
514+
} catch {
515+
// ignore
516+
}
517+
}
518+
} catch (e: any) {
519+
rec.status = 'failed';
520+
rec.error = String(e?.message ?? e);
521+
}
522+
}
523+
472524
/**
473525
* Take a snapshot of the current page state.
474526
*

src/index.ts

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,21 @@
55
export { SentienceBrowser } from './browser';
66
export { snapshot, SnapshotOptions } from './snapshot';
77
export { query, find, parseSelector } from './query';
8-
export { click, typeText, press, scrollTo, clickRect, ClickRect } from './actions';
8+
export {
9+
back,
10+
check,
11+
clear,
12+
click,
13+
clickRect,
14+
ClickRect,
15+
press,
16+
scrollTo,
17+
selectOption,
18+
submit,
19+
typeText,
20+
uncheck,
21+
uploadFile,
22+
} from './actions';
923
export { CursorPolicy, CursorMode, CursorMovementMetadata, CursorPathPoint } from './cursor-policy';
1024
export { waitFor } from './wait';
1125
export { expect, Expectation } from './expect';
@@ -52,6 +66,7 @@ export {
5266
AssertOutcome,
5367
AssertContext,
5468
Predicate,
69+
downloadCompleted,
5570
urlMatches,
5671
urlContains,
5772
exists,
@@ -72,6 +87,7 @@ export {
7287
export { AgentRuntime, AssertionHandle, AssertionRecord, EventuallyOptions } from './agent-runtime';
7388
export { RuntimeAgent } from './runtime-agent';
7489
export type { RuntimeStep, StepVerification } from './runtime-agent';
90+
export { parseVisionExecutorAction, executeVisionExecutorAction } from './vision-executor';
7591
export * from './captcha/types';
7692
export * from './captcha/strategies';
7793

src/snapshot.ts

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
* Snapshot functionality - calls window.sentience.snapshot() or server-side API
33
*/
44

5-
import { SentienceBrowser } from './browser';
65
import { IBrowser } from './protocols/browser-protocol';
76
import { Snapshot } from './types';
87
import * as fs from 'fs';
@@ -212,7 +211,7 @@ async function snapshotViaApi(
212211
() => typeof (window as any).sentience !== 'undefined',
213212
5000
214213
);
215-
} catch (e) {
214+
} catch (_e) {
216215
throw new Error(
217216
'Sentience extension failed to inject. Cannot collect raw data for API processing.'
218217
);
@@ -239,9 +238,15 @@ async function snapshotViaApi(
239238
// Use raw_elements (raw data) instead of elements (processed data)
240239
// Server validates API key and applies proprietary ranking logic
241240
const clientMetrics = rawResult?.diagnostics?.metrics;
242-
const clientDiagnostics = rawResult?.diagnostics?.captcha
243-
? { captcha: rawResult.diagnostics.captcha }
244-
: undefined;
241+
const diag = rawResult?.diagnostics;
242+
const clientDiagnostics =
243+
diag?.captcha || diag?.requires_vision || diag?.requires_vision_reason
244+
? {
245+
captcha: diag?.captcha,
246+
requires_vision: diag?.requires_vision,
247+
requires_vision_reason: diag?.requires_vision_reason,
248+
}
249+
: undefined;
245250
const payload = {
246251
raw_elements: rawResult.raw_elements || [], // Raw data needed for server processing
247252
url: rawResult.url || '',

src/types.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,9 @@ export interface SnapshotDiagnostics {
176176
reasons?: string[];
177177
metrics?: SnapshotDiagnosticsMetrics;
178178
captcha?: CaptchaDiagnostics;
179+
/** P1-01: forward-compatible vision recommendation signal (optional) */
180+
requires_vision?: boolean | null;
181+
requires_vision_reason?: string | null;
179182
}
180183

181184
/**

src/verification.ts

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ export interface AssertContext {
6060
url: string | null;
6161
/** Current step identifier (for trace correlation) */
6262
stepId: string | null;
63+
/** Optional: non-snapshot state signals for verification (e.g., downloads). */
64+
downloads?: Array<Record<string, any>> | null;
6365
}
6466

6567
/**
@@ -68,6 +70,32 @@ export interface AssertContext {
6870
*/
6971
export type Predicate = (ctx: AssertContext) => AssertOutcome;
7072

73+
/**
74+
* Predicate that passes if a browser download has completed.
75+
*
76+
* Notes:
77+
* - This relies on `AssertContext.downloads` being populated by the runtime/browser.
78+
*/
79+
export function downloadCompleted(filenameSubstring?: string): Predicate {
80+
return (ctx: AssertContext): AssertOutcome => {
81+
const downloads = ctx.downloads ?? [];
82+
for (const d of downloads) {
83+
if (String(d?.status ?? '') !== 'completed') continue;
84+
const fname = String(d?.filename ?? d?.suggested_filename ?? '');
85+
if (!filenameSubstring || fname.includes(filenameSubstring)) {
86+
return { passed: true, reason: '', details: { download: d } };
87+
}
88+
}
89+
return {
90+
passed: false,
91+
reason: filenameSubstring
92+
? `no completed download matched: ${filenameSubstring}`
93+
: 'no completed downloads',
94+
details: { filenameSubstring, downloads },
95+
};
96+
};
97+
}
98+
7199
/**
72100
* Create a predicate that checks if current URL matches a regex pattern.
73101
*

src/vision-executor.ts

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
/**
2+
* Vision executor primitives (shared parsing/execution helpers).
3+
*
4+
* This is used by higher-level agents when falling back to a vision model to propose
5+
* coordinate-based actions.
6+
*/
7+
8+
export type VisionExecutorActionKind = 'click_xy' | 'click_rect' | 'press' | 'type' | 'finish';
9+
10+
export interface VisionExecutorAction {
11+
kind: VisionExecutorActionKind;
12+
args: Record<string, any>;
13+
}
14+
15+
export function parseVisionExecutorAction(text: string): VisionExecutorAction {
16+
const t = String(text || '')
17+
.replace(/```[\w]*\n?/g, '')
18+
.trim();
19+
20+
if (/^FINISH\s*\(\s*\)\s*$/i.test(t)) return { kind: 'finish', args: {} };
21+
22+
let m = t.match(/^PRESS\s*\(\s*["']([^"']+)["']\s*\)\s*$/i);
23+
if (m) return { kind: 'press', args: { key: m[1] } };
24+
25+
m = t.match(/^TYPE\s*\(\s*["']([\s\S]*?)["']\s*\)\s*$/i);
26+
if (m) return { kind: 'type', args: { text: m[1] } };
27+
28+
m = t.match(/^CLICK_XY\s*\(\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*\)\s*$/i);
29+
if (m) return { kind: 'click_xy', args: { x: Number(m[1]), y: Number(m[2]) } };
30+
31+
m = t.match(
32+
/^CLICK_RECT\s*\(\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*,\s*(-?\d+(?:\.\d+)?)\s*\)\s*$/i
33+
);
34+
if (m)
35+
return {
36+
kind: 'click_rect',
37+
args: { x: Number(m[1]), y: Number(m[2]), w: Number(m[3]), h: Number(m[4]) },
38+
};
39+
40+
throw new Error(`unrecognized vision action: ${t.slice(0, 200)}`);
41+
}
42+
43+
export async function executeVisionExecutorAction(params: {
44+
backend: any;
45+
page?: any;
46+
action: VisionExecutorAction;
47+
}): Promise<void> {
48+
const { backend, page, action } = params;
49+
50+
if (action.kind === 'click_xy') {
51+
await backend.mouse_click(Number(action.args.x), Number(action.args.y));
52+
return;
53+
}
54+
55+
if (action.kind === 'click_rect') {
56+
const cx = Number(action.args.x) + Number(action.args.w) / 2;
57+
const cy = Number(action.args.y) + Number(action.args.h) / 2;
58+
await backend.mouse_click(cx, cy);
59+
return;
60+
}
61+
62+
if (action.kind === 'press') {
63+
if (!page) throw new Error('PRESS requires a Playwright page');
64+
await page.keyboard.press(String(action.args.key));
65+
return;
66+
}
67+
68+
if (action.kind === 'type') {
69+
await backend.type_text(String(action.args.text));
70+
return;
71+
}
72+
73+
if (action.kind === 'finish') return;
74+
75+
throw new Error(`unknown vision action kind: ${(action as any).kind}`);
76+
}

0 commit comments

Comments
 (0)