-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsentience_context.py
More file actions
480 lines (392 loc) · 16.8 KB
/
sentience_context.py
File metadata and controls
480 lines (392 loc) · 16.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
"""
SentienceContext: Token-Slasher Context Middleware for browser-use.
This module provides a compact, ranked DOM context block for browser-use agents,
reducing tokens and improving reliability by using Sentience snapshots.
Example usage:
from browser_use import Agent
from predicate.backends import PredicateContext
ctx = PredicateContext(show_overlay=True)
state = await ctx.build(agent.browser_session, goal="Click the first Show HN post")
if state:
agent.add_context(state.prompt_block) # or however browser-use injects state
"""
from __future__ import annotations
import asyncio
import logging
import re
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any
from urllib.parse import urlparse
from ..constants import PREDICATE_API_URL
if TYPE_CHECKING:
from ..models import Element, Snapshot
logger = logging.getLogger(__name__)
@dataclass
class SentienceContextState:
"""Sentience context state with snapshot and formatted prompt block."""
url: str
snapshot: Snapshot
prompt_block: str
@dataclass
class TopElementSelector:
"""
Configuration for element selection strategy.
The selector uses a 3-way merge to pick elements for the LLM context:
1. Top N by importance score (most actionable elements)
2. Top N from dominant group (for ordinal tasks like "click 3rd item")
3. Top N by position (elements at top of page, lowest doc_y)
Elements are deduplicated across all three sources.
"""
by_importance: int = 60
"""Number of top elements to select by importance score (descending)."""
from_dominant_group: int = 15
"""Number of top elements to select from the dominant group (for ordinal tasks)."""
by_position: int = 10
"""Number of top elements to select by position (lowest doc_y = top of page)."""
class SentienceContext:
"""
Token-Slasher Context Middleware for browser-use.
Creates a compact, ranked DOM context block using Sentience snapshots,
reducing tokens and improving reliability for LLM-based browser agents.
Example:
from browser_use import Agent
from predicate.backends import SentienceContext
ctx = SentienceContext(show_overlay=True)
state = await ctx.build(agent.browser_session, goal="Click the first Show HN post")
if state:
agent.add_context(state.prompt_block)
"""
# Sentience API endpoint
API_URL = PREDICATE_API_URL
def __init__(
self,
*,
predicate_api_key: str | None = None,
sentience_api_key: str | None = None,
use_api: bool | None = None,
max_elements: int = 60,
show_overlay: bool = False,
top_element_selector: TopElementSelector | None = None,
) -> None:
"""
Initialize SentienceContext.
Args:
predicate_api_key: Canonical API key parameter for gateway mode.
sentience_api_key: Backward-compatible API key alias (legacy name).
use_api: Force API vs extension mode (auto-detected if None)
max_elements: Maximum elements to fetch from snapshot
show_overlay: Show visual overlay highlighting elements in browser
top_element_selector: Configuration for element selection strategy
"""
self._api_key = predicate_api_key or sentience_api_key
self._use_api = use_api
self._max_elements = max_elements
self._show_overlay = show_overlay
self._selector = top_element_selector or TopElementSelector()
async def build(
self,
browser_session: Any,
*,
goal: str | None = None,
wait_for_extension_ms: int = 5000,
retries: int = 2,
retry_delay_s: float = 1.0,
) -> SentienceContextState | None:
"""
Build context state from browser session.
Takes a snapshot using the Sentience extension and formats it for LLM consumption.
Returns None if snapshot fails (extension not loaded, timeout, etc.).
Args:
browser_session: Browser-use BrowserSession instance
goal: Optional goal/task description (passed to gateway for reranking)
wait_for_extension_ms: Maximum time to wait for extension injection
retries: Number of retry attempts on snapshot failure
retry_delay_s: Delay between retries in seconds
Returns:
SentienceContextState with snapshot and formatted prompt, or None if failed
"""
try:
# Import here to avoid requiring sentience as a hard dependency
from ..models import SnapshotOptions
from .browser_use_adapter import BrowserUseAdapter
from .snapshot import snapshot
# Create adapter and backend
adapter = BrowserUseAdapter(browser_session)
backend = await adapter.create_backend()
# Wait for extension to inject (poll until ready or timeout)
await self._wait_for_extension(backend, timeout_ms=wait_for_extension_ms)
# Build snapshot options
options = SnapshotOptions(
limit=self._max_elements,
show_overlay=self._show_overlay,
goal=goal,
)
# Set API options
if self._api_key:
options.predicate_api_key = self._api_key
if self._use_api is not None:
options.use_api = self._use_api
elif self._api_key:
options.use_api = True
# Take snapshot with retry logic
snap = None
last_error: Exception | None = None
for attempt in range(retries):
try:
snap = await snapshot(backend, options=options)
break # Success
except Exception as e:
last_error = e
if attempt < retries - 1:
logger.debug(
"Sentience snapshot attempt %d failed: %s, retrying...",
attempt + 1,
e,
)
await asyncio.sleep(retry_delay_s)
else:
logger.warning(
"Sentience snapshot failed after %d attempts: %s",
retries,
last_error,
)
return None
if snap is None:
logger.warning("Sentience snapshot returned None")
return None
# Get URL from snapshot
url = snap.url or ""
# Format for LLM
formatted = self._format_snapshot_for_llm(snap)
# Build prompt block
prompt = (
"Elements: ID|role|text|imp|is_primary|docYq|ord|DG|href\n"
"Rules: ordinal→DG=1 then ord asc; otherwise imp desc. "
"Use click(ID)/input_text(ID,...).\n"
f"{formatted}"
)
logger.info(
"SentienceContext snapshot: %d elements URL=%s",
len(snap.elements),
url,
)
return SentienceContextState(url=url, snapshot=snap, prompt_block=prompt)
except ImportError as e:
logger.warning("Sentience SDK not available: %s", e)
return None
except Exception as e:
logger.warning("Sentience snapshot skipped: %s", e)
return None
def _format_snapshot_for_llm(self, snapshot: Snapshot) -> str:
"""
Format Sentience snapshot for LLM consumption.
Creates an ultra-compact inventory of interactive elements optimized
for minimal token usage. Uses 3-way selection: by importance,
from dominant group, and by position.
Args:
snapshot: Sentience Snapshot object
Returns:
Formatted string with format: ID|role|text|imp|is_primary|docYq|ord|DG|href
"""
# Filter to interactive elements only
interactive_roles = {
"button",
"link",
"textbox",
"searchbox",
"combobox",
"checkbox",
"radio",
"slider",
"tab",
"menuitem",
"option",
"switch",
"cell",
"a",
"input",
"select",
"textarea",
}
interactive_elements: list[Element] = []
for el in snapshot.elements:
role = (el.role or "").lower()
if role in interactive_roles:
interactive_elements.append(el)
# Sort by importance (descending) for importance-based selection
interactive_elements.sort(key=lambda el: el.importance or 0, reverse=True)
# Get top N by importance (track by ID for deduplication)
selected_ids: set[int] = set()
selected_elements: list[Element] = []
for el in interactive_elements[: self._selector.by_importance]:
if el.id not in selected_ids:
selected_ids.add(el.id)
selected_elements.append(el)
# Get top elements from dominant group (for ordinal tasks)
# Prefer in_dominant_group field (uses fuzzy matching from gateway)
dominant_group_elements = [
el for el in interactive_elements if el.in_dominant_group is True
]
# Fallback to exact group_key match if in_dominant_group not populated
if not dominant_group_elements and snapshot.dominant_group_key:
dominant_group_elements = [
el for el in interactive_elements if el.group_key == snapshot.dominant_group_key
]
# Sort by group_index for ordinal ordering
dominant_group_elements.sort(key=lambda el: el.group_index or 999)
for el in dominant_group_elements[: self._selector.from_dominant_group]:
if el.id not in selected_ids:
selected_ids.add(el.id)
selected_elements.append(el)
# Get top elements by position (lowest doc_y = top of page)
def get_y_position(el: Element) -> float:
if el.doc_y is not None:
return el.doc_y
if el.bbox is not None:
return el.bbox.y
return float("inf")
elements_by_position = sorted(
interactive_elements, key=lambda el: (get_y_position(el), -(el.importance or 0))
)
for el in elements_by_position[: self._selector.by_position]:
if el.id not in selected_ids:
selected_ids.add(el.id)
selected_elements.append(el)
# Compute local rank_in_group for dominant group elements
rank_in_group_map: dict[int, int] = {}
if True: # Always compute rank_in_group
# Sort dominant group elements by position for rank computation
dg_elements_for_rank = [
el for el in interactive_elements if el.in_dominant_group is True
]
if not dg_elements_for_rank and snapshot.dominant_group_key:
dg_elements_for_rank = [
el for el in interactive_elements if el.group_key == snapshot.dominant_group_key
]
# Sort by (doc_y, bbox.y, bbox.x, -importance)
def rank_sort_key(el: Element) -> tuple[float, float, float, float]:
doc_y = el.doc_y if el.doc_y is not None else float("inf")
bbox_y = el.bbox.y if el.bbox else float("inf")
bbox_x = el.bbox.x if el.bbox else float("inf")
neg_importance = -(el.importance or 0)
return (doc_y, bbox_y, bbox_x, neg_importance)
dg_elements_for_rank.sort(key=rank_sort_key)
for rank, el in enumerate(dg_elements_for_rank):
rank_in_group_map[el.id] = rank
# Format lines
lines: list[str] = []
for el in selected_elements:
# Get role (override to "link" if element has href)
role = el.role or ""
if el.href:
role = "link"
elif not role:
# Generic fallback for interactive elements without explicit role
role = "element"
# Get name/text (truncate aggressively, normalize whitespace)
name = el.text or ""
# Remove newlines and normalize whitespace
name = re.sub(r"\s+", " ", name.strip())
if len(name) > 30:
name = name[:27] + "..."
# Extract fields
importance = el.importance or 0
doc_y = el.doc_y or 0
# is_primary: from visual_cues.is_primary (boolean)
is_primary = False
if el.visual_cues:
is_primary = el.visual_cues.is_primary or False
is_primary_flag = "1" if is_primary else "0"
# Pre-encode fields for compactness
# docYq: bucketed doc_y (round to nearest 200 for smaller numbers)
doc_yq = int(round(doc_y / 200)) if doc_y else 0
# Determine if in dominant group
in_dg = el.in_dominant_group
if in_dg is None and snapshot.dominant_group_key:
# Fallback for older gateway versions
in_dg = el.group_key == snapshot.dominant_group_key
# ord_val: rank_in_group if in dominant group
if in_dg and el.id in rank_in_group_map:
ord_val: str | int = rank_in_group_map[el.id]
else:
ord_val = "-"
# DG: 1 if dominant group, else 0
dg_flag = "1" if in_dg else "0"
# href: short token (domain or last path segment, or blank)
href = self._compress_href(el.href)
# Ultra-compact format: ID|role|text|imp|is_primary|docYq|ord|DG|href
line = f"{el.id}|{role}|{name}|{importance}|{is_primary_flag}|{doc_yq}|{ord_val}|{dg_flag}|{href}"
lines.append(line)
logger.debug(
"Formatted %d elements (top %d by importance + top %d from dominant group + top %d by position)",
len(lines),
self._selector.by_importance,
self._selector.from_dominant_group,
self._selector.by_position,
)
return "\n".join(lines)
async def _wait_for_extension(
self,
backend: Any,
*,
timeout_ms: int = 5000,
poll_interval_ms: int = 100,
) -> bool:
"""
Wait for Sentience extension to be ready in the browser.
Polls window.sentience until it's defined or timeout is reached.
Args:
backend: Browser backend with evaluate() method
timeout_ms: Maximum time to wait in milliseconds
poll_interval_ms: Interval between polls in milliseconds
Returns:
True if extension is ready, False if timeout
"""
elapsed_ms = 0
poll_interval_s = poll_interval_ms / 1000
while elapsed_ms < timeout_ms:
try:
result = await backend.evaluate("typeof window.sentience !== 'undefined'")
if result is True:
logger.debug("Sentience extension ready after %dms", elapsed_ms)
return True
except Exception:
# Extension not ready yet, continue polling
pass
await asyncio.sleep(poll_interval_s)
elapsed_ms += poll_interval_ms
logger.warning("Sentience extension not ready after %dms timeout", timeout_ms)
return False
def _compress_href(self, href: str | None) -> str:
"""
Compress href into a short token for minimal tokens.
Args:
href: Full URL or None
Returns:
Short token (domain second-level or last path segment)
"""
if not href:
return ""
try:
parsed = urlparse(href)
if parsed.netloc:
# Extract second-level domain (e.g., "github" from "github.com")
parts = parsed.netloc.split(".")
if len(parts) >= 2:
return parts[-2][:10]
return parsed.netloc[:10]
elif parsed.path:
# Use last path segment
segments = [s for s in parsed.path.split("/") if s]
if segments:
return segments[-1][:10]
return "item"
except Exception:
pass
return "item"
# ---------------------------------------------------------------------------
# Predicate-named counterparts (canonical moving forward).
# Keep Sentience* names for backward compatibility.
# ---------------------------------------------------------------------------
PredicateContext = SentienceContext
PredicateContextState = SentienceContextState