Skip to content

Commit a8ea645

Browse files
authored
Merge pull request #96 from SentienceAPI/optimize_llm
optimize LLM agent efficiency
2 parents e70fbfb + 2a65eb5 commit a8ea645

File tree

4 files changed

+195
-25
lines changed

4 files changed

+195
-25
lines changed

src/utils/element-filter.ts

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ export class ElementFilter {
3232
* ```
3333
*/
3434
static filterByImportance(snapshot: Snapshot, maxElements: number = 50): Element[] {
35-
const elements = [...snapshot.elements];
35+
// Filter out REMOVED elements - they're not actionable and shouldn't be in LLM context
36+
const elements = snapshot.elements.filter(el => el.diff_status !== 'REMOVED');
3637

3738
// Sort by importance (descending)
3839
elements.sort((a, b) => b.importance - a.importance);
@@ -60,13 +61,16 @@ export class ElementFilter {
6061
return this.filterByImportance(snapshot, maxElements);
6162
}
6263

64+
// Filter out REMOVED elements - they're not actionable and shouldn't be in LLM context
65+
const elements = snapshot.elements.filter(el => el.diff_status !== 'REMOVED');
66+
6367
const goalLower = goal.toLowerCase();
6468
const keywords = this.extractKeywords(goalLower);
6569

6670
// Score elements based on keyword matches
6771
const scoredElements: Array<[number, Element]> = [];
6872

69-
for (const element of snapshot.elements) {
73+
for (const element of elements) {
7074
let score = element.importance; // Start with base importance
7175

7276
// Boost score for keyword matches in text
@@ -115,7 +119,8 @@ export class ElementFilter {
115119
* ```
116120
*/
117121
static filter(snapshot: Snapshot, options: FilterOptions = {}): Element[] {
118-
let elements = [...snapshot.elements];
122+
// Filter out REMOVED elements - they're not actionable and shouldn't be in LLM context
123+
let elements = snapshot.elements.filter(el => el.diff_status !== 'REMOVED');
119124

120125
// Apply filters
121126
if (options.minImportance !== undefined) {

src/utils/llm-interaction-handler.ts

Lines changed: 92 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,20 @@ export class LLMInteractionHandler {
2020
/**
2121
* Build context string from snapshot for LLM prompt
2222
*
23+
* Format: [ID] <role> "text" {cues} @ (x,y) size:WxH importance:score [status]
24+
*
2325
* @param snap - Snapshot containing elements
24-
* @param goal - Goal/task description
26+
* @param goal - Goal/task description (unused but kept for API consistency)
2527
* @returns Formatted context string
2628
*/
2729
buildContext(snap: Snapshot, _goal: string): string {
2830
const lines: string[] = [];
2931

3032
for (const el of snap.elements) {
33+
// Skip REMOVED elements - they're not actionable and shouldn't be in LLM context
34+
if (el.diff_status === 'REMOVED') {
35+
continue;
36+
}
3137
// Extract visual cues
3238
const cues: string[] = [];
3339
if (el.visual_cues.is_primary) cues.push('PRIMARY');
@@ -36,14 +42,44 @@ export class LLMInteractionHandler {
3642
cues.push(`color:${el.visual_cues.background_color_name}`);
3743
}
3844

39-
// Format element line
45+
// Format element line with improved readability
4046
const cuesStr = cues.length > 0 ? ` {${cues.join(',')}}` : '';
41-
const text = el.text || '';
42-
const textPreview = text.length > 50 ? text.substring(0, 50) + '...' : text;
4347

48+
// Better text handling - show truncation indicator
49+
let textPreview = '';
50+
if (el.text) {
51+
if (el.text.length > 50) {
52+
textPreview = `"${el.text.substring(0, 50)}..."`;
53+
} else {
54+
textPreview = `"${el.text}"`;
55+
}
56+
}
57+
58+
// Build position and size info
59+
const x = Math.floor(el.bbox.x);
60+
const y = Math.floor(el.bbox.y);
61+
const width = Math.floor(el.bbox.width);
62+
const height = Math.floor(el.bbox.height);
63+
const positionStr = `@ (${x},${y})`;
64+
const sizeStr = `size:${width}x${height}`;
65+
66+
// Build status indicators (only include if relevant)
67+
const statusParts: string[] = [];
68+
if (!el.in_viewport) {
69+
statusParts.push('not_in_viewport');
70+
}
71+
if (el.is_occluded) {
72+
statusParts.push('occluded');
73+
}
74+
if (el.diff_status) {
75+
statusParts.push(`diff:${el.diff_status}`);
76+
}
77+
const statusStr = statusParts.length > 0 ? ` [${statusParts.join(',')}]` : '';
78+
79+
// Format: [ID] <role> "text" {cues} @ (x,y) size:WxH importance:score [status]
4480
lines.push(
45-
`[${el.id}] <${el.role}> "${textPreview}"${cuesStr} ` +
46-
`@ (${Math.floor(el.bbox.x)},${Math.floor(el.bbox.y)}) (Imp:${el.importance})`
81+
`[${el.id}] <${el.role}> ${textPreview}${cuesStr} ` +
82+
`${positionStr} ${sizeStr} importance:${el.importance}${statusStr}`
4783
);
4884
}
4985

@@ -59,23 +95,60 @@ export class LLMInteractionHandler {
5995
*/
6096
async queryLLM(domContext: string, goal: string): Promise<LLMResponse> {
6197
const systemPrompt = `You are an AI web automation agent.
62-
Your job is to analyze the current page state and decide the next action to take.
63-
64-
Available actions:
65-
- CLICK(id) - Click element with ID
66-
- TYPE(id, "text") - Type text into element with ID
67-
- PRESS("key") - Press keyboard key (e.g., "Enter", "Escape", "Tab")
68-
- FINISH() - Task is complete
69-
70-
Format your response as a single action command on one line.
71-
Example: CLICK(42) or TYPE(5, "search query") or PRESS("Enter")`;
7298
73-
const userPrompt = `Goal: ${goal}
99+
GOAL: ${goal}
74100
75-
Current page elements:
101+
VISIBLE ELEMENTS (sorted by importance):
76102
${domContext}
77103
78-
What action should I take next? Respond with only the action command (e.g., CLICK(42)).`;
104+
VISUAL CUES EXPLAINED:
105+
After the text, you may see visual cues in curly braces like {CLICKABLE} or {PRIMARY,CLICKABLE,color:white}:
106+
- PRIMARY: Main call-to-action element on the page
107+
- CLICKABLE: Element is clickable/interactive
108+
- color:X: Background color name (e.g., color:white, color:blue)
109+
Multiple cues are comma-separated inside the braces: {CLICKABLE,color:white}
110+
111+
ELEMENT FORMAT EXPLAINED:
112+
Each element line follows this format:
113+
[ID] <role> "text" {cues} @ (x,y) size:WxH importance:score [status]
114+
115+
Example: [346] <button> "Computer Accessories" {CLICKABLE,color:white} @ (664,100) size:150x40 importance:811
116+
117+
Breaking down each part:
118+
- [ID]: The number in brackets is the element ID - use this EXACT number in CLICK/TYPE commands
119+
Example: If you see [346], use CLICK(346) or TYPE(346, "text")
120+
- <role>: Element type (button, link, textbox, etc.)
121+
- "text": Visible text content (truncated with "..." if long)
122+
- {cues}: Optional visual cues in curly braces (e.g., {CLICKABLE}, {PRIMARY,CLICKABLE}, {CLICKABLE,color:white})
123+
If no cues, this part is omitted entirely
124+
- @ (x,y): Element position in pixels from top-left corner
125+
- size:WxH: Element dimensions (width x height in pixels)
126+
- importance: Score indicating element relevance (higher = more important)
127+
- [status]: Optional status flags in brackets (not_in_viewport, occluded, diff:ADDED/MODIFIED/etc)
128+
129+
CRITICAL RESPONSE FORMAT:
130+
You MUST respond with ONLY ONE of these exact action formats:
131+
- CLICK(id) - Click element by ID (use the number from [ID] brackets)
132+
- TYPE(id, "text") - Type text into element (use the number from [ID] brackets)
133+
- PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
134+
- FINISH() - Task complete
135+
136+
DO NOT include any explanation, reasoning, or natural language.
137+
DO NOT use markdown formatting or code blocks.
138+
DO NOT say "The next step is..." or anything similar.
139+
140+
CORRECT Examples (matching element IDs from the list above):
141+
If element is [346] <button> "Click me" → respond: CLICK(346)
142+
If element is [15] <textbox> "Search" → respond: TYPE(15, "magic mouse")
143+
PRESS("Enter")
144+
FINISH()
145+
146+
INCORRECT Examples (DO NOT DO THIS):
147+
"The next step is to click..."
148+
"I will type..."
149+
\`\`\`CLICK(42)\`\`\``;
150+
151+
const userPrompt = 'Return the single action command:';
79152

80153
try {
81154
const response = await this.llm.generate(systemPrompt, userPrompt, {

tests/agent.test.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -201,8 +201,8 @@ describe('SentienceAgent', () => {
201201
const agent = new SentienceAgent(browser, llm, 50, false);
202202

203203
const snap = createMockSnapshot();
204-
// Access private method through any cast for testing
205-
const context = (agent as any).buildContext(snap, 'test goal');
204+
// Access LLMInteractionHandler through agent for testing
205+
const context = (agent as any).llmHandler.buildContext(snap, 'test goal');
206206

207207
expect(context).toContain('[1]');
208208
expect(context).toContain('[2]');
@@ -212,7 +212,8 @@ describe('SentienceAgent', () => {
212212
expect(context).toContain('PRIMARY');
213213
expect(context).toContain('CLICKABLE');
214214
expect(context).toContain('color:blue');
215-
expect(context).toContain('(Imp:900)');
215+
expect(context).toContain('importance:900');
216+
expect(context).toContain('size:80x30');
216217
});
217218
});
218219

tests/utils/llm-interaction-handler.test.ts

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,9 @@ describe('LLMInteractionHandler', () => {
8181
expect(context).toContain('PRIMARY');
8282
expect(context).toContain('CLICKABLE');
8383
expect(context).toContain('color:blue');
84+
expect(context).toContain('@ (10,20)');
85+
expect(context).toContain('size:100x30');
86+
expect(context).toContain('importance:0.9');
8487
});
8588

8689
it('should truncate long text', () => {
@@ -114,6 +117,94 @@ describe('LLMInteractionHandler', () => {
114117
expect(match).toBeTruthy();
115118
expect(match![1].length).toBeLessThanOrEqual(53); // 50 chars + "..."
116119
});
120+
121+
it('should include status indicators when present', () => {
122+
const elements: Element[] = [
123+
{
124+
id: 1,
125+
role: 'button',
126+
text: 'Test',
127+
importance: 0.9,
128+
bbox: { x: 10, y: 20, width: 100, height: 30 },
129+
visual_cues: {
130+
is_primary: false,
131+
background_color_name: null,
132+
is_clickable: true,
133+
},
134+
in_viewport: false, // Not in viewport
135+
is_occluded: true, // Occluded
136+
z_index: 1,
137+
diff_status: 'ADDED', // Has diff status
138+
},
139+
];
140+
141+
const snap: Snapshot = {
142+
status: 'success',
143+
url: 'https://example.com',
144+
elements,
145+
};
146+
147+
const context = handler.buildContext(snap, 'test');
148+
expect(context).toContain('not_in_viewport');
149+
expect(context).toContain('occluded');
150+
expect(context).toContain('diff:ADDED');
151+
expect(context).toContain('size:100x30');
152+
expect(context).toContain('importance:0.9');
153+
});
154+
155+
it('should exclude REMOVED elements from context', () => {
156+
const elements: Element[] = [
157+
{
158+
id: 1,
159+
role: 'button',
160+
text: 'Click me',
161+
importance: 100,
162+
bbox: { x: 10, y: 20, width: 100, height: 30 },
163+
visual_cues: {
164+
is_primary: true,
165+
background_color_name: 'blue',
166+
is_clickable: true,
167+
},
168+
in_viewport: true,
169+
is_occluded: false,
170+
z_index: 1,
171+
diff_status: undefined,
172+
},
173+
{
174+
id: 6344,
175+
role: 'button',
176+
text: '5.0 out of 5 stars Excellent product',
177+
importance: 0,
178+
bbox: { x: 429, y: 9175, width: 204, height: 17 },
179+
visual_cues: {
180+
is_primary: true,
181+
background_color_name: 'black',
182+
is_clickable: true,
183+
},
184+
in_viewport: false,
185+
is_occluded: false,
186+
z_index: 0,
187+
diff_status: 'REMOVED', // This should be excluded
188+
},
189+
];
190+
191+
const snap: Snapshot = {
192+
status: 'success',
193+
url: 'https://example.com',
194+
elements,
195+
};
196+
197+
const context = handler.buildContext(snap, 'test goal');
198+
199+
// Should include normal element
200+
expect(context).toContain('[1]');
201+
expect(context).toContain('Click me');
202+
203+
// Should NOT include REMOVED element
204+
expect(context).not.toContain('[6344]');
205+
expect(context).not.toContain('5.0 out of 5 stars');
206+
expect(context).not.toContain('diff:REMOVED');
207+
});
117208
});
118209

119210
describe('queryLLM', () => {

0 commit comments

Comments
 (0)