@@ -20,14 +20,20 @@ export class LLMInteractionHandler {
2020 /**
2121 * Build context string from snapshot for LLM prompt
2222 *
23+ * Format: [ID] <role> "text" {cues} @ (x,y) size:WxH importance:score [status]
24+ *
2325 * @param snap - Snapshot containing elements
24- * @param goal - Goal/task description
26+ * @param goal - Goal/task description (unused but kept for API consistency)
2527 * @returns Formatted context string
2628 */
2729 buildContext ( snap : Snapshot , _goal : string ) : string {
2830 const lines : string [ ] = [ ] ;
2931
3032 for ( const el of snap . elements ) {
33+ // Skip REMOVED elements - they're not actionable and shouldn't be in LLM context
34+ if ( el . diff_status === 'REMOVED' ) {
35+ continue ;
36+ }
3137 // Extract visual cues
3238 const cues : string [ ] = [ ] ;
3339 if ( el . visual_cues . is_primary ) cues . push ( 'PRIMARY' ) ;
@@ -36,14 +42,44 @@ export class LLMInteractionHandler {
3642 cues . push ( `color:${ el . visual_cues . background_color_name } ` ) ;
3743 }
3844
39- // Format element line
45+ // Format element line with improved readability
4046 const cuesStr = cues . length > 0 ? ` {${ cues . join ( ',' ) } }` : '' ;
41- const text = el . text || '' ;
42- const textPreview = text . length > 50 ? text . substring ( 0 , 50 ) + '...' : text ;
4347
48+ // Better text handling - show truncation indicator
49+ let textPreview = '' ;
50+ if ( el . text ) {
51+ if ( el . text . length > 50 ) {
52+ textPreview = `"${ el . text . substring ( 0 , 50 ) } ..."` ;
53+ } else {
54+ textPreview = `"${ el . text } "` ;
55+ }
56+ }
57+
58+ // Build position and size info
59+ const x = Math . floor ( el . bbox . x ) ;
60+ const y = Math . floor ( el . bbox . y ) ;
61+ const width = Math . floor ( el . bbox . width ) ;
62+ const height = Math . floor ( el . bbox . height ) ;
63+ const positionStr = `@ (${ x } ,${ y } )` ;
64+ const sizeStr = `size:${ width } x${ height } ` ;
65+
66+ // Build status indicators (only include if relevant)
67+ const statusParts : string [ ] = [ ] ;
68+ if ( ! el . in_viewport ) {
69+ statusParts . push ( 'not_in_viewport' ) ;
70+ }
71+ if ( el . is_occluded ) {
72+ statusParts . push ( 'occluded' ) ;
73+ }
74+ if ( el . diff_status ) {
75+ statusParts . push ( `diff:${ el . diff_status } ` ) ;
76+ }
77+ const statusStr = statusParts . length > 0 ? ` [${ statusParts . join ( ',' ) } ]` : '' ;
78+
79+ // Format: [ID] <role> "text" {cues} @ (x,y) size:WxH importance:score [status]
4480 lines . push (
45- `[${ el . id } ] <${ el . role } > " ${ textPreview } " ${ cuesStr } ` +
46- `@ ( ${ Math . floor ( el . bbox . x ) } , ${ Math . floor ( el . bbox . y ) } ) (Imp :${ el . importance } ) `
81+ `[${ el . id } ] <${ el . role } > ${ textPreview } ${ cuesStr } ` +
82+ `${ positionStr } ${ sizeStr } importance :${ el . importance } ${ statusStr } `
4783 ) ;
4884 }
4985
@@ -59,23 +95,60 @@ export class LLMInteractionHandler {
5995 */
6096 async queryLLM ( domContext : string , goal : string ) : Promise < LLMResponse > {
6197 const systemPrompt = `You are an AI web automation agent.
62- Your job is to analyze the current page state and decide the next action to take.
63-
64- Available actions:
65- - CLICK(id) - Click element with ID
66- - TYPE(id, "text") - Type text into element with ID
67- - PRESS("key") - Press keyboard key (e.g., "Enter", "Escape", "Tab")
68- - FINISH() - Task is complete
69-
70- Format your response as a single action command on one line.
71- Example: CLICK(42) or TYPE(5, "search query") or PRESS("Enter")` ;
7298
73- const userPrompt = `Goal : ${ goal }
99+ GOAL : ${ goal }
74100
75- Current page elements :
101+ VISIBLE ELEMENTS (sorted by importance) :
76102${ domContext }
77103
78- What action should I take next? Respond with only the action command (e.g., CLICK(42)).` ;
104+ VISUAL CUES EXPLAINED:
105+ After the text, you may see visual cues in curly braces like {CLICKABLE} or {PRIMARY,CLICKABLE,color:white}:
106+ - PRIMARY: Main call-to-action element on the page
107+ - CLICKABLE: Element is clickable/interactive
108+ - color:X: Background color name (e.g., color:white, color:blue)
109+ Multiple cues are comma-separated inside the braces: {CLICKABLE,color:white}
110+
111+ ELEMENT FORMAT EXPLAINED:
112+ Each element line follows this format:
113+ [ID] <role> "text" {cues} @ (x,y) size:WxH importance:score [status]
114+
115+ Example: [346] <button> "Computer Accessories" {CLICKABLE,color:white} @ (664,100) size:150x40 importance:811
116+
117+ Breaking down each part:
118+ - [ID]: The number in brackets is the element ID - use this EXACT number in CLICK/TYPE commands
119+ Example: If you see [346], use CLICK(346) or TYPE(346, "text")
120+ - <role>: Element type (button, link, textbox, etc.)
121+ - "text": Visible text content (truncated with "..." if long)
122+ - {cues}: Optional visual cues in curly braces (e.g., {CLICKABLE}, {PRIMARY,CLICKABLE}, {CLICKABLE,color:white})
123+ If no cues, this part is omitted entirely
124+ - @ (x,y): Element position in pixels from top-left corner
125+ - size:WxH: Element dimensions (width x height in pixels)
126+ - importance: Score indicating element relevance (higher = more important)
127+ - [status]: Optional status flags in brackets (not_in_viewport, occluded, diff:ADDED/MODIFIED/etc)
128+
129+ CRITICAL RESPONSE FORMAT:
130+ You MUST respond with ONLY ONE of these exact action formats:
131+ - CLICK(id) - Click element by ID (use the number from [ID] brackets)
132+ - TYPE(id, "text") - Type text into element (use the number from [ID] brackets)
133+ - PRESS("key") - Press keyboard key (Enter, Escape, Tab, ArrowDown, etc)
134+ - FINISH() - Task complete
135+
136+ DO NOT include any explanation, reasoning, or natural language.
137+ DO NOT use markdown formatting or code blocks.
138+ DO NOT say "The next step is..." or anything similar.
139+
140+ CORRECT Examples (matching element IDs from the list above):
141+ If element is [346] <button> "Click me" → respond: CLICK(346)
142+ If element is [15] <textbox> "Search" → respond: TYPE(15, "magic mouse")
143+ PRESS("Enter")
144+ FINISH()
145+
146+ INCORRECT Examples (DO NOT DO THIS):
147+ "The next step is to click..."
148+ "I will type..."
149+ \`\`\`CLICK(42)\`\`\`` ;
150+
151+ const userPrompt = 'Return the single action command:' ;
79152
80153 try {
81154 const response = await this . llm . generate ( systemPrompt , userPrompt , {
0 commit comments