Skip to content

Commit b2e05d9

Browse files
committed
update the project
1 parent d16b317 commit b2e05d9

File tree

17 files changed

+1207
-981
lines changed

17 files changed

+1207
-981
lines changed

Readme.md

Lines changed: 440 additions & 137 deletions
Large diffs are not rendered by default.

agent/agent.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,14 @@
1313
from langgraph.prebuilt import ToolNode, tools_condition
1414

1515
class AgentState(TypedDict):
16-
"""State for the browser agent."""
1716
messages: Annotated[list, add_messages]
1817

1918
async def create_agent(api_key: str):
20-
"""Create and return a LangGraph agent with specified tools."""
21-
22-
# Initialize Groq model
23-
# llm = ChatGroq(
24-
# model="qwen-qwq-32b",
25-
# temperature=0,
26-
# max_tokens=2048,
27-
# api_key=api_key
28-
# )
29-
30-
# For standard OpenAI API
19+
# Initialize Azure OpenAI
3120
llm = AzureChatOpenAI(
3221
model_name="gpt-4o",
3322
openai_api_key=api_key,
34-
temperature=1.0,
23+
temperature=0,
3524
api_version="2024-12-01-preview",
3625
azure_endpoint= os.getenv("AZURE_ENDPOINT"),
3726
)
@@ -42,7 +31,7 @@ async def create_agent(api_key: str):
4231

4332
# Create the system prompt
4433
system_prompt = """
45-
You are an expert AI agent that controls a web browser with precision, robust navigation awareness, and human-like interaction. Your goal is to reliably complete user tasks by analyzing, navigating, and interacting with web pages.
34+
You are an expert AI agent that controls a web browser with precision, robust navigation awareness. Your goal is to reliably complete user tasks by analyzing, navigating, and interacting with web pages.
4635
4736
## CORE TOOLS & CAPABILITIES
4837
- analyze_page: Extracts all visible elements and text. Use IMMEDIATELY after any navigation, click, or popup event. This tool builds an internal map of all interactive elements with numbered IDs.
@@ -53,6 +42,7 @@ async def create_agent(api_key: str):
5342
- navigate: Opens a URL. Ensures protocol (https://) is added if missing. ALWAYS follow with analyze_page().
5443
- go_back: Goes to previous page in browser history. ALWAYS follow with analyze_page().
5544
- scroll: Scrolls viewport in specified direction ("down", "up", "top", "bottom"). Use when elements are off-screen, for infinite scrolling pages, or when loading more content.
45+
- ask_user: Requests information directly from the user. Use JSON format: ask_user('{"prompt":"What is your username?","type":"text"}') or ask_user('{"prompt":"Choose payment method","type":"choice","choices":["Credit","PayPal"]}') or ask_user('{"prompt":"Enter password","type":"password"}').
5646
5747
## DYNAMIC CONTENT INTERACTION
5848
- For infinite scroll pages: Use scroll("down") repeatedly, running analyze_page() after each scroll to capture newly loaded content.
@@ -98,11 +88,26 @@ async def create_agent(api_key: str):
9888
- Filters and sorting: Use these controls to narrow down large sets of results to find specific items.
9989
- Pagination: Look for pagination controls when dealing with multi-page content and navigate between pages as needed.
10090
- Browser history: Use go_back() strategically to return to previously visited pages rather than re-navigating from the start.
91+
92+
## USER INTERACTION WITH ASK_USER TOOL
93+
Use the ask_user tool to collect information or decisions from users during tasks:
94+
95+
- **Syntax**: `ask_user('{"prompt":"Question?","type":"text|password|choice","choices":["Option1","Option2"],"default":"Default"}')`
96+
- **Common Uses**:
97+
1. Authentication: `ask_user('{"prompt":"Enter password","type":"password"}')`
98+
2. Choices: `ask_user('{"prompt":"Select payment method","type":"choice","choices":["Credit","PayPal"]}')`
99+
3. Confirmations: `ask_user('{"prompt":"Proceed with purchase?","type":"choice","choices":["Yes","No"]}')`
100+
4. Form data: `ask_user('{"prompt":"Enter shipping address","type":"text"}')`
101+
5. CAPTCHA assistance: `ask_user('{"prompt":"Please help with CAPTCHA verification"}')`
102+
103+
Always ask for ONE piece of information at a time, use clear prompts, and choose appropriate input types.
104+
105+
Response : [Provide the specific information requested by the user, including any data, facts, or details discovered. State "Goal completed successfully" when done.]
106+
101107
"""
102108

103109
# Create async node for chatbot
104110
async def chatbot(state: AgentState):
105-
"""Process messages and generate a response."""
106111
# If no message exists, return no change to state
107112
if not state.get("messages", []):
108113
return {"messages": []}
@@ -132,13 +137,12 @@ async def chatbot(state: AgentState):
132137
memory = MemorySaver()
133138
graph = graph_builder.compile(checkpointer=memory)
134139

135-
# Wrap the graph with an interface similar to the original AgentExecutor
140+
# Wrap the graph with an interface
136141
class LangGraphAgent:
137142
def __init__(self, graph):
138143
self.graph = graph
139144

140145
async def ainvoke(self, input_text, thread_id="main"):
141-
"""Invoke the agent with the input text asynchronously."""
142146
config = {"configurable": {"thread_id": thread_id}}
143147

144148
# Start with system message and user input
@@ -152,18 +156,17 @@ async def ainvoke(self, input_text, thread_id="main"):
152156
# Run the graph asynchronously
153157
result = await self.graph.ainvoke(state, config)
154158

155-
# Format the result to match the expected format
159+
# Format the result
156160
output = result["messages"][-1].content
157161

158-
# Create a result similar to AgentExecutor's format
162+
# Create a result
159163
return {
160164
"input": input_text,
161165
"output": output,
162166
"messages": result["messages"]
163167
}
164168

165169
def invoke(self, input_text, thread_id="main"):
166-
"""Synchronous wrapper for backwards compatibility."""
167170
# Get the current event loop or create a new one
168171
try:
169172
loop = asyncio.get_event_loop()
@@ -175,7 +178,6 @@ def invoke(self, input_text, thread_id="main"):
175178
return loop.run_until_complete(self.ainvoke(input_text, thread_id))
176179

177180
async def astream(self, input_text, thread_id="main"):
178-
"""Stream the agent's thinking process asynchronously."""
179181
config = {"configurable": {"thread_id": thread_id}}
180182

181183
# Start with system message and user input
@@ -192,7 +194,6 @@ async def astream(self, input_text, thread_id="main"):
192194
event["messages"][-1].pretty_print()
193195

194196
def stream(self, input_text, thread_id="main"):
195-
"""Synchronous streaming wrapper for backwards compatibility."""
196197
# Get the current event loop or create a new one
197198
try:
198199
loop = asyncio.get_event_loop()

browser/analyzers/page_analyzer.py

Lines changed: 88 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -175,31 +175,91 @@ async def analyze_page():
175175
176176
// Find all modal/dialog/popup elements
177177
function findPopups() {
178-
// Common selectors for modals/dialogs/popups
178+
// Expanded selectors for modals/dialogs/popups
179179
const selectors = [
180-
'[role=dialog]',
181-
'[aria-modal="true"]',
182-
'[data-modal="true"]',
183-
'.modal',
184-
'.dialog',
185-
'.popup',
186-
'.overlay',
187-
'.pop-up',
188-
'.ant-modal', // Ant Design
189-
'.MuiDialog-root', // Material UI
180+
// ARIA roles and attributes
181+
'[role=dialog]', '[role=alertdialog]', '[role=drawer]', '[role=tooltip]', '[role=menu]',
182+
'[aria-modal="true"]', '[aria-haspopup="dialog"]', '[aria-haspopup="menu"]',
183+
184+
// Data attributes
185+
'[data-modal="true"]', '[data-popup="true"]', '[data-dialog="true"]', '[data-overlay="true"]',
186+
187+
// Common class patterns
188+
'.modal', '.dialog', '.popup', '.overlay', '.pop-up', '.popover', '.tooltip', '.drawer',
189+
'.toast', '.notification', '.alert-box',
190+
191+
// Framework-specific selectors
192+
'.ant-modal', '.ant-drawer', '.ant-popover', // Ant Design
193+
'.MuiDialog-root', '.MuiDrawer-root', '.MuiPopover-root', // Material UI
190194
'.ReactModal__Content', // React Modal
191-
'[class*="modal"]',
192-
'[class*="dialog"]',
193-
'[class*="popup"]',
194-
'[class*="overlay"]'
195+
'.modal-dialog', '.modal-content', '.popover', // Bootstrap
196+
'.chakra-modal', '.chakra-dialog', // Chakra UI
197+
'.ui.modal', '.ui.popup', // Semantic UI
198+
'.v-dialog', '.v-menu', // Vuetify
199+
200+
// Generic patterns
201+
'[class*="modal"]', '[class*="dialog"]', '[class*="popup"]', '[class*="overlay"]',
202+
'[class*="drawer"]', '[class*="toast"]', '[class*="tooltip"]', '[class*="popover"]'
195203
];
196-
// Select visible popups only
204+
205+
// Find all visible popups
197206
const popups = [];
207+
208+
// Check for elements matching our selectors
198209
for (const sel of selectors) {
199210
for (const el of document.querySelectorAll(sel)) {
200211
if (isVisible(el)) popups.push(el);
201212
}
202213
}
214+
215+
// Check for fixed/absolute positioned elements with high z-index
216+
document.querySelectorAll('div, section, aside').forEach(el => {
217+
if (!popups.includes(el) && isVisible(el)) {
218+
const style = window.getComputedStyle(el);
219+
const position = style.position;
220+
const zIndex = parseInt(style.zIndex) || 0;
221+
222+
// Fixed/absolute with high z-index are often modals/popups
223+
if ((position === 'fixed' || position === 'absolute') && zIndex > 10) {
224+
const rect = el.getBoundingClientRect();
225+
if (rect.width > 50 && rect.height > 50) { // Reasonable size check
226+
popups.push(el);
227+
}
228+
}
229+
}
230+
});
231+
232+
// Check for elements near known backdrops (often indicates a modal)
233+
const backdrops = document.querySelectorAll('.modal-backdrop, .overlay, .backdrop, .dimmer, [class*="backdrop"], [class*="overlay"]');
234+
for (const backdrop of backdrops) {
235+
if (isVisible(backdrop)) {
236+
const backdropRect = backdrop.getBoundingClientRect();
237+
const viewportCenter = {
238+
x: window.innerWidth / 2,
239+
y: window.innerHeight / 2
240+
};
241+
242+
// Look for visible centered elements - often these are modals related to backdrops
243+
document.querySelectorAll('div, section, aside').forEach(el => {
244+
if (!popups.includes(el) && isVisible(el)) {
245+
const rect = el.getBoundingClientRect();
246+
const elementCenter = {
247+
x: rect.left + rect.width / 2,
248+
y: rect.top + rect.height / 2
249+
};
250+
251+
// Is it centered, reasonable size, and contained within backdrop?
252+
const isCentered = Math.abs(elementCenter.x - viewportCenter.x) < viewportCenter.x / 3 &&
253+
Math.abs(elementCenter.y - viewportCenter.y) < viewportCenter.y / 3;
254+
255+
if (isCentered && rect.width > 50 && rect.height > 50) {
256+
popups.push(el);
257+
}
258+
}
259+
});
260+
}
261+
}
262+
203263
// Remove duplicates
204264
return Array.from(new Set(popups));
205265
}
@@ -335,6 +395,18 @@ async def analyze_page():
335395

336396
# Add each item, grouping related content on the same line
337397
for item in page_content['content']:
398+
# Skip elements where type matches display text (e.g., "[49][button]button")
399+
if item.startswith('['):
400+
# Parse the element format: [id][type]text
401+
parts = item.split(']', 2)
402+
if len(parts) >= 3:
403+
element_type = parts[1][1:] # Get the type without '['
404+
display_text = parts[2] # Get the display text
405+
406+
# Skip if the element type exactly matches its display text
407+
if display_text.strip() == element_type:
408+
continue
409+
338410
# Start a new line for interactive elements or if current line is empty
339411
if item.startswith('[') or not current_line:
340412
if (current_line): # Add the previous line if it exists

browser/browser_setup.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from playwright.async_api import async_playwright
22

33
def inject_cursor_script():
4-
"""Returns the script to inject for cursor visualization"""
54
return """
65
// Create a custom cursor element
76
const cursor = document.createElement('div');
@@ -17,7 +16,7 @@ def inject_cursor_script():
1716
cursor.style.zIndex = '999999';
1817
cursor.style.transition = 'left 0.1s, top 0.1s';
1918
20-
// Add cursor to the page when it loads
19+
// Add cursor to the page
2120
document.addEventListener('DOMContentLoaded', function() {
2221
document.body.appendChild(cursor);
2322
});
@@ -41,7 +40,6 @@ def inject_cursor_script():
4140
"""
4241

4342
async def initialize_browser(options, connection_options=None):
44-
"""Initialize the browser by connecting to existing instance or launching a new one."""
4543
playwright = await async_playwright().start()
4644

4745
# Default connection options if none provided
@@ -80,15 +78,14 @@ async def initialize_browser(options, connection_options=None):
8078
raise e
8179

8280
print("Falling back to launching a new browser instance...")
83-
browser = None # Reset for fallback path
81+
browser = None
8482

8583
# Launch a new browser if needed
8684
if browser is None:
8785
print(f"Launching new browser with options: {options}")
8886
browser = await playwright.chromium.launch(**options)
8987
page = await browser.new_page(viewport=None)
9088

91-
# Shared initialization regardless of connection method
9289
# Inject cursor visualization CSS and JavaScript
9390
await page.add_init_script(inject_cursor_script())
9491

@@ -119,7 +116,6 @@ async def initialize_browser(options, connection_options=None):
119116
# Ensure cursor is created and function is available
120117
await page.evaluate("""
121118
() => {
122-
// Create a custom cursor element if it doesn't exist
123119
if (!document.getElementById('ai-agent-cursor')) {
124120
const cursor = document.createElement('div');
125121
cursor.id = 'ai-agent-cursor';
@@ -136,7 +132,6 @@ async def initialize_browser(options, connection_options=None):
136132
document.body.appendChild(cursor);
137133
}
138134
139-
// Define the updateAICursor function if it doesn't exist
140135
if (typeof window.updateAICursor !== 'function') {
141136
window.updateAICursor = function(x, y) {
142137
const cursor = document.getElementById('ai-agent-cursor');
@@ -157,7 +152,6 @@ async def initialize_browser(options, connection_options=None):
157152
return playwright, browser, page
158153

159154
async def close_browser(playwright, browser, is_connected=False):
160-
"""Close the browser cleanly."""
161155
try:
162156
if is_connected:
163157
# If connected to existing browser, just disconnect
Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""
22
Main browser controller that serves as the interface to all browser functionality.
3-
This controller delegates to specialized controllers for different aspects of browser control.
43
"""
54

65
import asyncio
@@ -9,18 +8,20 @@
98
from browser.analyzers.page_analyzer import analyze_page
109
from browser.navigation.navigator import navigate, go_back
1110
from browser.navigation.scroll_manager import scroll
11+
from browser.utils.user_interaction import ask_user
1212

1313
# Global page reference
1414
page = None
1515

1616
async def initialize(browser_page):
17-
"""Initialize the browser controller with a browser page."""
1817
# Import locally to avoid circular imports
1918
from browser.controllers.element_controller import initialize as init_element
2019
from browser.controllers.keyboard_controller import initialize as init_keyboard
2120
from browser.analyzers.page_analyzer import initialize as init_analyzer
2221
from browser.navigation.navigator import initialize as init_navigator
2322
from browser.navigation.scroll_manager import initialize as init_scroll
23+
from browser.utils.dom_helpers import initialize as init_dom_helpers
24+
from browser.utils.user_interaction import initialize as init_user_interaction
2425

2526
global page
2627
page = browser_page
@@ -31,28 +32,19 @@ async def initialize(browser_page):
3132
await init_analyzer(page)
3233
await init_navigator(page)
3334
await init_scroll(page)
35+
await init_dom_helpers(page)
36+
init_user_interaction()
3437

3538
print("Browser controller initialized successfully")
3639

3740
async def close():
38-
"""Close the browser cleanly."""
3941
try:
4042
await page.context.browser.close()
4143
return "Browser closed successfully"
4244
except Exception as e:
4345
return f"Error closing browser: {str(e)}"
4446

4547
def get_browser_tools():
46-
"""
47-
Return all the tools from the browser controller.
48-
49-
This function returns a list of all public browser control functions
50-
that are decorated with @tool. These functions can be used to control
51-
the browser programmatically.
52-
53-
Returns:
54-
list: A list of browser control tool functions
55-
"""
5648
browser_tools = [
5749
analyze_page,
5850
click,
@@ -62,6 +54,7 @@ def get_browser_tools():
6254
go_back,
6355
navigate,
6456
scroll,
57+
ask_user,
6558
]
6659

6760
return browser_tools

0 commit comments

Comments
 (0)