-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbrowser.py
More file actions
310 lines (273 loc) · 13.2 KB
/
browser.py
File metadata and controls
310 lines (273 loc) · 13.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
"""
Playwright browser harness with extension loading
"""
import os
import tempfile
import shutil
from pathlib import Path
from typing import Optional
from playwright.sync_api import sync_playwright, BrowserContext, Page, Playwright
# Import stealth for bot evasion (optional - graceful fallback if not available)
try:
from playwright_stealth import stealth_sync
STEALTH_AVAILABLE = True
except ImportError:
STEALTH_AVAILABLE = False
class SentienceBrowser:
"""Main browser session with Sentience extension loaded"""
def __init__(
self,
api_key: Optional[str] = None,
api_url: Optional[str] = None,
headless: bool = False
):
"""
Initialize Sentience browser
Args:
api_key: Optional API key for server-side processing (Pro/Enterprise tiers)
If None, uses free tier (local extension only)
api_url: Server URL for API calls (defaults to https://api.sentienceapi.com if api_key provided)
If None and api_key is provided, uses default URL
If None and no api_key, uses free tier (local extension only)
If 'local' or Docker sidecar URL, uses Enterprise tier
headless: Whether to run in headless mode
"""
self.api_key = api_key
# Only set api_url if api_key is provided, otherwise None (free tier)
# Default to https://api.sentienceapi.com if api_key is provided but api_url is not
if api_key:
self.api_url = api_url or "https://api.sentienceapi.com"
else:
self.api_url = None
self.headless = headless
self.playwright: Optional[Playwright] = None
self.context: Optional[BrowserContext] = None
self.page: Optional[Page] = None
self._extension_path: Optional[str] = None
def start(self) -> None:
"""Launch browser with extension loaded"""
# Try to find extension in multiple locations:
# 1. Embedded extension (sentience/extension/) - for production/CI
# 2. Development mode (../sentience-chrome/) - for local development
# __file__ is sdk-python/sentience/browser.py, so:
# parent = sdk-python/sentience/
# parent.parent = sdk-python/
sdk_root = Path(__file__).parent.parent
# Check for embedded extension first (production/CI)
embedded_extension = sdk_root / "sentience" / "extension"
# Check for development extension (local development)
repo_root = sdk_root.parent
dev_extension = repo_root / "sentience-chrome"
# Prefer embedded extension, fall back to dev extension
if embedded_extension.exists() and (embedded_extension / "manifest.json").exists():
extension_source = embedded_extension
elif dev_extension.exists() and (dev_extension / "manifest.json").exists():
extension_source = dev_extension
else:
raise FileNotFoundError(
f"Extension not found. Checked:\n"
f" 1. {embedded_extension}\n"
f" 2. {dev_extension}\n"
"Make sure extension files are available. "
"For development: cd ../sentience-chrome && ./build.sh"
)
# Create temporary extension bundle
temp_dir = tempfile.mkdtemp(prefix="sentience-ext-")
self._extension_path = temp_dir
# Copy extension files
files_to_copy = [
"manifest.json",
"content.js",
"background.js",
"injected_api.js",
]
for file in files_to_copy:
src = extension_source / file
if src.exists():
shutil.copy2(src, os.path.join(temp_dir, file))
# Copy pkg directory (WASM)
pkg_source = extension_source / "pkg"
if pkg_source.exists():
pkg_dest = os.path.join(temp_dir, "pkg")
shutil.copytree(pkg_source, pkg_dest, dirs_exist_ok=True)
else:
raise FileNotFoundError(
f"WASM files not found at {pkg_source}. "
"Build the extension first: cd sentience-chrome && ./build.sh"
)
# Launch Playwright
self.playwright = sync_playwright().start()
# Stealth arguments for bot evasion
stealth_args = [
f"--load-extension={temp_dir}",
f"--disable-extensions-except={temp_dir}",
"--disable-blink-features=AutomationControlled", # Hide automation indicators
"--no-sandbox", # Required for some environments
"--disable-infobars", # Hide "Chrome is being controlled" message
]
# Realistic viewport and user-agent for better evasion
viewport_config = {"width": 1920, "height": 1080}
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
# Launch browser with extension
# Note: channel="chrome" (system Chrome) has known issues with extension loading
# We use bundled Chromium for reliable extension loading, but still apply stealth features
user_data_dir = tempfile.mkdtemp(prefix="sentience-profile-")
use_chrome_channel = False # Disable for now due to extension loading issues
try:
if use_chrome_channel:
# Try with system Chrome first (better evasion, but may have extension issues)
self.context = self.playwright.chromium.launch_persistent_context(
user_data_dir=user_data_dir,
channel="chrome", # Use system Chrome (better evasion)
headless=self.headless,
args=stealth_args,
viewport=viewport_config,
user_agent=user_agent,
timeout=30000,
)
else:
# Use bundled Chromium (more reliable for extensions)
self.context = self.playwright.chromium.launch_persistent_context(
user_data_dir=user_data_dir,
headless=self.headless,
args=stealth_args,
viewport=viewport_config,
user_agent=user_agent,
timeout=30000,
)
except Exception as launch_error:
# Clean up on failure
if os.path.exists(user_data_dir):
try:
shutil.rmtree(user_data_dir)
except Exception:
pass
raise RuntimeError(
f"Failed to launch browser: {launch_error}\n"
"Make sure Playwright browsers are installed: playwright install chromium"
) from launch_error
# Get first page or create new one
pages = self.context.pages
if pages:
self.page = pages[0]
else:
self.page = self.context.new_page()
# Apply stealth patches for bot evasion (if available)
if STEALTH_AVAILABLE:
try:
stealth_sync(self.page)
except Exception:
# Silently fail if stealth application fails - not critical
# This is expected if playwright-stealth has compatibility issues
pass
# Verify extension is loaded by checking background page
# This helps catch extension loading issues early
try:
background_pages = [p for p in self.context.background_pages]
if not background_pages:
# Extension might not have a background page, or it's not loaded yet
# Wait a bit for extension to initialize
self.page.wait_for_timeout(1000)
except Exception:
# Background pages might not be accessible, continue anyway
pass
# Navigate to a real page so extension can inject
# Extension content scripts only run on actual pages (not about:blank)
# Use a simple page that loads quickly
self.page.goto("https://example.com", wait_until="domcontentloaded", timeout=15000)
# Give extension time to initialize (WASM loading is async)
# Content scripts run at document_idle, so we need to wait for that
# Also wait for extension ID to be set by content.js
self.page.wait_for_timeout(3000)
# Wait for extension to load
if not self._wait_for_extension(timeout=25000):
# Extension might need more time, try waiting a bit longer
self.page.wait_for_timeout(3000)
if not self._wait_for_extension(timeout=15000):
# Get diagnostic info before failing
try:
diagnostic_info = self.page.evaluate("""
() => {
const info = {
sentience_defined: typeof window.sentience !== 'undefined',
registry_defined: typeof window.sentience_registry !== 'undefined',
snapshot_defined: typeof window.sentience?.snapshot === 'function',
extension_id: document.documentElement.dataset.sentienceExtensionId || 'not set',
url: window.location.href
};
if (window.sentience) {
info.sentience_keys = Object.keys(window.sentience);
}
return info;
}
""")
diagnostic_str = f"\n5. Diagnostic info: {diagnostic_info}"
except Exception:
diagnostic_str = "\n5. Could not get diagnostic info"
raise RuntimeError(
"Extension failed to load after navigation. Make sure:\n"
"1. Extension is built (cd sentience-chrome && ./build.sh)\n"
"2. All files are present (manifest.json, content.js, injected_api.js, pkg/)\n"
"3. Check browser console for errors (run with headless=False to see console)\n"
f"4. Extension path: {temp_dir}"
+ diagnostic_str
)
def _wait_for_extension(self, timeout: int = 20000) -> bool:
"""Wait for window.sentience API to be available"""
import time
start = time.time()
last_error = None
while time.time() - start < timeout / 1000:
try:
result = self.page.evaluate("""
() => {
// Check if sentience API exists
if (typeof window.sentience === 'undefined') {
return { ready: false, reason: 'window.sentience not defined' };
}
// Check if snapshot function exists
if (typeof window.sentience.snapshot !== 'function') {
return { ready: false, reason: 'snapshot function not available' };
}
// Check if registry is initialized
if (window.sentience_registry === undefined) {
return { ready: false, reason: 'registry not initialized' };
}
// Check if WASM module is loaded (check internal _wasmModule if available)
const sentience = window.sentience;
if (sentience._wasmModule && !sentience._wasmModule.analyze_page) {
return { ready: false, reason: 'WASM module not fully loaded' };
}
// If _wasmModule is not exposed, that's okay - it might be internal
// Just verify the API structure is correct
return { ready: true };
}
""")
if isinstance(result, dict):
if result.get("ready"):
return True
last_error = result.get("reason", "Unknown error")
except Exception as e:
# Continue waiting on errors
last_error = f"Evaluation error: {str(e)}"
time.sleep(0.3)
# Log the last error for debugging
if last_error:
import warnings
warnings.warn(f"Extension wait timeout. Last status: {last_error}")
return False
def close(self) -> None:
"""Close browser and cleanup"""
if self.context:
self.context.close()
if self.playwright:
self.playwright.stop()
if self._extension_path and os.path.exists(self._extension_path):
shutil.rmtree(self._extension_path)
def __enter__(self):
"""Context manager entry"""
self.start()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit"""
self.close()