forked from addy999/onequery
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
255 lines (219 loc) · 8.25 KB
/
utils.py
File metadata and controls
255 lines (219 loc) · 8.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
import re
import time
from bs4 import BeautifulSoup, Comment, Tag
from bs4.element import NavigableString
from playwright.async_api import ElementHandle, Locator, Page, expect
def is_visible_bs(element):
"""
Checks if an element is likely visible based on common HTML attributes and inline styles.
This is not a perfect solution as it does not parse external CSS files.
"""
if element is None:
return False
# Check for inline 'display: none' or 'visibility: hidden' styles
style = element.get("style", "")
if "display" in style and "none" in style:
return False
if "visibility" in style and "hidden" in style:
return False
# Check for a 'hidden' attribute
if element.get("hidden") is not None:
return False
# Recursively check parent elements
parent = element.parent
if parent is not None and parent.name != "[document]":
return is_visible_bs(parent)
return True
class DOMSimplifier:
def __init__(self):
self.interactive_tags = {
"a",
"button",
"input",
"select",
"textarea",
"details",
"summary",
"iframe",
}
# Tags that define structure/grouping (keep these even if not interactive)
self.semantic_tags = {
"body",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"ul",
"ol",
"li",
"form",
"label",
"table",
"thead",
"tbody",
"tr",
"td",
"th",
"main",
"header",
"footer",
"nav",
"section",
"article",
}
self.ignore_tags = {
"script",
"style",
"meta",
"noscript",
"link",
"svg",
"path",
"br",
}
self.element_map = {}
self.counter = 1
def clean_dom(self, soup):
for tag in soup(self.ignore_tags):
tag.decompose()
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
for tag in soup.find_all("input", type="hidden"):
tag.decompose()
return soup
def get_own_text(self, soup_element: Tag):
"""Gets text actually inside this tag, NOT inside its children."""
text = ""
for child in soup_element.children:
if isinstance(child, NavigableString) and not isinstance(child, Comment):
trimmed = child.strip()
if trimmed:
text += trimmed + " "
text = text.strip()
if not text:
aria_label = soup_element.get("aria-label")
aria_str = str(aria_label) if aria_label else ""
text = (aria_str + " " + ",".join(soup_element.get("class") or [])).strip()
return text
def get_xpath(self, elem):
"""Generates a unique XPath for a BeautifulSoup element."""
path = []
while elem:
# Use ID if available for shorter XPath
if elem.attrs.get("id"):
path.append(f"//*[@id='{elem['id']}']")
break
# Determine the position among siblings of the same tag type
siblings = (
elem.find_parent().find_all(elem.name, recursive=False)
if elem.parent
else []
)
if len(siblings) > 1:
# Find the index of the current element (1-based index for XPath)
index = siblings.index(elem) + 1
path.append(f"{elem.name}[{index}]")
else:
path.append(elem.name)
elem = elem.parent
if not elem or elem.name == "[document]":
break
# Reverse the list and join with /
return "/".join(path[::-1])
async def process_node(self, locator: Locator, soup_element: Tag, depth=0):
if isinstance(soup_element, NavigableString) or not is_visible_bs(soup_element):
return ""
tag_name = soup_element.name
own_text = self.get_own_text(soup_element)
is_interactive = (tag_name in self.interactive_tags) or (
soup_element.get("aria-label") is not None
)
is_semantic = tag_name in self.semantic_tags
has_direct_text = len(own_text) > 0
should_print = (
is_interactive
or is_semantic
or has_direct_text
or (soup_element.get("aria-label") is not None)
)
# print(f"> Processing <{tag_name}> <{own_text}> should_print={should_print}")
element_handle = None
if should_print and is_interactive:
handles = await locator.element_handles()
element_handle = handles[-1] if handles else None
if element_handle:
box = await element_handle.bounding_box()
if box:
in_viewport = (
box["y"] < self.viewport["height"]
and box["y"] + box["height"] > 0
and box["x"] < self.viewport["width"]
and box["x"] + box["width"] > 0
)
if not in_viewport:
return ""
# Important Attributes
attrs = []
# ID generation - ONLY assign ID if we're going to print this node AND it's interactive
node_id_str = ""
if should_print and is_interactive and element_handle:
node_id = self.counter
self.element_map[node_id] = element_handle
node_id_str = f"[{node_id}] "
self.counter += 1
# Collect useful attributes using BeautifulSoup
if soup_element.get("aria-label"):
attrs.append(f'aria="{soup_element.get("aria-label")}"')
if soup_element.get("placeholder"):
attrs.append(f'plh="{soup_element.get("placeholder")}"')
if soup_element.get("alt"):
attrs.append(f'alt="{soup_element.get("alt")}"')
if soup_element.get("name"):
attrs.append(f'name="{soup_element.get("name")}"')
if tag_name == "input":
attrs.append(f'type="{soup_element.get("type") or "text"}"')
output = ""
if should_print:
indent = " " * depth
attr_str = " " + " ".join(attrs) if attrs else ""
text_str = f' "{own_text}"' if own_text else ""
output += f"{indent}{node_id_str}{tag_name}{attr_str}{text_str}\n"
next_depth = depth + 1
else:
# PASS-THROUGH: This is a generic div/span.
# Don't print it, but process its children at the SAME depth.
next_depth = depth
# 3. Recurse children using BeautifulSoup for iteration
# O(1) sibling indexing
from collections import defaultdict
seen = defaultdict(int)
for child in soup_element.children:
if not hasattr(child, "name"):
continue
seen[child.name] += 1
idx = seen[child.name]
if idx > 1:
child_locator = locator.locator(f"> {child.name}:nth-of-type({idx})")
else:
child_locator = locator.locator(f"> {child.name}")
output += await self.process_node(child_locator, child, next_depth)
return output
async def parse(self, page: Page) -> tuple[str, dict[str, ElementHandle]]:
self.counter = 1
self.element_map = {}
self.viewport = page.viewport_size
assert self.viewport is not None
# Get the DOM content and parse with BeautifulSoup
dom_content = await page.content()
soup = BeautifulSoup(dom_content, "html.parser")
soup = self.clean_dom(soup)
# Get body from both Playwright and BeautifulSoup
body_locator = page.locator("body")
body_soup = soup.body if soup.body else soup
assert body_locator is not None
assert body_soup is not None
print("> Starting DOM parsing...")
result = await self.process_node(body_locator, body_soup)
return (result, self.element_map)