Skip to content

Commit b246564

Browse files
authored
Merge pull request #535 from ccprocessor/dev
v4.0.0-released
2 parents fe1a2ec + 60136b0 commit b246564

8 files changed

Lines changed: 1746 additions & 69 deletions

File tree

llm_web_kit/extractor/html/pre_extractor.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22

33
from overrides import override
4+
from selectolax.parser import HTMLParser
45

56
from llm_web_kit.extractor.config import INVISIBLE_TAGS
67
from llm_web_kit.extractor.pre_extractor import \
@@ -154,7 +155,9 @@ def _do_pre_extract(self, data_json: DataJson) -> DataJson:
154155
def __clean_interactive_elements(self, data_json: DataJson) -> str:
155156
"""清除main_html中交互式元素."""
156157
html_content = data_json['main_html']
157-
tree = html_to_element(html_content)
158+
selectolax_tree = HTMLParser(html_content)
159+
fixed_html = selectolax_tree.html
160+
tree = html_to_element(fixed_html)
158161
# 删除main_html中的script和style标签
159162
for script_element in tree.xpath('//script'):
160163
remove_element(script_element)

llm_web_kit/extractor/html/recognizer/table.py

Lines changed: 72 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from itertools import chain
21
from typing import Any, List, Tuple
32

43
from lxml.html import HtmlElement
@@ -9,9 +8,30 @@
98
from llm_web_kit.extractor.html.recognizer.recognizer import (
109
BaseHTMLElementRecognizer, CCTag)
1110
from llm_web_kit.libs.doc_element_type import DocElementType
12-
from llm_web_kit.libs.html_utils import process_sub_sup_tags, remove_element
11+
from llm_web_kit.libs.html_utils import (html_normalize_space,
12+
process_sub_sup_tags)
1313
from llm_web_kit.libs.text_utils import normalize_text_segment
1414

15+
from .text import inline_tags
16+
17+
# 空元素
18+
VOID_ELEMENTS = {
19+
'area', 'base', 'br', 'col', 'embed', 'hr',
20+
'img', 'input', 'link', 'meta', 'param',
21+
'source', 'track', 'wbr'
22+
}
23+
24+
KEEP_ATTRS = {'colspan', 'rowspan'}
25+
26+
27+
def table_clean_attributes(element):
28+
attrs = list(element.attrib.keys())
29+
for attr in attrs:
30+
if attr not in KEEP_ATTRS:
31+
del element.attrib[attr]
32+
for child in element.iterchildren():
33+
table_clean_attributes(child)
34+
1535

1636
class TableRecognizer(BaseHTMLElementRecognizer):
1737
"""解析table元素."""
@@ -175,6 +195,9 @@ def __check_table_include_math_code(self, raw_html: HtmlElement):
175195
raw_html=math_raw_html
176196
)
177197
result = []
198+
if not math_res_parts:
199+
if raw_html.tag == 'br' or raw_html.xpath('.//br'):
200+
result.append("\n\n")
178201
for math_item in math_res_parts:
179202
ele_item = math_item[0]
180203

@@ -211,79 +234,76 @@ def process_node(node):
211234
if node.tail and node.tail.strip():
212235
result.append(node.tail.strip())
213236
else:
237+
if node.tag == 'br' or node.tag not in inline_tags:
238+
result.append('\n\n')
239+
214240
# 提取当前节点的文本
215241
if node.text and node.text.strip():
216242
cleaned_text = node.text.strip()
217-
result.append(cleaned_text)
218-
# 处理节点的tail(元素闭合后的文本)
219-
if node.tail and node.tail.strip():
220-
cleaned_tail = node.tail.strip()
221-
result.append(cleaned_tail)
243+
result.append(html_normalize_space(cleaned_text))
244+
222245
# 递归处理子节点
223246
for child in node:
224247
process_node(child)
248+
# 处理节点的tail(元素闭合后的文本)
249+
if node.tail and node.tail.strip():
250+
if node.tag not in inline_tags:
251+
result.append('\n\n')
252+
cleaned_tail = node.tail.strip()
253+
result.append(html_normalize_space(cleaned_tail))
225254
# 从根节点开始处理
226255
process_node(ele_item)
227256
return result
228257

229258
def __simplify_td_th_content(self, table_nest_level, elem: HtmlElement) -> None:
230259
"""简化 <td> 和 <th> 内容,保留嵌套表格结构."""
231-
if elem.tag in ['td', 'th']:
232-
parse_res = []
233-
# 检查是否存在嵌套的表格
234-
if table_nest_level > 1:
235-
if elem.text and elem.text.strip():
236-
parse_res.append(elem.text.strip())
237-
elem.text = None # 防止后续重复处理
238-
# 存在嵌套表格,递归处理子节点
260+
if (elem.tag in ['td', 'th', 'table'] or
261+
any(child.tag in ['table', 'td', 'th'] for child in elem.iterchildren()) or
262+
elem.xpath('.//table') or elem.xpath('.//td') or elem.xpath('.//th')):
263+
if len(elem) > 0:
264+
# 需要继续遍历的情况
239265
for child in elem.iterchildren():
240-
if child.tag == 'table':
241-
# 对嵌套表格递归调用简化处理
242-
self.__simplify_td_th_content(table_nest_level, child)
243-
else:
244-
# 处理非表格元素
245-
math_res = self.__check_table_include_math_code(child)
246-
parse_res.extend(math_res)
247-
remove_element(child)
248-
# 将非表格内容拼接后放在表格前面
249-
if parse_res:
250-
elem.text = ' '.join(normalize_text_segment(item) for item in parse_res)
266+
self.__simplify_td_th_content(table_nest_level, child)
251267
else:
252-
# 没有嵌套表格,直接简化
253268
math_res = self.__check_table_include_math_code(elem)
254-
parse_res.extend(math_res)
255-
for item in list(elem.iterchildren()):
256-
remove_element(item)
257-
if parse_res:
258-
elem.text = ' '.join(normalize_text_segment(item) for item in parse_res)
259-
return
260-
# 非 td/th 元素继续递归处理
261-
for child in elem.iterchildren():
262-
self.__simplify_td_th_content(table_nest_level, child)
269+
math_res_text = ' '.join(normalize_text_segment(item) for item in math_res)
270+
elem.text = math_res_text
271+
else:
272+
math_res = self.__check_table_include_math_code(elem)
273+
elem.clear()
274+
math_res_text = ' '.join(normalize_text_segment(item) for item in math_res)
275+
if elem.tag in VOID_ELEMENTS:
276+
elem_pre = elem.getprevious()
277+
if elem_pre is not None:
278+
elem_pre.tail = math_res_text
279+
else:
280+
elem_parent = elem.getparent()
281+
if elem_parent is not None:
282+
elem_parent_text = elem_parent.text + ' ' if elem_parent is not None and elem_parent.text is not None else ''
283+
elem_parent.text = elem_parent_text + math_res_text
284+
else:
285+
elem.text = math_res_text
263286

264287
def __get_table_body(self, table_type, table_nest_level, table_root):
265288
"""获取并处理table body,返回处理后的HTML字符串。"""
266289
if table_type == 'empty':
267290
content = table_root.text_content()
268291
return content
269-
allowed_attributes = ['colspan', 'rowspan']
270292
# 清理除了colspan和rowspan之外的属性
271-
if len(table_root.attrib) > 0:
272-
cleaned_attrs = {k: v for k, v in table_root.attrib.items() if k in allowed_attributes}
273-
table_root.attrib.clear()
274-
table_root.attrib.update(cleaned_attrs)
275-
# text进行strip操作,tail保留(部分内容留在tail中)
276-
for elem in chain([table_root], table_root.iterchildren()):
277-
if elem.text is not None:
278-
elem.text = elem.text.strip()
279-
if elem.tail is not None:
280-
elem.tail = elem.tail.strip()
281-
# 单元格内的多标签内容进行简化,空格拼接,公式、代码识别
282293
self.__simplify_td_th_content(table_nest_level, table_root)
283-
# 迭代
284-
for child in table_root.iterchildren():
285-
if child is not None:
286-
self.__get_table_body(table_type, table_nest_level, child)
294+
table_clean_attributes(table_root)
295+
296+
# doc = html.fromstring(html_content)
297+
for element in table_root.iter():
298+
# 清理元素前后的空白(不影响.text和.tail的内容)
299+
if element.text is not None:
300+
element.text = element.text.lstrip('\n\t ')
301+
if element.tail is not None:
302+
if "\n\n" in element.tail:
303+
element.tail = "\n\n" + element.tail.lstrip('\n\t ')
304+
else:
305+
element.tail = element.tail.lstrip('\n\t ')
306+
287307
return self._element_to_html_entity(table_root)
288308

289309
def __do_extract_tables(self, root: HtmlElement) -> None:

0 commit comments

Comments
 (0)