|
1 | | -from itertools import chain |
2 | 1 | from typing import Any, List, Tuple |
3 | 2 |
|
4 | 3 | from lxml.html import HtmlElement |
|
9 | 8 | from llm_web_kit.extractor.html.recognizer.recognizer import ( |
10 | 9 | BaseHTMLElementRecognizer, CCTag) |
11 | 10 | from llm_web_kit.libs.doc_element_type import DocElementType |
12 | | -from llm_web_kit.libs.html_utils import process_sub_sup_tags, remove_element |
| 11 | +from llm_web_kit.libs.html_utils import (html_normalize_space, |
| 12 | + process_sub_sup_tags) |
13 | 13 | from llm_web_kit.libs.text_utils import normalize_text_segment |
14 | 14 |
|
| 15 | +from .text import inline_tags |
| 16 | + |
| 17 | +# 空元素 |
| 18 | +VOID_ELEMENTS = { |
| 19 | + 'area', 'base', 'br', 'col', 'embed', 'hr', |
| 20 | + 'img', 'input', 'link', 'meta', 'param', |
| 21 | + 'source', 'track', 'wbr' |
| 22 | +} |
| 23 | + |
| 24 | +KEEP_ATTRS = {'colspan', 'rowspan'} |
| 25 | + |
| 26 | + |
| 27 | +def table_clean_attributes(element): |
| 28 | + attrs = list(element.attrib.keys()) |
| 29 | + for attr in attrs: |
| 30 | + if attr not in KEEP_ATTRS: |
| 31 | + del element.attrib[attr] |
| 32 | + for child in element.iterchildren(): |
| 33 | + table_clean_attributes(child) |
| 34 | + |
15 | 35 |
|
16 | 36 | class TableRecognizer(BaseHTMLElementRecognizer): |
17 | 37 | """解析table元素.""" |
@@ -175,6 +195,9 @@ def __check_table_include_math_code(self, raw_html: HtmlElement): |
175 | 195 | raw_html=math_raw_html |
176 | 196 | ) |
177 | 197 | result = [] |
| 198 | + if not math_res_parts: |
| 199 | + if raw_html.tag == 'br' or raw_html.xpath('.//br'): |
| 200 | + result.append("\n\n") |
178 | 201 | for math_item in math_res_parts: |
179 | 202 | ele_item = math_item[0] |
180 | 203 |
|
@@ -211,79 +234,76 @@ def process_node(node): |
211 | 234 | if node.tail and node.tail.strip(): |
212 | 235 | result.append(node.tail.strip()) |
213 | 236 | else: |
| 237 | + if node.tag == 'br' or node.tag not in inline_tags: |
| 238 | + result.append('\n\n') |
| 239 | + |
214 | 240 | # 提取当前节点的文本 |
215 | 241 | if node.text and node.text.strip(): |
216 | 242 | cleaned_text = node.text.strip() |
217 | | - result.append(cleaned_text) |
218 | | - # 处理节点的tail(元素闭合后的文本) |
219 | | - if node.tail and node.tail.strip(): |
220 | | - cleaned_tail = node.tail.strip() |
221 | | - result.append(cleaned_tail) |
| 243 | + result.append(html_normalize_space(cleaned_text)) |
| 244 | + |
222 | 245 | # 递归处理子节点 |
223 | 246 | for child in node: |
224 | 247 | process_node(child) |
| 248 | + # 处理节点的tail(元素闭合后的文本) |
| 249 | + if node.tail and node.tail.strip(): |
| 250 | + if node.tag not in inline_tags: |
| 251 | + result.append('\n\n') |
| 252 | + cleaned_tail = node.tail.strip() |
| 253 | + result.append(html_normalize_space(cleaned_tail)) |
225 | 254 | # 从根节点开始处理 |
226 | 255 | process_node(ele_item) |
227 | 256 | return result |
228 | 257 |
|
229 | 258 | def __simplify_td_th_content(self, table_nest_level, elem: HtmlElement) -> None: |
230 | 259 | """简化 <td> 和 <th> 内容,保留嵌套表格结构.""" |
231 | | - if elem.tag in ['td', 'th']: |
232 | | - parse_res = [] |
233 | | - # 检查是否存在嵌套的表格 |
234 | | - if table_nest_level > 1: |
235 | | - if elem.text and elem.text.strip(): |
236 | | - parse_res.append(elem.text.strip()) |
237 | | - elem.text = None # 防止后续重复处理 |
238 | | - # 存在嵌套表格,递归处理子节点 |
| 260 | + if (elem.tag in ['td', 'th', 'table'] or |
| 261 | + any(child.tag in ['table', 'td', 'th'] for child in elem.iterchildren()) or |
| 262 | + elem.xpath('.//table') or elem.xpath('.//td') or elem.xpath('.//th')): |
| 263 | + if len(elem) > 0: |
| 264 | + # 需要继续遍历的情况 |
239 | 265 | for child in elem.iterchildren(): |
240 | | - if child.tag == 'table': |
241 | | - # 对嵌套表格递归调用简化处理 |
242 | | - self.__simplify_td_th_content(table_nest_level, child) |
243 | | - else: |
244 | | - # 处理非表格元素 |
245 | | - math_res = self.__check_table_include_math_code(child) |
246 | | - parse_res.extend(math_res) |
247 | | - remove_element(child) |
248 | | - # 将非表格内容拼接后放在表格前面 |
249 | | - if parse_res: |
250 | | - elem.text = ' '.join(normalize_text_segment(item) for item in parse_res) |
| 266 | + self.__simplify_td_th_content(table_nest_level, child) |
251 | 267 | else: |
252 | | - # 没有嵌套表格,直接简化 |
253 | 268 | math_res = self.__check_table_include_math_code(elem) |
254 | | - parse_res.extend(math_res) |
255 | | - for item in list(elem.iterchildren()): |
256 | | - remove_element(item) |
257 | | - if parse_res: |
258 | | - elem.text = ' '.join(normalize_text_segment(item) for item in parse_res) |
259 | | - return |
260 | | - # 非 td/th 元素继续递归处理 |
261 | | - for child in elem.iterchildren(): |
262 | | - self.__simplify_td_th_content(table_nest_level, child) |
| 269 | + math_res_text = ' '.join(normalize_text_segment(item) for item in math_res) |
| 270 | + elem.text = math_res_text |
| 271 | + else: |
| 272 | + math_res = self.__check_table_include_math_code(elem) |
| 273 | + elem.clear() |
| 274 | + math_res_text = ' '.join(normalize_text_segment(item) for item in math_res) |
| 275 | + if elem.tag in VOID_ELEMENTS: |
| 276 | + elem_pre = elem.getprevious() |
| 277 | + if elem_pre is not None: |
| 278 | + elem_pre.tail = math_res_text |
| 279 | + else: |
| 280 | + elem_parent = elem.getparent() |
| 281 | + if elem_parent is not None: |
| 282 | + elem_parent_text = elem_parent.text + ' ' if elem_parent is not None and elem_parent.text is not None else '' |
| 283 | + elem_parent.text = elem_parent_text + math_res_text |
| 284 | + else: |
| 285 | + elem.text = math_res_text |
263 | 286 |
|
264 | 287 | def __get_table_body(self, table_type, table_nest_level, table_root): |
265 | 288 | """获取并处理table body,返回处理后的HTML字符串。""" |
266 | 289 | if table_type == 'empty': |
267 | 290 | content = table_root.text_content() |
268 | 291 | return content |
269 | | - allowed_attributes = ['colspan', 'rowspan'] |
270 | 292 | # 清理除了colspan和rowspan之外的属性 |
271 | | - if len(table_root.attrib) > 0: |
272 | | - cleaned_attrs = {k: v for k, v in table_root.attrib.items() if k in allowed_attributes} |
273 | | - table_root.attrib.clear() |
274 | | - table_root.attrib.update(cleaned_attrs) |
275 | | - # text进行strip操作,tail保留(部分内容留在tail中) |
276 | | - for elem in chain([table_root], table_root.iterchildren()): |
277 | | - if elem.text is not None: |
278 | | - elem.text = elem.text.strip() |
279 | | - if elem.tail is not None: |
280 | | - elem.tail = elem.tail.strip() |
281 | | - # 单元格内的多标签内容进行简化,空格拼接,公式、代码识别 |
282 | 293 | self.__simplify_td_th_content(table_nest_level, table_root) |
283 | | - # 迭代 |
284 | | - for child in table_root.iterchildren(): |
285 | | - if child is not None: |
286 | | - self.__get_table_body(table_type, table_nest_level, child) |
| 294 | + table_clean_attributes(table_root) |
| 295 | + |
| 296 | + # doc = html.fromstring(html_content) |
| 297 | + for element in table_root.iter(): |
| 298 | + # 清理元素前后的空白(不影响.text和.tail的内容) |
| 299 | + if element.text is not None: |
| 300 | + element.text = element.text.lstrip('\n\t ') |
| 301 | + if element.tail is not None: |
| 302 | + if "\n\n" in element.tail: |
| 303 | + element.tail = "\n\n" + element.tail.lstrip('\n\t ') |
| 304 | + else: |
| 305 | + element.tail = element.tail.lstrip('\n\t ') |
| 306 | + |
287 | 307 | return self._element_to_html_entity(table_root) |
288 | 308 |
|
289 | 309 | def __do_extract_tables(self, root: HtmlElement) -> None: |
|
0 commit comments