Skip to content

Commit ce41728

Browse files
feat: 将获取段落文本时,br换成双换行 (#325)
1 parent 4a3928b commit ce41728

13 files changed

Lines changed: 15156 additions & 8656 deletions

File tree

llm_web_kit/extractor/html/recognizer/text.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@
3737
'☁' # 云符号
3838
]
3939

40+
PARAGRAPH_SEPARATOR = '\n\n'
41+
4042

4143
class TextParagraphRecognizer(BaseHTMLElementRecognizer):
4244
"""解析文本段落元素."""
@@ -114,11 +116,11 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str:
114116
text2 = text2.strip(' ') if text2 else ''
115117
if lang == 'zh':
116118
txt = text1 + text2
117-
return txt.strip().replace('\\r\\n', '\n').replace('\\n', '\n')
119+
return txt.strip()
118120
else:
119121
words_sep = '' if text2[0] in string.punctuation or text2[0] in special_symbols else ' '
120122
txt = text1 + words_sep + text2
121-
return txt.strip().replace('\\r\\n', '\n').replace('\\n', '\n')
123+
return txt.strip()
122124

123125
def __get_paragraph_text(self, root: HtmlElement) -> List[dict]:
124126
"""
@@ -147,7 +149,7 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
147149
text = ''
148150
para_text.append({'c': el.text, 't': ParagraphTextType.CODE_INLINE})
149151
elif el.tag in ['br']:
150-
text += '\n'
152+
text += PARAGRAPH_SEPARATOR
151153
else:
152154
if el.text and el.text.strip():
153155
text = self.__combine_text(text, el.text.strip())

tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/para_is_short.html

Lines changed: 1092 additions & 1 deletion
Large diffs are not rendered by default.

tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text10.html

Lines changed: 2310 additions & 1 deletion
Large diffs are not rendered by default.

tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text2.html

Lines changed: 1846 additions & 1 deletion
Large diffs are not rendered by default.

tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text3.html

Lines changed: 2667 additions & 1246 deletions
Large diffs are not rendered by default.

tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text4.html

Lines changed: 2503 additions & 1 deletion
Large diffs are not rendered by default.

tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text5.html

Lines changed: 2621 additions & 1 deletion
Large diffs are not rendered by default.

tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text6.html

Lines changed: 1305 additions & 1 deletion
Large diffs are not rendered by default.

tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text7.html

Lines changed: 272 additions & 1 deletion
Large diffs are not rendered by default.

tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/text8.html

Lines changed: 351 additions & 1 deletion
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)