|
43 | 43 | IMG_SRC_PATTERN = re.compile(r'<img\s+[^>]*src="([^"]+)"', re.IGNORECASE) |
44 | 44 | HTML_IMG_TAG_PATTERN = re.compile(r"<img\s+[^>]*>", re.IGNORECASE) |
45 | 45 | MARKDOWN_IMG_PATTERN = re.compile(r"!\[[^\]]*\]\(([^)]+)\)") |
46 | | -CODE_BLOCK_PATTERN = re.compile(r"```[\s\S]*?```") |
47 | | -INLINE_CODE_PATTERN = re.compile(r"`[^`\n]*`") |
| 46 | +CODE_BLOCK_PATTERN = re.compile(r"```[^\n]*\n([\s\S]*?)```") |
| 47 | +INLINE_CODE_PATTERN = re.compile(r"`([^`\n]*)`") |
48 | 48 | LINK_PATTERN = re.compile(r"\[([^\]]+)\]\(([^)]+)\)") |
49 | 49 | HTML_TAG_PATTERN = re.compile(r"<[^>]+>") |
50 | 50 | MATH_BLOCK_PATTERN = re.compile(r"\$\$[\s\S]*?\$\$") |
@@ -521,16 +521,16 @@ def _to_plain_text(self, markdown_body: str) -> str: |
521 | 521 | if not base: |
522 | 522 | return "" |
523 | 523 |
|
524 | | - text = CODE_BLOCK_PATTERN.sub(" ", base) |
| 524 | + text = CODE_BLOCK_PATTERN.sub(lambda match: f" {match.group(1)} ", base) |
525 | 525 | text = MATH_BLOCK_PATTERN.sub(" ", text) |
526 | 526 | text = INLINE_MATH_PATTERN.sub(" ", text) |
527 | | - text = INLINE_CODE_PATTERN.sub(" ", text) |
| 527 | + text = INLINE_CODE_PATTERN.sub(lambda match: f" {match.group(1)} ", text) |
528 | 528 | text = MARKDOWN_IMG_PATTERN.sub(" ", text) |
529 | 529 | text = HTML_IMG_TAG_PATTERN.sub(" ", text) |
530 | 530 | text = LINK_PATTERN.sub(lambda match: f" {match.group(1)} ", text) |
531 | 531 | text = HTML_TAG_PATTERN.sub(" ", text) |
532 | 532 | text = re.sub(r"(?mi)^\s*#{1,6}\s*images\s*$", " ", text) |
533 | | - text = re.sub(r"[#>*_`~\-]+", " ", text) |
| 533 | + text = re.sub(r"[#>*~]+", " ", text) |
534 | 534 | text = re.sub(r"\s+", " ", text) |
535 | 535 | return text.strip() |
536 | 536 |
|
|
0 commit comments