From bf0a1e29c991bacb40438138a4d86338b7d9acd3 Mon Sep 17 00:00:00 2001
From: Crozzers
Date: Sun, 30 Nov 2025 09:49:59 +0000
Subject: [PATCH 1/9] Re-write the IAB processor to implement GFM rules
---
lib/markdown2.py | 160 +++++++++++++++++-
.../tm-cases/middle_word_em_escaped_char.html | 1 +
.../tm-cases/middle_word_em_escaped_char.opts | 1 +
.../tm-cases/middle_word_em_escaped_char.text | 1 +
4 files changed, 157 insertions(+), 6 deletions(-)
create mode 100644 test/tm-cases/middle_word_em_escaped_char.html
create mode 100644 test/tm-cases/middle_word_em_escaped_char.opts
create mode 100644 test/tm-cases/middle_word_em_escaped_char.text
diff --git a/lib/markdown2.py b/lib/markdown2.py
index 71b19f67..3dd19541 100755
--- a/lib/markdown2.py
+++ b/lib/markdown2.py
@@ -1123,8 +1123,14 @@ def _strict_tag_block_sub(
return result
def _tag_is_closed(self, tag_name: str, text: str) -> bool:
- # super basic check if number of open tags == number of closing tags
- return len(re.findall('<%s(?:.*?)>' % tag_name, text)) == len(re.findall('%s>' % tag_name, text))
+ # check if number of open tags == number of close tags
+ if len(re.findall('<%s(?:.*?)>' % tag_name, text)) != len(re.findall('%s>' % tag_name, text)):
+ return False
+
+ # check that close tag position is AFTER open tag
+ close_index = text.find(f'{tag_name}')
+ open_index = text.find(f'<{tag_name}')
+ return open_index != -1 and close_index != -1 and open_index < close_index
@mark_stage(Stage.LINK_DEFS)
def _strip_link_definitions(self, text: str) -> str:
@@ -2066,8 +2072,11 @@ def sub(match: re.Match):
return f'{prefix}<{syntax}>{contents}{syntax}>'
# must go first:
- text = self._strong_re.sub(sub, text)
- text = self._em_re.sub(sub, text)
+ # text = self._strong_re.sub(sub, text)
+ # text = self._em_re.sub(sub, text)
+ iab = ItalicAndBoldProcessor2(self, None)
+ if iab.test(text):
+ text = iab.run(text)
return text
_block_quote_base = r'''
@@ -2581,6 +2590,138 @@ def test(self, text):
return self.hash_table and re.search(r'md5-[0-9a-z]{32}', text)
+class ItalicAndBoldProcessor2(Extra):
+ name = 'iabp-2'
+ order = (Stage.ITALIC_AND_BOLD,), tuple()
+
+ def run(self, text):
+ for em_type in '*_':
+ opens = []
+ unused_opens = {}
+ tokens = []
+ index = 0
+
+ delim_runs = tuple(re.finditer(r'([%s]+)' % em_type, text))
+ for delim_run in delim_runs:
+ # first check if it is opening (left flanking)
+ # or closing (right flanking) run
+ run = delim_run.string[max(0, delim_run.start() - 1): delim_run.end() + 1]
+ syntax = delim_run.group(1)
+ syntax_re = syntax.replace('*', r'\*')
+
+ left = (
+ # not followed by whitespace
+ re.match(r'.*%s\S' % syntax_re, run, re.S)
+ and (
+ # either not followed by punctuation
+ re.match(r'.*%s[\s\w]' % syntax_re, run, re.S)
+ # or followed by punct and preceded by punct/whitespace
+ or re.match(r'(^|[\s\W])%s([^\s\w]|$)' % syntax_re, run, re.S | re.M)
+ )
+ )
+
+ right = (
+ # not preceded by whitespace
+ re.match(r'\S%s.*' % syntax_re, run, re.S)
+ and (
+ # either not preceded by punct
+ re.match(r'[\s\w]%s.*' % syntax_re, run, re.S)
+ # or preceded by punct and followed by whitespace or punct
+ or re.match(r'[^\s\w]%s(\s|[^\s\w]|$)' % syntax_re, run, re.S | re.M)
+ )
+ )
+
+ if not (left or right):
+ continue
+
+ if left and right:
+ if opens:
+ # if we have open tags prioritize closing them
+ left = False
+ else:
+ # if we don't, let's open a new one
+ right = False
+
+ if left:
+ opens.append(delim_run)
+ continue
+
+ # close. figure out how
+ if not opens:
+ tokens.append(delim_run.string[index: delim_run.end()])
+ index = delim_run.end()
+ continue
+
+ # get the opening run
+ open = opens.pop(-1)
+ # if the opening run was joined to a previous closing run (eg: **strong***em*)
+ # then re-use that previous closing run, but ignore the part that was used to
+ # close the previous emphasis
+ open_offset = unused_opens.pop(open, 0)
+ open_syntax = open.group(1)[open_offset:]
+ open_start = open.start() + open_offset
+
+ # add everything between last emphasis and this one
+ tokens.append(delim_run.string[index: open_start])
+ body = delim_run.string[open.end(): delim_run.start()]
+ if not all(
+ self.md._tag_is_closed(tag, body)
+ for tag in re.findall(rf'?({self.md._span_tags})', body)
+ ):
+ tokens.append(delim_run.string[open_start: delim_run.end()])
+ index = delim_run.end()
+ continue
+
+ if len(open_syntax) > len(syntax):
+ opens.append(open)
+ unused_opens[open] = open_offset
+ opens.append(delim_run)
+ unused_opens[delim_run] = 0
+ continue
+
+ # calc what type of emphasis based on the lowest common
+ # length of the delimiter run
+ length = min(3, min(len(open_syntax), len(syntax)))
+ if length == 3:
+ tokens.append('')
+ tokens.append(body)
+ tokens.append('')
+ else:
+ tag = 'strong' if length == 2 else 'em'
+ # add any part of the open that we don't consume
+ # eg: **one*
+ tokens.append(open_syntax[:-length])
+ tokens.append(f'<{tag}>')
+ tokens.append(body)
+ tokens.append(f'{tag}>')
+
+ # if our closing syntax is longer than our opening that
+ # means it's joined onto a previous emphasis
+ # eg: **strong***em*
+ # This means the current delim_run is not completely "spent".
+ # Mark this closing run as an opening run for the next em but
+ # record in `unused_opens` how mmany chars from the run we've
+ # already used
+ if len(syntax) > len(open_syntax):
+ opens.append(delim_run)
+ unused_opens[delim_run] = length
+ index = delim_run.start() + length
+ else:
+ tokens.append(delim_run.group(1)[length:])
+ index = delim_run.end()
+
+ if index < len(text):
+ tokens.append(text[index:])
+
+ text = ''.join(tokens)
+
+ return text
+
+
+ def test(self, text):
+ return text.count('*') > 1 or text.count('_') > 1
+
+
class _LinkProcessorExtraOpts(TypedDict, total=False):
'''Options for the `LinkProcessor` extra'''
tags: List[str]
@@ -3420,14 +3561,21 @@ def __init__(self, md: Markdown, options: Union[dict, bool, None]):
options.setdefault('allowed', True)
super().__init__(md, options)
+ escaped_hashes = '|'.join(md._escape_table.values())
+
self.middle_word_em_re = re.compile(
r'''
(?x/y and x\y
diff --git a/test/tm-cases/middle_word_em_escaped_char.opts b/test/tm-cases/middle_word_em_escaped_char.opts
new file mode 100644
index 00000000..f540dcd6
--- /dev/null
+++ b/test/tm-cases/middle_word_em_escaped_char.opts
@@ -0,0 +1 @@
+{'extras': {'middle-word-em': {'allowed': False}}}
diff --git a/test/tm-cases/middle_word_em_escaped_char.text b/test/tm-cases/middle_word_em_escaped_char.text
new file mode 100644
index 00000000..3548642d
--- /dev/null
+++ b/test/tm-cases/middle_word_em_escaped_char.text
@@ -0,0 +1 @@
+*x*/*y* and *x*\\*y*
From 6ade9ab62114e433b4d04fe771553b05e7825044 Mon Sep 17 00:00:00 2001
From: Crozzers
Date: Sun, 30 Nov 2025 22:04:22 +0000
Subject: [PATCH 2/9] Get closer to GFM compliance
---
lib/markdown2.py | 257 ++++++++++++++++++++++++++++-------------------
1 file changed, 154 insertions(+), 103 deletions(-)
diff --git a/lib/markdown2.py b/lib/markdown2.py
index 3dd19541..6cf5132c 100755
--- a/lib/markdown2.py
+++ b/lib/markdown2.py
@@ -2596,124 +2596,175 @@ class ItalicAndBoldProcessor2(Extra):
def run(self, text):
for em_type in '*_':
- opens = []
- unused_opens = {}
- tokens = []
- index = 0
-
- delim_runs = tuple(re.finditer(r'([%s]+)' % em_type, text))
- for delim_run in delim_runs:
- # first check if it is opening (left flanking)
- # or closing (right flanking) run
- run = delim_run.string[max(0, delim_run.start() - 1): delim_run.end() + 1]
- syntax = delim_run.group(1)
- syntax_re = syntax.replace('*', r'\*')
-
- left = (
- # not followed by whitespace
- re.match(r'.*%s\S' % syntax_re, run, re.S)
- and (
- # either not followed by punctuation
- re.match(r'.*%s[\s\w]' % syntax_re, run, re.S)
- # or followed by punct and preceded by punct/whitespace
- or re.match(r'(^|[\s\W])%s([^\s\w]|$)' % syntax_re, run, re.S | re.M)
+ nesting = True
+ while nesting:
+ nesting = False
+
+ opens = []
+ buffer = []
+ unused_opens = {}
+ tokens = []
+ index = 0
+
+ for delim_run in re.finditer(r'([%s]+)' % em_type, text):
+ # first check if it is opening (left flanking)
+ # or closing (right flanking) run
+ run = delim_run.string[max(0, delim_run.start() - 1): delim_run.end() + 1]
+ syntax = delim_run.group(1)
+ syntax_re = syntax.replace('*', r'\*')
+
+ left = (
+ # not followed by whitespace
+ re.match(r'.*%s\S' % syntax_re, run, re.S)
+ and (
+ # either not followed by punctuation
+ re.match(r'.*%s[\s\w]' % syntax_re, run, re.S)
+ # or followed by punct and preceded by punct/whitespace
+ or re.match(r'(^|[\s\W])%s([^\s\w]|$)' % syntax_re, run, re.S | re.M)
+ )
)
- )
- right = (
- # not preceded by whitespace
- re.match(r'\S%s.*' % syntax_re, run, re.S)
- and (
- # either not preceded by punct
- re.match(r'[\s\w]%s.*' % syntax_re, run, re.S)
- # or preceded by punct and followed by whitespace or punct
- or re.match(r'[^\s\w]%s(\s|[^\s\w]|$)' % syntax_re, run, re.S | re.M)
+ right = (
+ # not preceded by whitespace
+ re.match(r'\S%s.*' % syntax_re, run, re.S)
+ and (
+ # either not preceded by punct
+ re.match(r'[\s\w]%s.*' % syntax_re, run, re.S)
+ # or preceded by punct and followed by whitespace or punct
+ or re.match(r'[^\s\w]%s(\s|[^\s\w]|$)' % syntax_re, run, re.S | re.M)
+ )
)
- )
- if not (left or right):
- continue
+ if not (left or right):
+ continue
- if left and right:
- if opens:
- # if we have open tags prioritize closing them
- left = False
- else:
- # if we don't, let's open a new one
- right = False
+ if not right or not opens:
+ if left:
+ opens.append(delim_run)
+ continue
- if left:
- opens.append(delim_run)
- continue
+ syntax = delim_run.group(1)
+
+ open = opens.pop(-1)
+ # if the opening run was joined to a previous closing run (eg: **strong***em*)
+ # then re-use that previous closing run, but ignore the part that was used to
+ # close the previous emphasis
+ open_offset = unused_opens.pop(open, 0)
+ open_start = open.start() + open_offset
+ open_syntax = open.group(1)[open_offset:]
+
+ if open.start() < index:
+ # this happens with things like `*(**foo**)*`. We process LTR so the strong gets
+ # processed first (since that has the first closing delimiter). We now have
+ # `*(foo)*` and now we get round to processing the em.
+ # It's hard compare the match (against the original text var) to the processed text
+ # so it's easier to just note down that nesting is detected and re-run the loop
+ nesting = True
+ continue
- # close. figure out how
- if not opens:
- tokens.append(delim_run.string[index: delim_run.end()])
- index = delim_run.end()
- continue
+ prev_open = None
+
+ if len(open_syntax) < len(syntax):
+ # if closing syntax is longer then maybe we can close multiple openers that are queued up
+ if opens:
+ prev_open = opens.pop(-1)
+ prev_open_offset = unused_opens.pop(open, 0)
+ prev_open_start = prev_open.start() + prev_open_offset
+ prev_open_syntax = prev_open.group(1)[prev_open_offset:]
+
+ # check the new expanded body doesn't cross span borders
+ if not all(
+ self.md._tag_is_closed(tag, delim_run.string[prev_open.end(): open.start()])
+ for tag in re.findall(
+ rf'?({self.md._span_tags})',
+ delim_run.string[prev_open.end(): open.start()]
+ )
+ ):
+ opens.append(prev_open)
+ prev_open = None
+ else:
+ unused_opens[open] = open_offset
+ opens.append(open)
+ unused_opens[delim_run] = 0
+ opens.append(delim_run)
+ continue
+ elif len(open_syntax) > len(syntax):
+ # if the opening syntax is bigger than this close won't close all of it.
+ # Queue both up for later processing
+ opens.append(open)
+ unused_opens[open] = open_offset
+ if left:
+ opens.append(delim_run)
+ unused_opens[delim_run] = 0
+ continue
- # get the opening run
- open = opens.pop(-1)
- # if the opening run was joined to a previous closing run (eg: **strong***em*)
- # then re-use that previous closing run, but ignore the part that was used to
- # close the previous emphasis
- open_offset = unused_opens.pop(open, 0)
- open_syntax = open.group(1)[open_offset:]
- open_start = open.start() + open_offset
-
- # add everything between last emphasis and this one
- tokens.append(delim_run.string[index: open_start])
- body = delim_run.string[open.end(): delim_run.start()]
- if not all(
- self.md._tag_is_closed(tag, body)
- for tag in re.findall(rf'?({self.md._span_tags})', body)
- ):
- tokens.append(delim_run.string[open_start: delim_run.end()])
- index = delim_run.end()
- continue
+ body = delim_run.string[open.end(): delim_run.start()]
- if len(open_syntax) > len(syntax):
- opens.append(open)
- unused_opens[open] = open_offset
- opens.append(delim_run)
- unused_opens[delim_run] = 0
- continue
+ # ensure the body does not cross span borders
+ if not all(
+ self.md._tag_is_closed(tag, body)
+ for tag in re.findall(rf'?({self.md._span_tags})', body)
+ ):
+ continue
- # calc what type of emphasis based on the lowest common
- # length of the delimiter run
- length = min(3, min(len(open_syntax), len(syntax)))
- if length == 3:
- tokens.append('')
- tokens.append(body)
- tokens.append('')
- else:
- tag = 'strong' if length == 2 else 'em'
+ # put all the new processing in a buffer array that gets added to `tokens` anyway.
+ # Not the most efficient but it's convenient having a separate list of everything
+ # processed and added in the previous iteration
+ buffer = []
+
+ # add all the text leading up to the opening delimiter
+ buffer.append(delim_run.string[index: prev_open_start if prev_open else open_start])
+
+ # calc what type of emphasis based on the lowest common
+ # length of the delimiter run
+ length = min(3, min(len(open_syntax), len(syntax)))
# add any part of the open that we don't consume
# eg: **one*
- tokens.append(open_syntax[:-length])
- tokens.append(f'<{tag}>')
- tokens.append(body)
- tokens.append(f'{tag}>')
-
- # if our closing syntax is longer than our opening that
- # means it's joined onto a previous emphasis
- # eg: **strong***em*
- # This means the current delim_run is not completely "spent".
- # Mark this closing run as an opening run for the next em but
- # record in `unused_opens` how mmany chars from the run we've
- # already used
- if len(syntax) > len(open_syntax):
- opens.append(delim_run)
- unused_opens[delim_run] = length
- index = delim_run.start() + length
- else:
- tokens.append(delim_run.group(1)[length:])
+ buffer.append(open_syntax[:-length])
+ if length == 3:
+ buffer.append('')
+ buffer.append(body)
+ buffer.append('')
+ else:
+ tag = 'strong' if length == 2 else 'em'
+ # prev_open is defined if this closing syntax is closing multiple openers at once
+ if prev_open:
+ if len(prev_open_syntax) == 3:
+ prev_tag = 'strong' if tag == 'em' else 'em'
+ else:
+ prev_tag = 'strong' if len(prev_open_syntax) == 2 else 'em'
+ buffer.append(f'<{prev_tag}>')
+
+ if len(prev_open_syntax) == 3:
+ buffer.append(f'<{tag}>')
+
+ buffer.append(delim_run.string[prev_open.end(): open.start()])
+
+ if len(prev_open_syntax) == 3:
+ buffer.append(f'{tag}>')
+ else:
+ buffer.append(f'<{tag}>')
+
+ buffer.append(body)
+
+ if len(prev_open_syntax) != 3:
+ buffer.append(f'{tag}>')
+ buffer.append(f'{prev_tag}>')
+ else:
+ buffer.append(f'<{tag}>')
+ buffer.append(body)
+ buffer.append(f'{tag}>')
+
+ # If both syntaxes are equal length then that's easy. Remove the open run as it's fully
+ # processed and consumed, and move on
index = delim_run.end()
- if index < len(text):
- tokens.append(text[index:])
+ tokens.extend(buffer)
+
+ if index < len(text):
+ tokens.append(text[index:])
- text = ''.join(tokens)
+ text = ''.join(tokens)
return text
From b3e512de1925ab5f53597072d82b289a0e050e87 Mon Sep 17 00:00:00 2001
From: Crozzers
Date: Fri, 5 Dec 2025 21:58:12 +0000
Subject: [PATCH 3/9] Iron out some GFM edge cases
---
lib/markdown2.py | 291 ++++++++++++++++++++++++++++-------------------
1 file changed, 174 insertions(+), 117 deletions(-)
diff --git a/lib/markdown2.py b/lib/markdown2.py
index 6cf5132c..22e51961 100755
--- a/lib/markdown2.py
+++ b/lib/markdown2.py
@@ -2601,39 +2601,18 @@ def run(self, text):
nesting = False
opens = []
- buffer = []
unused_opens = {}
+ unused_closes = []
tokens = []
index = 0
- for delim_run in re.finditer(r'([%s]+)' % em_type, text):
- # first check if it is opening (left flanking)
- # or closing (right flanking) run
- run = delim_run.string[max(0, delim_run.start() - 1): delim_run.end() + 1]
- syntax = delim_run.group(1)
- syntax_re = syntax.replace('*', r'\*')
-
- left = (
- # not followed by whitespace
- re.match(r'.*%s\S' % syntax_re, run, re.S)
- and (
- # either not followed by punctuation
- re.match(r'.*%s[\s\w]' % syntax_re, run, re.S)
- # or followed by punct and preceded by punct/whitespace
- or re.match(r'(^|[\s\W])%s([^\s\w]|$)' % syntax_re, run, re.S | re.M)
- )
- )
+ delim_runs = {
+ delim_run: self.delimiter_left_or_right(delim_run)
+ for delim_run in re.finditer(r'([%s]+)' % em_type, text)
+ }
- right = (
- # not preceded by whitespace
- re.match(r'\S%s.*' % syntax_re, run, re.S)
- and (
- # either not preceded by punct
- re.match(r'[\s\w]%s.*' % syntax_re, run, re.S)
- # or preceded by punct and followed by whitespace or punct
- or re.match(r'[^\s\w]%s(\s|[^\s\w]|$)' % syntax_re, run, re.S | re.M)
- )
- )
+ for delim_run, (left, right) in delim_runs.items():
+ syntax = delim_run.group(1)
if not (left or right):
continue
@@ -2662,112 +2641,190 @@ def run(self, text):
nesting = True
continue
- prev_open = None
-
- if len(open_syntax) < len(syntax):
- # if closing syntax is longer then maybe we can close multiple openers that are queued up
- if opens:
- prev_open = opens.pop(-1)
- prev_open_offset = unused_opens.pop(open, 0)
- prev_open_start = prev_open.start() + prev_open_offset
- prev_open_syntax = prev_open.group(1)[prev_open_offset:]
-
- # check the new expanded body doesn't cross span borders
- if not all(
- self.md._tag_is_closed(tag, delim_run.string[prev_open.end(): open.start()])
- for tag in re.findall(
- rf'?({self.md._span_tags})',
- delim_run.string[prev_open.end(): open.start()]
- )
- ):
- opens.append(prev_open)
- prev_open = None
+ middle = None
+
+ if len(open_syntax) != len(syntax):
+ if len(open_syntax) < len(syntax) and opens:
+ # since we are detecting a previous open, we are expanding the em span to the left
+ # so we should check if we're covering additional chars that we don't cross an
+ # existing span border
+ if not self.body_crosses_span_borders(opens[-1], open):
+ middle = open
+
+ open = opens.pop(-1)
+ open_offset = unused_opens.pop(open, 0)
+ open_start = open.start() + open_offset
+ elif len(open_syntax) > len(syntax) and unused_closes:
+ # check if there is a previous closing delim run in the current body
+ # since this is already within the body we don't need to do a cross-span border check
+ # as we're not expanding into new ground and that is covered later
+ middle = next((i for i in unused_closes if open.end() < i.start() < delim_run.start()), None)
else:
- unused_opens[open] = open_offset
- opens.append(open)
- unused_opens[delim_run] = 0
- opens.append(delim_run)
- continue
- elif len(open_syntax) > len(syntax):
- # if the opening syntax is bigger than this close won't close all of it.
- # Queue both up for later processing
- opens.append(open)
- unused_opens[open] = open_offset
- if left:
- opens.append(delim_run)
- unused_opens[delim_run] = 0
- continue
-
- body = delim_run.string[open.end(): delim_run.start()]
+ try:
+ next_delim_run = tuple(delim_runs.keys())[tuple(delim_runs.keys()).index(delim_run) + 1]
+ except IndexError:
+ next_delim_run = None
+
+ if next_delim_run is None:
+ # if there is no follow up delimiter run then no point leaving this unused. Process now
+ pass
+ elif len(open_syntax) < len(syntax) and (
+ # if this run can be an opener, but the next run won't close both of them
+ (left and not delim_runs[next_delim_run][1])
+ # if the next run is not an opener and won't consume this run
+ and not delim_runs[next_delim_run][0]
+ ):
+ pass
+ elif len(open_syntax) > len(syntax) and (
+ # if this run can be an closer, but the next run is not a fresh opener
+ (right and not delim_runs[next_delim_run][0])
+ # if the next run is not a closer
+ and not delim_runs[next_delim_run][1]
+ ):
+ pass
+ elif delim_runs[next_delim_run][1] and len(open_syntax) == len(next_delim_run.group(1)):
+ # of the next run is a closer and matches the length of the opener then that is probably
+ # a better closer than this run - eg: **foo*bar** or *foo**bar*
+ opens.append(open)
+ continue
+ else:
+ # if there are no unused opens or closes to use up then this is just imbalanced
+ # mark as unused and leave for later processing
+ unused_opens[open] = open_offset
+ opens.append(open)
+ if left:
+ unused_opens[delim_run] = 0
+ opens.append(delim_run)
+ else:
+ unused_closes.append(delim_run)
+ continue
# ensure the body does not cross span borders
- if not all(
- self.md._tag_is_closed(tag, body)
- for tag in re.findall(rf'?({self.md._span_tags})', body)
- ):
+ if self.body_crosses_span_borders(open, delim_run):
continue
- # put all the new processing in a buffer array that gets added to `tokens` anyway.
- # Not the most efficient but it's convenient having a separate list of everything
- # processed and added in the previous iteration
- buffer = []
-
# add all the text leading up to the opening delimiter
- buffer.append(delim_run.string[index: prev_open_start if prev_open else open_start])
-
- # calc what type of emphasis based on the lowest common
- # length of the delimiter run
- length = min(3, min(len(open_syntax), len(syntax)))
- # add any part of the open that we don't consume
- # eg: **one*
- buffer.append(open_syntax[:-length])
- if length == 3:
- buffer.append('')
- buffer.append(body)
- buffer.append('')
- else:
- tag = 'strong' if length == 2 else 'em'
- # prev_open is defined if this closing syntax is closing multiple openers at once
- if prev_open:
- if len(prev_open_syntax) == 3:
- prev_tag = 'strong' if tag == 'em' else 'em'
- else:
- prev_tag = 'strong' if len(prev_open_syntax) == 2 else 'em'
- buffer.append(f'<{prev_tag}>')
+ tokens.append(delim_run.string[index: open_start])
- if len(prev_open_syntax) == 3:
- buffer.append(f'<{tag}>')
+ span, close_syntax_used_chars = self.process_span(open, delim_run, open_offset, middle)
+ tokens.extend(span)
+ if close_syntax_used_chars < len(syntax):
+ # if we didn't use up the entire closing delimiter mark it as unused
+ unused_opens[delim_run] = close_syntax_used_chars
+ opens.append(delim_run)
- buffer.append(delim_run.string[prev_open.end(): open.start()])
+ # Move index to end of the used delim run
+ index = delim_run.start() + close_syntax_used_chars
- if len(prev_open_syntax) == 3:
- buffer.append(f'{tag}>')
- else:
- buffer.append(f'<{tag}>')
+ if index < len(text):
+ tokens.append(text[index:])
+
+ text = ''.join(tokens)
- buffer.append(body)
+ return text
- if len(prev_open_syntax) != 3:
- buffer.append(f'{tag}>')
- buffer.append(f'{prev_tag}>')
- else:
- buffer.append(f'<{tag}>')
- buffer.append(body)
- buffer.append(f'{tag}>')
+ def process_span(
+ self, open: re.Match, close: re.Match,
+ offset: int, middle: Optional[re.Match] = None
+ ):
+ '''
+ Args:
+ open: the match against the opening delimiter run
+ close: the match against the closing delimiter run
+ offset: the number of chars from the opening delimiter that should be skipped when processing
+ middle: an optional delimiter run in the middle of the span
+ '''
+ tokens = []
- # If both syntaxes are equal length then that's easy. Remove the open run as it's fully
- # processed and consumed, and move on
- index = delim_run.end()
+ open_syntax = open.group(1)[offset:]
+ middle_syntax = middle.group(1) if middle else ''
+ close_syntax = close.group(1)
- tokens.extend(buffer)
+ # calculate what em type the inner and outer emphasis is
+ outer_syntax_length = min(3, min(len(open_syntax), len(close_syntax)))
+ inner_syntax_length = min(max(len(open_syntax), len(close_syntax)), len(middle_syntax)) if middle else 0
+ # add anything from the opening syntax that will not be consumed
+ # eg: **one*
+ tokens.append(open_syntax[:-(outer_syntax_length + inner_syntax_length)])
- if index < len(text):
- tokens.append(text[index:])
+ if outer_syntax_length == 3:
+ tokens.append('')
+ else:
+ tokens.append(f'<{"strong" if outer_syntax_length == 2 else "em"}>')
- text = ''.join(tokens)
+ if middle:
+ # outer_tag = 'strong' if outer_syntax_length == 2 else 'em'
- return text
+ # if there is a middle em (eg: ***abc*def**) then do some wrangling to figure
+ # out where to put the opening/closing inner tags depending on the size of the
+ # opening delim run
+ inner_tag = 'strong' if len(middle_syntax) == 2 else 'em'
+ if len(open_syntax) > len(close_syntax):
+ tokens.append(f'<{inner_tag}>')
+
+ tokens.append(close.string[open.end(): middle.start()])
+
+ if len(open_syntax) > len(close_syntax):
+ tokens.append(f'{inner_tag}>')
+ else:
+ tokens.append(f'<{inner_tag}>')
+
+ tokens.append(close.string[middle.end(): close.start()])
+ if len(open_syntax) < len(close_syntax):
+ tokens.append(f'{inner_tag}>')
+ else:
+ # if no middle em then it's easy. Just add the whole text body
+ tokens.append(close.string[open.end(): close.start()])
+
+ if outer_syntax_length == 3:
+ tokens.append('')
+ else:
+ tokens.append(f'{"strong" if outer_syntax_length == 2 else "em"}>')
+
+ # figure out how many chars from the closing delimiter we've actually used
+ close_delim_chars_used = outer_syntax_length
+ if middle and len(open_syntax) < len(close_syntax):
+ # if there's a middle part and it's right-aligned then add that on
+ close_delim_chars_used += inner_syntax_length
+
+ return tokens, close_delim_chars_used
+
+ def delimiter_left_or_right(self, delim_run: re.Match):
+ run = delim_run.string[max(0, delim_run.start() - 1): delim_run.end() + 1]
+ syntax = delim_run.group(1)
+ syntax_re = syntax.replace('*', r'\*')
+
+ left = (
+ # not followed by whitespace
+ re.match(r'.*%s\S' % syntax_re, run, re.S)
+ and (
+ # either not followed by punctuation
+ re.match(r'.*%s[\s\w]' % syntax_re, run, re.S)
+ # or followed by punct and preceded by punct/whitespace
+ or re.match(r'(^|[\s\W])%s([^\s\w]|$)' % syntax_re, run, re.S | re.M)
+ )
+ )
+
+ right = (
+ # not preceded by whitespace
+ re.match(r'\S%s.*' % syntax_re, run, re.S)
+ and (
+ # either not preceded by punct
+ re.match(r'[\s\w]%s.*' % syntax_re, run, re.S)
+ # or preceded by punct and followed by whitespace or punct
+ or re.match(r'[^\s\w]%s(\s|[^\s\w]|$)' % syntax_re, run, re.S | re.M)
+ )
+ )
+
+ return left, right
+
+ def body_crosses_span_borders(self, open: re.Match, close: re.Match):
+ for tag in re.findall(rf'?({self.md._span_tags})', open.string[open.end(): close.start()]):
+ if not self.md._tag_is_closed(tag, open.string[open.end(): close.start()]):
+ return True
+
+ return False
def test(self, text):
return text.count('*') > 1 or text.count('_') > 1
From 366ad8cb32687c95d87aaac19364e27b6f92c243 Mon Sep 17 00:00:00 2001
From: Crozzers
Date: Fri, 5 Dec 2025 22:29:18 +0000
Subject: [PATCH 4/9] Acheive near full GFM compliance on iab
---
lib/markdown2.py | 228 ++++++++++++++++++++++++-----------------------
1 file changed, 115 insertions(+), 113 deletions(-)
diff --git a/lib/markdown2.py b/lib/markdown2.py
index 22e51961..9099a693 100755
--- a/lib/markdown2.py
+++ b/lib/markdown2.py
@@ -2595,131 +2595,133 @@ class ItalicAndBoldProcessor2(Extra):
order = (Stage.ITALIC_AND_BOLD,), tuple()
def run(self, text):
- for em_type in '*_':
- nesting = True
- while nesting:
- nesting = False
-
- opens = []
- unused_opens = {}
- unused_closes = []
- tokens = []
- index = 0
-
- delim_runs = {
- delim_run: self.delimiter_left_or_right(delim_run)
- for delim_run in re.finditer(r'([%s]+)' % em_type, text)
- }
+ nesting = True
+ while nesting:
+ nesting = False
+
+ opens = {'*': [], '_': []}
+ unused_opens = {'*': {}, '_': {}}
+ unused_closes = {'*': [], '_': []}
+ tokens = []
+ index = 0
+
+ delim_runs = {
+ delim_run: self.delimiter_left_or_right(delim_run)
+ for delim_run in re.finditer(r'(\*+|_+)', text)
+ }
+
+ for delim_run, (left, right) in delim_runs.items():
+ syntax = delim_run.group(1)
+ em_type = syntax[0]
+
+ if not (left or right):
+ continue
- for delim_run, (left, right) in delim_runs.items():
- syntax = delim_run.group(1)
+ if not right or not opens[em_type]:
+ if left:
+ opens[em_type].append(delim_run)
+ continue
- if not (left or right):
- continue
+ syntax = delim_run.group(1)
- if not right or not opens:
- if left:
- opens.append(delim_run)
- continue
+ # grab the open run. If it crosses a span, keep looking backwards
+ while opens[em_type] and self.body_crosses_span_borders(opens[em_type][-1], delim_run):
+ opens[em_type].pop(-1)
+ if not opens[em_type]:
+ continue
+ open = opens[em_type].pop(-1)
+
+ # if the opening run was joined to a previous closing run (eg: **strong***em*)
+ # then re-use that previous closing run, but ignore the part that was used to
+ # close the previous emphasis
+ open_offset = unused_opens[em_type].pop(open, 0)
+ open_start = open.start() + open_offset
+ open_syntax = open.group(1)[open_offset:]
+
+ if open.start() < index:
+ # this happens with things like `*(**foo**)*`. We process LTR so the strong gets
+ # processed first (since that has the first closing delimiter). We now have
+ # `*(foo)*` and now we get round to processing the em.
+ # It's hard compare the match (against the original text var) to the processed text
+ # so it's easier to just note down that nesting is detected and re-run the loop
+ nesting = True
+ continue
- syntax = delim_run.group(1)
-
- open = opens.pop(-1)
- # if the opening run was joined to a previous closing run (eg: **strong***em*)
- # then re-use that previous closing run, but ignore the part that was used to
- # close the previous emphasis
- open_offset = unused_opens.pop(open, 0)
- open_start = open.start() + open_offset
- open_syntax = open.group(1)[open_offset:]
-
- if open.start() < index:
- # this happens with things like `*(**foo**)*`. We process LTR so the strong gets
- # processed first (since that has the first closing delimiter). We now have
- # `*(foo)*` and now we get round to processing the em.
- # It's hard compare the match (against the original text var) to the processed text
- # so it's easier to just note down that nesting is detected and re-run the loop
- nesting = True
- continue
+ middle = None
+
+ if len(open_syntax) != len(syntax):
+ if len(open_syntax) < len(syntax) and opens[em_type]:
+ # since we are detecting a previous open, we are expanding the em span to the left
+ # so we should check if we're covering additional chars that we don't cross an
+ # existing span border
+ if not self.body_crosses_span_borders(opens[em_type][-1], open):
+ middle = open
+
+ open = opens[em_type].pop(-1)
+ open_offset = unused_opens[em_type].pop(open, 0)
+ open_start = open.start() + open_offset
+ elif len(open_syntax) > len(syntax) and unused_closes[em_type]:
+ # check if there is a previous closing delim run in the current body
+ # since this is already within the body we don't need to do a cross-span border check
+ # as we're not expanding into new ground and that is covered later
+ middle = next((i for i in unused_closes[em_type] if open.end() < i.start() < delim_run.start()), None)
+ else:
+ try:
+ next_delim_run = tuple(delim_runs.keys())[tuple(delim_runs.keys()).index(delim_run) + 1]
+ except IndexError:
+ next_delim_run = None
- middle = None
-
- if len(open_syntax) != len(syntax):
- if len(open_syntax) < len(syntax) and opens:
- # since we are detecting a previous open, we are expanding the em span to the left
- # so we should check if we're covering additional chars that we don't cross an
- # existing span border
- if not self.body_crosses_span_borders(opens[-1], open):
- middle = open
-
- open = opens.pop(-1)
- open_offset = unused_opens.pop(open, 0)
- open_start = open.start() + open_offset
- elif len(open_syntax) > len(syntax) and unused_closes:
- # check if there is a previous closing delim run in the current body
- # since this is already within the body we don't need to do a cross-span border check
- # as we're not expanding into new ground and that is covered later
- middle = next((i for i in unused_closes if open.end() < i.start() < delim_run.start()), None)
+ if next_delim_run is None:
+ # if there is no follow up delimiter run then no point leaving this unused. Process now
+ pass
+ elif len(open_syntax) < len(syntax) and (
+ # if this run can be an opener, but the next run won't close both of them
+ (left and not delim_runs[next_delim_run][1])
+ # if the next run is not an opener and won't consume this run
+ and not delim_runs[next_delim_run][0]
+ ):
+ pass
+ elif len(open_syntax) > len(syntax) and (
+ # if this run can be an closer, but the next run is not a fresh opener
+ (right and not delim_runs[next_delim_run][0])
+ # if the next run is not a closer
+ and not delim_runs[next_delim_run][1]
+ ):
+ pass
+ elif delim_runs[next_delim_run][1] and len(open_syntax) == len(next_delim_run.group(1)):
+ # of the next run is a closer and matches the length of the opener then that is probably
+ # a better closer than this run - eg: **foo*bar** or *foo**bar*
+ opens[em_type].append(open)
+ continue
else:
- try:
- next_delim_run = tuple(delim_runs.keys())[tuple(delim_runs.keys()).index(delim_run) + 1]
- except IndexError:
- next_delim_run = None
-
- if next_delim_run is None:
- # if there is no follow up delimiter run then no point leaving this unused. Process now
- pass
- elif len(open_syntax) < len(syntax) and (
- # if this run can be an opener, but the next run won't close both of them
- (left and not delim_runs[next_delim_run][1])
- # if the next run is not an opener and won't consume this run
- and not delim_runs[next_delim_run][0]
- ):
- pass
- elif len(open_syntax) > len(syntax) and (
- # if this run can be an closer, but the next run is not a fresh opener
- (right and not delim_runs[next_delim_run][0])
- # if the next run is not a closer
- and not delim_runs[next_delim_run][1]
- ):
- pass
- elif delim_runs[next_delim_run][1] and len(open_syntax) == len(next_delim_run.group(1)):
- # of the next run is a closer and matches the length of the opener then that is probably
- # a better closer than this run - eg: **foo*bar** or *foo**bar*
- opens.append(open)
- continue
+ # if there are no unused opens or closes to use up then this is just imbalanced
+ # mark as unused and leave for later processing
+ unused_opens[em_type][open] = open_offset
+ opens[em_type].append(open)
+ if left:
+ unused_opens[em_type][delim_run] = 0
+ opens[em_type].append(delim_run)
else:
- # if there are no unused opens or closes to use up then this is just imbalanced
- # mark as unused and leave for later processing
- unused_opens[open] = open_offset
- opens.append(open)
- if left:
- unused_opens[delim_run] = 0
- opens.append(delim_run)
- else:
- unused_closes.append(delim_run)
- continue
-
- # ensure the body does not cross span borders
- if self.body_crosses_span_borders(open, delim_run):
- continue
+ unused_closes[em_type].append(delim_run)
+ continue
- # add all the text leading up to the opening delimiter
- tokens.append(delim_run.string[index: open_start])
+ # add all the text leading up to the opening delimiter
+ tokens.append(delim_run.string[index: open_start])
- span, close_syntax_used_chars = self.process_span(open, delim_run, open_offset, middle)
- tokens.extend(span)
- if close_syntax_used_chars < len(syntax):
- # if we didn't use up the entire closing delimiter mark it as unused
- unused_opens[delim_run] = close_syntax_used_chars
- opens.append(delim_run)
+ span, close_syntax_used_chars = self.process_span(open, delim_run, open_offset, middle)
+ tokens.extend(span)
+ if close_syntax_used_chars < len(syntax):
+ # if we didn't use up the entire closing delimiter mark it as unused
+ unused_opens[em_type][delim_run] = close_syntax_used_chars
+ opens[em_type].append(delim_run)
- # Move index to end of the used delim run
- index = delim_run.start() + close_syntax_used_chars
+ # Move index to end of the used delim run
+ index = delim_run.start() + close_syntax_used_chars
- if index < len(text):
- tokens.append(text[index:])
+ if index < len(text):
+ tokens.append(text[index:])
- text = ''.join(tokens)
+ text = ''.join(tokens)
return text
From e8e7ced7feea5d2cea53f4ffc19aa7d0dda6c6b6 Mon Sep 17 00:00:00 2001
From: Crozzers
Date: Sun, 7 Dec 2025 16:01:27 +0000
Subject: [PATCH 5/9] Acheive near complete GFM compliance
---
lib/markdown2.py | 69 +++-----
test/tm-cases/gfm_emphasis.html | 261 ++++++++++++++++++++++++++++
test/tm-cases/gfm_emphasis.text | 260 +++++++++++++++++++++++++++
test/tm-cases/hash_html_blocks.html | 3 -
4 files changed, 544 insertions(+), 49 deletions(-)
create mode 100644 test/tm-cases/gfm_emphasis.html
create mode 100644 test/tm-cases/gfm_emphasis.text
diff --git a/lib/markdown2.py b/lib/markdown2.py
index 9099a693..95d5a405 100755
--- a/lib/markdown2.py
+++ b/lib/markdown2.py
@@ -2046,35 +2046,7 @@ def _encode_code(self, text: str) -> str:
@mark_stage(Stage.ITALIC_AND_BOLD)
def _do_italics_and_bold(self, text: str) -> str:
- def sub(match: re.Match):
- '''
- regex sub function that checks that the match isn't matching across spans.
- The span shouldn't be across a closing or opening HTML tag, although spans within
- the span is acceptable.
- '''
- contents: str = match.group(2)
- # the strong re also checks for leading em chars, so the match may cover some additional text
- prefix = match.string[match.start(): match.regs[1][0]]
- # look for all possible span HTML tags
- for tag in re.findall(rf'?({self._span_tags})', contents):
- # if it's unbalanced then that violates the rules
- if not self._tag_is_closed(tag, contents):
- return prefix + match.group(1) + contents + match.group(1)
-
- # if it is balanced, but the closing tag is before the opening then
- # the text probably looks like `_abcdef_`, which is across 2 spans
- close_index = contents.find(f'{tag}')
- open_index = contents.find(f'<{tag}')
- if close_index != -1 and close_index < open_index:
- return prefix + match.group(1) + contents + match.group(1)
-
- syntax = 'strong' if len(match.group(1)) == 2 else 'em'
- return f'{prefix}<{syntax}>{contents}{syntax}>'
-
- # must go first:
- # text = self._strong_re.sub(sub, text)
- # text = self._em_re.sub(sub, text)
- iab = ItalicAndBoldProcessor2(self, None)
+ iab = GFMItalicAndBoldProcessor(self, None)
if iab.test(text):
text = iab.run(text)
return text
@@ -2590,8 +2562,8 @@ def test(self, text):
return self.hash_table and re.search(r'md5-[0-9a-z]{32}', text)
-class ItalicAndBoldProcessor2(Extra):
- name = 'iabp-2'
+class GFMItalicAndBoldProcessor(Extra):
+ name = 'gfm-italic-and-bold-processor'
order = (Stage.ITALIC_AND_BOLD,), tuple()
def run(self, text):
@@ -2649,6 +2621,7 @@ def run(self, text):
middle = None
+ # if the delimiter runs don't match then we need to figure out how to resolve this
if len(open_syntax) != len(syntax):
if len(open_syntax) < len(syntax) and opens[em_type]:
# since we are detecting a previous open, we are expanding the em span to the left
@@ -2659,7 +2632,12 @@ def run(self, text):
open = opens[em_type].pop(-1)
open_offset = unused_opens[em_type].pop(open, 0)
+ open_syntax = open.group(1)[open_offset:]
open_start = open.start() + open_offset
+
+ if len(open_syntax) == len(syntax):
+ # if it turns out the previous open is a perfect match then ignore the middle part
+ middle = None
elif len(open_syntax) > len(syntax) and unused_closes[em_type]:
# check if there is a previous closing delim run in the current body
# since this is already within the body we don't need to do a cross-span border check
@@ -2676,7 +2654,10 @@ def run(self, text):
pass
elif len(open_syntax) < len(syntax) and (
# if this run can be an opener, but the next run won't close both of them
- (left and not delim_runs[next_delim_run][1])
+ (left and (
+ not delim_runs[next_delim_run][1]
+ or len(next_delim_run.group(1)) < len(open_syntax) + len(syntax)
+ ))
# if the next run is not an opener and won't consume this run
and not delim_runs[next_delim_run][0]
):
@@ -2688,11 +2669,10 @@ def run(self, text):
and not delim_runs[next_delim_run][1]
):
pass
- elif delim_runs[next_delim_run][1] and len(open_syntax) == len(next_delim_run.group(1)):
- # of the next run is a closer and matches the length of the opener then that is probably
- # a better closer than this run - eg: **foo*bar** or *foo**bar*
- opens[em_type].append(open)
- continue
+ elif len(open_syntax) < len(syntax) and len(syntax) >= 3:
+ # if closing syntax is bigger and its >= three long then focus on closing any
+ # open em spans
+ pass
else:
# if there are no unused opens or closes to use up then this is just imbalanced
# mark as unused and leave for later processing
@@ -2743,16 +2723,16 @@ def process_span(
close_syntax = close.group(1)
# calculate what em type the inner and outer emphasis is
- outer_syntax_length = min(3, min(len(open_syntax), len(close_syntax)))
+ outer_syntax_length = min(len(open_syntax), len(close_syntax))
inner_syntax_length = min(max(len(open_syntax), len(close_syntax)), len(middle_syntax)) if middle else 0
# add anything from the opening syntax that will not be consumed
# eg: **one*
tokens.append(open_syntax[:-(outer_syntax_length + inner_syntax_length)])
- if outer_syntax_length == 3:
- tokens.append('')
- else:
- tokens.append(f'<{"strong" if outer_syntax_length == 2 else "em"}>')
+ tags = []
+ tags += [''] * (outer_syntax_length % 2)
+ tags += [''] * (outer_syntax_length // 2)
+ tokens.append(''.join(tags))
if middle:
# outer_tag = 'strong' if outer_syntax_length == 2 else 'em'
@@ -2779,10 +2759,7 @@ def process_span(
# if no middle em then it's easy. Just add the whole text body
tokens.append(close.string[open.end(): close.start()])
- if outer_syntax_length == 3:
- tokens.append('')
- else:
- tokens.append(f'{"strong" if outer_syntax_length == 2 else "em"}>')
+ tokens.append(''.join(reversed(tags)).replace('<', ''))
# figure out how many chars from the closing delimiter we've actually used
close_delim_chars_used = outer_syntax_length
diff --git a/test/tm-cases/gfm_emphasis.html b/test/tm-cases/gfm_emphasis.html
new file mode 100644
index 00000000..9078f0f9
--- /dev/null
+++ b/test/tm-cases/gfm_emphasis.html
@@ -0,0 +1,261 @@
+foo bar
+
+a * foo bar*
+
+a*"foo"*
+
+
+
+foobar
+
+5678
+
+пристанямстремятся
+
+aa_"bb"_cc
+
+foo-(bar)
+
+_foo*
+
+*foo bar *
+
+*foo bar
+*
+
+*(*foo)
+
+(foo)
+
+foobar
+
+_foo bar _
+
+_(_foo)
+
+(foo)
+
+foobar
+
+пристанямстремятся
+
+foobarbaz
+
+(bar).
+
+foo bar
+
+** foo bar**
+
+a**"foo"**
+
+foobar
+
+foo bar
+
+__ foo bar__
+
+__
+foo bar__
+
+a__"foo"__
+
+foobar
+
+5678
+
+пристанямстремятся
+
+foo, bar, baz
+
+foo-(bar)
+
+**foo bar **
+
+**(**foo)
+
+(foo)
+
+Gomphocarpus (Gomphocarpus physocarpus, syn.
+Asclepias physocarpa)
+
+foo "bar" foo
+
+foobar
+
+__foo bar __
+
+__(__foo)
+
+(foo)
+
+foobar
+
+пристанямстремятся
+
+foobarbaz
+
+(bar).
+
+foo bar
+
+foo
+bar
+
+foo bar baz
+
+foo bar baz
+
+foo bar
+
+foo bar
+
+foo bar baz
+
+foobarbaz
+
+foobar
+
+foo bar
+
+foo bar
+
+foobar
+
+foobarbaz
+
+foobar***baz
+
+foo bar baz bim bop
+
+foo bar
+
+** is not an empty emphasis
+
+**** is not an empty strong emphasis
+
+foo bar
+
+foo
+bar
+
+foo bar baz
+
+foo bar baz
+
+foo bar
+
+foo bar
+
+foo bar baz
+
+foobarbaz
+
+foo bar
+
+foo bar
+
+foo bar baz
+bim bop
+
+foo bar
+
+__ is not an empty emphasis
+
+____ is not an empty strong emphasis
+
+foo ***
+
+foo *
+
+foo _
+
+foo *****
+
+foo *
+
+foo _
+
+*foo
+
+foo*
+
+*foo
+
+***foo
+
+foo*
+
+foo***
+
+foo ___
+
+foo _
+
+foo *
+
+foo _____
+
+foo _
+
+foo *
+
+_foo
+
+foo_
+
+_foo
+
+___foo
+
+foo_
+
+foo___
+
+foo
+
+foo
+
+foo
+
+foo
+
+foo
+
+foo
+
+foo
+
+foo
+
+foo
+
+foo _bar baz_
+
+foo bar *baz bim bam
+
+**foo bar baz
+
+*foo bar baz
+
+*bar*
+
+_foo bar_
+
+*
+
+**
+
+__
+
+a *
+
+a _
+
+**ahttp://foo.bar/?q=**
+
+__ahttp://foo.bar/?q=__
+
+foo*bar
diff --git a/test/tm-cases/gfm_emphasis.text b/test/tm-cases/gfm_emphasis.text
new file mode 100644
index 00000000..7b88c80c
--- /dev/null
+++ b/test/tm-cases/gfm_emphasis.text
@@ -0,0 +1,260 @@
+*foo bar*
+
+a * foo bar*
+
+a*"foo"*
+
+* a *
+
+foo*bar*
+
+5*6*78
+
+пристаням_стремятся_
+
+aa_"bb"_cc
+
+foo-_(bar)_
+
+_foo*
+
+*foo bar *
+
+*foo bar
+*
+
+*(*foo)
+
+*(*foo*)*
+
+*foo*bar
+
+_foo bar _
+
+_(_foo)
+
+_(_foo_)_
+
+_foo_bar
+
+_пристаням_стремятся
+
+_foo_bar_baz_
+
+_(bar)_.
+
+**foo bar**
+
+** foo bar**
+
+a**"foo"**
+
+foo**bar**
+
+__foo bar__
+
+__ foo bar__
+
+__
+foo bar__
+
+a__"foo"__
+
+foo__bar__
+
+5__6__78
+
+пристаням__стремятся__
+
+__foo, __bar__, baz__
+
+foo-__(bar)__
+
+**foo bar **
+
+**(**foo)
+
+*(**foo**)*
+
+**Gomphocarpus (*Gomphocarpus physocarpus*, syn.
+*Asclepias physocarpa*)**
+
+**foo "*bar*" foo**
+
+**foo**bar
+
+__foo bar __
+
+__(__foo)
+
+_(__foo__)_
+
+__foo__bar
+
+__пристаням__стремятся
+
+__foo__bar__baz__
+
+__(bar)__.
+
+*foo [bar](/url)*
+
+*foo
+bar*
+
+_foo __bar__ baz_
+
+_foo _bar_ baz_
+
+__foo_ bar_
+
+*foo *bar**
+
+*foo **bar** baz*
+
+*foo**bar**baz*
+
+*foo**bar*
+
+***foo** bar*
+
+*foo **bar***
+
+*foo**bar***
+
+foo***bar***baz
+
+foo******bar*********baz
+
+*foo **bar *baz* bim** bop*
+
+*foo [*bar*](/url)*
+
+
+** is not an empty emphasis
+
+**** is not an empty strong emphasis
+
+**foo [bar](/url)**
+
+**foo
+bar**
+
+__foo _bar_ baz__
+
+__foo __bar__ baz__
+
+____foo__ bar__
+
+**foo **bar****
+
+**foo *bar* baz**
+
+**foo*bar*baz**
+
+***foo* bar**
+
+**foo *bar***
+
+**foo *bar **baz**
+bim* bop**
+
+**foo [*bar*](/url)**
+
+__ is not an empty emphasis
+
+____ is not an empty strong emphasis
+
+foo ***
+
+foo *\**
+
+foo *_*
+
+foo *****
+
+foo **\***
+
+foo **_**
+
+**foo*
+
+*foo**
+
+***foo**
+
+****foo*
+
+**foo***
+
+*foo****
+
+foo ___
+
+foo _\__
+
+foo _*_
+
+foo _____
+
+foo __\___
+
+foo __*__
+
+__foo_
+
+_foo__
+
+___foo__
+
+____foo_
+
+__foo___
+
+_foo____
+
+**foo**
+
+*_foo_*
+
+__foo__
+
+_*foo*_
+
+****foo****
+
+____foo____
+
+******foo******
+
+***foo***
+
+_____foo_____
+
+*foo _bar* baz_
+
+*foo __bar *baz bim__ bam*
+
+**foo **bar baz**
+
+*foo *bar baz*
+
+*[bar*](/url)
+
+_foo [bar_](/url)
+
+*
+
+**
+
+__
+
+*a `*`*
+
+_a `_`_
+
+**a
+
+__a
+
+**foo*bar**
\ No newline at end of file
diff --git a/test/tm-cases/hash_html_blocks.html b/test/tm-cases/hash_html_blocks.html
index 310fe3da..f4a20b0f 100644
--- a/test/tm-cases/hash_html_blocks.html
+++ b/test/tm-cases/hash_html_blocks.html
@@ -1,9 +1,6 @@
Archons of the Colophon
-
-
by Paco Xander Nathan
-
From 5988b0970137715d6f66cab766b74577ab742c43 Mon Sep 17 00:00:00 2001
From: Crozzers
Date: Sun, 7 Dec 2025 17:14:43 +0000
Subject: [PATCH 6/9] Refactor inheritants of original IABP to use new GFM
variant.
Also refactor the GFM class to be more readable
---
lib/markdown2.py | 290 +++++++++++++++++++++++++++++++++--------------
1 file changed, 206 insertions(+), 84 deletions(-)
diff --git a/lib/markdown2.py b/lib/markdown2.py
index 95d5a405..c4c2813f 100755
--- a/lib/markdown2.py
+++ b/lib/markdown2.py
@@ -2563,6 +2563,10 @@ def test(self, text):
class GFMItalicAndBoldProcessor(Extra):
+ '''
+ An upgraded version of the `ItalicAndBoldProcessor` that covers far more edge cases and gets close
+ to Github Flavoured Markdown compliance.
+ '''
name = 'gfm-italic-and-bold-processor'
order = (Stage.ITALIC_AND_BOLD,), tuple()
@@ -2572,24 +2576,39 @@ def run(self, text):
nesting = False
opens = {'*': [], '_': []}
+ '''Mapping of em type to a list of opening runs of that em type'''
unused_opens = {'*': {}, '_': {}}
+ '''
+ Mapping of em type to another mapping of unused opening runs of that em type.
+ An unused run is one that has been skipped, or only partially consumed (eg: **foo*) and
+ could be consumed by another closing run. The inner mapping is a mapping of the
+ delimiter run to an offset number, which is the number of characters from that run that
+ have been consumed so far
+ '''
unused_closes = {'*': [], '_': []}
+ '''
+ Mapping of em type to a list of closing delimiter runs that have not been fully consumed.
+ EG: *foo*bar*
+ '''
tokens = []
+ '''List of processed spans of text that will be joined to form the new `text`'''
index = 0
+ '''Number of chars of `text` that has been processed so far'''
- delim_runs = {
- delim_run: self.delimiter_left_or_right(delim_run)
- for delim_run in re.finditer(r'(\*+|_+)', text)
- }
+ # do a quick scan for all delimiter runs, filtering for those that can open/close emphasis
+ delim_runs = OrderedDict()
+ for delim_run in re.finditer(r'(\*+|_+)', text):
+ left, right = self.delimiter_left_or_right(delim_run)
+ if left or right:
+ delim_runs[delim_run] = (left, right)
for delim_run, (left, right) in delim_runs.items():
syntax = delim_run.group(1)
em_type = syntax[0]
- if not (left or right):
- continue
-
+ # if not a closing run, or there are no opens to consume
if not right or not opens[em_type]:
+ # if it can also be an opening run
if left:
opens[em_type].append(delim_run)
continue
@@ -2623,75 +2642,42 @@ def run(self, text):
# if the delimiter runs don't match then we need to figure out how to resolve this
if len(open_syntax) != len(syntax):
- if len(open_syntax) < len(syntax) and opens[em_type]:
- # since we are detecting a previous open, we are expanding the em span to the left
- # so we should check if we're covering additional chars that we don't cross an
- # existing span border
- if not self.body_crosses_span_borders(opens[em_type][-1], open):
- middle = open
-
- open = opens[em_type].pop(-1)
+ has_middle = self.has_middle(
+ open, delim_run, opens[em_type],
+ unused_opens[em_type], unused_closes[em_type]
+ )
+
+ if has_middle is not False:
+ middle = has_middle[1]
+ if has_middle[0] != open:
+ # only re-assign and re-calc opening offsets if that run HAS changed
+ open = has_middle[0]
open_offset = unused_opens[em_type].pop(open, 0)
open_syntax = open.group(1)[open_offset:]
open_start = open.start() + open_offset
-
- if len(open_syntax) == len(syntax):
- # if it turns out the previous open is a perfect match then ignore the middle part
- middle = None
- elif len(open_syntax) > len(syntax) and unused_closes[em_type]:
- # check if there is a previous closing delim run in the current body
- # since this is already within the body we don't need to do a cross-span border check
- # as we're not expanding into new ground and that is covered later
- middle = next((i for i in unused_closes[em_type] if open.end() < i.start() < delim_run.start()), None)
- else:
- try:
- next_delim_run = tuple(delim_runs.keys())[tuple(delim_runs.keys()).index(delim_run) + 1]
- except IndexError:
- next_delim_run = None
-
- if next_delim_run is None:
- # if there is no follow up delimiter run then no point leaving this unused. Process now
- pass
- elif len(open_syntax) < len(syntax) and (
- # if this run can be an opener, but the next run won't close both of them
- (left and (
- not delim_runs[next_delim_run][1]
- or len(next_delim_run.group(1)) < len(open_syntax) + len(syntax)
- ))
- # if the next run is not an opener and won't consume this run
- and not delim_runs[next_delim_run][0]
- ):
- pass
- elif len(open_syntax) > len(syntax) and (
- # if this run can be an closer, but the next run is not a fresh opener
- (right and not delim_runs[next_delim_run][0])
- # if the next run is not a closer
- and not delim_runs[next_delim_run][1]
- ):
- pass
- elif len(open_syntax) < len(syntax) and len(syntax) >= 3:
- # if closing syntax is bigger and its >= three long then focus on closing any
- # open em spans
- pass
+ elif not self.should_process_imbalanced_delimiter_runs(
+ open, delim_run, delim_runs, unused_opens[em_type]
+ ):
+ # if we shouldn't process them now, save these opens for a future pass
+ unused_opens[em_type][open] = open_offset
+ opens[em_type].append(open)
+ if left:
+ unused_opens[em_type][delim_run] = 0
+ opens[em_type].append(delim_run)
else:
- # if there are no unused opens or closes to use up then this is just imbalanced
- # mark as unused and leave for later processing
- unused_opens[em_type][open] = open_offset
- opens[em_type].append(open)
- if left:
- unused_opens[em_type][delim_run] = 0
- opens[em_type].append(delim_run)
- else:
- unused_closes[em_type].append(delim_run)
- continue
+ unused_closes[em_type].append(delim_run)
+ continue
# add all the text leading up to the opening delimiter
tokens.append(delim_run.string[index: open_start])
span, close_syntax_used_chars = self.process_span(open, delim_run, open_offset, middle)
tokens.extend(span)
- if close_syntax_used_chars < len(syntax):
- # if we didn't use up the entire closing delimiter mark it as unused
+
+ if close_syntax_used_chars is None:
+ close_syntax_used_chars = len(syntax)
+ elif close_syntax_used_chars < len(syntax):
+ # if we didn't use up the entire closing delimiter, mark it as unused
unused_opens[em_type][delim_run] = close_syntax_used_chars
opens[em_type].append(delim_run)
@@ -2708,13 +2694,17 @@ def run(self, text):
def process_span(
self, open: re.Match, close: re.Match,
offset: int, middle: Optional[re.Match] = None
- ):
+ ) -> Tuple[List[str], Optional[int]]:
'''
Args:
open: the match against the opening delimiter run
close: the match against the closing delimiter run
offset: the number of chars from the opening delimiter that should be skipped when processing
middle: an optional delimiter run in the middle of the span
+
+ Returns:
+ A list of processed tokens, and then the number of chars from the closing syntax that were
+ consumed. If the latter item is None, then assume all chars were consumed
'''
tokens = []
@@ -2769,6 +2759,108 @@ def process_span(
return tokens, close_delim_chars_used
+ def has_middle(
+ self, open: re.Match, close: re.Match, opens: List[re.Match],
+ unused_opens: Dict[re.Match, int], unused_closes: List[re.Match]
+ ) -> Union[Tuple[re.Match, Optional[re.Match]], Literal[False]]:
+ '''
+ Check if an emphasis span has a middle delimiter run, which may change the outer tags
+
+ Args:
+ open: the current opening delimiter run
+ close: the closing delimiter run
+ opens: a list of all opening delimiter runs in the text
+ unused_opens: a mapping of unused opens within the text to their offset values
+ unused_closes: a list of unused closes within the text
+
+ Returns:
+ False if there is no middle run. Otherwise, a tuple of the new opening run and the optional
+ middle span. The middle span may be None if it is invalid
+ '''
+ open_offset = unused_opens.get(open, 0)
+ open_syntax = open.group(1)[open_offset:]
+
+ syntax = close.group(1)
+
+ if len(open_syntax) < len(syntax) and opens:
+ # expand the em span to the left, meaning we're covering additional chars.
+ # check we don't cross an existing span border
+ if not self.body_crosses_span_borders(opens[-1], open):
+ middle = open
+
+ open = opens.pop(-1)
+ open_offset = unused_opens.pop(open, 0)
+ open_syntax = open.group(1)[open_offset:]
+
+ if len(open_syntax) == len(syntax):
+ # if it turns out the previous open is a perfect match then ignore the middle part
+ # eg: **foo*bar**
+ middle = None
+ elif len(open_syntax) > len(syntax) and unused_closes:
+ # check if there is a previous closing delim run in the current body
+ # since this is already within the body we don't need to do a cross-span border check
+ # as we're not expanding into new ground and that is covered later
+ middle = next((i for i in unused_closes if open.end() < i.start() < close.start()), None)
+ else:
+ return False
+
+ return open, middle
+
+ def should_process_imbalanced_delimiter_runs(
+ self, open: re.Match, close: re.Match,
+ delim_runs: Dict[re.Match, Tuple[bool, bool]],
+ unused_opens: Dict[re.Match, int]
+ ):
+ '''
+ Check if an imbalanced delimiter run should be consumed now, or left for a later pass
+
+ Args:
+ open: the opening delimiter run
+ close: the closing delimiter run
+ delim_runs: a mapping of all of the delimiter runs in the text to a tuple of whether
+ they are opening or closing runs
+ unused_opens: a mapping of unused opens within the text to their offset values
+ '''
+ open_offset = unused_opens.get(open, 0)
+ open_syntax = open.group(1)[open_offset:]
+
+ syntax = close.group(1)
+ left, right = delim_runs[close]
+
+ if len(open_syntax) < len(syntax) and len(syntax) >= 3:
+ # if closing syntax is bigger and its >= three long then focus on closing any
+ # open em spans
+ return True
+
+ try:
+ next_delim_run = tuple(delim_runs.keys())[tuple(delim_runs.keys()).index(close) + 1]
+ except IndexError:
+ # if there is no follow up delimiter run then no point leaving this unused. Process now
+ return True
+
+ if len(open_syntax) < len(syntax) and (
+ # if this run can be an opener, but the next run won't close both of them
+ (left and (
+ not delim_runs[next_delim_run][1]
+ or len(next_delim_run.group(1)) < len(open_syntax) + len(syntax)
+ ))
+ # if the next run is not an opener and won't consume this run
+ and not delim_runs[next_delim_run][0]
+ ):
+ return True
+
+ if len(open_syntax) > len(syntax) and (
+ # if this run can be a closer, but the next run is not a fresh opener
+ (right and not delim_runs[next_delim_run][0])
+ # if the next run is not a closer
+ and not delim_runs[next_delim_run][1]
+ ):
+ return True
+
+ # if there are no unused opens or closes to use up then this is just imbalanced.
+ # mark as unused and leave for later processing
+ return False
+
def delimiter_left_or_right(self, delim_run: re.Match):
run = delim_run.string[max(0, delim_run.start() - 1): delim_run.end() + 1]
syntax = delim_run.group(1)
@@ -3269,37 +3361,60 @@ def run(self, text):
return text
-class CodeFriendly(ItalicAndBoldProcessor):
+class CodeFriendly(GFMItalicAndBoldProcessor):
'''
Disable _ and __ for em and strong.
'''
name = 'code-friendly'
+ order = (Stage.ITALIC_AND_BOLD,), (Stage.ITALIC_AND_BOLD,)
def __init__(self, md, options):
super().__init__(md, options)
# add a prefix to it so we don't interfere with escaped/hashed chars from other stages
- self.hash_table[_hash_text(self.name + '_')] = '_'
- self.hash_table[_hash_text(self.name + '__')] = '__'
+ self.hash_table = {
+ _hash_text(self.name + '_'): '_',
+ _hash_text(self.name + '__'): '__'
+ }
- def sub(self, match: re.Match) -> str:
- syntax = match.group(1)
- # use match.regs because strong/em regex may include preceding text in the match as well
- text: str = match.string[match.regs[1][0]: match.end()]
- if '_' in syntax:
+ def run(self, text):
+ if self.md.order < Stage.ITALIC_AND_BOLD:
+ text = super().run(text)
+ else:
+ orig_text = ''
+ while orig_text != text:
+ orig_text = text
+ for key, substr in self.hash_table.items():
+ text = text.replace(key, substr)
+ return text
+
+ def process_span(self, open: re.Match, close: re.Match, offset: int, middle: re.Match | None = None):
+ text = open.string[open.start(): close.end()]
+ open_syntax = open.group(1)[offset:]
+ close_syntax = close.group(1)
+
+ if len(open_syntax) > 2 or open_syntax != close_syntax:
+ return [text], None
+
+ if '_' in open_syntax:
# if using _this_ syntax, hash it to avoid processing, but don't hash the contents incase of nested syntax
- text = text.replace(syntax, _hash_text(self.name + syntax))
- return text
+ text = text.replace(open_syntax, _hash_text(self.name + open_syntax))
+ return [text], None
elif '_' in text:
# if the text within the bold/em markers contains '_' then hash those chars to protect them from em_re
text = (
- text[len(syntax): -len(syntax)]
+ text[len(open_syntax): -len(close_syntax)]
.replace('__', _hash_text(self.name + '__'))
.replace('_', _hash_text(self.name + '_'))
)
- return syntax + text + syntax
- # if no underscores are present, the text is fine and we can just leave it alone
- return super().sub(match)
+ return [open_syntax, text, close_syntax], None
+
+ return super().process_span(open, close, offset, middle)
+
+ def test(self, text: str):
+ return super().test(text) or (
+ self.hash_table and re.search(r'md5-[0-9a-z]{32}', text)
+ )
class FencedCodeBlocks(Extra):
@@ -3623,7 +3738,7 @@ def tags(self, lexer_name):
return super().tags(lexer_name)
-class MiddleWordEm(ItalicAndBoldProcessor):
+class MiddleWordEm(GFMItalicAndBoldProcessor):
'''
Allows or disallows emphasis syntax in the middle of words,
defaulting to allow. Disabling this means that `this_text_here` will not be
@@ -3666,8 +3781,10 @@ def __init__(self, md: Markdown, options: Union[dict, bool, None]):
)
# add a prefix to it so we don't interfere with escaped/hashed chars from other stages
- self.hash_table['_'] = _hash_text(self.name + '_')
- self.hash_table['*'] = _hash_text(self.name + '*')
+ self.hash_table = {
+ '_': _hash_text(self.name + '_'),
+ '*': _hash_text(self.name + '*')
+ }
def run(self, text):
if self.options['allowed']:
@@ -3692,6 +3809,11 @@ def sub(self, match: re.Match):
syntax = match.group(1)
return self.hash_table[syntax]
+ def test(self, text: str):
+ return super().test(text) or (
+ self.hash_table and re.search(r'md5-[0-9a-z]{32}', text)
+ )
+
class Numbering(Extra):
'''
From 060d48da4c5cf7188968b9954fddac9d3cc3b748 Mon Sep 17 00:00:00 2001
From: Crozzers
Date: Sun, 7 Dec 2025 17:22:43 +0000
Subject: [PATCH 7/9] Add issues 645, 652, 653 and 654 to gfm test case
---
test/tm-cases/gfm_emphasis.html | 8 ++++++++
test/tm-cases/gfm_emphasis.text | 10 +++++++++-
2 files changed, 17 insertions(+), 1 deletion(-)
diff --git a/test/tm-cases/gfm_emphasis.html b/test/tm-cases/gfm_emphasis.html
index 9078f0f9..0ed679d6 100644
--- a/test/tm-cases/gfm_emphasis.html
+++ b/test/tm-cases/gfm_emphasis.html
@@ -259,3 +259,11 @@
__ahttp://foo.bar/?q=__
foo*bar
+
+_foo bar baz._bim
+
+__foo bar __baz bim bam
+
+foobar
+
+foobar
diff --git a/test/tm-cases/gfm_emphasis.text b/test/tm-cases/gfm_emphasis.text
index 7b88c80c..43e01891 100644
--- a/test/tm-cases/gfm_emphasis.text
+++ b/test/tm-cases/gfm_emphasis.text
@@ -257,4 +257,12 @@ _a `_`_
__a
-**foo*bar**
\ No newline at end of file
+**foo*bar**
+
+_foo **bar** baz._bim
+
+**__foo** bar **__baz** bim *bam*
+
+**foo*bar***
+
+***foo*bar**
\ No newline at end of file
From 3b19616b3a1bc9e83e3414b629d8258d70ebc233 Mon Sep 17 00:00:00 2001
From: Crozzers
Date: Sun, 7 Dec 2025 19:13:56 +0000
Subject: [PATCH 8/9] Improve performance in repetitive (ReDoS) scenarios by
caching some IAB internal functions
---
lib/markdown2.py | 102 +++++++++++++++++++++++++++++++++--------------
1 file changed, 73 insertions(+), 29 deletions(-)
diff --git a/lib/markdown2.py b/lib/markdown2.py
index c4c2813f..4197ae03 100755
--- a/lib/markdown2.py
+++ b/lib/markdown2.py
@@ -121,7 +121,7 @@
from collections import defaultdict, OrderedDict
from abc import ABC, abstractmethod
import functools
-from collections.abc import Iterable
+from collections.abc import Iterable, Iterator
from hashlib import sha256
from random import random
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Type, TypedDict, Union
@@ -2044,11 +2044,13 @@ def _encode_code(self, text: str) -> str:
)
_em_re = re.compile(r"(\*|_)(?=\S)(.*?\S)\1", re.S)
+ _iab_processor = None
@mark_stage(Stage.ITALIC_AND_BOLD)
def _do_italics_and_bold(self, text: str) -> str:
- iab = GFMItalicAndBoldProcessor(self, None)
- if iab.test(text):
- text = iab.run(text)
+ if not self._iab_processor:
+ self._iab_processor = GFMItalicAndBoldProcessor(self, None)
+ if self._iab_processor.test(text):
+ text = self._iab_processor.run(text)
return text
_block_quote_base = r'''
@@ -2595,14 +2597,13 @@ def run(self, text):
index = 0
'''Number of chars of `text` that has been processed so far'''
- # do a quick scan for all delimiter runs, filtering for those that can open/close emphasis
- delim_runs = OrderedDict()
- for delim_run in re.finditer(r'(\*+|_+)', text):
- left, right = self.delimiter_left_or_right(delim_run)
- if left or right:
- delim_runs[delim_run] = (left, right)
+ delim_runs_iter = re.finditer(r'(\*+|_+)', text)
+ next_delim_run = self._next_run(delim_runs_iter)
+
+ while next_delim_run:
+ delim_run, left, right = next_delim_run
+ next_delim_run = self._next_run(delim_runs_iter)
- for delim_run, (left, right) in delim_runs.items():
syntax = delim_run.group(1)
em_type = syntax[0]
@@ -2656,7 +2657,7 @@ def run(self, text):
open_syntax = open.group(1)[open_offset:]
open_start = open.start() + open_offset
elif not self.should_process_imbalanced_delimiter_runs(
- open, delim_run, delim_runs, unused_opens[em_type]
+ open, delim_run, unused_opens[em_type], next_delim_run
):
# if we shouldn't process them now, save these opens for a future pass
unused_opens[em_type][open] = open_offset
@@ -2808,8 +2809,8 @@ def has_middle(
def should_process_imbalanced_delimiter_runs(
self, open: re.Match, close: re.Match,
- delim_runs: Dict[re.Match, Tuple[bool, bool]],
- unused_opens: Dict[re.Match, int]
+ unused_opens: Dict[re.Match, int],
+ next_delim_run: Optional[Tuple[re.Match, Optional[re.Match], Optional[re.Match]]] = None
):
'''
Check if an imbalanced delimiter run should be consumed now, or left for a later pass
@@ -2817,43 +2818,39 @@ def should_process_imbalanced_delimiter_runs(
Args:
open: the opening delimiter run
close: the closing delimiter run
- delim_runs: a mapping of all of the delimiter runs in the text to a tuple of whether
- they are opening or closing runs
unused_opens: a mapping of unused opens within the text to their offset values
+ next_delim_run: the next delimiter run after the closing run
'''
open_offset = unused_opens.get(open, 0)
open_syntax = open.group(1)[open_offset:]
syntax = close.group(1)
- left, right = delim_runs[close]
+ left, right = self.delimiter_left_or_right(close)
if len(open_syntax) < len(syntax) and len(syntax) >= 3:
# if closing syntax is bigger and its >= three long then focus on closing any
# open em spans
return True
- try:
- next_delim_run = tuple(delim_runs.keys())[tuple(delim_runs.keys()).index(close) + 1]
- except IndexError:
- # if there is no follow up delimiter run then no point leaving this unused. Process now
+ if next_delim_run is None:
return True
if len(open_syntax) < len(syntax) and (
# if this run can be an opener, but the next run won't close both of them
(left and (
- not delim_runs[next_delim_run][1]
- or len(next_delim_run.group(1)) < len(open_syntax) + len(syntax)
+ not next_delim_run[2]
+ or len(next_delim_run[0].group(1)) < len(open_syntax) + len(syntax)
))
# if the next run is not an opener and won't consume this run
- and not delim_runs[next_delim_run][0]
+ and not next_delim_run[1]
):
return True
if len(open_syntax) > len(syntax) and (
# if this run can be a closer, but the next run is not a fresh opener
- (right and not delim_runs[next_delim_run][0])
+ (right and not next_delim_run[1])
# if the next run is not a closer
- and not delim_runs[next_delim_run][1]
+ and not next_delim_run[2]
):
return True
@@ -2862,8 +2859,22 @@ def should_process_imbalanced_delimiter_runs(
return False
def delimiter_left_or_right(self, delim_run: re.Match):
+ '''
+ Determine if a delimiter run is left or right flanking
+
+ Returns:
+ Tuple of bools that mean left and right flanking respectively
+ '''
run = delim_run.string[max(0, delim_run.start() - 1): delim_run.end() + 1]
- syntax = delim_run.group(1)
+
+ return self._delimiter_left_or_right(run, delim_run.group(1))
+
+ @functools.lru_cache(maxsize=512)
+ def _delimiter_left_or_right(self, run: str, syntax: str):
+ '''
+ Cached version of `delimiter_left_or_right` that massively speeds things up when dealing
+ with many repetetive delimiter runs - eg: in a ReDoS scenario
+ '''
syntax_re = syntax.replace('*', r'\*')
left = (
@@ -2891,12 +2902,45 @@ def delimiter_left_or_right(self, delim_run: re.Match):
return left, right
def body_crosses_span_borders(self, open: re.Match, close: re.Match):
- for tag in re.findall(rf'?({self.md._span_tags})', open.string[open.end(): close.start()]):
- if not self.md._tag_is_closed(tag, open.string[open.end(): close.start()]):
+ '''
+ Checks if the body of an emphasis crosses a span border
+
+ Args:
+ open: the opening delimiter run
+ close: the closing delimiter run
+
+ Returns:
+ True if the emphasis crosses a span border (invalid). False if not
+ '''
+ return self._body_crosses_span_borders(open.string[open.end(): close.start()])
+
+ @functools.lru_cache(maxsize=64)
+ def _body_crosses_span_borders(self, text: str):
+ '''Cached version of `body_crosses_span_borders`'''
+ for tag in re.findall(rf'?({self.md._span_tags})', text):
+ if not self.md._tag_is_closed(tag, text):
return True
return False
+ def _next_run(self, delim_runs_iter: Iterator[re.Match]):
+ '''
+ Gets the next delimiter run from an iterator of delimiter runs
+
+ Returns:
+ A tuple containing the run, and matches dictating whether it is left or right flanking
+ respectively. Returns nothing if no valid runs left
+ '''
+ next_delim_run: Optional[Tuple[re.Match, bool, bool]] = None
+ try:
+ while not next_delim_run:
+ delim_run = next(delim_runs_iter)
+ left, right = self.delimiter_left_or_right(delim_run)
+ if left or right:
+ return (delim_run, left, right)
+ except StopIteration:
+ return
+
def test(self, text):
return text.count('*') > 1 or text.count('_') > 1
From 749c9cb19800a3bef267d99a6025d948948bac11 Mon Sep 17 00:00:00 2001
From: Crozzers
Date: Sun, 7 Dec 2025 19:16:04 +0000
Subject: [PATCH 9/9] Fix python typing syntax error
---
lib/markdown2.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/markdown2.py b/lib/markdown2.py
index 4197ae03..6af7c929 100755
--- a/lib/markdown2.py
+++ b/lib/markdown2.py
@@ -3432,7 +3432,7 @@ def run(self, text):
text = text.replace(key, substr)
return text
- def process_span(self, open: re.Match, close: re.Match, offset: int, middle: re.Match | None = None):
+ def process_span(self, open: re.Match, close: re.Match, offset: int, middle: Optional[re.Match] = None):
text = open.string[open.start(): close.end()]
open_syntax = open.group(1)[offset:]
close_syntax = close.group(1)