From bf0a1e29c991bacb40438138a4d86338b7d9acd3 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Sun, 30 Nov 2025 09:49:59 +0000 Subject: [PATCH 1/9] Re-write the IAB processor to implement GFM rules --- lib/markdown2.py | 160 +++++++++++++++++- .../tm-cases/middle_word_em_escaped_char.html | 1 + .../tm-cases/middle_word_em_escaped_char.opts | 1 + .../tm-cases/middle_word_em_escaped_char.text | 1 + 4 files changed, 157 insertions(+), 6 deletions(-) create mode 100644 test/tm-cases/middle_word_em_escaped_char.html create mode 100644 test/tm-cases/middle_word_em_escaped_char.opts create mode 100644 test/tm-cases/middle_word_em_escaped_char.text diff --git a/lib/markdown2.py b/lib/markdown2.py index 71b19f67..3dd19541 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -1123,8 +1123,14 @@ def _strict_tag_block_sub( return result def _tag_is_closed(self, tag_name: str, text: str) -> bool: - # super basic check if number of open tags == number of closing tags - return len(re.findall('<%s(?:.*?)>' % tag_name, text)) == len(re.findall('' % tag_name, text)) + # check if number of open tags == number of close tags + if len(re.findall('<%s(?:.*?)>' % tag_name, text)) != len(re.findall('' % tag_name, text)): + return False + + # check that close tag position is AFTER open tag + close_index = text.find(f' str: @@ -2066,8 +2072,11 @@ def sub(match: re.Match): return f'{prefix}<{syntax}>{contents}' # must go first: - text = self._strong_re.sub(sub, text) - text = self._em_re.sub(sub, text) + # text = self._strong_re.sub(sub, text) + # text = self._em_re.sub(sub, text) + iab = ItalicAndBoldProcessor2(self, None) + if iab.test(text): + text = iab.run(text) return text _block_quote_base = r''' @@ -2581,6 +2590,138 @@ def test(self, text): return self.hash_table and re.search(r'md5-[0-9a-z]{32}', text) +class ItalicAndBoldProcessor2(Extra): + name = 'iabp-2' + order = (Stage.ITALIC_AND_BOLD,), tuple() + + def run(self, text): + for em_type in '*_': + opens = [] + unused_opens = {} + tokens = [] + index = 0 + + delim_runs = tuple(re.finditer(r'([%s]+)' % em_type, text)) + for delim_run in delim_runs: + # first check if it is opening (left flanking) + # or closing (right flanking) run + run = delim_run.string[max(0, delim_run.start() - 1): delim_run.end() + 1] + syntax = delim_run.group(1) + syntax_re = syntax.replace('*', r'\*') + + left = ( + # not followed by whitespace + re.match(r'.*%s\S' % syntax_re, run, re.S) + and ( + # either not followed by punctuation + re.match(r'.*%s[\s\w]' % syntax_re, run, re.S) + # or followed by punct and preceded by punct/whitespace + or re.match(r'(^|[\s\W])%s([^\s\w]|$)' % syntax_re, run, re.S | re.M) + ) + ) + + right = ( + # not preceded by whitespace + re.match(r'\S%s.*' % syntax_re, run, re.S) + and ( + # either not preceded by punct + re.match(r'[\s\w]%s.*' % syntax_re, run, re.S) + # or preceded by punct and followed by whitespace or punct + or re.match(r'[^\s\w]%s(\s|[^\s\w]|$)' % syntax_re, run, re.S | re.M) + ) + ) + + if not (left or right): + continue + + if left and right: + if opens: + # if we have open tags prioritize closing them + left = False + else: + # if we don't, let's open a new one + right = False + + if left: + opens.append(delim_run) + continue + + # close. figure out how + if not opens: + tokens.append(delim_run.string[index: delim_run.end()]) + index = delim_run.end() + continue + + # get the opening run + open = opens.pop(-1) + # if the opening run was joined to a previous closing run (eg: **strong***em*) + # then re-use that previous closing run, but ignore the part that was used to + # close the previous emphasis + open_offset = unused_opens.pop(open, 0) + open_syntax = open.group(1)[open_offset:] + open_start = open.start() + open_offset + + # add everything between last emphasis and this one + tokens.append(delim_run.string[index: open_start]) + body = delim_run.string[open.end(): delim_run.start()] + if not all( + self.md._tag_is_closed(tag, body) + for tag in re.findall(rf' len(syntax): + opens.append(open) + unused_opens[open] = open_offset + opens.append(delim_run) + unused_opens[delim_run] = 0 + continue + + # calc what type of emphasis based on the lowest common + # length of the delimiter run + length = min(3, min(len(open_syntax), len(syntax))) + if length == 3: + tokens.append('') + tokens.append(body) + tokens.append('') + else: + tag = 'strong' if length == 2 else 'em' + # add any part of the open that we don't consume + # eg: **one* + tokens.append(open_syntax[:-length]) + tokens.append(f'<{tag}>') + tokens.append(body) + tokens.append(f'') + + # if our closing syntax is longer than our opening that + # means it's joined onto a previous emphasis + # eg: **strong***em* + # This means the current delim_run is not completely "spent". + # Mark this closing run as an opening run for the next em but + # record in `unused_opens` how mmany chars from the run we've + # already used + if len(syntax) > len(open_syntax): + opens.append(delim_run) + unused_opens[delim_run] = length + index = delim_run.start() + length + else: + tokens.append(delim_run.group(1)[length:]) + index = delim_run.end() + + if index < len(text): + tokens.append(text[index:]) + + text = ''.join(tokens) + + return text + + + def test(self, text): + return text.count('*') > 1 or text.count('_') > 1 + + class _LinkProcessorExtraOpts(TypedDict, total=False): '''Options for the `LinkProcessor` extra''' tags: List[str] @@ -3420,14 +3561,21 @@ def __init__(self, md: Markdown, options: Union[dict, bool, None]): options.setdefault('allowed', True) super().__init__(md, options) + escaped_hashes = '|'.join(md._escape_table.values()) + self.middle_word_em_re = re.compile( r''' (?x/y and x\y

diff --git a/test/tm-cases/middle_word_em_escaped_char.opts b/test/tm-cases/middle_word_em_escaped_char.opts new file mode 100644 index 00000000..f540dcd6 --- /dev/null +++ b/test/tm-cases/middle_word_em_escaped_char.opts @@ -0,0 +1 @@ +{'extras': {'middle-word-em': {'allowed': False}}} diff --git a/test/tm-cases/middle_word_em_escaped_char.text b/test/tm-cases/middle_word_em_escaped_char.text new file mode 100644 index 00000000..3548642d --- /dev/null +++ b/test/tm-cases/middle_word_em_escaped_char.text @@ -0,0 +1 @@ +*x*/*y* and *x*\\*y* From 6ade9ab62114e433b4d04fe771553b05e7825044 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Sun, 30 Nov 2025 22:04:22 +0000 Subject: [PATCH 2/9] Get closer to GFM compliance --- lib/markdown2.py | 257 ++++++++++++++++++++++++++++------------------- 1 file changed, 154 insertions(+), 103 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 3dd19541..6cf5132c 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -2596,124 +2596,175 @@ class ItalicAndBoldProcessor2(Extra): def run(self, text): for em_type in '*_': - opens = [] - unused_opens = {} - tokens = [] - index = 0 - - delim_runs = tuple(re.finditer(r'([%s]+)' % em_type, text)) - for delim_run in delim_runs: - # first check if it is opening (left flanking) - # or closing (right flanking) run - run = delim_run.string[max(0, delim_run.start() - 1): delim_run.end() + 1] - syntax = delim_run.group(1) - syntax_re = syntax.replace('*', r'\*') - - left = ( - # not followed by whitespace - re.match(r'.*%s\S' % syntax_re, run, re.S) - and ( - # either not followed by punctuation - re.match(r'.*%s[\s\w]' % syntax_re, run, re.S) - # or followed by punct and preceded by punct/whitespace - or re.match(r'(^|[\s\W])%s([^\s\w]|$)' % syntax_re, run, re.S | re.M) + nesting = True + while nesting: + nesting = False + + opens = [] + buffer = [] + unused_opens = {} + tokens = [] + index = 0 + + for delim_run in re.finditer(r'([%s]+)' % em_type, text): + # first check if it is opening (left flanking) + # or closing (right flanking) run + run = delim_run.string[max(0, delim_run.start() - 1): delim_run.end() + 1] + syntax = delim_run.group(1) + syntax_re = syntax.replace('*', r'\*') + + left = ( + # not followed by whitespace + re.match(r'.*%s\S' % syntax_re, run, re.S) + and ( + # either not followed by punctuation + re.match(r'.*%s[\s\w]' % syntax_re, run, re.S) + # or followed by punct and preceded by punct/whitespace + or re.match(r'(^|[\s\W])%s([^\s\w]|$)' % syntax_re, run, re.S | re.M) + ) ) - ) - right = ( - # not preceded by whitespace - re.match(r'\S%s.*' % syntax_re, run, re.S) - and ( - # either not preceded by punct - re.match(r'[\s\w]%s.*' % syntax_re, run, re.S) - # or preceded by punct and followed by whitespace or punct - or re.match(r'[^\s\w]%s(\s|[^\s\w]|$)' % syntax_re, run, re.S | re.M) + right = ( + # not preceded by whitespace + re.match(r'\S%s.*' % syntax_re, run, re.S) + and ( + # either not preceded by punct + re.match(r'[\s\w]%s.*' % syntax_re, run, re.S) + # or preceded by punct and followed by whitespace or punct + or re.match(r'[^\s\w]%s(\s|[^\s\w]|$)' % syntax_re, run, re.S | re.M) + ) ) - ) - if not (left or right): - continue + if not (left or right): + continue - if left and right: - if opens: - # if we have open tags prioritize closing them - left = False - else: - # if we don't, let's open a new one - right = False + if not right or not opens: + if left: + opens.append(delim_run) + continue - if left: - opens.append(delim_run) - continue + syntax = delim_run.group(1) + + open = opens.pop(-1) + # if the opening run was joined to a previous closing run (eg: **strong***em*) + # then re-use that previous closing run, but ignore the part that was used to + # close the previous emphasis + open_offset = unused_opens.pop(open, 0) + open_start = open.start() + open_offset + open_syntax = open.group(1)[open_offset:] + + if open.start() < index: + # this happens with things like `*(**foo**)*`. We process LTR so the strong gets + # processed first (since that has the first closing delimiter). We now have + # `*(foo)*` and now we get round to processing the em. + # It's hard compare the match (against the original text var) to the processed text + # so it's easier to just note down that nesting is detected and re-run the loop + nesting = True + continue - # close. figure out how - if not opens: - tokens.append(delim_run.string[index: delim_run.end()]) - index = delim_run.end() - continue + prev_open = None + + if len(open_syntax) < len(syntax): + # if closing syntax is longer then maybe we can close multiple openers that are queued up + if opens: + prev_open = opens.pop(-1) + prev_open_offset = unused_opens.pop(open, 0) + prev_open_start = prev_open.start() + prev_open_offset + prev_open_syntax = prev_open.group(1)[prev_open_offset:] + + # check the new expanded body doesn't cross span borders + if not all( + self.md._tag_is_closed(tag, delim_run.string[prev_open.end(): open.start()]) + for tag in re.findall( + rf' len(syntax): + # if the opening syntax is bigger than this close won't close all of it. + # Queue both up for later processing + opens.append(open) + unused_opens[open] = open_offset + if left: + opens.append(delim_run) + unused_opens[delim_run] = 0 + continue - # get the opening run - open = opens.pop(-1) - # if the opening run was joined to a previous closing run (eg: **strong***em*) - # then re-use that previous closing run, but ignore the part that was used to - # close the previous emphasis - open_offset = unused_opens.pop(open, 0) - open_syntax = open.group(1)[open_offset:] - open_start = open.start() + open_offset - - # add everything between last emphasis and this one - tokens.append(delim_run.string[index: open_start]) - body = delim_run.string[open.end(): delim_run.start()] - if not all( - self.md._tag_is_closed(tag, body) - for tag in re.findall(rf' len(syntax): - opens.append(open) - unused_opens[open] = open_offset - opens.append(delim_run) - unused_opens[delim_run] = 0 - continue + # ensure the body does not cross span borders + if not all( + self.md._tag_is_closed(tag, body) + for tag in re.findall(rf'') - tokens.append(body) - tokens.append('') - else: - tag = 'strong' if length == 2 else 'em' + # put all the new processing in a buffer array that gets added to `tokens` anyway. + # Not the most efficient but it's convenient having a separate list of everything + # processed and added in the previous iteration + buffer = [] + + # add all the text leading up to the opening delimiter + buffer.append(delim_run.string[index: prev_open_start if prev_open else open_start]) + + # calc what type of emphasis based on the lowest common + # length of the delimiter run + length = min(3, min(len(open_syntax), len(syntax))) # add any part of the open that we don't consume # eg: **one* - tokens.append(open_syntax[:-length]) - tokens.append(f'<{tag}>') - tokens.append(body) - tokens.append(f'') - - # if our closing syntax is longer than our opening that - # means it's joined onto a previous emphasis - # eg: **strong***em* - # This means the current delim_run is not completely "spent". - # Mark this closing run as an opening run for the next em but - # record in `unused_opens` how mmany chars from the run we've - # already used - if len(syntax) > len(open_syntax): - opens.append(delim_run) - unused_opens[delim_run] = length - index = delim_run.start() + length - else: - tokens.append(delim_run.group(1)[length:]) + buffer.append(open_syntax[:-length]) + if length == 3: + buffer.append('') + buffer.append(body) + buffer.append('') + else: + tag = 'strong' if length == 2 else 'em' + # prev_open is defined if this closing syntax is closing multiple openers at once + if prev_open: + if len(prev_open_syntax) == 3: + prev_tag = 'strong' if tag == 'em' else 'em' + else: + prev_tag = 'strong' if len(prev_open_syntax) == 2 else 'em' + buffer.append(f'<{prev_tag}>') + + if len(prev_open_syntax) == 3: + buffer.append(f'<{tag}>') + + buffer.append(delim_run.string[prev_open.end(): open.start()]) + + if len(prev_open_syntax) == 3: + buffer.append(f'') + else: + buffer.append(f'<{tag}>') + + buffer.append(body) + + if len(prev_open_syntax) != 3: + buffer.append(f'') + buffer.append(f'') + else: + buffer.append(f'<{tag}>') + buffer.append(body) + buffer.append(f'') + + # If both syntaxes are equal length then that's easy. Remove the open run as it's fully + # processed and consumed, and move on index = delim_run.end() - if index < len(text): - tokens.append(text[index:]) + tokens.extend(buffer) + + if index < len(text): + tokens.append(text[index:]) - text = ''.join(tokens) + text = ''.join(tokens) return text From b3e512de1925ab5f53597072d82b289a0e050e87 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Fri, 5 Dec 2025 21:58:12 +0000 Subject: [PATCH 3/9] Iron out some GFM edge cases --- lib/markdown2.py | 291 ++++++++++++++++++++++++++++------------------- 1 file changed, 174 insertions(+), 117 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 6cf5132c..22e51961 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -2601,39 +2601,18 @@ def run(self, text): nesting = False opens = [] - buffer = [] unused_opens = {} + unused_closes = [] tokens = [] index = 0 - for delim_run in re.finditer(r'([%s]+)' % em_type, text): - # first check if it is opening (left flanking) - # or closing (right flanking) run - run = delim_run.string[max(0, delim_run.start() - 1): delim_run.end() + 1] - syntax = delim_run.group(1) - syntax_re = syntax.replace('*', r'\*') - - left = ( - # not followed by whitespace - re.match(r'.*%s\S' % syntax_re, run, re.S) - and ( - # either not followed by punctuation - re.match(r'.*%s[\s\w]' % syntax_re, run, re.S) - # or followed by punct and preceded by punct/whitespace - or re.match(r'(^|[\s\W])%s([^\s\w]|$)' % syntax_re, run, re.S | re.M) - ) - ) + delim_runs = { + delim_run: self.delimiter_left_or_right(delim_run) + for delim_run in re.finditer(r'([%s]+)' % em_type, text) + } - right = ( - # not preceded by whitespace - re.match(r'\S%s.*' % syntax_re, run, re.S) - and ( - # either not preceded by punct - re.match(r'[\s\w]%s.*' % syntax_re, run, re.S) - # or preceded by punct and followed by whitespace or punct - or re.match(r'[^\s\w]%s(\s|[^\s\w]|$)' % syntax_re, run, re.S | re.M) - ) - ) + for delim_run, (left, right) in delim_runs.items(): + syntax = delim_run.group(1) if not (left or right): continue @@ -2662,112 +2641,190 @@ def run(self, text): nesting = True continue - prev_open = None - - if len(open_syntax) < len(syntax): - # if closing syntax is longer then maybe we can close multiple openers that are queued up - if opens: - prev_open = opens.pop(-1) - prev_open_offset = unused_opens.pop(open, 0) - prev_open_start = prev_open.start() + prev_open_offset - prev_open_syntax = prev_open.group(1)[prev_open_offset:] - - # check the new expanded body doesn't cross span borders - if not all( - self.md._tag_is_closed(tag, delim_run.string[prev_open.end(): open.start()]) - for tag in re.findall( - rf' len(syntax) and unused_closes: + # check if there is a previous closing delim run in the current body + # since this is already within the body we don't need to do a cross-span border check + # as we're not expanding into new ground and that is covered later + middle = next((i for i in unused_closes if open.end() < i.start() < delim_run.start()), None) else: - unused_opens[open] = open_offset - opens.append(open) - unused_opens[delim_run] = 0 - opens.append(delim_run) - continue - elif len(open_syntax) > len(syntax): - # if the opening syntax is bigger than this close won't close all of it. - # Queue both up for later processing - opens.append(open) - unused_opens[open] = open_offset - if left: - opens.append(delim_run) - unused_opens[delim_run] = 0 - continue - - body = delim_run.string[open.end(): delim_run.start()] + try: + next_delim_run = tuple(delim_runs.keys())[tuple(delim_runs.keys()).index(delim_run) + 1] + except IndexError: + next_delim_run = None + + if next_delim_run is None: + # if there is no follow up delimiter run then no point leaving this unused. Process now + pass + elif len(open_syntax) < len(syntax) and ( + # if this run can be an opener, but the next run won't close both of them + (left and not delim_runs[next_delim_run][1]) + # if the next run is not an opener and won't consume this run + and not delim_runs[next_delim_run][0] + ): + pass + elif len(open_syntax) > len(syntax) and ( + # if this run can be an closer, but the next run is not a fresh opener + (right and not delim_runs[next_delim_run][0]) + # if the next run is not a closer + and not delim_runs[next_delim_run][1] + ): + pass + elif delim_runs[next_delim_run][1] and len(open_syntax) == len(next_delim_run.group(1)): + # of the next run is a closer and matches the length of the opener then that is probably + # a better closer than this run - eg: **foo*bar** or *foo**bar* + opens.append(open) + continue + else: + # if there are no unused opens or closes to use up then this is just imbalanced + # mark as unused and leave for later processing + unused_opens[open] = open_offset + opens.append(open) + if left: + unused_opens[delim_run] = 0 + opens.append(delim_run) + else: + unused_closes.append(delim_run) + continue # ensure the body does not cross span borders - if not all( - self.md._tag_is_closed(tag, body) - for tag in re.findall(rf'') - buffer.append(body) - buffer.append('') - else: - tag = 'strong' if length == 2 else 'em' - # prev_open is defined if this closing syntax is closing multiple openers at once - if prev_open: - if len(prev_open_syntax) == 3: - prev_tag = 'strong' if tag == 'em' else 'em' - else: - prev_tag = 'strong' if len(prev_open_syntax) == 2 else 'em' - buffer.append(f'<{prev_tag}>') + tokens.append(delim_run.string[index: open_start]) - if len(prev_open_syntax) == 3: - buffer.append(f'<{tag}>') + span, close_syntax_used_chars = self.process_span(open, delim_run, open_offset, middle) + tokens.extend(span) + if close_syntax_used_chars < len(syntax): + # if we didn't use up the entire closing delimiter mark it as unused + unused_opens[delim_run] = close_syntax_used_chars + opens.append(delim_run) - buffer.append(delim_run.string[prev_open.end(): open.start()]) + # Move index to end of the used delim run + index = delim_run.start() + close_syntax_used_chars - if len(prev_open_syntax) == 3: - buffer.append(f'') - else: - buffer.append(f'<{tag}>') + if index < len(text): + tokens.append(text[index:]) + + text = ''.join(tokens) - buffer.append(body) + return text - if len(prev_open_syntax) != 3: - buffer.append(f'') - buffer.append(f'') - else: - buffer.append(f'<{tag}>') - buffer.append(body) - buffer.append(f'') + def process_span( + self, open: re.Match, close: re.Match, + offset: int, middle: Optional[re.Match] = None + ): + ''' + Args: + open: the match against the opening delimiter run + close: the match against the closing delimiter run + offset: the number of chars from the opening delimiter that should be skipped when processing + middle: an optional delimiter run in the middle of the span + ''' + tokens = [] - # If both syntaxes are equal length then that's easy. Remove the open run as it's fully - # processed and consumed, and move on - index = delim_run.end() + open_syntax = open.group(1)[offset:] + middle_syntax = middle.group(1) if middle else '' + close_syntax = close.group(1) - tokens.extend(buffer) + # calculate what em type the inner and outer emphasis is + outer_syntax_length = min(3, min(len(open_syntax), len(close_syntax))) + inner_syntax_length = min(max(len(open_syntax), len(close_syntax)), len(middle_syntax)) if middle else 0 + # add anything from the opening syntax that will not be consumed + # eg: **one* + tokens.append(open_syntax[:-(outer_syntax_length + inner_syntax_length)]) - if index < len(text): - tokens.append(text[index:]) + if outer_syntax_length == 3: + tokens.append('') + else: + tokens.append(f'<{"strong" if outer_syntax_length == 2 else "em"}>') - text = ''.join(tokens) + if middle: + # outer_tag = 'strong' if outer_syntax_length == 2 else 'em' - return text + # if there is a middle em (eg: ***abc*def**) then do some wrangling to figure + # out where to put the opening/closing inner tags depending on the size of the + # opening delim run + inner_tag = 'strong' if len(middle_syntax) == 2 else 'em' + if len(open_syntax) > len(close_syntax): + tokens.append(f'<{inner_tag}>') + + tokens.append(close.string[open.end(): middle.start()]) + + if len(open_syntax) > len(close_syntax): + tokens.append(f'') + else: + tokens.append(f'<{inner_tag}>') + + tokens.append(close.string[middle.end(): close.start()]) + if len(open_syntax) < len(close_syntax): + tokens.append(f'') + else: + # if no middle em then it's easy. Just add the whole text body + tokens.append(close.string[open.end(): close.start()]) + + if outer_syntax_length == 3: + tokens.append('') + else: + tokens.append(f'') + + # figure out how many chars from the closing delimiter we've actually used + close_delim_chars_used = outer_syntax_length + if middle and len(open_syntax) < len(close_syntax): + # if there's a middle part and it's right-aligned then add that on + close_delim_chars_used += inner_syntax_length + + return tokens, close_delim_chars_used + + def delimiter_left_or_right(self, delim_run: re.Match): + run = delim_run.string[max(0, delim_run.start() - 1): delim_run.end() + 1] + syntax = delim_run.group(1) + syntax_re = syntax.replace('*', r'\*') + + left = ( + # not followed by whitespace + re.match(r'.*%s\S' % syntax_re, run, re.S) + and ( + # either not followed by punctuation + re.match(r'.*%s[\s\w]' % syntax_re, run, re.S) + # or followed by punct and preceded by punct/whitespace + or re.match(r'(^|[\s\W])%s([^\s\w]|$)' % syntax_re, run, re.S | re.M) + ) + ) + + right = ( + # not preceded by whitespace + re.match(r'\S%s.*' % syntax_re, run, re.S) + and ( + # either not preceded by punct + re.match(r'[\s\w]%s.*' % syntax_re, run, re.S) + # or preceded by punct and followed by whitespace or punct + or re.match(r'[^\s\w]%s(\s|[^\s\w]|$)' % syntax_re, run, re.S | re.M) + ) + ) + + return left, right + + def body_crosses_span_borders(self, open: re.Match, close: re.Match): + for tag in re.findall(rf' 1 or text.count('_') > 1 From 366ad8cb32687c95d87aaac19364e27b6f92c243 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Fri, 5 Dec 2025 22:29:18 +0000 Subject: [PATCH 4/9] Acheive near full GFM compliance on iab --- lib/markdown2.py | 228 ++++++++++++++++++++++++----------------------- 1 file changed, 115 insertions(+), 113 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 22e51961..9099a693 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -2595,131 +2595,133 @@ class ItalicAndBoldProcessor2(Extra): order = (Stage.ITALIC_AND_BOLD,), tuple() def run(self, text): - for em_type in '*_': - nesting = True - while nesting: - nesting = False - - opens = [] - unused_opens = {} - unused_closes = [] - tokens = [] - index = 0 - - delim_runs = { - delim_run: self.delimiter_left_or_right(delim_run) - for delim_run in re.finditer(r'([%s]+)' % em_type, text) - } + nesting = True + while nesting: + nesting = False + + opens = {'*': [], '_': []} + unused_opens = {'*': {}, '_': {}} + unused_closes = {'*': [], '_': []} + tokens = [] + index = 0 + + delim_runs = { + delim_run: self.delimiter_left_or_right(delim_run) + for delim_run in re.finditer(r'(\*+|_+)', text) + } + + for delim_run, (left, right) in delim_runs.items(): + syntax = delim_run.group(1) + em_type = syntax[0] + + if not (left or right): + continue - for delim_run, (left, right) in delim_runs.items(): - syntax = delim_run.group(1) + if not right or not opens[em_type]: + if left: + opens[em_type].append(delim_run) + continue - if not (left or right): - continue + syntax = delim_run.group(1) - if not right or not opens: - if left: - opens.append(delim_run) - continue + # grab the open run. If it crosses a span, keep looking backwards + while opens[em_type] and self.body_crosses_span_borders(opens[em_type][-1], delim_run): + opens[em_type].pop(-1) + if not opens[em_type]: + continue + open = opens[em_type].pop(-1) + + # if the opening run was joined to a previous closing run (eg: **strong***em*) + # then re-use that previous closing run, but ignore the part that was used to + # close the previous emphasis + open_offset = unused_opens[em_type].pop(open, 0) + open_start = open.start() + open_offset + open_syntax = open.group(1)[open_offset:] + + if open.start() < index: + # this happens with things like `*(**foo**)*`. We process LTR so the strong gets + # processed first (since that has the first closing delimiter). We now have + # `*(foo)*` and now we get round to processing the em. + # It's hard compare the match (against the original text var) to the processed text + # so it's easier to just note down that nesting is detected and re-run the loop + nesting = True + continue - syntax = delim_run.group(1) - - open = opens.pop(-1) - # if the opening run was joined to a previous closing run (eg: **strong***em*) - # then re-use that previous closing run, but ignore the part that was used to - # close the previous emphasis - open_offset = unused_opens.pop(open, 0) - open_start = open.start() + open_offset - open_syntax = open.group(1)[open_offset:] - - if open.start() < index: - # this happens with things like `*(**foo**)*`. We process LTR so the strong gets - # processed first (since that has the first closing delimiter). We now have - # `*(foo)*` and now we get round to processing the em. - # It's hard compare the match (against the original text var) to the processed text - # so it's easier to just note down that nesting is detected and re-run the loop - nesting = True - continue + middle = None + + if len(open_syntax) != len(syntax): + if len(open_syntax) < len(syntax) and opens[em_type]: + # since we are detecting a previous open, we are expanding the em span to the left + # so we should check if we're covering additional chars that we don't cross an + # existing span border + if not self.body_crosses_span_borders(opens[em_type][-1], open): + middle = open + + open = opens[em_type].pop(-1) + open_offset = unused_opens[em_type].pop(open, 0) + open_start = open.start() + open_offset + elif len(open_syntax) > len(syntax) and unused_closes[em_type]: + # check if there is a previous closing delim run in the current body + # since this is already within the body we don't need to do a cross-span border check + # as we're not expanding into new ground and that is covered later + middle = next((i for i in unused_closes[em_type] if open.end() < i.start() < delim_run.start()), None) + else: + try: + next_delim_run = tuple(delim_runs.keys())[tuple(delim_runs.keys()).index(delim_run) + 1] + except IndexError: + next_delim_run = None - middle = None - - if len(open_syntax) != len(syntax): - if len(open_syntax) < len(syntax) and opens: - # since we are detecting a previous open, we are expanding the em span to the left - # so we should check if we're covering additional chars that we don't cross an - # existing span border - if not self.body_crosses_span_borders(opens[-1], open): - middle = open - - open = opens.pop(-1) - open_offset = unused_opens.pop(open, 0) - open_start = open.start() + open_offset - elif len(open_syntax) > len(syntax) and unused_closes: - # check if there is a previous closing delim run in the current body - # since this is already within the body we don't need to do a cross-span border check - # as we're not expanding into new ground and that is covered later - middle = next((i for i in unused_closes if open.end() < i.start() < delim_run.start()), None) + if next_delim_run is None: + # if there is no follow up delimiter run then no point leaving this unused. Process now + pass + elif len(open_syntax) < len(syntax) and ( + # if this run can be an opener, but the next run won't close both of them + (left and not delim_runs[next_delim_run][1]) + # if the next run is not an opener and won't consume this run + and not delim_runs[next_delim_run][0] + ): + pass + elif len(open_syntax) > len(syntax) and ( + # if this run can be an closer, but the next run is not a fresh opener + (right and not delim_runs[next_delim_run][0]) + # if the next run is not a closer + and not delim_runs[next_delim_run][1] + ): + pass + elif delim_runs[next_delim_run][1] and len(open_syntax) == len(next_delim_run.group(1)): + # of the next run is a closer and matches the length of the opener then that is probably + # a better closer than this run - eg: **foo*bar** or *foo**bar* + opens[em_type].append(open) + continue else: - try: - next_delim_run = tuple(delim_runs.keys())[tuple(delim_runs.keys()).index(delim_run) + 1] - except IndexError: - next_delim_run = None - - if next_delim_run is None: - # if there is no follow up delimiter run then no point leaving this unused. Process now - pass - elif len(open_syntax) < len(syntax) and ( - # if this run can be an opener, but the next run won't close both of them - (left and not delim_runs[next_delim_run][1]) - # if the next run is not an opener and won't consume this run - and not delim_runs[next_delim_run][0] - ): - pass - elif len(open_syntax) > len(syntax) and ( - # if this run can be an closer, but the next run is not a fresh opener - (right and not delim_runs[next_delim_run][0]) - # if the next run is not a closer - and not delim_runs[next_delim_run][1] - ): - pass - elif delim_runs[next_delim_run][1] and len(open_syntax) == len(next_delim_run.group(1)): - # of the next run is a closer and matches the length of the opener then that is probably - # a better closer than this run - eg: **foo*bar** or *foo**bar* - opens.append(open) - continue + # if there are no unused opens or closes to use up then this is just imbalanced + # mark as unused and leave for later processing + unused_opens[em_type][open] = open_offset + opens[em_type].append(open) + if left: + unused_opens[em_type][delim_run] = 0 + opens[em_type].append(delim_run) else: - # if there are no unused opens or closes to use up then this is just imbalanced - # mark as unused and leave for later processing - unused_opens[open] = open_offset - opens.append(open) - if left: - unused_opens[delim_run] = 0 - opens.append(delim_run) - else: - unused_closes.append(delim_run) - continue - - # ensure the body does not cross span borders - if self.body_crosses_span_borders(open, delim_run): - continue + unused_closes[em_type].append(delim_run) + continue - # add all the text leading up to the opening delimiter - tokens.append(delim_run.string[index: open_start]) + # add all the text leading up to the opening delimiter + tokens.append(delim_run.string[index: open_start]) - span, close_syntax_used_chars = self.process_span(open, delim_run, open_offset, middle) - tokens.extend(span) - if close_syntax_used_chars < len(syntax): - # if we didn't use up the entire closing delimiter mark it as unused - unused_opens[delim_run] = close_syntax_used_chars - opens.append(delim_run) + span, close_syntax_used_chars = self.process_span(open, delim_run, open_offset, middle) + tokens.extend(span) + if close_syntax_used_chars < len(syntax): + # if we didn't use up the entire closing delimiter mark it as unused + unused_opens[em_type][delim_run] = close_syntax_used_chars + opens[em_type].append(delim_run) - # Move index to end of the used delim run - index = delim_run.start() + close_syntax_used_chars + # Move index to end of the used delim run + index = delim_run.start() + close_syntax_used_chars - if index < len(text): - tokens.append(text[index:]) + if index < len(text): + tokens.append(text[index:]) - text = ''.join(tokens) + text = ''.join(tokens) return text From e8e7ced7feea5d2cea53f4ffc19aa7d0dda6c6b6 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Sun, 7 Dec 2025 16:01:27 +0000 Subject: [PATCH 5/9] Acheive near complete GFM compliance --- lib/markdown2.py | 69 +++----- test/tm-cases/gfm_emphasis.html | 261 ++++++++++++++++++++++++++++ test/tm-cases/gfm_emphasis.text | 260 +++++++++++++++++++++++++++ test/tm-cases/hash_html_blocks.html | 3 - 4 files changed, 544 insertions(+), 49 deletions(-) create mode 100644 test/tm-cases/gfm_emphasis.html create mode 100644 test/tm-cases/gfm_emphasis.text diff --git a/lib/markdown2.py b/lib/markdown2.py index 9099a693..95d5a405 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -2046,35 +2046,7 @@ def _encode_code(self, text: str) -> str: @mark_stage(Stage.ITALIC_AND_BOLD) def _do_italics_and_bold(self, text: str) -> str: - def sub(match: re.Match): - ''' - regex sub function that checks that the match isn't matching across spans. - The span shouldn't be across a closing or opening HTML tag, although spans within - the span is acceptable. - ''' - contents: str = match.group(2) - # the strong re also checks for leading em chars, so the match may cover some additional text - prefix = match.string[match.start(): match.regs[1][0]] - # look for all possible span HTML tags - for tag in re.findall(rf'abcdef_`, which is across 2 spans - close_index = contents.find(f'{contents}' - - # must go first: - # text = self._strong_re.sub(sub, text) - # text = self._em_re.sub(sub, text) - iab = ItalicAndBoldProcessor2(self, None) + iab = GFMItalicAndBoldProcessor(self, None) if iab.test(text): text = iab.run(text) return text @@ -2590,8 +2562,8 @@ def test(self, text): return self.hash_table and re.search(r'md5-[0-9a-z]{32}', text) -class ItalicAndBoldProcessor2(Extra): - name = 'iabp-2' +class GFMItalicAndBoldProcessor(Extra): + name = 'gfm-italic-and-bold-processor' order = (Stage.ITALIC_AND_BOLD,), tuple() def run(self, text): @@ -2649,6 +2621,7 @@ def run(self, text): middle = None + # if the delimiter runs don't match then we need to figure out how to resolve this if len(open_syntax) != len(syntax): if len(open_syntax) < len(syntax) and opens[em_type]: # since we are detecting a previous open, we are expanding the em span to the left @@ -2659,7 +2632,12 @@ def run(self, text): open = opens[em_type].pop(-1) open_offset = unused_opens[em_type].pop(open, 0) + open_syntax = open.group(1)[open_offset:] open_start = open.start() + open_offset + + if len(open_syntax) == len(syntax): + # if it turns out the previous open is a perfect match then ignore the middle part + middle = None elif len(open_syntax) > len(syntax) and unused_closes[em_type]: # check if there is a previous closing delim run in the current body # since this is already within the body we don't need to do a cross-span border check @@ -2676,7 +2654,10 @@ def run(self, text): pass elif len(open_syntax) < len(syntax) and ( # if this run can be an opener, but the next run won't close both of them - (left and not delim_runs[next_delim_run][1]) + (left and ( + not delim_runs[next_delim_run][1] + or len(next_delim_run.group(1)) < len(open_syntax) + len(syntax) + )) # if the next run is not an opener and won't consume this run and not delim_runs[next_delim_run][0] ): @@ -2688,11 +2669,10 @@ def run(self, text): and not delim_runs[next_delim_run][1] ): pass - elif delim_runs[next_delim_run][1] and len(open_syntax) == len(next_delim_run.group(1)): - # of the next run is a closer and matches the length of the opener then that is probably - # a better closer than this run - eg: **foo*bar** or *foo**bar* - opens[em_type].append(open) - continue + elif len(open_syntax) < len(syntax) and len(syntax) >= 3: + # if closing syntax is bigger and its >= three long then focus on closing any + # open em spans + pass else: # if there are no unused opens or closes to use up then this is just imbalanced # mark as unused and leave for later processing @@ -2743,16 +2723,16 @@ def process_span( close_syntax = close.group(1) # calculate what em type the inner and outer emphasis is - outer_syntax_length = min(3, min(len(open_syntax), len(close_syntax))) + outer_syntax_length = min(len(open_syntax), len(close_syntax)) inner_syntax_length = min(max(len(open_syntax), len(close_syntax)), len(middle_syntax)) if middle else 0 # add anything from the opening syntax that will not be consumed # eg: **one* tokens.append(open_syntax[:-(outer_syntax_length + inner_syntax_length)]) - if outer_syntax_length == 3: - tokens.append('') - else: - tokens.append(f'<{"strong" if outer_syntax_length == 2 else "em"}>') + tags = [] + tags += [''] * (outer_syntax_length % 2) + tags += [''] * (outer_syntax_length // 2) + tokens.append(''.join(tags)) if middle: # outer_tag = 'strong' if outer_syntax_length == 2 else 'em' @@ -2779,10 +2759,7 @@ def process_span( # if no middle em then it's easy. Just add the whole text body tokens.append(close.string[open.end(): close.start()]) - if outer_syntax_length == 3: - tokens.append('') - else: - tokens.append(f'') + tokens.append(''.join(reversed(tags)).replace('<', 'foo bar

+ +

a * foo bar*

+ +

a*"foo"*

+ +
    +
  • a *
  • +
+ +

foobar

+ +

5678

+ +

пристанямстремятся

+ +

aa_"bb"_cc

+ +

foo-(bar)

+ +

_foo*

+ +

*foo bar *

+ +

*foo bar +*

+ +

*(*foo)

+ +

(foo)

+ +

foobar

+ +

_foo bar _

+ +

_(_foo)

+ +

(foo)

+ +

foobar

+ +

пристанямстремятся

+ +

foobarbaz

+ +

(bar).

+ +

foo bar

+ +

** foo bar**

+ +

a**"foo"**

+ +

foobar

+ +

foo bar

+ +

__ foo bar__

+ +

__ +foo bar__

+ +

a__"foo"__

+ +

foobar

+ +

5678

+ +

пристанямстремятся

+ +

foo, bar, baz

+ +

foo-(bar)

+ +

**foo bar **

+ +

**(**foo)

+ +

(foo)

+ +

Gomphocarpus (Gomphocarpus physocarpus, syn. +Asclepias physocarpa)

+ +

foo "bar" foo

+ +

foobar

+ +

__foo bar __

+ +

__(__foo)

+ +

(foo)

+ +

foobar

+ +

пристанямстремятся

+ +

foobarbaz

+ +

(bar).

+ +

foo bar

+ +

foo +bar

+ +

foo bar baz

+ +

foo bar baz

+ +

foo bar

+ +

foo bar

+ +

foo bar baz

+ +

foobarbaz

+ +

foobar

+ +

foo bar

+ +

foo bar

+ +

foobar

+ +

foobarbaz

+ +

foobar***baz

+ +

foo bar baz bim bop

+ +

foo bar

+ +

** is not an empty emphasis

+ +

**** is not an empty strong emphasis

+ +

foo bar

+ +

foo +bar

+ +

foo bar baz

+ +

foo bar baz

+ +

foo bar

+ +

foo bar

+ +

foo bar baz

+ +

foobarbaz

+ +

foo bar

+ +

foo bar

+ +

foo bar baz +bim bop

+ +

foo bar

+ +

__ is not an empty emphasis

+ +

____ is not an empty strong emphasis

+ +

foo ***

+ +

foo *

+ +

foo _

+ +

foo *****

+ +

foo *

+ +

foo _

+ +

*foo

+ +

foo*

+ +

*foo

+ +

***foo

+ +

foo*

+ +

foo***

+ +

foo ___

+ +

foo _

+ +

foo *

+ +

foo _____

+ +

foo _

+ +

foo *

+ +

_foo

+ +

foo_

+ +

_foo

+ +

___foo

+ +

foo_

+ +

foo___

+ +

foo

+ +

foo

+ +

foo

+ +

foo

+ +

foo

+ +

foo

+ +

foo

+ +

foo

+ +

foo

+ +

foo _bar baz_

+ +

foo bar *baz bim bam

+ +

**foo bar baz

+ +

*foo bar baz

+ +

*bar*

+ +

_foo bar_

+ +

*

+ +

**

+ +

__

+ +

a *

+ +

a _

+ +

**ahttp://foo.bar/?q=**

+ +

__ahttp://foo.bar/?q=__

+ +

foo*bar

diff --git a/test/tm-cases/gfm_emphasis.text b/test/tm-cases/gfm_emphasis.text new file mode 100644 index 00000000..7b88c80c --- /dev/null +++ b/test/tm-cases/gfm_emphasis.text @@ -0,0 +1,260 @@ +*foo bar* + +a * foo bar* + +a*"foo"* + +* a * + +foo*bar* + +5*6*78 + +пристаням_стремятся_ + +aa_"bb"_cc + +foo-_(bar)_ + +_foo* + +*foo bar * + +*foo bar +* + +*(*foo) + +*(*foo*)* + +*foo*bar + +_foo bar _ + +_(_foo) + +_(_foo_)_ + +_foo_bar + +_пристаням_стремятся + +_foo_bar_baz_ + +_(bar)_. + +**foo bar** + +** foo bar** + +a**"foo"** + +foo**bar** + +__foo bar__ + +__ foo bar__ + +__ +foo bar__ + +a__"foo"__ + +foo__bar__ + +5__6__78 + +пристаням__стремятся__ + +__foo, __bar__, baz__ + +foo-__(bar)__ + +**foo bar ** + +**(**foo) + +*(**foo**)* + +**Gomphocarpus (*Gomphocarpus physocarpus*, syn. +*Asclepias physocarpa*)** + +**foo "*bar*" foo** + +**foo**bar + +__foo bar __ + +__(__foo) + +_(__foo__)_ + +__foo__bar + +__пристаням__стремятся + +__foo__bar__baz__ + +__(bar)__. + +*foo [bar](/url)* + +*foo +bar* + +_foo __bar__ baz_ + +_foo _bar_ baz_ + +__foo_ bar_ + +*foo *bar** + +*foo **bar** baz* + +*foo**bar**baz* + +*foo**bar* + +***foo** bar* + +*foo **bar*** + +*foo**bar*** + +foo***bar***baz + +foo******bar*********baz + +*foo **bar *baz* bim** bop* + +*foo [*bar*](/url)* + + +** is not an empty emphasis + +**** is not an empty strong emphasis + +**foo [bar](/url)** + +**foo +bar** + +__foo _bar_ baz__ + +__foo __bar__ baz__ + +____foo__ bar__ + +**foo **bar**** + +**foo *bar* baz** + +**foo*bar*baz** + +***foo* bar** + +**foo *bar*** + +**foo *bar **baz** +bim* bop** + +**foo [*bar*](/url)** + +__ is not an empty emphasis + +____ is not an empty strong emphasis + +foo *** + +foo *\** + +foo *_* + +foo ***** + +foo **\*** + +foo **_** + +**foo* + +*foo** + +***foo** + +****foo* + +**foo*** + +*foo**** + +foo ___ + +foo _\__ + +foo _*_ + +foo _____ + +foo __\___ + +foo __*__ + +__foo_ + +_foo__ + +___foo__ + +____foo_ + +__foo___ + +_foo____ + +**foo** + +*_foo_* + +__foo__ + +_*foo*_ + +****foo**** + +____foo____ + +******foo****** + +***foo*** + +_____foo_____ + +*foo _bar* baz_ + +*foo __bar *baz bim__ bam* + +**foo **bar baz** + +*foo *bar baz* + +*[bar*](/url) + +_foo [bar_](/url) + +* + +** + +__ + +*a `*`* + +_a `_`_ + +**a + +__a + +**foo*bar** \ No newline at end of file diff --git a/test/tm-cases/hash_html_blocks.html b/test/tm-cases/hash_html_blocks.html index 310fe3da..f4a20b0f 100644 --- a/test/tm-cases/hash_html_blocks.html +++ b/test/tm-cases/hash_html_blocks.html @@ -1,9 +1,6 @@

Archons of the Colophon

- -

by Paco Xander Nathan

-
From 5988b0970137715d6f66cab766b74577ab742c43 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Sun, 7 Dec 2025 17:14:43 +0000 Subject: [PATCH 6/9] Refactor inheritants of original IABP to use new GFM variant. Also refactor the GFM class to be more readable --- lib/markdown2.py | 290 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 206 insertions(+), 84 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 95d5a405..c4c2813f 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -2563,6 +2563,10 @@ def test(self, text): class GFMItalicAndBoldProcessor(Extra): + ''' + An upgraded version of the `ItalicAndBoldProcessor` that covers far more edge cases and gets close + to Github Flavoured Markdown compliance. + ''' name = 'gfm-italic-and-bold-processor' order = (Stage.ITALIC_AND_BOLD,), tuple() @@ -2572,24 +2576,39 @@ def run(self, text): nesting = False opens = {'*': [], '_': []} + '''Mapping of em type to a list of opening runs of that em type''' unused_opens = {'*': {}, '_': {}} + ''' + Mapping of em type to another mapping of unused opening runs of that em type. + An unused run is one that has been skipped, or only partially consumed (eg: **foo*) and + could be consumed by another closing run. The inner mapping is a mapping of the + delimiter run to an offset number, which is the number of characters from that run that + have been consumed so far + ''' unused_closes = {'*': [], '_': []} + ''' + Mapping of em type to a list of closing delimiter runs that have not been fully consumed. + EG: *foo*bar* + ''' tokens = [] + '''List of processed spans of text that will be joined to form the new `text`''' index = 0 + '''Number of chars of `text` that has been processed so far''' - delim_runs = { - delim_run: self.delimiter_left_or_right(delim_run) - for delim_run in re.finditer(r'(\*+|_+)', text) - } + # do a quick scan for all delimiter runs, filtering for those that can open/close emphasis + delim_runs = OrderedDict() + for delim_run in re.finditer(r'(\*+|_+)', text): + left, right = self.delimiter_left_or_right(delim_run) + if left or right: + delim_runs[delim_run] = (left, right) for delim_run, (left, right) in delim_runs.items(): syntax = delim_run.group(1) em_type = syntax[0] - if not (left or right): - continue - + # if not a closing run, or there are no opens to consume if not right or not opens[em_type]: + # if it can also be an opening run if left: opens[em_type].append(delim_run) continue @@ -2623,75 +2642,42 @@ def run(self, text): # if the delimiter runs don't match then we need to figure out how to resolve this if len(open_syntax) != len(syntax): - if len(open_syntax) < len(syntax) and opens[em_type]: - # since we are detecting a previous open, we are expanding the em span to the left - # so we should check if we're covering additional chars that we don't cross an - # existing span border - if not self.body_crosses_span_borders(opens[em_type][-1], open): - middle = open - - open = opens[em_type].pop(-1) + has_middle = self.has_middle( + open, delim_run, opens[em_type], + unused_opens[em_type], unused_closes[em_type] + ) + + if has_middle is not False: + middle = has_middle[1] + if has_middle[0] != open: + # only re-assign and re-calc opening offsets if that run HAS changed + open = has_middle[0] open_offset = unused_opens[em_type].pop(open, 0) open_syntax = open.group(1)[open_offset:] open_start = open.start() + open_offset - - if len(open_syntax) == len(syntax): - # if it turns out the previous open is a perfect match then ignore the middle part - middle = None - elif len(open_syntax) > len(syntax) and unused_closes[em_type]: - # check if there is a previous closing delim run in the current body - # since this is already within the body we don't need to do a cross-span border check - # as we're not expanding into new ground and that is covered later - middle = next((i for i in unused_closes[em_type] if open.end() < i.start() < delim_run.start()), None) - else: - try: - next_delim_run = tuple(delim_runs.keys())[tuple(delim_runs.keys()).index(delim_run) + 1] - except IndexError: - next_delim_run = None - - if next_delim_run is None: - # if there is no follow up delimiter run then no point leaving this unused. Process now - pass - elif len(open_syntax) < len(syntax) and ( - # if this run can be an opener, but the next run won't close both of them - (left and ( - not delim_runs[next_delim_run][1] - or len(next_delim_run.group(1)) < len(open_syntax) + len(syntax) - )) - # if the next run is not an opener and won't consume this run - and not delim_runs[next_delim_run][0] - ): - pass - elif len(open_syntax) > len(syntax) and ( - # if this run can be an closer, but the next run is not a fresh opener - (right and not delim_runs[next_delim_run][0]) - # if the next run is not a closer - and not delim_runs[next_delim_run][1] - ): - pass - elif len(open_syntax) < len(syntax) and len(syntax) >= 3: - # if closing syntax is bigger and its >= three long then focus on closing any - # open em spans - pass + elif not self.should_process_imbalanced_delimiter_runs( + open, delim_run, delim_runs, unused_opens[em_type] + ): + # if we shouldn't process them now, save these opens for a future pass + unused_opens[em_type][open] = open_offset + opens[em_type].append(open) + if left: + unused_opens[em_type][delim_run] = 0 + opens[em_type].append(delim_run) else: - # if there are no unused opens or closes to use up then this is just imbalanced - # mark as unused and leave for later processing - unused_opens[em_type][open] = open_offset - opens[em_type].append(open) - if left: - unused_opens[em_type][delim_run] = 0 - opens[em_type].append(delim_run) - else: - unused_closes[em_type].append(delim_run) - continue + unused_closes[em_type].append(delim_run) + continue # add all the text leading up to the opening delimiter tokens.append(delim_run.string[index: open_start]) span, close_syntax_used_chars = self.process_span(open, delim_run, open_offset, middle) tokens.extend(span) - if close_syntax_used_chars < len(syntax): - # if we didn't use up the entire closing delimiter mark it as unused + + if close_syntax_used_chars is None: + close_syntax_used_chars = len(syntax) + elif close_syntax_used_chars < len(syntax): + # if we didn't use up the entire closing delimiter, mark it as unused unused_opens[em_type][delim_run] = close_syntax_used_chars opens[em_type].append(delim_run) @@ -2708,13 +2694,17 @@ def run(self, text): def process_span( self, open: re.Match, close: re.Match, offset: int, middle: Optional[re.Match] = None - ): + ) -> Tuple[List[str], Optional[int]]: ''' Args: open: the match against the opening delimiter run close: the match against the closing delimiter run offset: the number of chars from the opening delimiter that should be skipped when processing middle: an optional delimiter run in the middle of the span + + Returns: + A list of processed tokens, and then the number of chars from the closing syntax that were + consumed. If the latter item is None, then assume all chars were consumed ''' tokens = [] @@ -2769,6 +2759,108 @@ def process_span( return tokens, close_delim_chars_used + def has_middle( + self, open: re.Match, close: re.Match, opens: List[re.Match], + unused_opens: Dict[re.Match, int], unused_closes: List[re.Match] + ) -> Union[Tuple[re.Match, Optional[re.Match]], Literal[False]]: + ''' + Check if an emphasis span has a middle delimiter run, which may change the outer tags + + Args: + open: the current opening delimiter run + close: the closing delimiter run + opens: a list of all opening delimiter runs in the text + unused_opens: a mapping of unused opens within the text to their offset values + unused_closes: a list of unused closes within the text + + Returns: + False if there is no middle run. Otherwise, a tuple of the new opening run and the optional + middle span. The middle span may be None if it is invalid + ''' + open_offset = unused_opens.get(open, 0) + open_syntax = open.group(1)[open_offset:] + + syntax = close.group(1) + + if len(open_syntax) < len(syntax) and opens: + # expand the em span to the left, meaning we're covering additional chars. + # check we don't cross an existing span border + if not self.body_crosses_span_borders(opens[-1], open): + middle = open + + open = opens.pop(-1) + open_offset = unused_opens.pop(open, 0) + open_syntax = open.group(1)[open_offset:] + + if len(open_syntax) == len(syntax): + # if it turns out the previous open is a perfect match then ignore the middle part + # eg: **foo*bar** + middle = None + elif len(open_syntax) > len(syntax) and unused_closes: + # check if there is a previous closing delim run in the current body + # since this is already within the body we don't need to do a cross-span border check + # as we're not expanding into new ground and that is covered later + middle = next((i for i in unused_closes if open.end() < i.start() < close.start()), None) + else: + return False + + return open, middle + + def should_process_imbalanced_delimiter_runs( + self, open: re.Match, close: re.Match, + delim_runs: Dict[re.Match, Tuple[bool, bool]], + unused_opens: Dict[re.Match, int] + ): + ''' + Check if an imbalanced delimiter run should be consumed now, or left for a later pass + + Args: + open: the opening delimiter run + close: the closing delimiter run + delim_runs: a mapping of all of the delimiter runs in the text to a tuple of whether + they are opening or closing runs + unused_opens: a mapping of unused opens within the text to their offset values + ''' + open_offset = unused_opens.get(open, 0) + open_syntax = open.group(1)[open_offset:] + + syntax = close.group(1) + left, right = delim_runs[close] + + if len(open_syntax) < len(syntax) and len(syntax) >= 3: + # if closing syntax is bigger and its >= three long then focus on closing any + # open em spans + return True + + try: + next_delim_run = tuple(delim_runs.keys())[tuple(delim_runs.keys()).index(close) + 1] + except IndexError: + # if there is no follow up delimiter run then no point leaving this unused. Process now + return True + + if len(open_syntax) < len(syntax) and ( + # if this run can be an opener, but the next run won't close both of them + (left and ( + not delim_runs[next_delim_run][1] + or len(next_delim_run.group(1)) < len(open_syntax) + len(syntax) + )) + # if the next run is not an opener and won't consume this run + and not delim_runs[next_delim_run][0] + ): + return True + + if len(open_syntax) > len(syntax) and ( + # if this run can be a closer, but the next run is not a fresh opener + (right and not delim_runs[next_delim_run][0]) + # if the next run is not a closer + and not delim_runs[next_delim_run][1] + ): + return True + + # if there are no unused opens or closes to use up then this is just imbalanced. + # mark as unused and leave for later processing + return False + def delimiter_left_or_right(self, delim_run: re.Match): run = delim_run.string[max(0, delim_run.start() - 1): delim_run.end() + 1] syntax = delim_run.group(1) @@ -3269,37 +3361,60 @@ def run(self, text): return text -class CodeFriendly(ItalicAndBoldProcessor): +class CodeFriendly(GFMItalicAndBoldProcessor): ''' Disable _ and __ for em and strong. ''' name = 'code-friendly' + order = (Stage.ITALIC_AND_BOLD,), (Stage.ITALIC_AND_BOLD,) def __init__(self, md, options): super().__init__(md, options) # add a prefix to it so we don't interfere with escaped/hashed chars from other stages - self.hash_table[_hash_text(self.name + '_')] = '_' - self.hash_table[_hash_text(self.name + '__')] = '__' + self.hash_table = { + _hash_text(self.name + '_'): '_', + _hash_text(self.name + '__'): '__' + } - def sub(self, match: re.Match) -> str: - syntax = match.group(1) - # use match.regs because strong/em regex may include preceding text in the match as well - text: str = match.string[match.regs[1][0]: match.end()] - if '_' in syntax: + def run(self, text): + if self.md.order < Stage.ITALIC_AND_BOLD: + text = super().run(text) + else: + orig_text = '' + while orig_text != text: + orig_text = text + for key, substr in self.hash_table.items(): + text = text.replace(key, substr) + return text + + def process_span(self, open: re.Match, close: re.Match, offset: int, middle: re.Match | None = None): + text = open.string[open.start(): close.end()] + open_syntax = open.group(1)[offset:] + close_syntax = close.group(1) + + if len(open_syntax) > 2 or open_syntax != close_syntax: + return [text], None + + if '_' in open_syntax: # if using _this_ syntax, hash it to avoid processing, but don't hash the contents incase of nested syntax - text = text.replace(syntax, _hash_text(self.name + syntax)) - return text + text = text.replace(open_syntax, _hash_text(self.name + open_syntax)) + return [text], None elif '_' in text: # if the text within the bold/em markers contains '_' then hash those chars to protect them from em_re text = ( - text[len(syntax): -len(syntax)] + text[len(open_syntax): -len(close_syntax)] .replace('__', _hash_text(self.name + '__')) .replace('_', _hash_text(self.name + '_')) ) - return syntax + text + syntax - # if no underscores are present, the text is fine and we can just leave it alone - return super().sub(match) + return [open_syntax, text, close_syntax], None + + return super().process_span(open, close, offset, middle) + + def test(self, text: str): + return super().test(text) or ( + self.hash_table and re.search(r'md5-[0-9a-z]{32}', text) + ) class FencedCodeBlocks(Extra): @@ -3623,7 +3738,7 @@ def tags(self, lexer_name): return super().tags(lexer_name) -class MiddleWordEm(ItalicAndBoldProcessor): +class MiddleWordEm(GFMItalicAndBoldProcessor): ''' Allows or disallows emphasis syntax in the middle of words, defaulting to allow. Disabling this means that `this_text_here` will not be @@ -3666,8 +3781,10 @@ def __init__(self, md: Markdown, options: Union[dict, bool, None]): ) # add a prefix to it so we don't interfere with escaped/hashed chars from other stages - self.hash_table['_'] = _hash_text(self.name + '_') - self.hash_table['*'] = _hash_text(self.name + '*') + self.hash_table = { + '_': _hash_text(self.name + '_'), + '*': _hash_text(self.name + '*') + } def run(self, text): if self.options['allowed']: @@ -3692,6 +3809,11 @@ def sub(self, match: re.Match): syntax = match.group(1) return self.hash_table[syntax] + def test(self, text: str): + return super().test(text) or ( + self.hash_table and re.search(r'md5-[0-9a-z]{32}', text) + ) + class Numbering(Extra): ''' From 060d48da4c5cf7188968b9954fddac9d3cc3b748 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Sun, 7 Dec 2025 17:22:43 +0000 Subject: [PATCH 7/9] Add issues 645, 652, 653 and 654 to gfm test case --- test/tm-cases/gfm_emphasis.html | 8 ++++++++ test/tm-cases/gfm_emphasis.text | 10 +++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/test/tm-cases/gfm_emphasis.html b/test/tm-cases/gfm_emphasis.html index 9078f0f9..0ed679d6 100644 --- a/test/tm-cases/gfm_emphasis.html +++ b/test/tm-cases/gfm_emphasis.html @@ -259,3 +259,11 @@

__ahttp://foo.bar/?q=__

foo*bar

+ +

_foo bar baz._bim

+ +

__foo bar __baz bim bam

+ +

foobar

+ +

foobar

diff --git a/test/tm-cases/gfm_emphasis.text b/test/tm-cases/gfm_emphasis.text index 7b88c80c..43e01891 100644 --- a/test/tm-cases/gfm_emphasis.text +++ b/test/tm-cases/gfm_emphasis.text @@ -257,4 +257,12 @@ _a `_`_ __a -**foo*bar** \ No newline at end of file +**foo*bar** + +_foo **bar** baz._bim + +**__foo** bar **__baz** bim *bam* + +**foo*bar*** + +***foo*bar** \ No newline at end of file From 3b19616b3a1bc9e83e3414b629d8258d70ebc233 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Sun, 7 Dec 2025 19:13:56 +0000 Subject: [PATCH 8/9] Improve performance in repetitive (ReDoS) scenarios by caching some IAB internal functions --- lib/markdown2.py | 102 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 73 insertions(+), 29 deletions(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index c4c2813f..4197ae03 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -121,7 +121,7 @@ from collections import defaultdict, OrderedDict from abc import ABC, abstractmethod import functools -from collections.abc import Iterable +from collections.abc import Iterable, Iterator from hashlib import sha256 from random import random from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Type, TypedDict, Union @@ -2044,11 +2044,13 @@ def _encode_code(self, text: str) -> str: ) _em_re = re.compile(r"(\*|_)(?=\S)(.*?\S)\1", re.S) + _iab_processor = None @mark_stage(Stage.ITALIC_AND_BOLD) def _do_italics_and_bold(self, text: str) -> str: - iab = GFMItalicAndBoldProcessor(self, None) - if iab.test(text): - text = iab.run(text) + if not self._iab_processor: + self._iab_processor = GFMItalicAndBoldProcessor(self, None) + if self._iab_processor.test(text): + text = self._iab_processor.run(text) return text _block_quote_base = r''' @@ -2595,14 +2597,13 @@ def run(self, text): index = 0 '''Number of chars of `text` that has been processed so far''' - # do a quick scan for all delimiter runs, filtering for those that can open/close emphasis - delim_runs = OrderedDict() - for delim_run in re.finditer(r'(\*+|_+)', text): - left, right = self.delimiter_left_or_right(delim_run) - if left or right: - delim_runs[delim_run] = (left, right) + delim_runs_iter = re.finditer(r'(\*+|_+)', text) + next_delim_run = self._next_run(delim_runs_iter) + + while next_delim_run: + delim_run, left, right = next_delim_run + next_delim_run = self._next_run(delim_runs_iter) - for delim_run, (left, right) in delim_runs.items(): syntax = delim_run.group(1) em_type = syntax[0] @@ -2656,7 +2657,7 @@ def run(self, text): open_syntax = open.group(1)[open_offset:] open_start = open.start() + open_offset elif not self.should_process_imbalanced_delimiter_runs( - open, delim_run, delim_runs, unused_opens[em_type] + open, delim_run, unused_opens[em_type], next_delim_run ): # if we shouldn't process them now, save these opens for a future pass unused_opens[em_type][open] = open_offset @@ -2808,8 +2809,8 @@ def has_middle( def should_process_imbalanced_delimiter_runs( self, open: re.Match, close: re.Match, - delim_runs: Dict[re.Match, Tuple[bool, bool]], - unused_opens: Dict[re.Match, int] + unused_opens: Dict[re.Match, int], + next_delim_run: Optional[Tuple[re.Match, Optional[re.Match], Optional[re.Match]]] = None ): ''' Check if an imbalanced delimiter run should be consumed now, or left for a later pass @@ -2817,43 +2818,39 @@ def should_process_imbalanced_delimiter_runs( Args: open: the opening delimiter run close: the closing delimiter run - delim_runs: a mapping of all of the delimiter runs in the text to a tuple of whether - they are opening or closing runs unused_opens: a mapping of unused opens within the text to their offset values + next_delim_run: the next delimiter run after the closing run ''' open_offset = unused_opens.get(open, 0) open_syntax = open.group(1)[open_offset:] syntax = close.group(1) - left, right = delim_runs[close] + left, right = self.delimiter_left_or_right(close) if len(open_syntax) < len(syntax) and len(syntax) >= 3: # if closing syntax is bigger and its >= three long then focus on closing any # open em spans return True - try: - next_delim_run = tuple(delim_runs.keys())[tuple(delim_runs.keys()).index(close) + 1] - except IndexError: - # if there is no follow up delimiter run then no point leaving this unused. Process now + if next_delim_run is None: return True if len(open_syntax) < len(syntax) and ( # if this run can be an opener, but the next run won't close both of them (left and ( - not delim_runs[next_delim_run][1] - or len(next_delim_run.group(1)) < len(open_syntax) + len(syntax) + not next_delim_run[2] + or len(next_delim_run[0].group(1)) < len(open_syntax) + len(syntax) )) # if the next run is not an opener and won't consume this run - and not delim_runs[next_delim_run][0] + and not next_delim_run[1] ): return True if len(open_syntax) > len(syntax) and ( # if this run can be a closer, but the next run is not a fresh opener - (right and not delim_runs[next_delim_run][0]) + (right and not next_delim_run[1]) # if the next run is not a closer - and not delim_runs[next_delim_run][1] + and not next_delim_run[2] ): return True @@ -2862,8 +2859,22 @@ def should_process_imbalanced_delimiter_runs( return False def delimiter_left_or_right(self, delim_run: re.Match): + ''' + Determine if a delimiter run is left or right flanking + + Returns: + Tuple of bools that mean left and right flanking respectively + ''' run = delim_run.string[max(0, delim_run.start() - 1): delim_run.end() + 1] - syntax = delim_run.group(1) + + return self._delimiter_left_or_right(run, delim_run.group(1)) + + @functools.lru_cache(maxsize=512) + def _delimiter_left_or_right(self, run: str, syntax: str): + ''' + Cached version of `delimiter_left_or_right` that massively speeds things up when dealing + with many repetetive delimiter runs - eg: in a ReDoS scenario + ''' syntax_re = syntax.replace('*', r'\*') left = ( @@ -2891,12 +2902,45 @@ def delimiter_left_or_right(self, delim_run: re.Match): return left, right def body_crosses_span_borders(self, open: re.Match, close: re.Match): - for tag in re.findall(rf' 1 or text.count('_') > 1 From 749c9cb19800a3bef267d99a6025d948948bac11 Mon Sep 17 00:00:00 2001 From: Crozzers Date: Sun, 7 Dec 2025 19:16:04 +0000 Subject: [PATCH 9/9] Fix python typing syntax error --- lib/markdown2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/markdown2.py b/lib/markdown2.py index 4197ae03..6af7c929 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -3432,7 +3432,7 @@ def run(self, text): text = text.replace(key, substr) return text - def process_span(self, open: re.Match, close: re.Match, offset: int, middle: re.Match | None = None): + def process_span(self, open: re.Match, close: re.Match, offset: int, middle: Optional[re.Match] = None): text = open.string[open.start(): close.end()] open_syntax = open.group(1)[offset:] close_syntax = close.group(1)