diff --git a/markdownify/__init__.py b/markdownify/__init__.py index c732711..f23a1b3 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -17,7 +17,9 @@ # Extract (leading_nl, content, trailing_nl) from a string # (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here) -re_extract_newlines = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL) +re_extract_newlines = re.compile( + r'^(\n*)((?:.*[^\n])?)(\n*)$', + flags=re.DOTALL) # Escape miscellaneous special Markdown characters re_escape_misc_chars = re.compile(r'([]\\&<`[>~=+|])') @@ -78,6 +80,7 @@ def abstract_inline_conversion(markup_fn): the text if it looks like an HTML tag. markup_fn is necessary to allow for references to self.strong_em_symbol etc. """ + def implementation(self, el, text, parent_tags): markup_prefix = markup_fn(self) if markup_prefix.startswith('<') and markup_prefix.endswith('>'): @@ -89,12 +92,14 @@ def implementation(self, el, text, parent_tags): prefix, suffix, text = chomp(text) if not text: return '' - return '%s%s%s%s%s' % (prefix, markup_prefix, text, markup_suffix, suffix) + return '%s%s%s%s%s' % (prefix, markup_prefix, + text, markup_suffix, suffix) return implementation def _todict(obj): - return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_')) + return dict((k, getattr(obj, k)) + for k in dir(obj) if not k.startswith('_')) def should_remove_whitespace_inside(el): @@ -170,6 +175,8 @@ class DefaultOptions: strip_document = STRIP strong_em_symbol = ASTERISK sub_symbol = '' + preprocess_fn = None + postprocess_fn = None sup_symbol = '' table_infer_header = False wrap = False @@ -184,6 +191,8 @@ def __init__(self, **options): self.options = _todict(self.DefaultOptions) self.options.update(_todict(self.Options)) self.options.update(options) + self.preprocess_fn = self.options['preprocess_fn'] + self.postprocess_fn = self.options['postprocess_fn'] if self.options['strip'] is not None and self.options['convert'] is not None: raise ValueError('You may specify either tags to strip or tags to' ' convert, but not both.') @@ -205,7 +214,8 @@ def process_element(self, node, parent_tags=None): return self.process_tag(node, parent_tags=parent_tags) def process_tag(self, node, parent_tags=None): - # For the top-level element, initialize the parent context with an empty set. + # For the top-level element, initialize the parent context with an + # empty set. if parent_tags is None: parent_tags = set() @@ -226,10 +236,12 @@ def _can_ignore(el): # Non-whitespace text nodes are always processed. return False elif should_remove_inside and (not el.previous_sibling or not el.next_sibling): - # Inside block elements (excluding
), ignore adjacent whitespace elements.
+ # Inside block elements (excluding ), ignore adjacent
+ # whitespace elements.
return True
elif should_remove_whitespace_outside(el.previous_sibling) or should_remove_whitespace_outside(el.next_sibling):
- # Outside block elements (including ), ignore adjacent whitespace elements.
+ # Outside block elements (including ), ignore adjacent
+ # whitespace elements.
return True
else:
return False
@@ -238,21 +250,24 @@ def _can_ignore(el):
else:
raise ValueError('Unexpected element type: %s' % type(el))
- children_to_convert = [el for el in node.children if not _can_ignore(el)]
+ children_to_convert = [
+ el for el in node.children if not _can_ignore(el)]
# Create a copy of this tag's parent context, then update it to include this tag
# to propagate down into the children.
parent_tags_for_children = set(parent_tags)
parent_tags_for_children.add(node.name)
- # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
+ # if this tag is a heading or table cell, add an '_inline' parent
+ # pseudo-tag
if (
re_html_heading.match(node.name) is not None # headings
or node.name in {'td', 'th'} # table cells
):
parent_tags_for_children.add('_inline')
- # if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
+ # if this tag is a preformatted element, add a '_noformat' parent
+ # pseudo-tag
if node.name in {'pre', 'code', 'kbd', 'samp'}:
parent_tags_for_children.add('_noformat')
@@ -274,17 +289,21 @@ def _can_ignore(el):
updated_child_strings = [''] # so the first lookback works
for child_string in child_strings:
# Separate the leading/trailing newlines from the content.
- leading_nl, content, trailing_nl = re_extract_newlines.match(child_string).groups()
+ leading_nl, content, trailing_nl = re_extract_newlines.match(
+ child_string).groups()
# If the last child had trailing newlines and this child has leading newlines,
# use the larger newline count, limited to 2.
if updated_child_strings[-1] and leading_nl:
- prev_trailing_nl = updated_child_strings.pop() # will be replaced by the collapsed value
- num_newlines = min(2, max(len(prev_trailing_nl), len(leading_nl)))
+ # will be replaced by the collapsed value
+ prev_trailing_nl = updated_child_strings.pop()
+ num_newlines = min(
+ 2, max(len(prev_trailing_nl), len(leading_nl)))
leading_nl = '\n' * num_newlines
# Add the results to the updated child string list.
- updated_child_strings.extend([leading_nl, content, trailing_nl])
+ updated_child_strings.extend(
+ [leading_nl, content, trailing_nl])
child_strings = updated_child_strings
@@ -292,9 +311,15 @@ def _can_ignore(el):
text = ''.join(child_strings)
# apply this tag's final conversion function
+
+ if self.preprocess_fn and self.should_convert_tag(node.name):
+ text = self.preprocess_fn(node, text, parent_tags=parent_tags)
+
convert_fn = self.get_conv_fn_cached(node.name)
if convert_fn is not None:
text = convert_fn(node, text, parent_tags=parent_tags)
+ if self.postprocess_fn and self.should_convert_tag(node.name):
+ text = self.postprocess_fn(node, text, parent_tags=parent_tags)
return text
@@ -305,16 +330,20 @@ def convert__document_(self, el, text, parent_tags):
elif self.options['strip_document'] == RSTRIP:
text = text.rstrip('\n') # remove trailing separation newlines
elif self.options['strip_document'] == STRIP:
- text = text.strip('\n') # remove leading and trailing separation newlines
+ # remove leading and trailing separation newlines
+ text = text.strip('\n')
elif self.options['strip_document'] is None:
pass # leave leading and trailing separation newlines as-is
else:
- raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document'])
+ raise ValueError(
+ 'Invalid value for strip_document: %s' %
+ self.options['strip_document'])
return text
def process_text(self, el, parent_tags=None):
- # For the top-level element, initialize the parent context with an empty set.
+ # For the top-level element, initialize the parent context with an
+ # empty set.
if parent_tags is None:
parent_tags = set()
@@ -328,7 +357,8 @@ def process_text(self, el, parent_tags=None):
text = re_newline_whitespace.sub('\n', text)
text = re_whitespace.sub(' ', text)
- # escape special characters if we're not inside a preformatted or code element
+ # escape special characters if we're not inside a preformatted or code
+ # element
if '_noformat' not in parent_tags:
text = self.escape(text, parent_tags)
@@ -364,7 +394,8 @@ def get_conv_fn(self, tag_name):
return None
# Look for an explicitly defined conversion function by tag name first
- convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub("_", tag_name)
+ convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub(
+ "_", tag_name)
convert_fn = getattr(self, convert_fn_name, None)
if convert_fn:
return convert_fn
@@ -373,7 +404,8 @@ def get_conv_fn(self, tag_name):
match = re_html_heading.match(tag_name)
if match:
n = int(match.group(1)) # get value of N from
- return lambda el, text, parent_tags: self.convert_hN(n, el, text, parent_tags)
+ return lambda el, text, parent_tags: self.convert_hN(
+ n, el, text, parent_tags)
# No conversion function was found
return None
@@ -426,9 +458,11 @@ def convert_a(self, el, text, parent_tags):
if self.options['default_title'] and not title:
title = href
title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
- return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text
+ return '%s[%s](%s%s)%s' % (prefix, text, href,
+ title_part, suffix) if href else text
- convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])
+ convert_b = abstract_inline_conversion(
+ lambda self: 2 * self.options['strong_em_symbol'])
def convert_blockquote(self, el, text, parent_tags):
# handle some early-exit scenarios
@@ -473,7 +507,8 @@ def convert_div(self, el, text, parent_tags):
convert_section = convert_div
- convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol'])
+ convert_em = abstract_inline_conversion(
+ lambda self: self.options['strong_em_symbol'])
convert_kbd = convert_code
@@ -650,7 +685,8 @@ def convert_pre(self, el, text, parent_tags):
code_language = self.options['code_language']
if self.options['code_language_callback']:
- code_language = self.options['code_language_callback'](el) or code_language
+ code_language = self.options['code_language_callback'](
+ el) or code_language
return '\n\n```%s\n%s\n```\n\n' % (code_language, text)
@@ -669,9 +705,11 @@ def convert_style(self, el, text, parent_tags):
convert_samp = convert_code
- convert_sub = abstract_inline_conversion(lambda self: self.options['sub_symbol'])
+ convert_sub = abstract_inline_conversion(
+ lambda self: self.options['sub_symbol'])
- convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])
+ convert_sup = abstract_inline_conversion(
+ lambda self: self.options['sup_symbol'])
def convert_table(self, el, text, parent_tags):
return '\n\n' + text.strip() + '\n\n'
@@ -704,9 +742,10 @@ def convert_tr(self, el, text, parent_tags):
and len(el.parent.find_all('tr')) == 1)
)
is_head_row_missing = (
- (is_first_row and not el.parent.name == 'tbody')
- or (is_first_row and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1)
- )
+ (is_first_row and not el.parent.name == 'tbody') or (
+ is_first_row and el.parent.name == 'tbody' and len(
+ el.parent.parent.find_all(
+ ['thead'])) < 1))
overline = ''
underline = ''
full_colspan = 0
@@ -723,7 +762,8 @@ def convert_tr(self, el, text, parent_tags):
# - is headline or
# - headline is missing and header inference is enabled
# print headline underline
- underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'
+ underline += '| ' + \
+ ' | '.join(['---'] * full_colspan) + ' |' + '\n'
elif ((is_head_row_missing
and not self.options['table_infer_header'])
or (is_first_row
diff --git a/tests/test_preprocess_postprocess.py b/tests/test_preprocess_postprocess.py
new file mode 100644
index 0000000..c05e4b4
--- /dev/null
+++ b/tests/test_preprocess_postprocess.py
@@ -0,0 +1,128 @@
+from markdownify import markdownify as md
+
+
+def test_preprocess_all_tags():
+
+ def preprocess(node, text, parent_tags):
+ alignment = ""
+ if 'style' in node.attrs and 'text-align' in node.attrs['style']:
+ style = node.attrs['style']
+ alignment = style.split("text-align:")[1].split(";")[0].strip()
+
+ if alignment:
+ return f"[align={alignment}]{text}[/align]"
+ return text
+
+ assert md(
+ 'para
bold',
+ preprocess_fn=preprocess) == '[align=center]para[/align]\n\n**[align=left]bold[/align]**'
+
+
+def test_postprocess_all_tags():
+
+ def postprocess(node, text, parent_tags):
+ alignment = ""
+ if 'style' in node.attrs and 'text-align' in node.attrs['style']:
+ style = node.attrs['style']
+ alignment = style.split("text-align:")[1].split(";")[0].strip()
+
+ if alignment:
+ return f"[align={alignment}]{text}[/align]"
+ return text
+ b = md(
+ 'para
bold',
+ postprocess_fn=postprocess)
+ print(b)
+ assert md(
+ 'para
bold',
+ postprocess_fn=postprocess) == '[align=center]\n\npara\n\n[/align][align=left]**bold**[/align]'
+
+
+def test_preprocess_runs_before_conversion():
+
+ def preprocess(node, text, parent_tags):
+ if node.name == 'b':
+ return f"PRE_{text}_PRE"
+ return text
+
+ # Default conversion would make this "**bold**"
+ # With preprocessing it should become "**PRE_bold_PRE**"
+ assert md('bold', preprocess_fn=preprocess) == '**PRE_bold_PRE**'
+
+
+def test_postprocess_runs_after_conversion():
+
+ def postprocess(node, text, parent_tags):
+ if node.name == 'b':
+ return f"POST_{text}_POST"
+ return text
+
+ # Default conversion makes this "**bold**"
+ # With postprocessing it should become "POST_**bold**_POST"
+ assert md(
+ 'bold',
+ postprocess_fn=postprocess) == 'POST_**bold**_POST'
+
+
+def test_preprocess_doesnt_prevent_conversion():
+
+ def preprocess(node, text, parent_tags):
+ return text.upper() # Just modify the text, don't prevent conversion
+
+ # Should still get converted to markdown, just with uppercase content
+ assert md('bold', preprocess_fn=preprocess) == '**BOLD**'
+
+
+def test_postprocess_doesnt_prevent_conversion():
+
+ def postprocess(node, text, parent_tags):
+ return text.upper() # Just modify the result, don't prevent conversion
+
+ # Should get normal markdown conversion but then uppercased
+ assert md('bold', postprocess_fn=postprocess) == '**BOLD**'
+
+
+def test_combined_pre_and_post_processing():
+
+ def preprocess(node, text, parent_tags):
+ print("Running preprocess on", text)
+ return f"PRE:{text}:PRE"
+
+ def postprocess(node, text, parent_tags):
+ print("Running postprocess on", text)
+ return f"POST:{text}:POST"
+
+ # bold normally becomes "**bold**"
+ # With preprocessing: "(bold)" -> "**(bold)**"
+ # Then postprocessing: "[**(bold)**]"
+ assert md('text',
+ preprocess_fn=preprocess,
+ postprocess_fn=postprocess) == 'POST:PRE:text:PRE:POST'
+
+
+def test_processing_with_multiple_tags():
+
+ def preprocess(node, text, parent_tags):
+ if node.name == 'b':
+ return f"B:{text}"
+ elif node.name == 'i':
+ return f"I:{text}"
+ return text
+
+ # bold and italic
+ # Should become "**B:bold** and *I:italic*"
+ assert md('bold and italic
',
+ preprocess_fn=preprocess) == '**B:bold** and *I:italic*'
+
+
+def test_processing_with_nested_tags():
+
+ def postprocess(node, text, parent_tags):
+ if node.name == 'p':
+ return f"P:{text}"
+ return text
+
+ # bold text
normally becomes "**bold** text"
+ # With postprocessing becomes "P:**bold** text"
+ assert md('bold text
',
+ postprocess_fn=postprocess) == 'P:\n\n**bold** text'