Add support for multiline annotations with single-line prefix ("#")

regisb · regisb · commit 097db99c595b · 2020-09-02T09:24:29.000+02:00
Multiline annotations were previously supported only for multi-line
comments. In Python: """..."""

This introduces multiline annotations for comments prefixed by
single-line comment signs. In Python:

    # .. pii: a multiline annotation
    #   that spans multiple lines.

This makes it possible to wrap long comment lines more naturally, in
particular in Python.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -11,6 +11,11 @@ Change Log
 
 .. There should always be an "Unreleased" section for changes pending release.
 
+[0.6.0] - 2020-08-27
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* Add support for multiline annotations for lines prefixed with single-line comment signs ("#")
+
 [0.5.1] - 2020-08-25
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/code_annotations/__init__.py b/code_annotations/__init__.py
@@ -2,4 +2,4 @@
 Extensible tools for parsing annotations in codebases.
 """
 
-__version__ = '0.5.1'
+__version__ = '0.6.0'
diff --git a/code_annotations/extensions/base.py b/code_annotations/extensions/base.py
@@ -42,23 +42,26 @@ class SimpleRegexAnnotationExtension(AnnotationExtension, metaclass=ABCMeta):
     # Javascript and Python extensions for examples.
     lang_comment_definition = None
 
-    """
-    This format string/regex finds all comments in the file. The format tokens will be replaced with the
-    language-specific comment definitions defined in the sub-classes.
-
-    Returns two named values: multiline_comment and singleline_comment.
-    """
+    # This format string/regex finds all comments in the file. The format tokens will be replaced with the
+    # language-specific comment definitions defined in the sub-classes.
+    #
+    # Match groupdict will contain two named subgroups: 'comment' and 'prefixed_comment', of which at most
+    # one will be non-None.
     comment_regex_fmt = r"""
-    {multi_start}   # start of the language-specific multi-line comment (ex. /*)
-    (?P<multiline_comment>
-        [\d\D]*?    # capture all of the characters...
-    )
-    {multi_end}     # until you find the end of the language-specific multi-line comment (ex. */)
-    |               # If you don't find any of those...
-    {single}        # start by finding the single-line comment token (ex. //)
-    (?P<singleline_comment>
-    .*              # and capture all characters until the end of the line
-    )
+        {multi_start}           # start of the language-specific multi-line comment (ex. /*)
+        (?P<comment>            # Look for a multiline comment
+            [\d\D]*?            # capture all of the characters...
+        )
+        {multi_end}             # until you find the end of the language-specific multi-line comment (ex. */)
+        |                       # If you don't find any of those...
+        (?P<prefixed_comment>   # Look for a group of single-line comments
+            (?:                 # Non-capture mode
+                {single}        # start by finding the single-line comment token (ex. //)
+                .*              # and capture all characters until the end of the line
+                \n?             # followed by an optional carriage return
+                \ *             # and some empty space
+            )*                  # multiple times
+        )
     """
 
     def __init__(self, config, echo):
@@ -79,6 +82,10 @@ def __init__(self, config, echo):
             self.comment_regex_fmt.format(**self.lang_comment_definition),
             flags=re.VERBOSE
         )
+        self.prefixed_comment_regex = re.compile(
+            r"^ *{single}".format(**self.lang_comment_definition),
+            flags=re.MULTILINE
+        )
 
         # Parent class will allow this class to populate self.strings_to_search via
         # calls to _add_annotation_token or _add_annotation_group for each configured
@@ -105,14 +112,14 @@ def search(self, file_handle):
         if any(anno in txt for anno in self.config.annotation_tokens):
             fname = clean_abs_path(file_handle.name, self.config.source_path)
 
-            # Iterate on all comments: both multi- and single-line.
+            # Iterate on all comments: both prefixed- and non-prefixed.
             for match in self.comment_regex.finditer(txt):
                 # Get the line number by counting newlines + 1 (for the first line).
                 # Note that this is the line number of the beginning of the comment, not the
                 # annotation token itself.
                 line = txt.count('\n', 0, match.start()) + 1
-                # Should only be one match
-                comment_content = match.groupdict()["multiline_comment"] or match.groupdict()["singleline_comment"]
+
+                comment_content = self._find_comment_content(match)
                 for inner_match in self.query.finditer(comment_content):
                     try:
                         annotation_token = inner_match.group('token')
@@ -134,3 +141,27 @@ def search(self, file_handle):
                     })
 
         return found_annotations
+
+    def _find_comment_content(self, match):
+        """
+        Return the comment content as text.
+
+        Args:
+            match (sre.SRE_MATCH): one of the matches of the self.comment_regex regular expression.
+        """
+        comment_content = match.groupdict()["comment"]
+        if comment_content:
+            return comment_content
+
+        # Find single-line comments and strip comment tokens
+        comment_content = match.groupdict()["prefixed_comment"]
+        return self._strip_single_line_comment_tokens(comment_content)
+
+    def _strip_single_line_comment_tokens(self, content):
+        """
+        Strip the leading single-line comment tokens from a comment text.
+
+        Args:
+            content (str): token-prefixed multi-line comment string.
+        """
+        return self.prefixed_comment_regex.sub("", content)
diff --git a/tests/extensions/python_test_files/multiline_singlelinecomment.pyt b/tests/extensions/python_test_files/multiline_singlelinecomment.pyt
@@ -0,0 +1,7 @@
+# Docstring
+#.. pii: A long description that
+#  spans multiple
+#  lines
+# A comment that is not indented and not part of the above multi-line annotation
+#.. pii_types: id, name
+# Some comment that comes after the multiple-line annotation
diff --git a/tests/extensions/test_base_extensions.py b/tests/extensions/test_base_extensions.py
@@ -28,3 +28,19 @@ def test_nothing_found():
     r = FakeExtension(config, VerboseEcho())
     with open('tests/extensions/base_test_files/empty.foo') as f:
         r.search(f)
+
+
+def test_strip_single_line_comment_tokens():
+    config = FakeConfig()
+
+    extension = FakeExtension(config, VerboseEcho())
+    text = """baz line1
+  baz line2
+bazline3
+baz   line4"""
+    expected_result = """ line1
+ line2
+line3
+   line4"""
+    # pylint: disable=protected-access
+    assert expected_result == extension._strip_single_line_comment_tokens(text)
diff --git a/tests/extensions/test_extension_python.py b/tests/extensions/test_extension_python.py
@@ -76,6 +76,15 @@ def test_grouping_and_choice_failures(test_file, expected_exit_code, expected_me
      Multi-line and multi-paragraph.""")
         ]
     ),
+    (
+        'multiline_singlelinecomment.pyt',
+        [
+            ('.. pii:', """A long description that
+  spans multiple
+  lines"""),
+            ('.. pii_types:', 'id, name'),
+        ]
+    ),
 ])
 def test_multi_line_annotations(test_file, annotations):
     config = AnnotationConfig('tests/test_configurations/.annotations_test')