Merge pull request #881 from juanjemdIos/master

dgarijo · web-flow · commit f373fc97a6df · 2026-01-09T12:19:20.000+01:00
New library to handle Markdown unmarking more reliably: MarkdownIt instead of Markdown.
diff --git a/.github/workflows/action-test-before-PR.yml b/.github/workflows/action-test-before-PR.yml
@@ -31,4 +31,4 @@ jobs:
         run: poetry run somef configure -a
         
       - name: Run pytest
-        run: poetry run pytest -v src/somef/test
+        run: poetry run pytest -v src/somef/test/
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -45,7 +45,8 @@ homepage = "https://github.com/KnowledgeCaptureAndDiscovery/somef"
     pyyaml = "^6.0.2"
     lxml = "^5.1.0"
     tomli = "^2.0.1"
-
+    markdown-it-py = "^3.0"
+    
 [tool.poetry.scripts]
 somef = "somef.__main__:cli"
 
diff --git a/src/somef/header_analysis.py b/src/somef/header_analysis.py
@@ -138,6 +138,7 @@ def extract_header_content(text: str) -> Tuple[pd.DataFrame, str | None]:
 
     content, none_header_content = mardown_parser.extract_content_per_header(text, headers)
     parents = mardown_parser.extract_headers_parents(text)
+    
     df = pd.DataFrame({
         'Header': header_list,
         'Content': content,
@@ -395,10 +396,9 @@ def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Res
             source = source[constants.PROP_RESULT][constants.PROP_VALUE]
 
         logging.info("Extracting information using headers - iterating over valid entries")
+        logging.info("Valid rows: %s", len(valid))
 
         for _, row in valid.iterrows():
-            
-            # logging.info(f'row value: {row[constants.PROP_VALUE]}')
             result = {
                 constants.PROP_VALUE: row[constants.PROP_VALUE],
                 constants.PROP_TYPE: constants.TEXT_EXCERPT,
diff --git a/src/somef/somef_cli.py b/src/somef/somef_cli.py
@@ -148,14 +148,18 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
         readme_unfiltered_text = markdown_utils.remove_comments(readme_unfiltered_text)
         repository_metadata, string_list = header_analysis.extract_categories(readme_unfiltered_text,
                                                                               repository_metadata)
+        
+        logging.info("Extracted categories from headers successfully.")
         readme_text_unmarked = markdown_utils.unmark(readme_text)
+        logging.info("readme text unmarked successfully.") 
         if not ignore_classifiers and readme_unfiltered_text != '':
             logging.info("--> suppervised classification")
             repository_metadata = supervised_classification.run_category_classification(readme_unfiltered_text,
                                                                                         threshold,
                                                                                         repository_metadata)
             logging.info("--> create excerpts")                                                                           
             excerpts = create_excerpts.create_excerpts(string_list)
+            logging.info("--> extract text excerpts headers")  
             excerpts_headers = mardown_parser.extract_text_excerpts_header(readme_unfiltered_text)
             header_parents = mardown_parser.extract_headers_parents(readme_unfiltered_text)
             score_dict = supervised_classification.run_classifiers(excerpts, file_paths)
diff --git a/src/somef/test/test_JSON_export.py b/src/somef/test/test_JSON_export.py
@@ -407,36 +407,36 @@ def test_issue_830(self):
     #         except Exception as e:
     #             print(f"Failed to delete {cls.json_file}: {e}")  
 
-    # def test_issue_862(self):
-    #     """Checks if this repository does not gets stuck when labeling headers"""
-    #     somef_cli.run_cli(threshold=0.8,
-    #                         ignore_classifiers=False,
-    #                         repo_url=None,
-    #                         local_repo=test_data_repositories + "componentInstaller",
-    #                         doc_src=None,
-    #                         in_file=None,
-    #                         output=test_data_path + "test_issue_862.json",
-    #                         graph_out=None,
-    #                         graph_format="turtle",
-    #                         codemeta_out=None,
-    #                         pretty=True,
-    #                         missing=False,
-    #                         readme_only=False)
+    def test_issue_862(self):
+        """Checks if this repository does not gets stuck when labeling headers"""
+        somef_cli.run_cli(threshold=0.8,
+                            ignore_classifiers=False,
+                            repo_url=None,
+                            local_repo=test_data_repositories + "componentInstaller",
+                            doc_src=None,
+                            in_file=None,
+                            output=test_data_path + "test_issue_862.json",
+                            graph_out=None,
+                            graph_format="turtle",
+                            codemeta_out=None,
+                            pretty=True,
+                            missing=False,
+                            readme_only=False)
         
-    #     text_file = open(test_data_path + "test_issue_862.json", "r")
-    #     data = text_file.read()
-    #     text_file.close()
-    #     json_content = json.loads(data)
+        text_file = open(test_data_path + "test_issue_862.json", "r")
+        data = text_file.read()
+        text_file.close()
+        json_content = json.loads(data)
 
-    #     assert "description" in json_content, "Missing 'description' property"
+        assert "description" in json_content, "Missing 'description' property"
    
-    #     assert len(json_content["description"]) > 0, "Description list is empty"
+        assert len(json_content["description"]) > 0, "Description list is empty"
    
-    #     first_desc = json_content["description"][0]["result"] 
-    #     assert "value" in first_desc, "Missing 'value' in description result"
-    #     assert first_desc["value"], "Description 'value' is empty"  
+        first_desc = json_content["description"][0]["result"] 
+        assert "value" in first_desc, "Missing 'value' in description result"
+        assert first_desc["value"], "Description 'value' is empty"  
         
-    #     os.remove(test_data_path + "test_issue_862.json")
+        os.remove(test_data_path + "test_issue_862.json")
 
     def test_issue_859(self):
         """Checks whether a repository without content works fine. Must have just some results from the API."""
diff --git a/src/somef/utils/markdown_utils.py b/src/somef/utils/markdown_utils.py
@@ -1,9 +1,11 @@
 import logging
 from io import StringIO
 from markdown import Markdown
+from markdown_it import MarkdownIt
 from ..utils import constants
 import re
 
+logging.getLogger("markdown_it").setLevel(logging.WARNING)
 
 ## Markdown to plain text conversion: begin ##
 # code snippet from https://stackoverflow.com/a/54923798
@@ -20,13 +22,18 @@ def unmark_element(element, stream=None):
 
 
 # patching Markdown
-Markdown.output_formats["plain"] = unmark_element
-__md = Markdown(output_format="plain")
-__md.stripTopLevelTags = False
+# Markdown.output_formats["plain"] = unmark_element
+# __md = Markdown(output_format="plain")
+# __md.stripTopLevelTags = False
 
-
-def unmark(text):
-    return __md.convert(text)
+_md = MarkdownIt()
+def unmark(text: str) -> str:
+    tokens = _md.parse(text)
+    return "".join(
+        t.content for t in tokens if t.type == "inline"
+    )
+# def unmark(text):
+#     return __md.convert(text)
 
 
 def remove_bibtex(string_list):
@@ -63,6 +70,10 @@ def remove_comments(html_text):
     -------
     Markdown with no HTML comments
     """
-    comment_pattern = r'<!--(.*?)-->'
-    html_without_comments = re.sub(comment_pattern, '', html_text, flags=re.DOTALL)
+    # comment_pattern = r'<!--(.*?)-->'
+    # # # comment_pattern = r'<!--[\s\S]*?--\s*>'
+    comment_pattern = r'<!--[\s\S]*?--.*?>'
+
+    html_without_comments = re.sub(comment_pattern, '', html_text)
+    # print(html_without_comments)
     return html_without_comments