Skip to content

Commit f373fc9

Browse files
authored
Merge pull request #881 from juanjemdIos/master
New library to handle Markdown unmarking more reliably: MarkdownIt instead of Markdown.
2 parents b988013 + cb58aad commit f373fc9

7 files changed

Lines changed: 92 additions & 41 deletions

File tree

.github/workflows/action-test-before-PR.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,4 @@ jobs:
3131
run: poetry run somef configure -a
3232

3333
- name: Run pytest
34-
run: poetry run pytest -v src/somef/test
34+
run: poetry run pytest -v src/somef/test/

poetry.lock

Lines changed: 39 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ homepage = "https://github.com/KnowledgeCaptureAndDiscovery/somef"
4545
pyyaml = "^6.0.2"
4646
lxml = "^5.1.0"
4747
tomli = "^2.0.1"
48-
48+
markdown-it-py = "^3.0"
49+
4950
[tool.poetry.scripts]
5051
somef = "somef.__main__:cli"
5152

src/somef/header_analysis.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ def extract_header_content(text: str) -> Tuple[pd.DataFrame, str | None]:
138138

139139
content, none_header_content = mardown_parser.extract_content_per_header(text, headers)
140140
parents = mardown_parser.extract_headers_parents(text)
141+
141142
df = pd.DataFrame({
142143
'Header': header_list,
143144
'Content': content,
@@ -395,10 +396,9 @@ def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Res
395396
source = source[constants.PROP_RESULT][constants.PROP_VALUE]
396397

397398
logging.info("Extracting information using headers - iterating over valid entries")
399+
logging.info("Valid rows: %s", len(valid))
398400

399401
for _, row in valid.iterrows():
400-
401-
# logging.info(f'row value: {row[constants.PROP_VALUE]}')
402402
result = {
403403
constants.PROP_VALUE: row[constants.PROP_VALUE],
404404
constants.PROP_TYPE: constants.TEXT_EXCERPT,

src/somef/somef_cli.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,14 +148,18 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
148148
readme_unfiltered_text = markdown_utils.remove_comments(readme_unfiltered_text)
149149
repository_metadata, string_list = header_analysis.extract_categories(readme_unfiltered_text,
150150
repository_metadata)
151+
152+
logging.info("Extracted categories from headers successfully.")
151153
readme_text_unmarked = markdown_utils.unmark(readme_text)
154+
logging.info("readme text unmarked successfully.")
152155
if not ignore_classifiers and readme_unfiltered_text != '':
153156
logging.info("--> suppervised classification")
154157
repository_metadata = supervised_classification.run_category_classification(readme_unfiltered_text,
155158
threshold,
156159
repository_metadata)
157160
logging.info("--> create excerpts")
158161
excerpts = create_excerpts.create_excerpts(string_list)
162+
logging.info("--> extract text excerpts headers")
159163
excerpts_headers = mardown_parser.extract_text_excerpts_header(readme_unfiltered_text)
160164
header_parents = mardown_parser.extract_headers_parents(readme_unfiltered_text)
161165
score_dict = supervised_classification.run_classifiers(excerpts, file_paths)

src/somef/test/test_JSON_export.py

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -407,36 +407,36 @@ def test_issue_830(self):
407407
# except Exception as e:
408408
# print(f"Failed to delete {cls.json_file}: {e}")
409409

410-
# def test_issue_862(self):
411-
# """Checks if this repository does not gets stuck when labeling headers"""
412-
# somef_cli.run_cli(threshold=0.8,
413-
# ignore_classifiers=False,
414-
# repo_url=None,
415-
# local_repo=test_data_repositories + "componentInstaller",
416-
# doc_src=None,
417-
# in_file=None,
418-
# output=test_data_path + "test_issue_862.json",
419-
# graph_out=None,
420-
# graph_format="turtle",
421-
# codemeta_out=None,
422-
# pretty=True,
423-
# missing=False,
424-
# readme_only=False)
410+
def test_issue_862(self):
411+
"""Checks if this repository does not gets stuck when labeling headers"""
412+
somef_cli.run_cli(threshold=0.8,
413+
ignore_classifiers=False,
414+
repo_url=None,
415+
local_repo=test_data_repositories + "componentInstaller",
416+
doc_src=None,
417+
in_file=None,
418+
output=test_data_path + "test_issue_862.json",
419+
graph_out=None,
420+
graph_format="turtle",
421+
codemeta_out=None,
422+
pretty=True,
423+
missing=False,
424+
readme_only=False)
425425

426-
# text_file = open(test_data_path + "test_issue_862.json", "r")
427-
# data = text_file.read()
428-
# text_file.close()
429-
# json_content = json.loads(data)
426+
text_file = open(test_data_path + "test_issue_862.json", "r")
427+
data = text_file.read()
428+
text_file.close()
429+
json_content = json.loads(data)
430430

431-
# assert "description" in json_content, "Missing 'description' property"
431+
assert "description" in json_content, "Missing 'description' property"
432432

433-
# assert len(json_content["description"]) > 0, "Description list is empty"
433+
assert len(json_content["description"]) > 0, "Description list is empty"
434434

435-
# first_desc = json_content["description"][0]["result"]
436-
# assert "value" in first_desc, "Missing 'value' in description result"
437-
# assert first_desc["value"], "Description 'value' is empty"
435+
first_desc = json_content["description"][0]["result"]
436+
assert "value" in first_desc, "Missing 'value' in description result"
437+
assert first_desc["value"], "Description 'value' is empty"
438438

439-
# os.remove(test_data_path + "test_issue_862.json")
439+
os.remove(test_data_path + "test_issue_862.json")
440440

441441
def test_issue_859(self):
442442
"""Checks whether a repository without content works fine. Must have just some results from the API."""

src/somef/utils/markdown_utils.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import logging
22
from io import StringIO
33
from markdown import Markdown
4+
from markdown_it import MarkdownIt
45
from ..utils import constants
56
import re
67

8+
logging.getLogger("markdown_it").setLevel(logging.WARNING)
79

810
## Markdown to plain text conversion: begin ##
911
# code snippet from https://stackoverflow.com/a/54923798
@@ -20,13 +22,18 @@ def unmark_element(element, stream=None):
2022

2123

2224
# patching Markdown
23-
Markdown.output_formats["plain"] = unmark_element
24-
__md = Markdown(output_format="plain")
25-
__md.stripTopLevelTags = False
25+
# Markdown.output_formats["plain"] = unmark_element
26+
# __md = Markdown(output_format="plain")
27+
# __md.stripTopLevelTags = False
2628

27-
28-
def unmark(text):
29-
return __md.convert(text)
29+
_md = MarkdownIt()
30+
def unmark(text: str) -> str:
31+
tokens = _md.parse(text)
32+
return "".join(
33+
t.content for t in tokens if t.type == "inline"
34+
)
35+
# def unmark(text):
36+
# return __md.convert(text)
3037

3138

3239
def remove_bibtex(string_list):
@@ -63,6 +70,10 @@ def remove_comments(html_text):
6370
-------
6471
Markdown with no HTML comments
6572
"""
66-
comment_pattern = r'<!--(.*?)-->'
67-
html_without_comments = re.sub(comment_pattern, '', html_text, flags=re.DOTALL)
73+
# comment_pattern = r'<!--(.*?)-->'
74+
# # # comment_pattern = r'<!--[\s\S]*?--\s*>'
75+
comment_pattern = r'<!--[\s\S]*?--.*?>'
76+
77+
html_without_comments = re.sub(comment_pattern, '', html_text)
78+
# print(html_without_comments)
6879
return html_without_comments

0 commit comments

Comments
 (0)