Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 48 additions & 1 deletion src/somef/regular_expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,31 @@ def extract_support_channels(readme_text, repository_metadata: Result, readme_so

return repository_metadata

def extract_license_badges(readme_text, repository_metadata, readme_source):
"""
Extracts license information from choosealicense.com URLs found in README badges.
"""
matches = re.finditer(constants.REGEXP_CHOOSE_LICENSE, readme_text)
for match in matches:
license_url = match.group(0) # full URL
license_info = detect_license_spdx(license_url, 'HEADER')
if license_info:
result = {
constants.PROP_VALUE: license_info['spdx_id'],
constants.PROP_TYPE: constants.PROP_LICENSE,
constants.PROP_NAME: license_info['name'],
constants.PROP_SPDX_ID: license_info['spdx_id'],
constants.PROP_URL: license_info['url'],
constants.PROP_IDENTIFIER: license_info['url'],
}
repository_metadata.add_result(
constants.CAT_LICENSE,
result,
1,
constants.TECHNIQUE_REGULAR_EXPRESSION,
readme_source
)
return repository_metadata

def extract_repo_status(unfiltered_text, repository_metadata: Result, readme_source) -> Result:
"""
Expand Down Expand Up @@ -995,7 +1020,29 @@ def detect_license_spdx(license_text, type):
-------
A JSON dictionary with name and spdx id
"""

match = re.search(constants.REGEXP_CHOOSE_LICENSE, license_text)
if match:
slug = match.group(1).lower().rstrip('/')
for license_name, license_info in constants.LICENSES_DICT.items():
if license_info['spdx_id'].lower() == slug:
spdx_id = license_info['spdx_id']
spdx_url = f"https://spdx.org/licenses/{spdx_id}"
if type == 'JSON':
return {
"name": license_name,
"spdx_id": spdx_id,
"@id": spdx_url,
"url": spdx_url,
"identifier": spdx_url
}
else:
return {
"name": license_name,
"spdx_id": spdx_id,
"identifier": spdx_url,
"url": spdx_url
}

for license_name, license_info in constants.LICENSES_DICT.items():
if re.search(license_info["regex"], license_text, re.IGNORECASE):
spdx_id = license_info['spdx_id']
Expand Down
2 changes: 2 additions & 0 deletions src/somef/somef_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,8 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
repository_metadata, readme_source, def_branch)
repository_metadata = regular_expressions.extract_arxiv_links(readme_unfiltered_text, repository_metadata,
readme_source)
repository_metadata = regular_expressions.extract_license_badges(readme_unfiltered_text, repository_metadata, readme_source)

logging.info("Completed extracting regular expressions")

return repository_metadata
Expand Down
35 changes: 35 additions & 0 deletions src/somef/test/test_JSON_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -1004,3 +1004,38 @@ def test_issue_980_reconciliation_requirements(self):
os.remove(output_path)


def test_issue_533_choosealicense_badge(self):
"""
Checks that a license badge with a choosealicense.com URL is detected and resolved to SPDX.
"""
output_path = test_data_path + "test_issue_533_choosealicense_badge.json"

somef_cli.run_cli(threshold=0.8,
ignore_classifiers=False,
repo_url=None,
local_repo=None,
doc_src=test_data_path + "README-manim.md",
in_file=None,
output=output_path,
graph_out=None,
graph_format="turtle",
codemeta_out=None,
pretty=True,
missing=False,
readme_only=False)

with open(output_path, "r") as f:
json_content = json.loads(f.read())

licenses = json_content.get(constants.CAT_LICENSE, [])

mit_license = next(
(l for l in licenses if l[constants.PROP_RESULT].get(constants.PROP_SPDX_ID) == "MIT"),
None
)
assert mit_license is not None, f"Expected a MIT license resolved from choosealicense.com badge, got: {licenses}"

techniques = mit_license.get("technique", [])
assert constants.TECHNIQUE_REGULAR_EXPRESSION in techniques, f"Expected 'regular_expressions' in techniques, got: {techniques}"

os.remove(output_path)
3 changes: 3 additions & 0 deletions src/somef/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@
REGEXP_MPL2 = r'(?i)mozilla\s+public\s+license\s*,?\s*version\s*2\.0'
REGEXP_UNLICENSE = r'(?i)the\s+unlicense'

# detect choosealicense in badges
REGEXP_CHOOSE_LICENSE = r'choosealicense\.com/licenses/([^/\s]+)'

# Detect organization in authors.md
# REGEXP_LTD_INC = r'\b(inc|ltd|llc|corporation)([.,]|\b)'
REGEXP_LTD_INC = r'\b(inc|ltd|llc|corporation|foundation|community|project|team|group|society|institute|association|consortium|organization|organisation)([.,]|\b)'
Expand Down
Loading