From 420e59854eac5fba7c826a0a69d1564e63d715ba Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Thu, 7 May 2026 09:45:44 +0200 Subject: [PATCH 1/2] choosealicense in badges --- src/somef/regular_expressions.py | 49 +++++++++++++++++++++++++++++- src/somef/test/test_JSON_export.py | 35 +++++++++++++++++++++ src/somef/utils/constants.py | 3 ++ 3 files changed, 86 insertions(+), 1 deletion(-) diff --git a/src/somef/regular_expressions.py b/src/somef/regular_expressions.py index 5157c147..074e0637 100644 --- a/src/somef/regular_expressions.py +++ b/src/somef/regular_expressions.py @@ -181,6 +181,31 @@ def extract_support_channels(readme_text, repository_metadata: Result, readme_so return repository_metadata +def extract_license_badges(readme_text, repository_metadata, readme_source): + """ + Extracts license information from choosealicense.com URLs found in README badges. + """ + matches = re.finditer(constants.REGEXP_CHOOSE_LICENSE, readme_text) + for match in matches: + license_url = match.group(0) # full URL + license_info = detect_license_spdx(license_url, 'HEADER') + if license_info: + result = { + constants.PROP_VALUE: license_info['spdx_id'], + constants.PROP_TYPE: constants.PROP_LICENSE, + constants.PROP_NAME: license_info['name'], + constants.PROP_SPDX_ID: license_info['spdx_id'], + constants.PROP_URL: license_info['url'], + constants.PROP_IDENTIFIER: license_info['url'], + } + repository_metadata.add_result( + constants.CAT_LICENSE, + result, + 1, + constants.TECHNIQUE_REGULAR_EXPRESSION, + readme_source + ) + return repository_metadata def extract_repo_status(unfiltered_text, repository_metadata: Result, readme_source) -> Result: """ @@ -995,7 +1020,29 @@ def detect_license_spdx(license_text, type): ------- A JSON dictionary with name and spdx id """ - + match = re.search(constants.REGEXP_CHOOSE_LICENSE, license_text) + if match: + slug = match.group(1).lower().rstrip('/') + for license_name, license_info in constants.LICENSES_DICT.items(): + if license_info['spdx_id'].lower() == slug: + spdx_id = license_info['spdx_id'] + spdx_url = f"https://spdx.org/licenses/{spdx_id}" + if type == 'JSON': + return { + "name": license_name, + "spdx_id": spdx_id, + "@id": spdx_url, + "url": spdx_url, + "identifier": spdx_url + } + else: + return { + "name": license_name, + "spdx_id": spdx_id, + "identifier": spdx_url, + "url": spdx_url + } + for license_name, license_info in constants.LICENSES_DICT.items(): if re.search(license_info["regex"], license_text, re.IGNORECASE): spdx_id = license_info['spdx_id'] diff --git a/src/somef/test/test_JSON_export.py b/src/somef/test/test_JSON_export.py index 619061bc..958c36f4 100644 --- a/src/somef/test/test_JSON_export.py +++ b/src/somef/test/test_JSON_export.py @@ -1004,3 +1004,38 @@ def test_issue_980_reconciliation_requirements(self): os.remove(output_path) + def test_issue_533_choosealicense_badge(self): + """ + Checks that a license badge with a choosealicense.com URL is detected and resolved to SPDX. + """ + output_path = test_data_path + "test_issue_533_choosealicense_badge.json" + + somef_cli.run_cli(threshold=0.8, + ignore_classifiers=False, + repo_url=None, + local_repo=None, + doc_src=test_data_path + "README-manim.md", + in_file=None, + output=output_path, + graph_out=None, + graph_format="turtle", + codemeta_out=None, + pretty=True, + missing=False, + readme_only=False) + + with open(output_path, "r") as f: + json_content = json.loads(f.read()) + + licenses = json_content.get(constants.CAT_LICENSE, []) + + mit_license = next( + (l for l in licenses if l[constants.PROP_RESULT].get(constants.PROP_SPDX_ID) == "MIT"), + None + ) + assert mit_license is not None, f"Expected a MIT license resolved from choosealicense.com badge, got: {licenses}" + + techniques = mit_license.get("technique", []) + assert constants.TECHNIQUE_REGULAR_EXPRESSION in techniques, f"Expected 'regular_expressions' in techniques, got: {techniques}" + + os.remove(output_path) diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index 94d3ce6e..ecf336a6 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -77,6 +77,9 @@ REGEXP_MPL2 = r'(?i)mozilla\s+public\s+license\s*,?\s*version\s*2\.0' REGEXP_UNLICENSE = r'(?i)the\s+unlicense' +# detect choosealicense in badges +REGEXP_CHOOSE_LICENSE = r'choosealicense\.com/licenses/([^/\s]+)' + # Detect organization in authors.md # REGEXP_LTD_INC = r'\b(inc|ltd|llc|corporation)([.,]|\b)' REGEXP_LTD_INC = r'\b(inc|ltd|llc|corporation|foundation|community|project|team|group|society|institute|association|consortium|organization|organisation)([.,]|\b)' From bb7689680eef7ac6fc95365613d495958c80d9da Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Thu, 7 May 2026 09:51:11 +0200 Subject: [PATCH 2/2] I forgot the call to the extract license --- src/somef/somef_cli.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/somef/somef_cli.py b/src/somef/somef_cli.py index 4bd4efbc..371489bb 100644 --- a/src/somef/somef_cli.py +++ b/src/somef/somef_cli.py @@ -230,6 +230,8 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc repository_metadata, readme_source, def_branch) repository_metadata = regular_expressions.extract_arxiv_links(readme_unfiltered_text, repository_metadata, readme_source) + repository_metadata = regular_expressions.extract_license_badges(readme_unfiltered_text, repository_metadata, readme_source) + logging.info("Completed extracting regular expressions") return repository_metadata