From e5b7a42c69faa6d574a1dbc112e04491dbf7b32d Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Thu, 7 May 2026 12:03:06 +0200 Subject: [PATCH 1/3] configuration of similarity thresshold. Fixes #112 --- config.json | 3 ++- src/somef/configuration.py | 9 +++++++-- src/somef/header_analysis.py | 25 +++++++++++++------------ src/somef/somef_cli.py | 3 ++- src/somef/test/test_header_analysis.py | 16 ++++++++++++++++ src/somef/utils/constants.py | 6 ++++++ 6 files changed, 46 insertions(+), 16 deletions(-) diff --git a/config.json b/config.json index 2a5effa0..b0e431ef 100644 --- a/config.json +++ b/config.json @@ -2,5 +2,6 @@ "description" : "./models/description.p", "citation" : "./models/citation.p", "installation" : "./models/installation.p", - "invocation" : "./models/invocation.p" + "invocation" : "./models/invocation.p", + "similarity_threshold": 0.8 } \ No newline at end of file diff --git a/src/somef/configuration.py b/src/somef/configuration.py index d0d4004b..599228fe 100644 --- a/src/somef/configuration.py +++ b/src/somef/configuration.py @@ -26,6 +26,8 @@ def get_configuration_file(): if credentials_file.exists(): with credentials_file.open("r") as fh: file_paths = json.load(fh) + if constants.CONF_SIMILARITY_THRESHOLD not in file_paths: + file_paths[constants.CONF_SIMILARITY_THRESHOLD] = constants.CONF_DEFAULT_SIMILARITY_THRESHOLD else: sys.exit("Error: Please provide a config.json file or run somef configure.") return file_paths @@ -53,7 +55,8 @@ def configure(authorization="", invocation=default_invocation, installation=default_installation, citation=default_citation, - base_uri=constants.CONF_DEFAULT_BASE_URI): + base_uri=constants.CONF_DEFAULT_BASE_URI, + similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD): """ Function to configure the main program""" import nltk nltk.download('wordnet') @@ -77,7 +80,8 @@ def configure(authorization="", constants.CONF_INVOCATION: invocation, constants.CONF_INSTALLATION: installation, constants.CONF_CITATION: citation, - constants.CONF_BASE_URI: base_uri + constants.CONF_BASE_URI: base_uri, + constants.CONF_SIMILARITY_THRESHOLD: similarity_threshold } if data[constants.CONF_AUTHORIZATION] == "token ": @@ -88,3 +92,4 @@ def configure(authorization="", credentials_file.chmod(0o600) json.dump(data, fh) logging.info("Configuration file saved at "+os.path.dirname(credentials_file)) + diff --git a/src/somef/header_analysis.py b/src/somef/header_analysis.py index a7489a9c..d80acb3d 100644 --- a/src/somef/header_analysis.py +++ b/src/somef/header_analysis.py @@ -14,7 +14,7 @@ pd.options.mode.chained_assignment = None # default='warn' -SIMILARITY_THRESHOLD = 0.8 +# SIMILARITY_THRESHOLD = 0.8 # Define wordnet groups @@ -203,7 +203,7 @@ def find_sim(wordlist, wd): # return maxgroup -def label_header(header): +def label_header(header, similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD): """Function designed to label a header with a subgroup""" # remove punctuation header_clean = header.translate(str.maketrans('', '', string.punctuation)) @@ -213,13 +213,13 @@ def label_header(header): synn = Word(s).synsets if len(synn) > 0: # bestgroup = match_group(synn, group, 0.8) - bestgroup = match_group(synn) + bestgroup = match_group(synn, similarity_threshold) if bestgroup != "" and bestgroup not in label: label.append(bestgroup) return label -def label_parent_headers(parentHeaders): +def label_parent_headers(parentHeaders, similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD): """label the header with a subgroup""" header = "" for value in parentHeaders: @@ -232,7 +232,7 @@ def label_parent_headers(parentHeaders): synn = Word(s).synsets if len(synn) > 0: # bestgroup = match_group(synn, group, 0.8) - bestgroup = match_group(synn) + bestgroup = match_group(synn, similarity_threshold) if bestgroup != "" and bestgroup not in label: label.append(bestgroup) return label @@ -261,13 +261,14 @@ def get_groups() -> Dict[str, List]: WORDNET_GROUPS = build_wordnet_groups() return WORDNET_GROUPS -def match_group(word_synsets) -> str: +def match_group(word_synsets,similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD) -> str: best_group = "" best_score = 0.0 for key, synsets in get_groups().items(): score = max_similarity(word_synsets, synsets) - if score > SIMILARITY_THRESHOLD and score > best_score: + # if score > SIMILARITY_THRESHOLD and score > best_score: + if score > similarity_threshold and score > best_score: best_score = score best_group = key @@ -286,7 +287,7 @@ def tokenize_header(text) -> Iterable[str]: clean = text.translate(str.maketrans('', '', string.punctuation)) return clean.strip().split() -def label_text(text: str) -> List[str]: +def label_text(text: str, similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD) -> List[str]: labels: List[str] = [] if isinstance(text, list): @@ -297,7 +298,7 @@ def label_text(text: str) -> List[str]: for token in tokenize_header(text): synsets = get_synsets(token) if synsets: - grp = match_group(synsets) + grp = match_group(synsets, similarity_threshold ) # Skip if the header matches a known false positive for this group # if isinstance(text, list): @@ -410,7 +411,7 @@ def is_false_positive_header(text: str, category: str) -> bool: # logging.error("Error while extracting headers: ", str(e)) # return repository_metadata, [repo_data] -def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Result, List[str]]: +def extract_categories(repo_data: str, repository_metadata: Result, similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD) -> Tuple[Result, List[str]]: logging.info("Extracting information using headers") if not repo_data: @@ -423,8 +424,8 @@ def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Res logging.warning("File to analyze has no headers") return repository_metadata, [repo_data] - df['Group'] = df['Header'].map(label_text) - df['ParentGroup'] = df['ParentHeader'].fillna('').map(label_text) + df['Group'] = df['Header'].map(lambda x: label_text(x, similarity_threshold)) + df['ParentGroup'] = df['ParentHeader'].fillna('').map(lambda x: label_text(x, similarity_threshold)) df.loc[df['Group'].str.len() == 0, 'Group'] = df['ParentGroup'] df = df.drop(columns=['ParentGroup']) diff --git a/src/somef/somef_cli.py b/src/somef/somef_cli.py index 4bd4efbc..aeac866e 100644 --- a/src/somef/somef_cli.py +++ b/src/somef/somef_cli.py @@ -53,6 +53,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc logging.getLogger("urllib3").setLevel(logging.WARNING) file_paths = configuration.get_configuration_file() + similarity_threshold = file_paths.get(constants.CONF_SIMILARITY_THRESHOLD, constants.CONF_DEFAULT_SIMILARITY_THRESHOLD) repo_type = constants.RepositoryType.GITHUB repository_metadata = Result() def_branch = "main" @@ -172,7 +173,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc # remove html comments from unfiltered text (to avoid detecting commented out (wrong) metadata readme_unfiltered_text = markdown_utils.remove_comments(readme_unfiltered_text) repository_metadata, string_list = header_analysis.extract_categories(readme_unfiltered_text, - repository_metadata) + repository_metadata,similarity_threshold) logging.info("Extracted categories from headers successfully.") readme_text_unmarked = markdown_utils.unmark(readme_text) diff --git a/src/somef/test/test_header_analysis.py b/src/somef/test/test_header_analysis.py index b5259205..d494c1bd 100644 --- a/src/somef/test/test_header_analysis.py +++ b/src/somef/test/test_header_analysis.py @@ -149,3 +149,19 @@ def test_extract_headers_with_separators(self): assert 'Installation' in headers assert 'Citation' in headers assert 'Funding' in headers + + + def test_issue_112_similarity_threshold(self): + """ + Checks that the similarity_threshold parameter is respected in header analysis. + """ + with open(test_data_path + "README-manim.md", "r") as data_file: + file_text = data_file.read() + + json_default, _ = extract_categories(file_text, Result(), similarity_threshold=0.8) + assert constants.CAT_INSTALLATION in json_default.results, f"Expected CAT_INSTALLATION with threshold 0.8" + + # threshold 2.0 (extremely high) nothing should be detected via similarity + json_impossible, _ = extract_categories(file_text, Result(), similarity_threshold=2.0) + assert constants.CAT_INSTALLATION not in json_impossible.results, \ + f"Expected no CAT_INSTALLATION with threshold 2.0, got: {json_impossible.results.get(constants.CAT_INSTALLATION)}" diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index 94d3ce6e..69572880 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -541,3 +541,9 @@ class RepositoryType(Enum): DEPENDENCY_TYPE_RUNTIME = "runtime" DEPENDENCY_TYPE_DEVELOPMENT = "development" + +# in case not exist in config file. But config file has higher priority than this default value. +CONF_SIMILARITY_THRESHOLD = "similarity_threshold" +CONF_DEFAULT_SIMILARITY_THRESHOLD = 0.8 + + From fdbd04010f94d14f489af60552ff9db633564905 Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Thu, 7 May 2026 13:10:29 +0200 Subject: [PATCH 2/3] less request to github --- src/somef/test/test_process_repository.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/somef/test/test_process_repository.py b/src/somef/test/test_process_repository.py index 5f807c36..7a4e366b 100644 --- a/src/somef/test/test_process_repository.py +++ b/src/somef/test/test_process_repository.py @@ -140,10 +140,12 @@ def test_no_repository_metadata(self): "https://github.com/oeg-upm/delta-ontology") assert constants.CAT_RELEASES not in github_data.results.keys() + @unittest.skipIf(os.getenv("CI") == "true", "Skipped in CI because it is already verified locally") def test_issue_284_issue_272(self): """Test designed to check if there are errors detecting title or stargazers""" github_data, owner, repo_name, default_br, project_path = process_repository.\ load_online_repository_metadata(Result(), "https://github.com/3b1b/manim") + result_keys = github_data.results.keys() assert ((constants.CAT_STARS in result_keys) and (constants.CAT_FULL_TITLE not in result_keys)) From fd001216579690dd0485bf0516434114ffee475b Mon Sep 17 00:00:00 2001 From: Juanje Mendoza Date: Fri, 8 May 2026 08:52:35 +0200 Subject: [PATCH 3/3] explanation of similarity threshold in readthedocs --- README.md | 4 ++++ docs/usage.md | 27 ++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 68cad3d1..0b4a7976 100644 --- a/README.md +++ b/README.md @@ -362,6 +362,10 @@ The following command extracts all metadata available from [https://github.com/d somef describe -r https://github.com/dgarijo/Widoco/ -o test.json -t 0.8 ``` +We recommend having a high value for the `threshold` parameter, 0.8 (default) or above. +Additional configuration parameters (such as the `similarity_threshold` for header analysis) +can be set in `~/.somef/config.json`. See the [usage documentation](https://somef.readthedocs.io/en/latest/usage/) for details. + Try SOMEF in Binder with our sample notebook: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/KnowledgeCaptureAndDiscovery/somef/HEAD?filepath=notebook%2FSOMEF%20Usage%20Example.ipynb) ## Contribute: diff --git a/docs/usage.md b/docs/usage.md index 7dd209a8..b212e972 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -82,8 +82,33 @@ If you prefer to export as a [Codemeta](https://codemeta.github.io/) JSON-LD, ju somef describe -r https://github.com/dgarijo/Widoco/ -c test.json ``` -For more information about the output types supported by SOMEF, please see [the output format help page](https://somef.readthedocs.io/en/latest/output/). +For more information about the output types supported by SOMEF, please see [the output format help page](https://somef.readthedocs.io/en/latest/output/). We recommend having a high value for the `threshold` parameter, 0.8 (default) or above. +## Configuration parameters + +SOMEF uses a configuration file located at `~/.somef/config.json` that can be edited to customize its behavior. +To generate it, run `somef configure`. The following parameters are available: + +### Similarity threshold + +Controls the minimum similarity score required for a README header to be matched to a +category (e.g., installation, usage, license). SOMEF uses WordNet path similarity to +compare header words against known category terms. + +- **Default value**: `0.8` +- **Range**: `0.0` to `1.0` (higher values = stricter matching, lower values = more permissive) + +To change it, edit your `~/.somef/config.json`: + +```json +{ + "similarity_threshold": 0.75 +} +``` + +Note: This parameter is different from the `-t` threshold used in `somef describe`, +which controls the confidence of the supervised classifiers. + To see a live usage example, try our Binder Notebook: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/KnowledgeCaptureAndDiscovery/somef/HEAD?filepath=notebook%2FSOMEF%20Usage%20Example.ipynb) \ No newline at end of file