From e5b7a42c69faa6d574a1dbc112e04491dbf7b32d Mon Sep 17 00:00:00 2001
From: Juanje Mendoza <juanjemd@gmail.com>
Date: Thu, 7 May 2026 12:03:06 +0200
Subject: [PATCH 1/3] configuration of similarity thresshold. Fixes #112

---
 config.json                            |  3 ++-
 src/somef/configuration.py             |  9 +++++++--
 src/somef/header_analysis.py           | 25 +++++++++++++------------
 src/somef/somef_cli.py                 |  3 ++-
 src/somef/test/test_header_analysis.py | 16 ++++++++++++++++
 src/somef/utils/constants.py           |  6 ++++++
 6 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/config.json b/config.json
index 2a5effa0..b0e431ef 100644
--- a/config.json
+++ b/config.json
@@ -2,5 +2,6 @@
 	"description" : "./models/description.p",
 	"citation" : "./models/citation.p",
 	"installation" : "./models/installation.p",
-	"invocation" : "./models/invocation.p"
+	"invocation" : "./models/invocation.p",
+	"similarity_threshold": 0.8
 }
\ No newline at end of file
diff --git a/src/somef/configuration.py b/src/somef/configuration.py
index d0d4004b..599228fe 100644
--- a/src/somef/configuration.py
+++ b/src/somef/configuration.py
@@ -26,6 +26,8 @@ def get_configuration_file():
     if credentials_file.exists():
         with credentials_file.open("r") as fh:
             file_paths = json.load(fh)
+        if constants.CONF_SIMILARITY_THRESHOLD not in file_paths:
+            file_paths[constants.CONF_SIMILARITY_THRESHOLD] = constants.CONF_DEFAULT_SIMILARITY_THRESHOLD
     else:
         sys.exit("Error: Please provide a config.json file or run somef configure.")
     return file_paths
@@ -53,7 +55,8 @@ def configure(authorization="",
               invocation=default_invocation,
               installation=default_installation,
               citation=default_citation,
-              base_uri=constants.CONF_DEFAULT_BASE_URI):
+              base_uri=constants.CONF_DEFAULT_BASE_URI,
+              similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD):
     """ Function to configure the main program"""
     import nltk
     nltk.download('wordnet')
@@ -77,7 +80,8 @@ def configure(authorization="",
         constants.CONF_INVOCATION: invocation,
         constants.CONF_INSTALLATION: installation,
         constants.CONF_CITATION: citation,
-        constants.CONF_BASE_URI: base_uri
+        constants.CONF_BASE_URI: base_uri,
+        constants.CONF_SIMILARITY_THRESHOLD: similarity_threshold
     }
 
     if data[constants.CONF_AUTHORIZATION] == "token ":
@@ -88,3 +92,4 @@ def configure(authorization="",
         credentials_file.chmod(0o600)
         json.dump(data, fh)
         logging.info("Configuration file saved at "+os.path.dirname(credentials_file))
+
diff --git a/src/somef/header_analysis.py b/src/somef/header_analysis.py
index a7489a9c..d80acb3d 100644
--- a/src/somef/header_analysis.py
+++ b/src/somef/header_analysis.py
@@ -14,7 +14,7 @@
 pd.options.mode.chained_assignment = None  # default='warn'
 
 
-SIMILARITY_THRESHOLD = 0.8
+# SIMILARITY_THRESHOLD = 0.8
 
 
 # Define wordnet groups
@@ -203,7 +203,7 @@ def find_sim(wordlist, wd):
 #     return maxgroup
 
 
-def label_header(header):
+def label_header(header, similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD):
     """Function designed to label a header with a subgroup"""
     # remove punctuation
     header_clean = header.translate(str.maketrans('', '', string.punctuation))
@@ -213,13 +213,13 @@ def label_header(header):
         synn = Word(s).synsets 
         if len(synn) > 0:
             # bestgroup = match_group(synn, group, 0.8)
-            bestgroup = match_group(synn)
+            bestgroup = match_group(synn, similarity_threshold)
             if bestgroup != "" and bestgroup not in label:
                 label.append(bestgroup) 
     return label
 
 
-def label_parent_headers(parentHeaders):
+def label_parent_headers(parentHeaders, similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD):
     """label the header with a subgroup"""
     header = ""
     for value in parentHeaders:
@@ -232,7 +232,7 @@ def label_parent_headers(parentHeaders):
         synn = Word(s).synsets
         if len(synn) > 0:
             # bestgroup = match_group(synn, group, 0.8)
-            bestgroup = match_group(synn)
+            bestgroup = match_group(synn, similarity_threshold)
             if bestgroup != "" and bestgroup not in label:
                 label.append(bestgroup)
     return label
@@ -261,13 +261,14 @@ def get_groups() -> Dict[str, List]:
         WORDNET_GROUPS = build_wordnet_groups()
     return WORDNET_GROUPS
 
-def match_group(word_synsets) -> str:
+def match_group(word_synsets,similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD) -> str:
     best_group = ""
     best_score = 0.0
 
     for key, synsets in get_groups().items():
         score = max_similarity(word_synsets, synsets)
-        if score > SIMILARITY_THRESHOLD and score > best_score:
+        # if score > SIMILARITY_THRESHOLD and score > best_score:
+        if score > similarity_threshold and score > best_score:
             best_score = score
             best_group = key
 
@@ -286,7 +287,7 @@ def tokenize_header(text) -> Iterable[str]:
     clean = text.translate(str.maketrans('', '', string.punctuation)) 
     return clean.strip().split()
 
-def label_text(text: str) -> List[str]:
+def label_text(text: str, similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD) -> List[str]:
     labels: List[str] = []
 
     if isinstance(text, list):
@@ -297,7 +298,7 @@ def label_text(text: str) -> List[str]:
     for token in tokenize_header(text):
         synsets = get_synsets(token)
         if synsets:
-            grp = match_group(synsets)
+            grp = match_group(synsets, similarity_threshold )
             # Skip if the header matches a known false positive for this group
            
             # if isinstance(text, list):
@@ -410,7 +411,7 @@ def is_false_positive_header(text: str, category: str) -> bool:
 #         logging.error("Error while extracting headers: ", str(e))
 #         return repository_metadata, [repo_data]
 
-def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Result, List[str]]:
+def extract_categories(repo_data: str, repository_metadata: Result, similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD) -> Tuple[Result, List[str]]:
     logging.info("Extracting information using headers")
 
     if not repo_data:
@@ -423,8 +424,8 @@ def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Res
             logging.warning("File to analyze has no headers")
             return repository_metadata, [repo_data]
 
-        df['Group'] = df['Header'].map(label_text)
-        df['ParentGroup'] = df['ParentHeader'].fillna('').map(label_text)
+        df['Group'] = df['Header'].map(lambda x: label_text(x, similarity_threshold))
+        df['ParentGroup'] = df['ParentHeader'].fillna('').map(lambda x: label_text(x, similarity_threshold))
 
         df.loc[df['Group'].str.len() == 0, 'Group'] = df['ParentGroup']
         df = df.drop(columns=['ParentGroup'])
diff --git a/src/somef/somef_cli.py b/src/somef/somef_cli.py
index 4bd4efbc..aeac866e 100644
--- a/src/somef/somef_cli.py
+++ b/src/somef/somef_cli.py
@@ -53,6 +53,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
     logging.getLogger("urllib3").setLevel(logging.WARNING)
 
     file_paths = configuration.get_configuration_file()
+    similarity_threshold = file_paths.get(constants.CONF_SIMILARITY_THRESHOLD, constants.CONF_DEFAULT_SIMILARITY_THRESHOLD)
     repo_type = constants.RepositoryType.GITHUB
     repository_metadata = Result()
     def_branch = "main"
@@ -172,7 +173,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
         # remove html comments from unfiltered text (to avoid detecting commented out (wrong) metadata
         readme_unfiltered_text = markdown_utils.remove_comments(readme_unfiltered_text)
         repository_metadata, string_list = header_analysis.extract_categories(readme_unfiltered_text,
-                                                                              repository_metadata)
+                                                                              repository_metadata,similarity_threshold)
         
         logging.info("Extracted categories from headers successfully.")
         readme_text_unmarked = markdown_utils.unmark(readme_text)
diff --git a/src/somef/test/test_header_analysis.py b/src/somef/test/test_header_analysis.py
index b5259205..d494c1bd 100644
--- a/src/somef/test/test_header_analysis.py
+++ b/src/somef/test/test_header_analysis.py
@@ -149,3 +149,19 @@ def test_extract_headers_with_separators(self):
             assert 'Installation' in headers
             assert 'Citation' in headers
             assert 'Funding' in headers
+
+
+    def test_issue_112_similarity_threshold(self):
+        """
+        Checks that the similarity_threshold parameter is respected in header analysis.
+        """
+        with open(test_data_path + "README-manim.md", "r") as data_file:
+            file_text = data_file.read()
+
+        json_default, _ = extract_categories(file_text, Result(), similarity_threshold=0.8)
+        assert constants.CAT_INSTALLATION in json_default.results, f"Expected CAT_INSTALLATION with threshold 0.8"
+
+        # threshold 2.0 (extremely high) nothing should be detected via similarity
+        json_impossible, _ = extract_categories(file_text, Result(), similarity_threshold=2.0)
+        assert constants.CAT_INSTALLATION not in json_impossible.results, \
+            f"Expected no CAT_INSTALLATION with threshold 2.0, got: {json_impossible.results.get(constants.CAT_INSTALLATION)}"
diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py
index 94d3ce6e..69572880 100644
--- a/src/somef/utils/constants.py
+++ b/src/somef/utils/constants.py
@@ -541,3 +541,9 @@ class RepositoryType(Enum):
 
 DEPENDENCY_TYPE_RUNTIME = "runtime"
 DEPENDENCY_TYPE_DEVELOPMENT = "development"
+
+# in case not exist in config file. But config file has higher priority than this default value.
+CONF_SIMILARITY_THRESHOLD = "similarity_threshold"
+CONF_DEFAULT_SIMILARITY_THRESHOLD = 0.8
+
+

From fdbd04010f94d14f489af60552ff9db633564905 Mon Sep 17 00:00:00 2001
From: Juanje Mendoza <juanjemd@gmail.com>
Date: Thu, 7 May 2026 13:10:29 +0200
Subject: [PATCH 2/3] less request to github

---
 src/somef/test/test_process_repository.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/somef/test/test_process_repository.py b/src/somef/test/test_process_repository.py
index 5f807c36..7a4e366b 100644
--- a/src/somef/test/test_process_repository.py
+++ b/src/somef/test/test_process_repository.py
@@ -140,10 +140,12 @@ def test_no_repository_metadata(self):
             "https://github.com/oeg-upm/delta-ontology")
         assert constants.CAT_RELEASES not in github_data.results.keys()
 
+    @unittest.skipIf(os.getenv("CI") == "true", "Skipped in CI because it is already verified locally")
     def test_issue_284_issue_272(self):
         """Test designed to check if there are errors detecting title or stargazers"""
         github_data, owner, repo_name, default_br, project_path = process_repository.\
             load_online_repository_metadata(Result(), "https://github.com/3b1b/manim")
+        
         result_keys = github_data.results.keys()
         assert ((constants.CAT_STARS in result_keys) and (constants.CAT_FULL_TITLE not in result_keys))
 

From fd001216579690dd0485bf0516434114ffee475b Mon Sep 17 00:00:00 2001
From: Juanje Mendoza <juanjemd@gmail.com>
Date: Fri, 8 May 2026 08:52:35 +0200
Subject: [PATCH 3/3] explanation of similarity threshold in readthedocs

---
 README.md     |  4 ++++
 docs/usage.md | 27 ++++++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 68cad3d1..0b4a7976 100644
--- a/README.md
+++ b/README.md
@@ -362,6 +362,10 @@ The following command extracts all metadata available from [https://github.com/d
 somef describe -r https://github.com/dgarijo/Widoco/ -o test.json -t 0.8
 ```
 
+We recommend having a high value for the `threshold` parameter, 0.8 (default) or above.
+Additional configuration parameters (such as the `similarity_threshold` for header analysis) 
+can be set in `~/.somef/config.json`. See the [usage documentation](https://somef.readthedocs.io/en/latest/usage/) for details.
+
 Try SOMEF in Binder with our sample notebook: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/KnowledgeCaptureAndDiscovery/somef/HEAD?filepath=notebook%2FSOMEF%20Usage%20Example.ipynb)
 
 ## Contribute:
diff --git a/docs/usage.md b/docs/usage.md
index 7dd209a8..b212e972 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -82,8 +82,33 @@ If you prefer to export as a [Codemeta](https://codemeta.github.io/) JSON-LD, ju
 somef describe -r https://github.com/dgarijo/Widoco/ -c test.json
 ```
 
-For more information about the output types supported by SOMEF, please see [the output format help page](https://somef.readthedocs.io/en/latest/output/).
+For more information about the output types supported by SOMEF, please see [the output format help page](https://somef.readthedocs.io/en/latest/output/). 
 
 We recommend having a high value for the `threshold` parameter, 0.8 (default) or above.
 
+## Configuration parameters
+
+SOMEF uses a configuration file located at `~/.somef/config.json` that can be edited to customize its behavior. 
+To generate it, run `somef configure`. The following parameters are available:
+
+### Similarity threshold
+
+Controls the minimum similarity score required for a README header to be matched to a 
+category (e.g., installation, usage, license). SOMEF uses WordNet path similarity to 
+compare header words against known category terms.
+
+- **Default value**: `0.8`
+- **Range**: `0.0` to `1.0` (higher values = stricter matching, lower values = more permissive)
+
+To change it, edit your `~/.somef/config.json`:
+
+```json
+{
+    "similarity_threshold": 0.75
+}
+```
+
+Note: This parameter is different from the `-t` threshold used in `somef describe`, 
+which controls the confidence of the supervised classifiers.
+
 To see a live usage example, try our Binder Notebook: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/KnowledgeCaptureAndDiscovery/somef/HEAD?filepath=notebook%2FSOMEF%20Usage%20Example.ipynb)
\ No newline at end of file