Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,10 @@ The following command extracts all metadata available from [https://github.com/d
somef describe -r https://github.com/dgarijo/Widoco/ -o test.json -t 0.8
```

We recommend having a high value for the `threshold` parameter, 0.8 (default) or above.
Additional configuration parameters (such as the `similarity_threshold` for header analysis)
can be set in `~/.somef/config.json`. See the [usage documentation](https://somef.readthedocs.io/en/latest/usage/) for details.

Try SOMEF in Binder with our sample notebook: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/KnowledgeCaptureAndDiscovery/somef/HEAD?filepath=notebook%2FSOMEF%20Usage%20Example.ipynb)

## Contribute:
Expand Down
3 changes: 2 additions & 1 deletion config.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@
"description" : "./models/description.p",
"citation" : "./models/citation.p",
"installation" : "./models/installation.p",
"invocation" : "./models/invocation.p"
"invocation" : "./models/invocation.p",
"similarity_threshold": 0.8
}
27 changes: 26 additions & 1 deletion docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,33 @@ If you prefer to export as a [Codemeta](https://codemeta.github.io/) JSON-LD, ju
somef describe -r https://github.com/dgarijo/Widoco/ -c test.json
```

For more information about the output types supported by SOMEF, please see [the output format help page](https://somef.readthedocs.io/en/latest/output/).
For more information about the output types supported by SOMEF, please see [the output format help page](https://somef.readthedocs.io/en/latest/output/).

We recommend having a high value for the `threshold` parameter, 0.8 (default) or above.

## Configuration parameters

SOMEF uses a configuration file located at `~/.somef/config.json` that can be edited to customize its behavior.
To generate it, run `somef configure`. The following parameters are available:

### Similarity threshold

Controls the minimum similarity score required for a README header to be matched to a
category (e.g., installation, usage, license). SOMEF uses WordNet path similarity to
compare header words against known category terms.

- **Default value**: `0.8`
- **Range**: `0.0` to `1.0` (higher values = stricter matching, lower values = more permissive)

To change it, edit your `~/.somef/config.json`:

```json
{
"similarity_threshold": 0.75
}
```

Note: This parameter is different from the `-t` threshold used in `somef describe`,
which controls the confidence of the supervised classifiers.

To see a live usage example, try our Binder Notebook: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/KnowledgeCaptureAndDiscovery/somef/HEAD?filepath=notebook%2FSOMEF%20Usage%20Example.ipynb)
9 changes: 7 additions & 2 deletions src/somef/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ def get_configuration_file():
if credentials_file.exists():
with credentials_file.open("r") as fh:
file_paths = json.load(fh)
if constants.CONF_SIMILARITY_THRESHOLD not in file_paths:
file_paths[constants.CONF_SIMILARITY_THRESHOLD] = constants.CONF_DEFAULT_SIMILARITY_THRESHOLD
else:
sys.exit("Error: Please provide a config.json file or run somef configure.")
return file_paths
Expand Down Expand Up @@ -53,7 +55,8 @@ def configure(authorization="",
invocation=default_invocation,
installation=default_installation,
citation=default_citation,
base_uri=constants.CONF_DEFAULT_BASE_URI):
base_uri=constants.CONF_DEFAULT_BASE_URI,
similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD):
""" Function to configure the main program"""
import nltk
nltk.download('wordnet')
Expand All @@ -77,7 +80,8 @@ def configure(authorization="",
constants.CONF_INVOCATION: invocation,
constants.CONF_INSTALLATION: installation,
constants.CONF_CITATION: citation,
constants.CONF_BASE_URI: base_uri
constants.CONF_BASE_URI: base_uri,
constants.CONF_SIMILARITY_THRESHOLD: similarity_threshold
}

if data[constants.CONF_AUTHORIZATION] == "token ":
Expand All @@ -88,3 +92,4 @@ def configure(authorization="",
credentials_file.chmod(0o600)
json.dump(data, fh)
logging.info("Configuration file saved at "+os.path.dirname(credentials_file))

25 changes: 13 additions & 12 deletions src/somef/header_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
pd.options.mode.chained_assignment = None # default='warn'


SIMILARITY_THRESHOLD = 0.8
# SIMILARITY_THRESHOLD = 0.8


# Define wordnet groups
Expand Down Expand Up @@ -203,7 +203,7 @@ def find_sim(wordlist, wd):
# return maxgroup


def label_header(header):
def label_header(header, similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD):
"""Function designed to label a header with a subgroup"""
# remove punctuation
header_clean = header.translate(str.maketrans('', '', string.punctuation))
Expand All @@ -213,13 +213,13 @@ def label_header(header):
synn = Word(s).synsets
if len(synn) > 0:
# bestgroup = match_group(synn, group, 0.8)
bestgroup = match_group(synn)
bestgroup = match_group(synn, similarity_threshold)
if bestgroup != "" and bestgroup not in label:
label.append(bestgroup)
return label


def label_parent_headers(parentHeaders):
def label_parent_headers(parentHeaders, similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD):
"""label the header with a subgroup"""
header = ""
for value in parentHeaders:
Expand All @@ -232,7 +232,7 @@ def label_parent_headers(parentHeaders):
synn = Word(s).synsets
if len(synn) > 0:
# bestgroup = match_group(synn, group, 0.8)
bestgroup = match_group(synn)
bestgroup = match_group(synn, similarity_threshold)
if bestgroup != "" and bestgroup not in label:
label.append(bestgroup)
return label
Expand Down Expand Up @@ -261,13 +261,14 @@ def get_groups() -> Dict[str, List]:
WORDNET_GROUPS = build_wordnet_groups()
return WORDNET_GROUPS

def match_group(word_synsets) -> str:
def match_group(word_synsets,similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD) -> str:
best_group = ""
best_score = 0.0

for key, synsets in get_groups().items():
score = max_similarity(word_synsets, synsets)
if score > SIMILARITY_THRESHOLD and score > best_score:
# if score > SIMILARITY_THRESHOLD and score > best_score:
if score > similarity_threshold and score > best_score:
best_score = score
best_group = key

Expand All @@ -286,7 +287,7 @@ def tokenize_header(text) -> Iterable[str]:
clean = text.translate(str.maketrans('', '', string.punctuation))
return clean.strip().split()

def label_text(text: str) -> List[str]:
def label_text(text: str, similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD) -> List[str]:
labels: List[str] = []

if isinstance(text, list):
Expand All @@ -297,7 +298,7 @@ def label_text(text: str) -> List[str]:
for token in tokenize_header(text):
synsets = get_synsets(token)
if synsets:
grp = match_group(synsets)
grp = match_group(synsets, similarity_threshold )
# Skip if the header matches a known false positive for this group

# if isinstance(text, list):
Expand Down Expand Up @@ -410,7 +411,7 @@ def is_false_positive_header(text: str, category: str) -> bool:
# logging.error("Error while extracting headers: ", str(e))
# return repository_metadata, [repo_data]

def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Result, List[str]]:
def extract_categories(repo_data: str, repository_metadata: Result, similarity_threshold=constants.CONF_DEFAULT_SIMILARITY_THRESHOLD) -> Tuple[Result, List[str]]:
logging.info("Extracting information using headers")

if not repo_data:
Expand All @@ -423,8 +424,8 @@ def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Res
logging.warning("File to analyze has no headers")
return repository_metadata, [repo_data]

df['Group'] = df['Header'].map(label_text)
df['ParentGroup'] = df['ParentHeader'].fillna('').map(label_text)
df['Group'] = df['Header'].map(lambda x: label_text(x, similarity_threshold))
df['ParentGroup'] = df['ParentHeader'].fillna('').map(lambda x: label_text(x, similarity_threshold))

df.loc[df['Group'].str.len() == 0, 'Group'] = df['ParentGroup']
df = df.drop(columns=['ParentGroup'])
Expand Down
3 changes: 2 additions & 1 deletion src/somef/somef_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
logging.getLogger("urllib3").setLevel(logging.WARNING)

file_paths = configuration.get_configuration_file()
similarity_threshold = file_paths.get(constants.CONF_SIMILARITY_THRESHOLD, constants.CONF_DEFAULT_SIMILARITY_THRESHOLD)
repo_type = constants.RepositoryType.GITHUB
repository_metadata = Result()
def_branch = "main"
Expand Down Expand Up @@ -172,7 +173,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc
# remove html comments from unfiltered text (to avoid detecting commented out (wrong) metadata
readme_unfiltered_text = markdown_utils.remove_comments(readme_unfiltered_text)
repository_metadata, string_list = header_analysis.extract_categories(readme_unfiltered_text,
repository_metadata)
repository_metadata,similarity_threshold)

logging.info("Extracted categories from headers successfully.")
readme_text_unmarked = markdown_utils.unmark(readme_text)
Expand Down
16 changes: 16 additions & 0 deletions src/somef/test/test_header_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,3 +149,19 @@ def test_extract_headers_with_separators(self):
assert 'Installation' in headers
assert 'Citation' in headers
assert 'Funding' in headers


def test_issue_112_similarity_threshold(self):
"""
Checks that the similarity_threshold parameter is respected in header analysis.
"""
with open(test_data_path + "README-manim.md", "r") as data_file:
file_text = data_file.read()

json_default, _ = extract_categories(file_text, Result(), similarity_threshold=0.8)
assert constants.CAT_INSTALLATION in json_default.results, f"Expected CAT_INSTALLATION with threshold 0.8"

# threshold 2.0 (extremely high) nothing should be detected via similarity
json_impossible, _ = extract_categories(file_text, Result(), similarity_threshold=2.0)
assert constants.CAT_INSTALLATION not in json_impossible.results, \
f"Expected no CAT_INSTALLATION with threshold 2.0, got: {json_impossible.results.get(constants.CAT_INSTALLATION)}"
2 changes: 2 additions & 0 deletions src/somef/test/test_process_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,12 @@ def test_no_repository_metadata(self):
"https://github.com/oeg-upm/delta-ontology")
assert constants.CAT_RELEASES not in github_data.results.keys()

@unittest.skipIf(os.getenv("CI") == "true", "Skipped in CI because it is already verified locally")
def test_issue_284_issue_272(self):
"""Test designed to check if there are errors detecting title or stargazers"""
github_data, owner, repo_name, default_br, project_path = process_repository.\
load_online_repository_metadata(Result(), "https://github.com/3b1b/manim")

result_keys = github_data.results.keys()
assert ((constants.CAT_STARS in result_keys) and (constants.CAT_FULL_TITLE not in result_keys))

Expand Down
6 changes: 6 additions & 0 deletions src/somef/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,3 +541,9 @@ class RepositoryType(Enum):

DEPENDENCY_TYPE_RUNTIME = "runtime"
DEPENDENCY_TYPE_DEVELOPMENT = "development"

# in case not exist in config file. But config file has higher priority than this default value.
CONF_SIMILARITY_THRESHOLD = "similarity_threshold"
CONF_DEFAULT_SIMILARITY_THRESHOLD = 0.8


Loading