diff --git a/retraction_check/check_bib.py b/retraction_check/check_bib.py index 3f0e93a..3bd2d17 100644 --- a/retraction_check/check_bib.py +++ b/retraction_check/check_bib.py @@ -5,6 +5,15 @@ import io import difflib from typing import List, Literal, TypedDict, Set, Optional +from dataclasses import dataclass + + +@dataclass +class RetractionCheckConfig: + fuzzy_cutoff: float = 0.6 + + +default_config = RetractionCheckConfig() RETRACTION_WATCH_CSV = ( "https://gitlab.com/crossref/retraction-watch-data/-/raw/main/retraction_watch.csv" @@ -90,15 +99,24 @@ def build_retraction_lookup( return titles, dois -def fuzzy_title_match(title: str, titles: set[str]) -> bool: +def fuzzy_title_match( + title: str, titles: set[str], config: Optional[RetractionCheckConfig] = None +) -> bool: + if config is None: + config = default_config if not title: return False - matches = difflib.get_close_matches(title.strip(), titles, n=1) + matches = difflib.get_close_matches( + title.strip(), titles, n=1, cutoff=config.fuzzy_cutoff + ) return bool(matches) def is_retracted( - entry: BibEntry, titles: Set[str], dois: Set[str] + entry: BibEntry, + titles: Set[str], + dois: Set[str], + config: Optional[RetractionCheckConfig] = None, ) -> Optional[MATCH_TYPE]: try: title = entry.get("title", "").strip() @@ -108,7 +126,7 @@ def is_retracted( return None if doi and doi in dois: return "doi" - if fuzzy_title_match(title, titles): + if fuzzy_title_match(title, titles, config=config): return "fuzzy" return None @@ -117,25 +135,25 @@ def check_entry( entry: BibEntry, titles: Optional[set[str]] = None, dois: Optional[set[str]] = None, + config: Optional[RetractionCheckConfig] = None, ) -> Optional[MATCH_TYPE]: - """ - Standalone function to check a single bibtex entry dict for retraction status. - Downloads and builds lookup if titles/dois are not provided. - Returns 'doi', 'fuzzy', or None. - """ + if config is None: + config = default_config if titles is None or dois is None: csv_rows = download_retraction_watch_csv() titles, dois = build_retraction_lookup(csv_rows) - return is_retracted(entry, titles, dois) + return is_retracted(entry, titles, dois, config=config) -def check_bib_file(bib_path: str) -> None: +def check_bib_file( + bib_path: str, config: Optional[RetractionCheckConfig] = None +) -> None: entries = parse_bib_file(bib_path) csv_rows = download_retraction_watch_csv() titles, dois = build_retraction_lookup(csv_rows) matches: dict[str, list[str]] = {"doi": [], "fuzzy": []} for entry in entries: - match_type = is_retracted(entry, titles, dois) + match_type = is_retracted(entry, titles, dois, config) if match_type: matches[match_type].append(entry.get("title", "Unknown Title")) if matches["doi"]: diff --git a/tests/test_check_bib.py b/tests/test_check_bib.py index 2f98d80..58e6478 100644 --- a/tests/test_check_bib.py +++ b/tests/test_check_bib.py @@ -133,6 +133,20 @@ def test_fuzzy_match_empty_title(self): result = fuzzy_title_match("", titles) self.assertFalse(result) + def test_fuzzy_cutoff_config_respected(self): + # Assuming fuzzy_title_match now takes a config or cutoff argument + from retraction_check.check_bib import RetractionCheckConfig + + titles = {"Test Paper Title"} + # With a high cutoff, similar titles should not match + config = RetractionCheckConfig(fuzzy_cutoff=0.95) + result = fuzzy_title_match("Test Pper Titl", titles, config=config) + self.assertFalse(result) + # With a low cutoff, similar titles should match + config = RetractionCheckConfig(fuzzy_cutoff=0.5) + result = fuzzy_title_match("Test Pper Titl", titles, config=config) + self.assertTrue(result) + class TestIsRetracted(unittest.TestCase):