From 3a25ffd5e080e32e5b9144c69d20b0201f548f34 Mon Sep 17 00:00:00 2001 From: codingfabi Date: Thu, 4 Sep 2025 17:05:10 +0200 Subject: [PATCH 1/2] add option to customize fuzzy search cutoff --- retraction_check/check_bib.py | 33 +++++++++++++++++++++------------ tests/test_check_bib.py | 13 +++++++++++++ 2 files changed, 34 insertions(+), 12 deletions(-) diff --git a/retraction_check/check_bib.py b/retraction_check/check_bib.py index 3f0e93a..6da443a 100644 --- a/retraction_check/check_bib.py +++ b/retraction_check/check_bib.py @@ -5,6 +5,13 @@ import io import difflib from typing import List, Literal, TypedDict, Set, Optional +from dataclasses import dataclass + +@dataclass +class RetractionCheckConfig: + fuzzy_cutoff: float = 0.6 + +default_config = RetractionCheckConfig() RETRACTION_WATCH_CSV = ( "https://gitlab.com/crossref/retraction-watch-data/-/raw/main/retraction_watch.csv" @@ -90,15 +97,19 @@ def build_retraction_lookup( return titles, dois -def fuzzy_title_match(title: str, titles: set[str]) -> bool: +def fuzzy_title_match( + title: str, titles: set[str], config: RetractionCheckConfig = None +) -> bool: + if config is None: + config = default_config if not title: return False - matches = difflib.get_close_matches(title.strip(), titles, n=1) + matches = difflib.get_close_matches(title.strip(), titles, n=1, cutoff=config.fuzzy_cutoff) return bool(matches) def is_retracted( - entry: BibEntry, titles: Set[str], dois: Set[str] + entry: BibEntry, titles: Set[str], dois: Set[str], config: RetractionCheckConfig = None ) -> Optional[MATCH_TYPE]: try: title = entry.get("title", "").strip() @@ -108,7 +119,7 @@ def is_retracted( return None if doi and doi in dois: return "doi" - if fuzzy_title_match(title, titles): + if fuzzy_title_match(title, titles, config=config): return "fuzzy" return None @@ -117,25 +128,23 @@ def check_entry( entry: BibEntry, titles: Optional[set[str]] = None, dois: Optional[set[str]] = None, + config: RetractionCheckConfig = None ) -> Optional[MATCH_TYPE]: - """ - Standalone function to check a single bibtex entry dict for retraction status. - Downloads and builds lookup if titles/dois are not provided. - Returns 'doi', 'fuzzy', or None. - """ + if config is None: + config = default_config if titles is None or dois is None: csv_rows = download_retraction_watch_csv() titles, dois = build_retraction_lookup(csv_rows) - return is_retracted(entry, titles, dois) + return is_retracted(entry, titles, dois, config=config) -def check_bib_file(bib_path: str) -> None: +def check_bib_file(bib_path: str, config: RetractionCheckConfig = None) -> None: entries = parse_bib_file(bib_path) csv_rows = download_retraction_watch_csv() titles, dois = build_retraction_lookup(csv_rows) matches: dict[str, list[str]] = {"doi": [], "fuzzy": []} for entry in entries: - match_type = is_retracted(entry, titles, dois) + match_type = is_retracted(entry, titles, dois, config) if match_type: matches[match_type].append(entry.get("title", "Unknown Title")) if matches["doi"]: diff --git a/tests/test_check_bib.py b/tests/test_check_bib.py index 2f98d80..858d754 100644 --- a/tests/test_check_bib.py +++ b/tests/test_check_bib.py @@ -133,6 +133,19 @@ def test_fuzzy_match_empty_title(self): result = fuzzy_title_match("", titles) self.assertFalse(result) + def test_fuzzy_cutoff_config_respected(self): + # Assuming fuzzy_title_match now takes a config or cutoff argument + from retraction_check.check_bib import RetractionCheckConfig + titles = {"Test Paper Title"} + # With a high cutoff, similar titles should not match + config = RetractionCheckConfig(fuzzy_cutoff=0.95) + result = fuzzy_title_match("Test Pper Titl", titles, config=config) + self.assertFalse(result) + # With a low cutoff, similar titles should match + config = RetractionCheckConfig(fuzzy_cutoff=0.5) + result = fuzzy_title_match("Test Pper Titl", titles, config=config) + self.assertTrue(result) + class TestIsRetracted(unittest.TestCase): From ab259432387d5e85f08e463c79a57e5760476753 Mon Sep 17 00:00:00 2001 From: codingfabi Date: Thu, 4 Sep 2025 17:38:36 +0200 Subject: [PATCH 2/2] fix linter errors --- retraction_check/check_bib.py | 19 ++++++++++++++----- tests/test_check_bib.py | 1 + 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/retraction_check/check_bib.py b/retraction_check/check_bib.py index 6da443a..3bd2d17 100644 --- a/retraction_check/check_bib.py +++ b/retraction_check/check_bib.py @@ -7,10 +7,12 @@ from typing import List, Literal, TypedDict, Set, Optional from dataclasses import dataclass + @dataclass class RetractionCheckConfig: fuzzy_cutoff: float = 0.6 + default_config = RetractionCheckConfig() RETRACTION_WATCH_CSV = ( @@ -98,18 +100,23 @@ def build_retraction_lookup( def fuzzy_title_match( - title: str, titles: set[str], config: RetractionCheckConfig = None + title: str, titles: set[str], config: Optional[RetractionCheckConfig] = None ) -> bool: if config is None: config = default_config if not title: return False - matches = difflib.get_close_matches(title.strip(), titles, n=1, cutoff=config.fuzzy_cutoff) + matches = difflib.get_close_matches( + title.strip(), titles, n=1, cutoff=config.fuzzy_cutoff + ) return bool(matches) def is_retracted( - entry: BibEntry, titles: Set[str], dois: Set[str], config: RetractionCheckConfig = None + entry: BibEntry, + titles: Set[str], + dois: Set[str], + config: Optional[RetractionCheckConfig] = None, ) -> Optional[MATCH_TYPE]: try: title = entry.get("title", "").strip() @@ -128,7 +135,7 @@ def check_entry( entry: BibEntry, titles: Optional[set[str]] = None, dois: Optional[set[str]] = None, - config: RetractionCheckConfig = None + config: Optional[RetractionCheckConfig] = None, ) -> Optional[MATCH_TYPE]: if config is None: config = default_config @@ -138,7 +145,9 @@ def check_entry( return is_retracted(entry, titles, dois, config=config) -def check_bib_file(bib_path: str, config: RetractionCheckConfig = None) -> None: +def check_bib_file( + bib_path: str, config: Optional[RetractionCheckConfig] = None +) -> None: entries = parse_bib_file(bib_path) csv_rows = download_retraction_watch_csv() titles, dois = build_retraction_lookup(csv_rows) diff --git a/tests/test_check_bib.py b/tests/test_check_bib.py index 858d754..58e6478 100644 --- a/tests/test_check_bib.py +++ b/tests/test_check_bib.py @@ -136,6 +136,7 @@ def test_fuzzy_match_empty_title(self): def test_fuzzy_cutoff_config_respected(self): # Assuming fuzzy_title_match now takes a config or cutoff argument from retraction_check.check_bib import RetractionCheckConfig + titles = {"Test Paper Title"} # With a high cutoff, similar titles should not match config = RetractionCheckConfig(fuzzy_cutoff=0.95)