Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 30 additions & 12 deletions retraction_check/check_bib.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@
import io
import difflib
from typing import List, Literal, TypedDict, Set, Optional
from dataclasses import dataclass


@dataclass
class RetractionCheckConfig:
fuzzy_cutoff: float = 0.6


default_config = RetractionCheckConfig()

RETRACTION_WATCH_CSV = (
"https://gitlab.com/crossref/retraction-watch-data/-/raw/main/retraction_watch.csv"
Expand Down Expand Up @@ -90,15 +99,24 @@ def build_retraction_lookup(
return titles, dois


def fuzzy_title_match(title: str, titles: set[str]) -> bool:
def fuzzy_title_match(
title: str, titles: set[str], config: Optional[RetractionCheckConfig] = None
) -> bool:
if config is None:
config = default_config
if not title:
return False
matches = difflib.get_close_matches(title.strip(), titles, n=1)
matches = difflib.get_close_matches(
title.strip(), titles, n=1, cutoff=config.fuzzy_cutoff
)
return bool(matches)


def is_retracted(
entry: BibEntry, titles: Set[str], dois: Set[str]
entry: BibEntry,
titles: Set[str],
dois: Set[str],
config: Optional[RetractionCheckConfig] = None,
) -> Optional[MATCH_TYPE]:
try:
title = entry.get("title", "").strip()
Expand All @@ -108,7 +126,7 @@ def is_retracted(
return None
if doi and doi in dois:
return "doi"
if fuzzy_title_match(title, titles):
if fuzzy_title_match(title, titles, config=config):
return "fuzzy"
return None

Expand All @@ -117,25 +135,25 @@ def check_entry(
entry: BibEntry,
titles: Optional[set[str]] = None,
dois: Optional[set[str]] = None,
config: Optional[RetractionCheckConfig] = None,
) -> Optional[MATCH_TYPE]:
"""
Standalone function to check a single bibtex entry dict for retraction status.
Downloads and builds lookup if titles/dois are not provided.
Returns 'doi', 'fuzzy', or None.
"""
if config is None:
config = default_config
if titles is None or dois is None:
csv_rows = download_retraction_watch_csv()
titles, dois = build_retraction_lookup(csv_rows)
return is_retracted(entry, titles, dois)
return is_retracted(entry, titles, dois, config=config)


def check_bib_file(bib_path: str) -> None:
def check_bib_file(
bib_path: str, config: Optional[RetractionCheckConfig] = None
) -> None:
entries = parse_bib_file(bib_path)
csv_rows = download_retraction_watch_csv()
titles, dois = build_retraction_lookup(csv_rows)
matches: dict[str, list[str]] = {"doi": [], "fuzzy": []}
for entry in entries:
match_type = is_retracted(entry, titles, dois)
match_type = is_retracted(entry, titles, dois, config)
if match_type:
matches[match_type].append(entry.get("title", "Unknown Title"))
if matches["doi"]:
Expand Down
14 changes: 14 additions & 0 deletions tests/test_check_bib.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,20 @@ def test_fuzzy_match_empty_title(self):
result = fuzzy_title_match("", titles)
self.assertFalse(result)

def test_fuzzy_cutoff_config_respected(self):
# Assuming fuzzy_title_match now takes a config or cutoff argument
from retraction_check.check_bib import RetractionCheckConfig

titles = {"Test Paper Title"}
# With a high cutoff, similar titles should not match
config = RetractionCheckConfig(fuzzy_cutoff=0.95)
result = fuzzy_title_match("Test Pper Titl", titles, config=config)
self.assertFalse(result)
# With a low cutoff, similar titles should match
config = RetractionCheckConfig(fuzzy_cutoff=0.5)
result = fuzzy_title_match("Test Pper Titl", titles, config=config)
self.assertTrue(result)


class TestIsRetracted(unittest.TestCase):

Expand Down