diff --git a/src/pybool_ir/datasets/pubmed/baseline.py b/src/pybool_ir/datasets/pubmed/baseline.py index d5bd4d1..8b637bb 100644 --- a/src/pybool_ir/datasets/pubmed/baseline.py +++ b/src/pybool_ir/datasets/pubmed/baseline.py @@ -1,4 +1,5 @@ import os +import time from ftplib import FTP from pathlib import Path @@ -6,7 +7,7 @@ from pybool_ir import util from pybool_ir.datasets.pubmed import datautils -from pybool_ir.datasets.pubmed.datautils import FTP_URL, FTP_BASELINE_CWD +from pybool_ir.datasets.pubmed.datautils import FTP_URL, FTP_BASELINE_CWD, FTP_UPDATE_CWD def download_baseline(path: Path, limit: int = None, workers: int = 2, retries: int = 3): with FTP(host=FTP_URL, user="anonymous") as ftp: @@ -41,3 +42,38 @@ def download_one(filename): futures = [executor.submit(download_one, fn) for fn in filenames] for f in as_completed(futures): print(f.result()) + + +def update_baseline(path: Path, limit: int = None, workers: int = 2, retries: int = 3): + with FTP(host=FTP_URL, user="anonymous") as ftp: + ftp.cwd(FTP_UPDATE_CWD) + files = [] + ftp.dir(files.append) + + os.makedirs(str(path), exist_ok=True) + filenames = list(reversed(datautils.dir_to_filenames(files))) + + if limit is not None and limit > 0: + filenames = filenames[:limit] + print(f"Limit set: Downloading first {limit} documents ...") + + def download_one(filename): + target = path / filename + if target.exists(): + return f"skip {filename}" + + url = "https://" + FTP_URL + FTP_UPDATE_CWD + filename + + for attempt in range(1, retries + 1): + try: + util.download_file(url, target) + return f"done {filename}" + except Exception as e: + if attempt == retries: + return f"FAILED {filename}: {e}" + time.sleep(2 * attempt) + + with ThreadPoolExecutor(max_workers=workers) as executor: + futures = [executor.submit(download_one, fn) for fn in filenames] + for f in as_completed(futures): + print(f.result()) \ No newline at end of file diff --git a/src/pybool_ir/datasets/pubmed/datautils.py b/src/pybool_ir/datasets/pubmed/datautils.py index 0c1ca5c..c8863ef 100644 --- a/src/pybool_ir/datasets/pubmed/datautils.py +++ b/src/pybool_ir/datasets/pubmed/datautils.py @@ -8,6 +8,7 @@ FTP_URL = "ftp.ncbi.nlm.nih.gov" FTP_BASELINE_CWD = "/pubmed/baseline/" +FTP_UPDATE_CWD = "/pubmed/updatefiles/" FTP_PMC_CWD = "/pubmed/baseline/" MESH_YEAR = "2025"