Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 37 additions & 1 deletion src/pybool_ir/datasets/pubmed/baseline.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import os
import time
from ftplib import FTP
from pathlib import Path

from concurrent.futures import ThreadPoolExecutor, as_completed

from pybool_ir import util
from pybool_ir.datasets.pubmed import datautils
from pybool_ir.datasets.pubmed.datautils import FTP_URL, FTP_BASELINE_CWD
from pybool_ir.datasets.pubmed.datautils import FTP_URL, FTP_BASELINE_CWD, FTP_UPDATE_CWD

def download_baseline(path: Path, limit: int = None, workers: int = 2, retries: int = 3):
with FTP(host=FTP_URL, user="anonymous") as ftp:
Expand Down Expand Up @@ -41,3 +42,38 @@ def download_one(filename):
futures = [executor.submit(download_one, fn) for fn in filenames]
for f in as_completed(futures):
print(f.result())


def update_baseline(path: Path, limit: int = None, workers: int = 2, retries: int = 3):
with FTP(host=FTP_URL, user="anonymous") as ftp:
ftp.cwd(FTP_UPDATE_CWD)
files = []
ftp.dir(files.append)

os.makedirs(str(path), exist_ok=True)
filenames = list(reversed(datautils.dir_to_filenames(files)))

if limit is not None and limit > 0:
filenames = filenames[:limit]
print(f"Limit set: Downloading first {limit} documents ...")

def download_one(filename):
target = path / filename
if target.exists():
return f"skip {filename}"

url = "https://" + FTP_URL + FTP_UPDATE_CWD + filename

for attempt in range(1, retries + 1):
try:
util.download_file(url, target)
return f"done {filename}"
except Exception as e:
if attempt == retries:
return f"FAILED {filename}: {e}"
time.sleep(2 * attempt)

with ThreadPoolExecutor(max_workers=workers) as executor:
futures = [executor.submit(download_one, fn) for fn in filenames]
for f in as_completed(futures):
print(f.result())
1 change: 1 addition & 0 deletions src/pybool_ir/datasets/pubmed/datautils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

FTP_URL = "ftp.ncbi.nlm.nih.gov"
FTP_BASELINE_CWD = "/pubmed/baseline/"
FTP_UPDATE_CWD = "/pubmed/updatefiles/"
FTP_PMC_CWD = "/pubmed/baseline/"

MESH_YEAR = "2025"
Expand Down
Loading