Add Open Targets target prioritisation bundle support

jbenjoseph · jbenjoseph · commit 519695be82ed · 2026-04-01T17:58:35.000+03:00
diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@
 - Pluggable cache backend architecture (filesystem cache by default).
 - API dataset ingestion for paginated JSON endpoints (for example ChEMBL and UniProt).
 - HTTP conditional refresh support (`ETag` / `Last-Modified`) when enabled.
+- Support for partitioned parquet bundle downloads (for example Open Targets releases).
 - Incremental parquet materialization (chunked processing + partitioned parquet parts).
 - CLI for listing, fetching, and materializing datasets.
 - Query interface for filtered row access from materialized parquet datasets.
@@ -17,7 +18,7 @@
 
 ## Included datasets
 
-The default catalog includes local-file/HTTP datasets plus API presets useful in drug discovery, including **ZINC**, **BindingDB**, **ChEMBL**, **UniProt**, **openFDA**, and the **Human Protein Atlas**.
+The default catalog includes local-file/HTTP datasets plus API presets useful in drug discovery, including **ZINC**, **BindingDB**, **Open Targets**, **ChEMBL**, **UniProt**, **openFDA**, and the **Human Protein Atlas**.
 
 1. `zinc15_250k` (ZINC)
 2. `zinc15_tranche_druglike_instock` (ZINC tranche)
@@ -39,36 +40,38 @@ The default catalog includes local-file/HTTP datasets plus API presets useful in
 18. `bindingdb_articles_affinity`
 19. `openfda_drug_event_serious`
 20. `proteinatlas_human_proteome`
-21. `chembl_activity_ki_human`
-22. `chembl_activity_ic50_human`
-23. `chembl_activity_kd_human`
-24. `chembl_activity_ec50_human`
-25. `chembl_activity_ac50_human`
-26. `chembl_assays_binding_human`
-27. `chembl_assays_functional_human`
-28. `chembl_assays_adme_human`
-29. `chembl_targets_human_single_protein`
-30. `chembl_targets_human_protein_complex`
-31. `chembl_molecules_phase3plus`
-32. `chembl_molecules_phase4`
-33. `chembl_molecules_black_box_warning`
-34. `chembl_mechanism_phase2plus`
-35. `chembl_drug_indications_phase2plus`
-36. `chembl_drug_indications_phase3plus`
-37. `uniprot_human_reviewed`
-38. `uniprot_human_receptors`
-39. `uniprot_human_membrane`
-40. `uniprot_human_nucleus`
-41. `uniprot_human_kinases`
-42. `uniprot_human_gpcr`
-43. `uniprot_human_ion_channels`
-44. `uniprot_human_transporters`
-45. `uniprot_human_secreted`
-46. `uniprot_human_transcription_factors`
-47. `uniprot_human_enzymes`
+21. `opentargets_target_prioritisation`
+22. `chembl_activity_ki_human`
+23. `chembl_activity_ic50_human`
+24. `chembl_activity_kd_human`
+25. `chembl_activity_ec50_human`
+26. `chembl_activity_ac50_human`
+27. `chembl_assays_binding_human`
+28. `chembl_assays_functional_human`
+29. `chembl_assays_adme_human`
+30. `chembl_targets_human_single_protein`
+31. `chembl_targets_human_protein_complex`
+32. `chembl_molecules_phase3plus`
+33. `chembl_molecules_phase4`
+34. `chembl_molecules_black_box_warning`
+35. `chembl_mechanism_phase2plus`
+36. `chembl_drug_indications_phase2plus`
+37. `chembl_drug_indications_phase3plus`
+38. `uniprot_human_reviewed`
+39. `uniprot_human_receptors`
+40. `uniprot_human_membrane`
+41. `uniprot_human_nucleus`
+42. `uniprot_human_kinases`
+43. `uniprot_human_gpcr`
+44. `uniprot_human_ion_channels`
+45. `uniprot_human_transporters`
+46. `uniprot_human_secreted`
+47. `uniprot_human_transcription_factors`
+48. `uniprot_human_enzymes`
 
 Most of these are distributed through MoleculeNet/DeepChem mirrors and retain upstream licensing terms.
 BindingDB is included as a versioned ZIP-backed TSV snapshot for literature-derived affinity modeling.
+Open Targets is included as a versioned parquet-part bundle for target prioritisation workflows.
 ChEMBL, UniProt, and openFDA presets are fetched through their public REST APIs and cached locally as JSONL.
 ZINC tranche presets aggregate multiple tranche files per dataset (drug-like MW B-K and logP A-K bins,
 reactivity A/B/C/E) into one cached tabular source during fetch.
diff --git a/src/refua_data/cache.py b/src/refua_data/cache.py
@@ -92,7 +92,22 @@ def write_json(self, path: Path, payload: dict[str, Any]) -> None:
 
 
 def sha256_file(path: Path) -> str:
-    """Compute the SHA256 checksum of a file."""
+    """Compute the SHA256 checksum of a file or directory."""
+    if path.is_dir():
+        digest = hashlib.sha256()
+        for child in sorted(candidate for candidate in path.rglob("*") if candidate.is_file()):
+            relative = child.relative_to(path).as_posix().encode("utf-8")
+            digest.update(relative)
+            digest.update(b"\0")
+            with child.open("rb") as handle:
+                while True:
+                    chunk = handle.read(_CHUNK_SIZE)
+                    if not chunk:
+                        break
+                    digest.update(chunk)
+            digest.update(b"\0")
+        return digest.hexdigest()
+
     digest = hashlib.sha256()
     with path.open("rb") as handle:
         while True:
diff --git a/src/refua_data/catalog.py b/src/refua_data/catalog.py
@@ -63,6 +63,20 @@ def _zinc_druglike_tranche_urls(
     )
 
 
+def _opentargets_parquet_part_urls(
+    *,
+    release: str,
+    dataset: str,
+    part_token: str,
+    part_count: int,
+) -> tuple[str, ...]:
+    base = f"https://ftp.ebi.ac.uk/pub/databases/opentargets/platform/{release}/output/{dataset}"
+    return tuple(
+        f"{base}/part-{index:05d}-{part_token}-c000.snappy.parquet"
+        for index in range(part_count)
+    )
+
+
 _DEFAULT_DATASETS = [
     DatasetDefinition(
         dataset_id="zinc15_250k",
@@ -439,6 +453,41 @@ def _zinc_druglike_tranche_urls(
             "subcellular_localization",
         ),
     ),
+    DatasetDefinition(
+        dataset_id="opentargets_target_prioritisation",
+        name="Open Targets Target Prioritisation",
+        description=(
+            "Open Targets target prioritisation features spanning tractability, "
+            "safety, novelty, and evidence-linked attributes for therapeutic "
+            "target ranking."
+        ),
+        source="Open Targets Platform downloadable parquet",
+        homepage="https://platform-docs.opentargets.org/data-access/datasets",
+        license_name="CC0 1.0 (Open Targets Platform data)",
+        license_url="https://platform-docs.opentargets.org/licence",
+        urls=_opentargets_parquet_part_urls(
+            release="25.03",
+            dataset="target_prioritisation",
+            part_token="9647e5c1-fd87-47e0-8c5d-3b1429e19b9a",
+            part_count=16,
+        ),
+        file_format="parquet",
+        category="targets",
+        version="25.03",
+        filename="target_prioritisation",
+        url_mode="bundle",
+        usage_notes=(
+            "Use for target ranking, tractability-aware portfolio construction, "
+            "and feature generation for target prioritisation models.",
+        ),
+        tags=(
+            "opentargets",
+            "target_prioritisation",
+            "tractability",
+            "safety",
+            "target_ranking",
+        ),
+    ),
     DatasetDefinition(
         dataset_id="chembl_activity_ki_human",
         name="ChEMBL Human Ki Activities",
diff --git a/src/refua_data/downloader.py b/src/refua_data/downloader.py
@@ -154,6 +154,16 @@ def fetch_dataset(
                 timeout_seconds=timeout_seconds,
             )
 
+        if dataset.url_mode == "bundle":
+            return _fetch_bundle_urls(
+                dataset=dataset,
+                cache=cache,
+                raw_path=raw_path,
+                meta_path=meta_path,
+                refresh=refresh,
+                timeout_seconds=timeout_seconds,
+            )
+
         errors: list[str] = []
         for url in dataset.urls:
             try:
@@ -302,6 +312,13 @@ def _download_url_to_path(
     raise ValueError(f"Unsupported URL scheme for {url}")
 
 
+def _remove_path(path: Path) -> None:
+    if path.is_dir():
+        shutil.rmtree(path)
+        return
+    path.unlink(missing_ok=True)
+
+
 def _fetch_concat_urls(
     *,
     dataset: DatasetDefinition,
@@ -392,6 +409,74 @@ def _fetch_concat_urls(
     )
 
 
+def _fetch_bundle_urls(
+    *,
+    dataset: DatasetDefinition,
+    cache: CacheBackend,
+    raw_path: Path,
+    meta_path: Path,
+    refresh: bool,
+    timeout_seconds: float,
+) -> FetchResult:
+    raw_path.parent.mkdir(parents=True, exist_ok=True)
+    tmp_path = raw_path.with_name(f"{raw_path.name}.tmp")
+    if tmp_path.exists():
+        _remove_path(tmp_path)
+    tmp_path.mkdir(parents=True, exist_ok=True)
+
+    bytes_downloaded = 0
+    source_details: list[dict[str, Any]] = []
+
+    try:
+        for index, url in enumerate(dataset.urls):
+            candidate_name = Path(urlparse(url).path).name
+            filename = candidate_name or f"part-{index:05d}"
+            detail = _download_url_to_path(
+                url=url,
+                dest_path=tmp_path / filename,
+                timeout_seconds=timeout_seconds,
+            )
+            source_details.append(detail)
+            bytes_downloaded += int(detail.get("bytes_downloaded", 0))
+
+        if raw_path.exists():
+            _remove_path(raw_path)
+        os.replace(tmp_path, raw_path)
+    except Exception:
+        _remove_path(tmp_path)
+        raise
+
+    checksum = sha256_file(raw_path)
+    source_url = dataset.urls[0]
+    meta = {
+        "dataset_id": dataset.dataset_id,
+        "version": dataset.version,
+        "source_type": "multi_url",
+        "source_url": source_url,
+        "source_urls": list(dataset.urls),
+        "url_mode": dataset.url_mode,
+        "source_count": len(dataset.urls),
+        "fetched_at": _utcnow_iso(),
+        "refreshed": refresh,
+        "bytes_downloaded": bytes_downloaded,
+        "sources": source_details,
+        "sha256": checksum,
+    }
+    _write_raw_metadata(cache, meta_path, dataset=dataset, meta=meta)
+
+    return FetchResult(
+        dataset_id=dataset.dataset_id,
+        version=dataset.version,
+        raw_path=raw_path,
+        metadata_path=meta_path,
+        source_url=source_url,
+        cache_hit=False,
+        refreshed=refresh,
+        bytes_downloaded=bytes_downloaded,
+        sha256=checksum,
+    )
+
+
 def _fetch_file_url(
     *,
     dataset: DatasetDefinition,
diff --git a/src/refua_data/io.py b/src/refua_data/io.py
@@ -33,6 +33,10 @@ def iter_dataset_chunks(
     chunksize: int,
 ) -> Iterator[pd.DataFrame]:
     """Yield DataFrame chunks from a dataset raw file."""
+    if dataset.file_format == "parquet":
+        yield from _iter_parquet_chunks(raw_path, chunksize=chunksize)
+        return
+
     if dataset.file_format == "jsonl":
         yield from _iter_jsonl_chunks(raw_path, chunksize=chunksize)
         return
@@ -70,6 +74,26 @@ def _iter_jsonl_chunks(raw_path: Path, *, chunksize: int) -> Iterator[pd.DataFra
         yield prepare_dataframe(chunk)
 
 
+def _iter_parquet_chunks(raw_path: Path, *, chunksize: int) -> Iterator[pd.DataFrame]:
+    parquet_files: list[Path]
+    if raw_path.is_dir():
+        parquet_files = sorted(path for path in raw_path.glob("*.parquet"))
+    else:
+        parquet_files = [raw_path]
+
+    if not parquet_files:
+        raise ValueError(f"No parquet files found at '{raw_path}'.")
+
+    for parquet_file in parquet_files:
+        frame = pd.read_parquet(parquet_file)
+        if len(frame) <= chunksize:
+            yield prepare_dataframe(frame)
+            continue
+
+        for start in range(0, len(frame), chunksize):
+            yield prepare_dataframe(frame.iloc[start : start + chunksize])
+
+
 def _iter_csv_like_from_zip(
     raw_path: Path,
     *,
diff --git a/src/refua_data/models.py b/src/refua_data/models.py
@@ -7,10 +7,10 @@
 from typing import Any, Literal
 from urllib.parse import urlparse
 
-TabularFormat = Literal["csv", "tsv", "jsonl"]
+TabularFormat = Literal["csv", "tsv", "jsonl", "parquet"]
 Compression = Literal["none", "gzip", "zip", "infer"]
 ApiPaginationMode = Literal["none", "chembl", "link_header"]
-UrlMode = Literal["fallback", "concat"]
+UrlMode = Literal["fallback", "concat", "bundle"]
 
 _CATEGORY_USAGE_DEFAULTS: dict[str, str] = {
     "compound_library": "Use for compound library curation, screening, and molecular pretraining.",
@@ -94,6 +94,7 @@ def preferred_filename(self) -> str:
             "csv": ".csv",
             "tsv": ".tsv",
             "jsonl": ".jsonl",
+            "parquet": ".parquet",
         }[self.file_format]
         return f"{self.dataset_id}{fallback_ext}"
 
diff --git a/src/refua_data/validation.py b/src/refua_data/validation.py
@@ -38,7 +38,8 @@ def validate_dataset_sources(
 
     For datasets with multiple URLs in `fallback` mode, probes are attempted in
     order and the dataset is considered healthy once one source succeeds.
-    For datasets in `concat` mode, every configured source must be reachable.
+    For datasets in `concat` or `bundle` mode, every configured source must be
+    reachable.
     """
     if dataset.api is not None:
         return [_probe_api(dataset, dataset.api, timeout_seconds=timeout_seconds)]
@@ -56,7 +57,7 @@ def validate_dataset_sources(
             )
         ]
 
-    if dataset.url_mode == "concat":
+    if dataset.url_mode in {"concat", "bundle"}:
         concat_attempts = [
             _probe_url(dataset, url, timeout_seconds=timeout_seconds)
             for url in dataset.urls
diff --git a/tests/test_catalog.py b/tests/test_catalog.py
@@ -22,6 +22,7 @@ def test_default_catalog_contains_core_and_api_datasets() -> None:
     assert "openfda_drug_event_serious" in ids
     assert "bindingdb_articles_affinity" in ids
     assert "proteinatlas_human_proteome" in ids
+    assert "opentargets_target_prioritisation" in ids
     assert "uniprot_human_reviewed" in ids
     assert "uniprot_human_receptors" in ids
     assert "uniprot_human_membrane" in ids
@@ -30,4 +31,4 @@ def test_default_catalog_contains_core_and_api_datasets() -> None:
     assert "uniprot_human_transcription_factors" in ids
     assert "uniprot_human_enzymes" in ids
     assert "chembl_targets_human_protein_complex" in ids
-    assert len(datasets) >= 47
+    assert len(datasets) >= 48
diff --git a/tests/test_fetch_cache.py b/tests/test_fetch_cache.py
diff --git a/tests/test_materialize.py b/tests/test_materialize.py
diff --git a/tests/test_validate_sources.py b/tests/test_validate_sources.py