Skip to content

Commit 519695b

Browse files
committed
Add Open Targets target prioritisation bundle support
1 parent 778b8c1 commit 519695b

11 files changed

Lines changed: 320 additions & 34 deletions

File tree

README.md

Lines changed: 31 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
- Pluggable cache backend architecture (filesystem cache by default).
1010
- API dataset ingestion for paginated JSON endpoints (for example ChEMBL and UniProt).
1111
- HTTP conditional refresh support (`ETag` / `Last-Modified`) when enabled.
12+
- Support for partitioned parquet bundle downloads (for example Open Targets releases).
1213
- Incremental parquet materialization (chunked processing + partitioned parquet parts).
1314
- CLI for listing, fetching, and materializing datasets.
1415
- Query interface for filtered row access from materialized parquet datasets.
@@ -17,7 +18,7 @@
1718

1819
## Included datasets
1920

20-
The default catalog includes local-file/HTTP datasets plus API presets useful in drug discovery, including **ZINC**, **BindingDB**, **ChEMBL**, **UniProt**, **openFDA**, and the **Human Protein Atlas**.
21+
The default catalog includes local-file/HTTP datasets plus API presets useful in drug discovery, including **ZINC**, **BindingDB**, **Open Targets**, **ChEMBL**, **UniProt**, **openFDA**, and the **Human Protein Atlas**.
2122

2223
1. `zinc15_250k` (ZINC)
2324
2. `zinc15_tranche_druglike_instock` (ZINC tranche)
@@ -39,36 +40,38 @@ The default catalog includes local-file/HTTP datasets plus API presets useful in
3940
18. `bindingdb_articles_affinity`
4041
19. `openfda_drug_event_serious`
4142
20. `proteinatlas_human_proteome`
42-
21. `chembl_activity_ki_human`
43-
22. `chembl_activity_ic50_human`
44-
23. `chembl_activity_kd_human`
45-
24. `chembl_activity_ec50_human`
46-
25. `chembl_activity_ac50_human`
47-
26. `chembl_assays_binding_human`
48-
27. `chembl_assays_functional_human`
49-
28. `chembl_assays_adme_human`
50-
29. `chembl_targets_human_single_protein`
51-
30. `chembl_targets_human_protein_complex`
52-
31. `chembl_molecules_phase3plus`
53-
32. `chembl_molecules_phase4`
54-
33. `chembl_molecules_black_box_warning`
55-
34. `chembl_mechanism_phase2plus`
56-
35. `chembl_drug_indications_phase2plus`
57-
36. `chembl_drug_indications_phase3plus`
58-
37. `uniprot_human_reviewed`
59-
38. `uniprot_human_receptors`
60-
39. `uniprot_human_membrane`
61-
40. `uniprot_human_nucleus`
62-
41. `uniprot_human_kinases`
63-
42. `uniprot_human_gpcr`
64-
43. `uniprot_human_ion_channels`
65-
44. `uniprot_human_transporters`
66-
45. `uniprot_human_secreted`
67-
46. `uniprot_human_transcription_factors`
68-
47. `uniprot_human_enzymes`
43+
21. `opentargets_target_prioritisation`
44+
22. `chembl_activity_ki_human`
45+
23. `chembl_activity_ic50_human`
46+
24. `chembl_activity_kd_human`
47+
25. `chembl_activity_ec50_human`
48+
26. `chembl_activity_ac50_human`
49+
27. `chembl_assays_binding_human`
50+
28. `chembl_assays_functional_human`
51+
29. `chembl_assays_adme_human`
52+
30. `chembl_targets_human_single_protein`
53+
31. `chembl_targets_human_protein_complex`
54+
32. `chembl_molecules_phase3plus`
55+
33. `chembl_molecules_phase4`
56+
34. `chembl_molecules_black_box_warning`
57+
35. `chembl_mechanism_phase2plus`
58+
36. `chembl_drug_indications_phase2plus`
59+
37. `chembl_drug_indications_phase3plus`
60+
38. `uniprot_human_reviewed`
61+
39. `uniprot_human_receptors`
62+
40. `uniprot_human_membrane`
63+
41. `uniprot_human_nucleus`
64+
42. `uniprot_human_kinases`
65+
43. `uniprot_human_gpcr`
66+
44. `uniprot_human_ion_channels`
67+
45. `uniprot_human_transporters`
68+
46. `uniprot_human_secreted`
69+
47. `uniprot_human_transcription_factors`
70+
48. `uniprot_human_enzymes`
6971

7072
Most of these are distributed through MoleculeNet/DeepChem mirrors and retain upstream licensing terms.
7173
BindingDB is included as a versioned ZIP-backed TSV snapshot for literature-derived affinity modeling.
74+
Open Targets is included as a versioned parquet-part bundle for target prioritisation workflows.
7275
ChEMBL, UniProt, and openFDA presets are fetched through their public REST APIs and cached locally as JSONL.
7376
ZINC tranche presets aggregate multiple tranche files per dataset (drug-like MW B-K and logP A-K bins,
7477
reactivity A/B/C/E) into one cached tabular source during fetch.

src/refua_data/cache.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,22 @@ def write_json(self, path: Path, payload: dict[str, Any]) -> None:
9292

9393

9494
def sha256_file(path: Path) -> str:
95-
"""Compute the SHA256 checksum of a file."""
95+
"""Compute the SHA256 checksum of a file or directory."""
96+
if path.is_dir():
97+
digest = hashlib.sha256()
98+
for child in sorted(candidate for candidate in path.rglob("*") if candidate.is_file()):
99+
relative = child.relative_to(path).as_posix().encode("utf-8")
100+
digest.update(relative)
101+
digest.update(b"\0")
102+
with child.open("rb") as handle:
103+
while True:
104+
chunk = handle.read(_CHUNK_SIZE)
105+
if not chunk:
106+
break
107+
digest.update(chunk)
108+
digest.update(b"\0")
109+
return digest.hexdigest()
110+
96111
digest = hashlib.sha256()
97112
with path.open("rb") as handle:
98113
while True:

src/refua_data/catalog.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,20 @@ def _zinc_druglike_tranche_urls(
6363
)
6464

6565

66+
def _opentargets_parquet_part_urls(
67+
*,
68+
release: str,
69+
dataset: str,
70+
part_token: str,
71+
part_count: int,
72+
) -> tuple[str, ...]:
73+
base = f"https://ftp.ebi.ac.uk/pub/databases/opentargets/platform/{release}/output/{dataset}"
74+
return tuple(
75+
f"{base}/part-{index:05d}-{part_token}-c000.snappy.parquet"
76+
for index in range(part_count)
77+
)
78+
79+
6680
_DEFAULT_DATASETS = [
6781
DatasetDefinition(
6882
dataset_id="zinc15_250k",
@@ -439,6 +453,41 @@ def _zinc_druglike_tranche_urls(
439453
"subcellular_localization",
440454
),
441455
),
456+
DatasetDefinition(
457+
dataset_id="opentargets_target_prioritisation",
458+
name="Open Targets Target Prioritisation",
459+
description=(
460+
"Open Targets target prioritisation features spanning tractability, "
461+
"safety, novelty, and evidence-linked attributes for therapeutic "
462+
"target ranking."
463+
),
464+
source="Open Targets Platform downloadable parquet",
465+
homepage="https://platform-docs.opentargets.org/data-access/datasets",
466+
license_name="CC0 1.0 (Open Targets Platform data)",
467+
license_url="https://platform-docs.opentargets.org/licence",
468+
urls=_opentargets_parquet_part_urls(
469+
release="25.03",
470+
dataset="target_prioritisation",
471+
part_token="9647e5c1-fd87-47e0-8c5d-3b1429e19b9a",
472+
part_count=16,
473+
),
474+
file_format="parquet",
475+
category="targets",
476+
version="25.03",
477+
filename="target_prioritisation",
478+
url_mode="bundle",
479+
usage_notes=(
480+
"Use for target ranking, tractability-aware portfolio construction, "
481+
"and feature generation for target prioritisation models.",
482+
),
483+
tags=(
484+
"opentargets",
485+
"target_prioritisation",
486+
"tractability",
487+
"safety",
488+
"target_ranking",
489+
),
490+
),
442491
DatasetDefinition(
443492
dataset_id="chembl_activity_ki_human",
444493
name="ChEMBL Human Ki Activities",

src/refua_data/downloader.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,16 @@ def fetch_dataset(
154154
timeout_seconds=timeout_seconds,
155155
)
156156

157+
if dataset.url_mode == "bundle":
158+
return _fetch_bundle_urls(
159+
dataset=dataset,
160+
cache=cache,
161+
raw_path=raw_path,
162+
meta_path=meta_path,
163+
refresh=refresh,
164+
timeout_seconds=timeout_seconds,
165+
)
166+
157167
errors: list[str] = []
158168
for url in dataset.urls:
159169
try:
@@ -302,6 +312,13 @@ def _download_url_to_path(
302312
raise ValueError(f"Unsupported URL scheme for {url}")
303313

304314

315+
def _remove_path(path: Path) -> None:
316+
if path.is_dir():
317+
shutil.rmtree(path)
318+
return
319+
path.unlink(missing_ok=True)
320+
321+
305322
def _fetch_concat_urls(
306323
*,
307324
dataset: DatasetDefinition,
@@ -392,6 +409,74 @@ def _fetch_concat_urls(
392409
)
393410

394411

412+
def _fetch_bundle_urls(
413+
*,
414+
dataset: DatasetDefinition,
415+
cache: CacheBackend,
416+
raw_path: Path,
417+
meta_path: Path,
418+
refresh: bool,
419+
timeout_seconds: float,
420+
) -> FetchResult:
421+
raw_path.parent.mkdir(parents=True, exist_ok=True)
422+
tmp_path = raw_path.with_name(f"{raw_path.name}.tmp")
423+
if tmp_path.exists():
424+
_remove_path(tmp_path)
425+
tmp_path.mkdir(parents=True, exist_ok=True)
426+
427+
bytes_downloaded = 0
428+
source_details: list[dict[str, Any]] = []
429+
430+
try:
431+
for index, url in enumerate(dataset.urls):
432+
candidate_name = Path(urlparse(url).path).name
433+
filename = candidate_name or f"part-{index:05d}"
434+
detail = _download_url_to_path(
435+
url=url,
436+
dest_path=tmp_path / filename,
437+
timeout_seconds=timeout_seconds,
438+
)
439+
source_details.append(detail)
440+
bytes_downloaded += int(detail.get("bytes_downloaded", 0))
441+
442+
if raw_path.exists():
443+
_remove_path(raw_path)
444+
os.replace(tmp_path, raw_path)
445+
except Exception:
446+
_remove_path(tmp_path)
447+
raise
448+
449+
checksum = sha256_file(raw_path)
450+
source_url = dataset.urls[0]
451+
meta = {
452+
"dataset_id": dataset.dataset_id,
453+
"version": dataset.version,
454+
"source_type": "multi_url",
455+
"source_url": source_url,
456+
"source_urls": list(dataset.urls),
457+
"url_mode": dataset.url_mode,
458+
"source_count": len(dataset.urls),
459+
"fetched_at": _utcnow_iso(),
460+
"refreshed": refresh,
461+
"bytes_downloaded": bytes_downloaded,
462+
"sources": source_details,
463+
"sha256": checksum,
464+
}
465+
_write_raw_metadata(cache, meta_path, dataset=dataset, meta=meta)
466+
467+
return FetchResult(
468+
dataset_id=dataset.dataset_id,
469+
version=dataset.version,
470+
raw_path=raw_path,
471+
metadata_path=meta_path,
472+
source_url=source_url,
473+
cache_hit=False,
474+
refreshed=refresh,
475+
bytes_downloaded=bytes_downloaded,
476+
sha256=checksum,
477+
)
478+
479+
395480
def _fetch_file_url(
396481
*,
397482
dataset: DatasetDefinition,

src/refua_data/io.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ def iter_dataset_chunks(
3333
chunksize: int,
3434
) -> Iterator[pd.DataFrame]:
3535
"""Yield DataFrame chunks from a dataset raw file."""
36+
if dataset.file_format == "parquet":
37+
yield from _iter_parquet_chunks(raw_path, chunksize=chunksize)
38+
return
39+
3640
if dataset.file_format == "jsonl":
3741
yield from _iter_jsonl_chunks(raw_path, chunksize=chunksize)
3842
return
@@ -70,6 +74,26 @@ def _iter_jsonl_chunks(raw_path: Path, *, chunksize: int) -> Iterator[pd.DataFra
7074
yield prepare_dataframe(chunk)
7175

7276

77+
def _iter_parquet_chunks(raw_path: Path, *, chunksize: int) -> Iterator[pd.DataFrame]:
78+
parquet_files: list[Path]
79+
if raw_path.is_dir():
80+
parquet_files = sorted(path for path in raw_path.glob("*.parquet"))
81+
else:
82+
parquet_files = [raw_path]
83+
84+
if not parquet_files:
85+
raise ValueError(f"No parquet files found at '{raw_path}'.")
86+
87+
for parquet_file in parquet_files:
88+
frame = pd.read_parquet(parquet_file)
89+
if len(frame) <= chunksize:
90+
yield prepare_dataframe(frame)
91+
continue
92+
93+
for start in range(0, len(frame), chunksize):
94+
yield prepare_dataframe(frame.iloc[start : start + chunksize])
95+
96+
7397
def _iter_csv_like_from_zip(
7498
raw_path: Path,
7599
*,

src/refua_data/models.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77
from typing import Any, Literal
88
from urllib.parse import urlparse
99

10-
TabularFormat = Literal["csv", "tsv", "jsonl"]
10+
TabularFormat = Literal["csv", "tsv", "jsonl", "parquet"]
1111
Compression = Literal["none", "gzip", "zip", "infer"]
1212
ApiPaginationMode = Literal["none", "chembl", "link_header"]
13-
UrlMode = Literal["fallback", "concat"]
13+
UrlMode = Literal["fallback", "concat", "bundle"]
1414

1515
_CATEGORY_USAGE_DEFAULTS: dict[str, str] = {
1616
"compound_library": "Use for compound library curation, screening, and molecular pretraining.",
@@ -94,6 +94,7 @@ def preferred_filename(self) -> str:
9494
"csv": ".csv",
9595
"tsv": ".tsv",
9696
"jsonl": ".jsonl",
97+
"parquet": ".parquet",
9798
}[self.file_format]
9899
return f"{self.dataset_id}{fallback_ext}"
99100

src/refua_data/validation.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@ def validate_dataset_sources(
3838
3939
For datasets with multiple URLs in `fallback` mode, probes are attempted in
4040
order and the dataset is considered healthy once one source succeeds.
41-
For datasets in `concat` mode, every configured source must be reachable.
41+
For datasets in `concat` or `bundle` mode, every configured source must be
42+
reachable.
4243
"""
4344
if dataset.api is not None:
4445
return [_probe_api(dataset, dataset.api, timeout_seconds=timeout_seconds)]
@@ -56,7 +57,7 @@ def validate_dataset_sources(
5657
)
5758
]
5859

59-
if dataset.url_mode == "concat":
60+
if dataset.url_mode in {"concat", "bundle"}:
6061
concat_attempts = [
6162
_probe_url(dataset, url, timeout_seconds=timeout_seconds)
6263
for url in dataset.urls

tests/test_catalog.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ def test_default_catalog_contains_core_and_api_datasets() -> None:
2222
assert "openfda_drug_event_serious" in ids
2323
assert "bindingdb_articles_affinity" in ids
2424
assert "proteinatlas_human_proteome" in ids
25+
assert "opentargets_target_prioritisation" in ids
2526
assert "uniprot_human_reviewed" in ids
2627
assert "uniprot_human_receptors" in ids
2728
assert "uniprot_human_membrane" in ids
@@ -30,4 +31,4 @@ def test_default_catalog_contains_core_and_api_datasets() -> None:
3031
assert "uniprot_human_transcription_factors" in ids
3132
assert "uniprot_human_enzymes" in ids
3233
assert "chembl_targets_human_protein_complex" in ids
33-
assert len(datasets) >= 47
34+
assert len(datasets) >= 48

0 commit comments

Comments
 (0)