Skip to content

Commit 38e18a5

Browse files
committed
Add new ChEMBL and UniProt datasets for AC50 activities, Phase 4 molecules, and nuclear proteins
1 parent 5aca922 commit 38e18a5

3 files changed

Lines changed: 109 additions & 21 deletions

File tree

README.md

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -39,26 +39,29 @@ The default catalog includes local-file/HTTP datasets plus API presets useful in
3939
19. `chembl_activity_ic50_human`
4040
20. `chembl_activity_kd_human`
4141
21. `chembl_activity_ec50_human`
42-
22. `chembl_assays_binding_human`
43-
23. `chembl_assays_functional_human`
44-
24. `chembl_assays_adme_human`
45-
25. `chembl_targets_human_single_protein`
46-
26. `chembl_targets_human_protein_complex`
47-
27. `chembl_molecules_phase3plus`
48-
28. `chembl_molecules_black_box_warning`
49-
29. `chembl_mechanism_phase2plus`
50-
30. `chembl_drug_indications_phase2plus`
51-
31. `chembl_drug_indications_phase3plus`
52-
32. `uniprot_human_reviewed`
53-
33. `uniprot_human_receptors`
54-
34. `uniprot_human_membrane`
55-
35. `uniprot_human_kinases`
56-
36. `uniprot_human_gpcr`
57-
37. `uniprot_human_ion_channels`
58-
38. `uniprot_human_transporters`
59-
39. `uniprot_human_secreted`
60-
40. `uniprot_human_transcription_factors`
61-
41. `uniprot_human_enzymes`
42+
22. `chembl_activity_ac50_human`
43+
23. `chembl_assays_binding_human`
44+
24. `chembl_assays_functional_human`
45+
25. `chembl_assays_adme_human`
46+
26. `chembl_targets_human_single_protein`
47+
27. `chembl_targets_human_protein_complex`
48+
28. `chembl_molecules_phase3plus`
49+
29. `chembl_molecules_phase4`
50+
30. `chembl_molecules_black_box_warning`
51+
31. `chembl_mechanism_phase2plus`
52+
32. `chembl_drug_indications_phase2plus`
53+
33. `chembl_drug_indications_phase3plus`
54+
34. `uniprot_human_reviewed`
55+
35. `uniprot_human_receptors`
56+
36. `uniprot_human_membrane`
57+
37. `uniprot_human_nucleus`
58+
38. `uniprot_human_kinases`
59+
39. `uniprot_human_gpcr`
60+
40. `uniprot_human_ion_channels`
61+
41. `uniprot_human_transporters`
62+
42. `uniprot_human_secreted`
63+
43. `uniprot_human_transcription_factors`
64+
44. `uniprot_human_enzymes`
6265

6366
Most of these are distributed through MoleculeNet/DeepChem mirrors and retain upstream licensing terms.
6467
ChEMBL and UniProt presets are fetched through their public REST APIs and cached locally as JSONL.

src/refua_data/catalog.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,35 @@ def _zinc_druglike_tranche_urls(
475475
),
476476
tags=("api", "chembl", "human", "ec50", "potency"),
477477
),
478+
DatasetDefinition(
479+
dataset_id="chembl_activity_ac50_human",
480+
name="ChEMBL Human AC50 Activities",
481+
description=(
482+
"ChEMBL activity records for human targets with AC50 and pChEMBL "
483+
"values, useful for concentration-response modeling."
484+
),
485+
source="ChEMBL REST API",
486+
homepage="https://www.ebi.ac.uk/chembl/",
487+
license_name="ChEMBL data terms",
488+
license_url="https://www.ebi.ac.uk/chembl/ws",
489+
file_format="jsonl",
490+
category="target_activity",
491+
api=ApiDatasetConfig(
492+
endpoint="https://www.ebi.ac.uk/chembl/api/data/activity.json",
493+
params={
494+
"target_organism": "Homo sapiens",
495+
"standard_type": "AC50",
496+
"pchembl_value__isnull": "false",
497+
},
498+
pagination="chembl",
499+
items_path="activities",
500+
page_size_param="limit",
501+
page_size=1000,
502+
max_pages=40,
503+
max_rows=25_000,
504+
),
505+
tags=("api", "chembl", "human", "ac50", "potency"),
506+
),
478507
DatasetDefinition(
479508
dataset_id="chembl_assays_binding_human",
480509
name="ChEMBL Human Binding Assays",
@@ -640,6 +669,31 @@ def _zinc_druglike_tranche_urls(
640669
),
641670
tags=("api", "chembl", "clinical", "phase3plus"),
642671
),
672+
DatasetDefinition(
673+
dataset_id="chembl_molecules_phase4",
674+
name="ChEMBL Molecules Phase 4",
675+
description=(
676+
"ChEMBL molecules with max clinical phase >= 4, useful for "
677+
"marketed-drug priors and late-stage benchmark sets."
678+
),
679+
source="ChEMBL REST API",
680+
homepage="https://www.ebi.ac.uk/chembl/",
681+
license_name="ChEMBL data terms",
682+
license_url="https://www.ebi.ac.uk/chembl/ws",
683+
file_format="jsonl",
684+
category="compound_library",
685+
api=ApiDatasetConfig(
686+
endpoint="https://www.ebi.ac.uk/chembl/api/data/molecule.json",
687+
params={"max_phase__gte": "4"},
688+
pagination="chembl",
689+
items_path="molecules",
690+
page_size_param="limit",
691+
page_size=1000,
692+
max_pages=30,
693+
max_rows=20_000,
694+
),
695+
tags=("api", "chembl", "clinical", "phase4"),
696+
),
643697
DatasetDefinition(
644698
dataset_id="chembl_molecules_black_box_warning",
645699
name="ChEMBL Molecules with Black Box Warning",
@@ -824,6 +878,34 @@ def _zinc_druglike_tranche_urls(
824878
),
825879
tags=("api", "uniprot", "human", "membrane", "target_family"),
826880
),
881+
DatasetDefinition(
882+
dataset_id="uniprot_human_nucleus",
883+
name="UniProt Human Nuclear Proteins",
884+
description=(
885+
"Reviewed human proteins annotated with nuclear localization for "
886+
"nucleus-focused target enrichment and biology workflows."
887+
),
888+
source="UniProt REST API",
889+
homepage="https://www.uniprot.org/help/api_queries",
890+
license_name="UniProt terms",
891+
license_url="https://www.uniprot.org/help/license",
892+
file_format="jsonl",
893+
category="target_families",
894+
api=ApiDatasetConfig(
895+
endpoint="https://rest.uniprot.org/uniprotkb/search",
896+
params={
897+
"query": "organism_id:9606 AND reviewed:true AND keyword:Nucleus",
898+
"format": "json",
899+
},
900+
pagination="link_header",
901+
items_path="results",
902+
page_size_param="size",
903+
page_size=500,
904+
max_pages=20,
905+
max_rows=8_000,
906+
),
907+
tags=("api", "uniprot", "human", "nucleus", "target_family"),
908+
),
827909
DatasetDefinition(
828910
dataset_id="uniprot_human_kinases",
829911
name="UniProt Human Kinases",

tests/test_catalog.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,20 @@ def test_default_catalog_contains_core_and_api_datasets() -> None:
1111
assert "chembl_activity_ki_human" in ids
1212
assert "chembl_activity_kd_human" in ids
1313
assert "chembl_activity_ec50_human" in ids
14+
assert "chembl_activity_ac50_human" in ids
1415
assert "chembl_assays_functional_human" in ids
1516
assert "chembl_assays_adme_human" in ids
17+
assert "chembl_molecules_phase4" in ids
1618
assert "chembl_molecules_black_box_warning" in ids
1719
assert "chembl_mechanism_phase2plus" in ids
1820
assert "chembl_drug_indications_phase2plus" in ids
1921
assert "chembl_drug_indications_phase3plus" in ids
2022
assert "uniprot_human_reviewed" in ids
2123
assert "uniprot_human_receptors" in ids
2224
assert "uniprot_human_membrane" in ids
25+
assert "uniprot_human_nucleus" in ids
2326
assert "uniprot_human_secreted" in ids
2427
assert "uniprot_human_transcription_factors" in ids
2528
assert "uniprot_human_enzymes" in ids
2629
assert "chembl_targets_human_protein_complex" in ids
27-
assert len(datasets) >= 39
30+
assert len(datasets) >= 42

0 commit comments

Comments
 (0)