@@ -475,6 +475,35 @@ def _zinc_druglike_tranche_urls(
475475 ),
476476 tags = ("api" , "chembl" , "human" , "ec50" , "potency" ),
477477 ),
478+ DatasetDefinition (
479+ dataset_id = "chembl_activity_ac50_human" ,
480+ name = "ChEMBL Human AC50 Activities" ,
481+ description = (
482+ "ChEMBL activity records for human targets with AC50 and pChEMBL "
483+ "values, useful for concentration-response modeling."
484+ ),
485+ source = "ChEMBL REST API" ,
486+ homepage = "https://www.ebi.ac.uk/chembl/" ,
487+ license_name = "ChEMBL data terms" ,
488+ license_url = "https://www.ebi.ac.uk/chembl/ws" ,
489+ file_format = "jsonl" ,
490+ category = "target_activity" ,
491+ api = ApiDatasetConfig (
492+ endpoint = "https://www.ebi.ac.uk/chembl/api/data/activity.json" ,
493+ params = {
494+ "target_organism" : "Homo sapiens" ,
495+ "standard_type" : "AC50" ,
496+ "pchembl_value__isnull" : "false" ,
497+ },
498+ pagination = "chembl" ,
499+ items_path = "activities" ,
500+ page_size_param = "limit" ,
501+ page_size = 1000 ,
502+ max_pages = 40 ,
503+ max_rows = 25_000 ,
504+ ),
505+ tags = ("api" , "chembl" , "human" , "ac50" , "potency" ),
506+ ),
478507 DatasetDefinition (
479508 dataset_id = "chembl_assays_binding_human" ,
480509 name = "ChEMBL Human Binding Assays" ,
@@ -640,6 +669,31 @@ def _zinc_druglike_tranche_urls(
640669 ),
641670 tags = ("api" , "chembl" , "clinical" , "phase3plus" ),
642671 ),
672+ DatasetDefinition (
673+ dataset_id = "chembl_molecules_phase4" ,
674+ name = "ChEMBL Molecules Phase 4" ,
675+ description = (
676+ "ChEMBL molecules with max clinical phase >= 4, useful for "
677+ "marketed-drug priors and late-stage benchmark sets."
678+ ),
679+ source = "ChEMBL REST API" ,
680+ homepage = "https://www.ebi.ac.uk/chembl/" ,
681+ license_name = "ChEMBL data terms" ,
682+ license_url = "https://www.ebi.ac.uk/chembl/ws" ,
683+ file_format = "jsonl" ,
684+ category = "compound_library" ,
685+ api = ApiDatasetConfig (
686+ endpoint = "https://www.ebi.ac.uk/chembl/api/data/molecule.json" ,
687+ params = {"max_phase__gte" : "4" },
688+ pagination = "chembl" ,
689+ items_path = "molecules" ,
690+ page_size_param = "limit" ,
691+ page_size = 1000 ,
692+ max_pages = 30 ,
693+ max_rows = 20_000 ,
694+ ),
695+ tags = ("api" , "chembl" , "clinical" , "phase4" ),
696+ ),
643697 DatasetDefinition (
644698 dataset_id = "chembl_molecules_black_box_warning" ,
645699 name = "ChEMBL Molecules with Black Box Warning" ,
@@ -824,6 +878,34 @@ def _zinc_druglike_tranche_urls(
824878 ),
825879 tags = ("api" , "uniprot" , "human" , "membrane" , "target_family" ),
826880 ),
881+ DatasetDefinition (
882+ dataset_id = "uniprot_human_nucleus" ,
883+ name = "UniProt Human Nuclear Proteins" ,
884+ description = (
885+ "Reviewed human proteins annotated with nuclear localization for "
886+ "nucleus-focused target enrichment and biology workflows."
887+ ),
888+ source = "UniProt REST API" ,
889+ homepage = "https://www.uniprot.org/help/api_queries" ,
890+ license_name = "UniProt terms" ,
891+ license_url = "https://www.uniprot.org/help/license" ,
892+ file_format = "jsonl" ,
893+ category = "target_families" ,
894+ api = ApiDatasetConfig (
895+ endpoint = "https://rest.uniprot.org/uniprotkb/search" ,
896+ params = {
897+ "query" : "organism_id:9606 AND reviewed:true AND keyword:Nucleus" ,
898+ "format" : "json" ,
899+ },
900+ pagination = "link_header" ,
901+ items_path = "results" ,
902+ page_size_param = "size" ,
903+ page_size = 500 ,
904+ max_pages = 20 ,
905+ max_rows = 8_000 ,
906+ ),
907+ tags = ("api" , "uniprot" , "human" , "nucleus" , "target_family" ),
908+ ),
827909 DatasetDefinition (
828910 dataset_id = "uniprot_human_kinases" ,
829911 name = "UniProt Human Kinases" ,
0 commit comments