Skip to content

Commit 54cef46

Browse files
committed
update obo_extractor
1 parent 806460b commit 54cef46

File tree

6 files changed

+130
-72
lines changed

6 files changed

+130
-72
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,3 +205,7 @@ cython_debug/
205205
marimo/_static/
206206
marimo/_lsp/
207207
__marimo__/
208+
209+
210+
211+
data/

chebi_utils/obo_extractor.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def _term_data(doc: "fastobo.term.TermFrame") -> dict | None:
2222
Parsed term data, or ``None`` if the term is marked as obsolete.
2323
"""
2424
parents: list[str] = []
25-
has_part: set[str] = set()
25+
relations: dict = dict()
2626
name: str | None = None
2727
smiles: str | None = None
2828
subset: str | None = None
@@ -42,8 +42,10 @@ def _term_data(doc: "fastobo.term.TermFrame") -> dict | None:
4242
if "SMILES" in clause.raw_value() and smiles is None:
4343
smiles = clause.raw_value().split('"')[1]
4444
elif isinstance(clause, fastobo.term.RelationshipClause):
45-
if str(clause.typedef) == "has_part":
46-
has_part.add(_chebi_id_to_str(str(clause.term)))
45+
relation = str(clause.typedef)
46+
if relation not in relations:
47+
relations[relation] = []
48+
relations[relation].append(_chebi_id_to_str(str(clause.term)))
4749
elif isinstance(clause, fastobo.term.IsAClause):
4850
parents.append(_chebi_id_to_str(str(clause.term)))
4951
elif isinstance(clause, fastobo.term.NameClause):
@@ -54,7 +56,7 @@ def _term_data(doc: "fastobo.term.TermFrame") -> dict | None:
5456
return {
5557
"id": _chebi_id_to_str(str(doc.id)),
5658
"parents": parents,
57-
"has_part": has_part,
59+
"relations": relations,
5860
"name": name,
5961
"smiles": smiles,
6062
"subset": subset,
@@ -107,7 +109,14 @@ def build_chebi_graph(filepath: str | Path) -> nx.DiGraph:
107109
for parent_id in term["parents"]:
108110
graph.add_edge(node_id, parent_id, relation="is_a")
109111

110-
for part_id in term["has_part"]:
111-
graph.add_edge(node_id, part_id, relation="has_part")
112+
for relation, parts in term["relations"].items():
113+
for part_id in parts:
114+
graph.add_edge(node_id, part_id, relation=relation)
112115

113116
return graph
117+
118+
def get_hierarchy_subgraph(chebi_graph: nx.DiGraph) -> nx.DiGraph:
119+
"""Subgraph of ChEBI including only edges corresponding to hierarchical relations (is_a). Also removes nodes that are not connected by any is_a edges to other nodes."""
120+
return chebi_graph.edge_subgraph(
121+
(u, v) for u, v, d in chebi_graph.edges(data=True) if d.get("relation") == "is_a"
122+
)

chebi_utils/sdf_extractor.py

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,21 @@
1010
from rdkit import Chem
1111

1212

13-
def _update_mol_valences(mol: Chem.Mol) -> Chem.Mol:
14-
"""Mark all atoms as having no implicit hydrogens to preserve molfile valences."""
15-
for atom in mol.GetAtoms():
16-
atom.SetNoImplicit(True)
13+
def _sanitize_molecule(mol: Chem.Mol) -> Chem.Mol:
14+
"""Sanitize molecule, mirroring the ChEBI molecule processing."""
15+
from chembl_structure_pipeline.standardizer import update_mol_valences
16+
17+
mol = update_mol_valences(mol)
18+
Chem.SanitizeMol(
19+
mol,
20+
sanitizeOps=Chem.SanitizeFlags.SANITIZE_FINDRADICALS
21+
| Chem.SanitizeFlags.SANITIZE_KEKULIZE
22+
| Chem.SanitizeFlags.SANITIZE_SETAROMATICITY
23+
| Chem.SanitizeFlags.SANITIZE_SETCONJUGATION
24+
| Chem.SanitizeFlags.SANITIZE_SETHYBRIDIZATION
25+
| Chem.SanitizeFlags.SANITIZE_SYMMRINGS,
26+
catchErrors=True,
27+
)
1728
return mol
1829

1930

@@ -39,19 +50,7 @@ def _parse_molblock(molblock: str, chebi_id: str | None = None) -> Chem.Mol | No
3950
if mol is None:
4051
warnings.warn(f"Failed to parse molblock for {chebi_id}", stacklevel=2)
4152
return None
42-
mol = _update_mol_valences(mol)
43-
Chem.SanitizeMol(
44-
mol,
45-
sanitizeOps=(
46-
Chem.SanitizeFlags.SANITIZE_FINDRADICALS
47-
| Chem.SanitizeFlags.SANITIZE_KEKULIZE
48-
| Chem.SanitizeFlags.SANITIZE_SETAROMATICITY
49-
| Chem.SanitizeFlags.SANITIZE_SETCONJUGATION
50-
| Chem.SanitizeFlags.SANITIZE_SETHYBRIDIZATION
51-
| Chem.SanitizeFlags.SANITIZE_SYMMRINGS
52-
),
53-
catchErrors=True,
54-
)
53+
mol = _sanitize_molecule(mol)
5554
return mol
5655

5756

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ dependencies = [
1515
"numpy>=1.24",
1616
"pandas>=2.0",
1717
"rdkit>=2022.09",
18+
"chembl_structure_pipeline>=1.2.4",
1819
]
1920

2021
[project.optional-dependencies]

tests/fixtures/sample.obo

Lines changed: 74 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,87 @@
11
format-version: 1.2
2+
date: 06:01:2026 14:33
3+
saved-by: chebi
4+
subsetdef: 1:STAR "Preliminary entries"
5+
subsetdef: 2:STAR "Annotated by 3rd party"
6+
subsetdef: 3:STAR "Manually annotated by ChEBI Team"
7+
synonymtypedef: BRAND:NAME "BRAND NAME"
8+
synonymtypedef: http://purl.obolibrary.org/obo/chebi/INN "INN"
9+
synonymtypedef: IUPAC:NAME "IUPAC NAME"
10+
default-namespace: chebi_ontology
11+
idspace: chemrof https://w3id.org/chemrof/
12+
idspace: foaf http://xmlns.com/foaf/0.1/
13+
idspace: oboInOwl http://www.geneontology.org/formats/oboInOwl#
14+
idspace: terms http://purl.org/dc/terms/
15+
remark: ChEBI subsumes and replaces the Chemical Ontology first. This ontology was developed by Michael Ashburner and Pankaj Jaiswal. Data was cuarated by The ChEBI Curation Team. For any queries contact chebi-help@ebi.ac.uk
216
ontology: chebi
17+
property_value: foaf:homepage "https://www.ebi.ac.uk/chebi" xsd:anyURI
18+
property_value: owl:versionInfo "248" xsd:decimal
19+
property_value: terms:description "Chemical Entities of Biological Interest, also known as ChEBI, is a chemical database and ontology of molecular entities focused on 'small' chemical compounds, that is part of the Open Biomedical Ontologies (OBO) effort at the European Bioinformatics Institute (EBI)." xsd:string
20+
property_value: terms:license https://creativecommons.org/licenses/by/4.0/
21+
property_value: terms:title "ChEBI Ontology" xsd:string
322

423
[Term]
5-
id: CHEBI:1
6-
name: compound A
7-
def: "A test compound." [TestDB:001]
8-
is_a: CHEBI:2 ! compound B
9-
relationship: has_part CHEBI:3 ! methyl group
10-
property_value: http://purl.obolibrary.org/obo/chebi/smiles "C" xsd:string
24+
id: CHEBI:10
25+
name: (+)-Atherospermoline
26+
subset: 2:STAR
27+
synonym: "(+)-Atherospermoline" RELATED [kegg.compound]
28+
xref: cas:21008-67-3 {source="cas"}
29+
xref: kegg.compound:C11141 {source="kegg.compound"}
30+
xref: knapsack:C00001814 {source="knapsack"}
31+
is_a: CHEBI:133004 ! bisbenzylisoquinoline alkaloid
32+
property_value: chemrof:charge "0" xsd:integer
33+
property_value: chemrof:generalized_empirical_formula "C36H38N2O6" xsd:string
34+
property_value: chemrof:inchi_key_string "XGEAUXVPBXUBKN-NSOVKSMOSA-N" xsd:string
35+
property_value: chemrof:inchi_string "InChI=1S/C36H38N2O6/c1-37-13-11-23-18-31(41-3)32-20-26(23)27(37)15-21-5-8-25(9-6-21)43-30-17-22(7-10-29(30)39)16-28-34-24(12-14-38(28)2)19-33(42-4)35(40)36(34)44-32/h5-10,17-20,27-28,39-40H,11-16H2,1-4H3/t27-,28-/m0/s1" xsd:string
36+
property_value: chemrof:mass "594.708" xsd:decimal
37+
property_value: chemrof:monoisotopic_mass "594.27299" xsd:decimal
38+
property_value: chemrof:smiles_string "COc1cc2c3cc1Oc1c(O)c(OC)cc4c1[C@H](Cc1ccc(O)c(c1)Oc1ccc(cc1)C[C@@H]3N(C)CC2)N(C)CC4" xsd:string
1139

1240
[Term]
13-
id: CHEBI:2
14-
name: compound B
15-
def: "Another test compound." [TestDB:002]
16-
is_a: CHEBI:5 ! root compound
41+
id: CHEBI:133004
42+
name: bisbenzylisoquinoline alkaloid
43+
def: "A type of benzylisoquinoline alkaloid whose structures are built up of two benzylisoquinoline units linked by ether bridges. Various structural patterns resulting from additional bridging between the two units by direct carbon-carbon bridging or by methylenedioxy groups are common." []
44+
subset: 3:STAR
45+
synonym: "bis(benzylisoquinoline) alkaloid" RELATED [chebi]
46+
synonym: "bis(benzylisoquinoline) alkaloids" RELATED [chebi]
47+
synonym: "bis-benzylisoquinoline alkaloid" RELATED [chebi]
48+
synonym: "bis-benzylisoquinoline alkaloids" RELATED [chebi]
49+
synonym: "bisbenzylisoquinoline alkaloids" RELATED [chebi]
50+
xref: pubmed:1955879 {source="pubmed"}
51+
xref: pubmed:2191354 {source="pubmed"}
52+
xref: pubmed:3323421 {source="pubmed"}
53+
is_a: CHEBI:22750 ! benzylisoquinoline alkaloid
1754

1855
[Term]
19-
id: CHEBI:3
20-
name: methyl group
21-
def: "A methyl group." [TestDB:003]
22-
subset: 3_STAR
56+
id: CHEBI:22750
57+
name: benzylisoquinoline alkaloid
58+
def: "Any isoquinoline alkaloid based on a benzylisoquinoline skeleton." []
59+
subset: 3:STAR
60+
synonym: "benzylisoquinoline alkaloids" RELATED [chebi]
61+
is_a: CHEBI:24921 ! isoquinoline alkaloid
2362

2463
[Term]
25-
id: CHEBI:4
26-
name: obsolete term
27-
def: "This term is obsolete." [TestDB:004]
64+
id: CHEBI:27189
2865
is_obsolete: true
2966

3067
[Term]
31-
id: CHEBI:5
32-
name: root compound
33-
def: "The root compound." [TestDB:005]
34-
35-
[Typedef]
36-
id: has_part
37-
name: has part
68+
id: CHEBI:28017
69+
name: starch
70+
alt_id: CHEBI:26750
71+
alt_id: CHEBI:26751
72+
alt_id: CHEBI:9251
73+
def: "The most important reserve polysaccharide found in plants. It is a glucan consisting of amylose and amylopectin." []
74+
subset: 3:STAR
75+
synonym: "amidon" RELATED [chebi]
76+
synonym: "amylum" RELATED [chebi]
77+
synonym: "Staerke" RELATED [chebi]
78+
synonym: "Starch" RELATED [kegg.compound]
79+
xref: cas:9005-25-8 {source="cas"}
80+
xref: kegg.compound:C00369 {source="kegg.compound"}
81+
xref: kegg.drug:D06507 {source="kegg.drug"}
82+
xref: kegg.glycan:G10545 {source="kegg.glycan"}
83+
xref: wikipedia.en:Starch {source="wikipedia.en"}
84+
is_a: CHEBI:37163 ! glucan
85+
relationship: BFO:0000051 CHEBI:28057 ! has part amylopectin
86+
relationship: BFO:0000051 CHEBI:28102 ! has part amylose
87+
relationship: RO:0000087 CHEBI:75771 ! has role mouse metabolite

tests/test_obo_extractor.py

Lines changed: 20 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def test_returns_directed_graph(self):
1818
assert isinstance(g, nx.DiGraph)
1919

2020
def test_correct_number_of_nodes(self):
21-
# CHEBI:4 is obsolete and must be excluded -> 4 nodes remain
21+
# CHEBI:27189 is obsolete -> excluded; 3 explicit + 1 implicit (24921) = 4
2222
g = build_chebi_graph(SAMPLE_OBO)
2323
assert len(g.nodes) == 4
2424

@@ -28,55 +28,50 @@ def test_node_ids_are_strings(self):
2828

2929
def test_expected_nodes_present(self):
3030
g = build_chebi_graph(SAMPLE_OBO)
31-
assert set(g.nodes) == {"1", "2", "3", "5"}
31+
assert set(g.nodes) == {"10", "133004", "22750", "24921"}
3232

3333
def test_obsolete_term_excluded(self):
3434
g = build_chebi_graph(SAMPLE_OBO)
35-
assert "4" not in g.nodes
35+
assert "27189" not in g.nodes
3636

3737
def test_node_name_attribute(self):
3838
g = build_chebi_graph(SAMPLE_OBO)
39-
assert g.nodes["1"]["name"] == "compound A"
40-
assert g.nodes["2"]["name"] == "compound B"
39+
assert g.nodes["10"]["name"] == "(+)-Atherospermoline"
40+
assert g.nodes["133004"]["name"] == "bisbenzylisoquinoline alkaloid"
4141

4242
def test_smiles_extracted_from_property_value(self):
4343
g = build_chebi_graph(SAMPLE_OBO)
44-
assert g.nodes["1"]["smiles"] == "C"
44+
expected = "COc1cc2c3cc1Oc1c(O)c(OC)cc4c1[C@H](Cc1ccc(O)c(c1)Oc1ccc(cc1)C[C@@H]3N(C)CC2)N(C)CC4"
45+
assert g.nodes["10"]["smiles"] == expected
4546

4647
def test_smiles_none_when_absent(self):
4748
g = build_chebi_graph(SAMPLE_OBO)
48-
assert g.nodes["2"]["smiles"] is None
49+
assert g.nodes["133004"]["smiles"] is None
4950

5051
def test_subset_extracted(self):
5152
g = build_chebi_graph(SAMPLE_OBO)
52-
assert g.nodes["3"]["subset"] == "3_STAR"
53-
54-
def test_subset_none_when_absent(self):
55-
g = build_chebi_graph(SAMPLE_OBO)
56-
assert g.nodes["1"]["subset"] is None
53+
assert g.nodes["133004"]["subset"] == "3:STAR"
5754

5855
def test_isa_edge_present(self):
5956
g = build_chebi_graph(SAMPLE_OBO)
60-
# CHEBI:1 is_a CHEBI:2
61-
assert g.has_edge("1", "2")
62-
assert g.edges["1", "2"]["relation"] == "is_a"
57+
# CHEBI:10 is_a CHEBI:133004
58+
assert g.has_edge("10", "133004")
59+
assert g.edges["10", "133004"]["relation"] == "is_a"
6360

64-
def test_has_part_edge_present(self):
61+
def test_isa_chain(self):
6562
g = build_chebi_graph(SAMPLE_OBO)
66-
# CHEBI:1 has_part CHEBI:3
67-
assert g.has_edge("1", "3")
68-
assert g.edges["1", "3"]["relation"] == "has_part"
63+
# CHEBI:133004 is_a CHEBI:22750
64+
assert g.has_edge("133004", "22750")
65+
assert g.edges["133004", "22750"]["relation"] == "is_a"
66+
# CHEBI:22750 is_a CHEBI:24921
67+
assert g.has_edge("22750", "24921")
68+
assert g.edges["22750", "24921"]["relation"] == "is_a"
6969

7070
def test_total_edge_count(self):
7171
g = build_chebi_graph(SAMPLE_OBO)
72-
# 1->2 (is_a), 1->3 (has_part), 2->5 (is_a)
72+
# 10->133004 (is_a), 133004->22750 (is_a), 22750->24921 (is_a)
7373
assert len(g.edges) == 3
7474

75-
def test_typedef_stanza_excluded(self):
76-
g = build_chebi_graph(SAMPLE_OBO)
77-
# "has_part" Typedef id is not numeric CHEBI ID, should not appear as node
78-
assert "has_part" not in g.nodes
79-
8075
def test_xref_lines_do_not_break_parsing(self, tmp_path):
8176
obo_with_xrefs = tmp_path / "xref.obo"
8277
obo_with_xrefs.write_text(

0 commit comments

Comments
 (0)