merge dev

sfluegel05 · sfluegel05 · commit d4cdf02d3061 · 2026-02-27T14:25:03.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -98,7 +98,7 @@ ipython_config.py
 #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
-#uv.lock
+uv.lock
 
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
diff --git a/chebi_utils/__init__.py b/chebi_utils/__init__.py
@@ -1,9 +1,11 @@
+from chebi_utils.dataset_builder import build_labeled_dataset
 from chebi_utils.downloader import download_chebi_obo, download_chebi_sdf
 from chebi_utils.obo_extractor import build_chebi_graph
 from chebi_utils.sdf_extractor import extract_molecules
 from chebi_utils.splitter import create_multilabel_splits
 
 __all__ = [
+    "build_labeled_dataset",
     "download_chebi_obo",
     "download_chebi_sdf",
     "build_chebi_graph",
diff --git a/chebi_utils/dataset_builder.py b/chebi_utils/dataset_builder.py
@@ -0,0 +1,106 @@
+"""Build a labeled dataset by matching molecules to ChEBI ontology classes."""
+
+from __future__ import annotations
+
+from collections import Counter
+
+import networkx as nx
+import pandas as pd
+
+from chebi_utils.obo_extractor import get_hierarchy_subgraph
+
+
+def _count_molecules_per_class(closure: nx.DiGraph, mol_ids: set[str]) -> dict[str, int]:
+    """Count how many molecules fall under each ontology class.
+
+    Uses the precomputed transitive closure so that ancestor look-ups are O(1).
+
+    Parameters
+    ----------
+    closure : nx.DiGraph
+        Transitive closure of the ``is_a`` hierarchy.
+    mol_ids : set[str]
+        ChEBI IDs of molecules with valid ``Mol`` objects.
+
+    Returns
+    -------
+    dict[str, int]
+        Mapping from ChEBI class ID to count of molecules in its subtree.
+    """
+    counts: Counter[str] = Counter()
+    for mid in mol_ids:
+        if mid in closure:
+            for ancestor in closure.successors(mid):
+                counts[ancestor] += 1
+        # The molecule itself always counts for its own class
+        counts[mid] += 1
+    return dict(counts)
+
+
+def build_labeled_dataset(
+    chebi_graph: nx.DiGraph,
+    molecules: pd.DataFrame,
+    min_molecules: int = 50,
+) -> tuple[pd.DataFrame, list[str]]:
+    """Build a labeled dataset matching molecules to ontology classes.
+
+    Each molecule is assigned to every selected label class that it belongs to
+    (directly or through a chain of ``is_a`` relationships).  Only classes with
+    at least *min_molecules* descendant molecules (including indirect
+    descendants) are retained as labels.
+
+    Labels are encoded **one-hot**: the returned DataFrame contains one boolean
+    column per selected label.
+
+    Parameters
+    ----------
+    chebi_graph : nx.DiGraph
+        Full ChEBI ontology graph from :func:`build_chebi_graph`.
+    molecules : pd.DataFrame
+        DataFrame from :func:`extract_molecules` containing at least
+        ``chebi_id`` and ``mol`` columns.
+    min_molecules : int
+        Minimum number of descendant molecules a class must have to be
+        selected as a label (default 50).
+
+    Returns
+    -------
+    tuple[pd.DataFrame, list[str]]
+        A tuple of:
+        - DataFrame with columns ``chebi_id``, ``mol``, and one boolean
+          column per selected label.  Each row represents one molecule.
+        - Sorted list of selected label ChEBI IDs.
+    """
+    # Keep only molecules with a valid Mol object
+    mol_df = molecules[molecules["mol"].notna()].copy()
+    mol_ids = set(mol_df["chebi_id"])
+
+    # Build transitive closure of hierarchy once
+    hierarchy = get_hierarchy_subgraph(chebi_graph)
+    closure = nx.transitive_closure_dag(hierarchy)
+
+    # Determine label set
+    counts = _count_molecules_per_class(closure, mol_ids)
+    labels = {cls for cls, count in counts.items() if count >= min_molecules}
+    sorted_labels = sorted(labels)
+
+    if not labels:
+        return pd.DataFrame(columns=["chebi_id", "mol"]), sorted_labels
+
+    # For each molecule compute its ancestor set (including itself) via closure
+    label_matrix: list[dict[str, bool]] = []
+    for cid in mol_df["chebi_id"]:
+        if cid in closure:
+            ancestors = set(closure.successors(cid)) | {cid}
+        else:
+            ancestors = {cid}
+        mol_labels = ancestors & labels
+        label_matrix.append({lbl: lbl in mol_labels for lbl in sorted_labels})
+
+    label_df = pd.DataFrame(label_matrix, index=mol_df.index)
+    result = pd.concat(
+        [mol_df[["chebi_id", "mol"]].reset_index(drop=True), label_df.reset_index(drop=True)],
+        axis=1,
+    )
+
+    return result, sorted_labels
diff --git a/chebi_utils/sdf_extractor.py b/chebi_utils/sdf_extractor.py
@@ -9,6 +9,8 @@
 import pandas as pd
 from rdkit import Chem
 
+from chebi_utils.obo_extractor import _chebi_id_to_str
+
 
 def _sanitize_molecule(mol: Chem.Mol) -> Chem.Mol:
     """Sanitize molecule, mirroring the ChEBI molecule processing."""
@@ -157,5 +159,14 @@ def extract_molecules(filepath: str | Path) -> pd.DataFrame:
 
     chebi_ids = df["chebi_id"].tolist() if "chebi_id" in df.columns else [None] * len(df)
     df["mol"] = [_parse_molblock(mb, cid) for mb, cid in zip(molblocks, chebi_ids, strict=False)]
+    df["chebi_id"] = df["chebi_id"].apply(_chebi_id_to_str)
+
+    # exclude records without a valid mol, but keep the same columns for consistency
+    df = df[df["mol"].notna()]
 
     return df
+
+
+if __name__ == "__main__":
+    df = extract_molecules("data/chebi.sdf.gz")
+    print(df.head())
diff --git a/tests/test_dataset_builder.py b/tests/test_dataset_builder.py
@@ -0,0 +1,165 @@
+"""Tests for chebi_utils.dataset_builder."""
+
+from __future__ import annotations
+
+import networkx as nx
+import pandas as pd
+import pytest
+from rdkit import Chem
+
+from chebi_utils.dataset_builder import (
+    build_labeled_dataset,
+)
+
+
+def _make_mol(smiles: str) -> Chem.Mol:
+    """Helper to create a sanitised Mol from SMILES."""
+    return Chem.MolFromSmiles(smiles)
+
+
+@pytest.fixture
+def simple_graph() -> nx.DiGraph:
+    """Build a small ChEBI-like directed graph (child -> parent via is_a).
+
+    Hierarchy::
+
+        A ─is_a─> B ─is_a─> D
+        A ─is_a─> C ─is_a─> D
+        E ─is_a─> C
+
+    Ontology descendants:
+        D: {A, B, C, E}
+        C: {A, E}
+        B: {A}
+        A: (none)
+        E: (none)
+    """
+    g = nx.DiGraph()
+    for node in ["A", "B", "C", "D", "E"]:
+        g.add_node(node, name=node, smiles=None, subset=None)
+
+    g.add_edge("A", "B", relation="is_a")
+    g.add_edge("A", "C", relation="is_a")
+    g.add_edge("B", "D", relation="is_a")
+    g.add_edge("C", "D", relation="is_a")
+    g.add_edge("E", "C", relation="is_a")
+    return g
+
+
+@pytest.fixture
+def simple_molecules() -> pd.DataFrame:
+    """Three molecules with IDs A, B, E (matching graph nodes)."""
+    return pd.DataFrame(
+        {
+            "chebi_id": ["A", "B", "E"],
+            "mol": [_make_mol("C"), _make_mol("CC"), _make_mol("CCC")],
+        }
+    )
+
+
+class TestBuildLabeledDataset:
+    def test_returns_dataframe_with_base_columns(self, simple_graph, simple_molecules):
+        df, labels = build_labeled_dataset(simple_graph, simple_molecules, min_molecules=2)
+        assert "chebi_id" in df.columns
+        assert "mol" in df.columns
+        # Label columns should also be present
+        for lbl in labels:
+            assert lbl in df.columns
+
+    def test_one_row_per_molecule(self, simple_graph, simple_molecules):
+        df, _ = build_labeled_dataset(simple_graph, simple_molecules, min_molecules=2)
+        # 3 molecules with valid Mol -> 3 rows
+        assert len(df) == 3
+
+    def test_label_columns_are_boolean(self, simple_graph, simple_molecules):
+        df, labels = build_labeled_dataset(simple_graph, simple_molecules, min_molecules=2)
+        for lbl in labels:
+            assert df[lbl].dtype == bool
+
+    def test_one_hot_values_correct(self, simple_graph, simple_molecules):
+        # Labels (min=2): {B, C, D}
+        # A -> ancestors {A,B,C,D} -> B=True, C=True, D=True
+        # B -> ancestors {B,D}     -> B=True, C=False, D=True
+        # E -> ancestors {E,C,D}   -> B=False, C=True, D=True
+        df, _ = build_labeled_dataset(simple_graph, simple_molecules, min_molecules=2)
+        a_row = df[df["chebi_id"] == "A"].iloc[0]
+        assert a_row["B"] == True  # noqa: E712
+        assert a_row["C"] == True  # noqa: E712
+        assert a_row["D"] == True  # noqa: E712
+
+        b_row = df[df["chebi_id"] == "B"].iloc[0]
+        assert b_row["B"] == True  # noqa: E712
+        assert b_row["C"] == False  # noqa: E712
+        assert b_row["D"] == True  # noqa: E712
+
+        e_row = df[df["chebi_id"] == "E"].iloc[0]
+        assert e_row["B"] == False  # noqa: E712
+        assert e_row["C"] == True  # noqa: E712
+        assert e_row["D"] == True  # noqa: E712
+
+    def test_mol_objects_preserved(self, simple_graph, simple_molecules):
+        df, _ = build_labeled_dataset(simple_graph, simple_molecules, min_molecules=1)
+        for _, row in df.iterrows():
+            assert isinstance(row["mol"], Chem.rdchem.Mol)
+
+    def test_none_mols_are_excluded(self, simple_graph):
+        mol_df = pd.DataFrame(
+            {
+                "chebi_id": ["A", "B"],
+                "mol": [_make_mol("C"), None],
+            }
+        )
+        df, _ = build_labeled_dataset(simple_graph, mol_df, min_molecules=1)
+        assert set(df["chebi_id"]) == {"A"}
+
+    def test_high_threshold_returns_empty(self, simple_graph, simple_molecules):
+        df, labels = build_labeled_dataset(simple_graph, simple_molecules, min_molecules=100)
+        assert df.empty
+        assert labels == []
+
+    def test_molecule_not_in_graph(self, simple_graph):
+        """Molecules with chebi_ids not present in the graph are still handled."""
+        mol_df = pd.DataFrame(
+            {
+                "chebi_id": ["Z"],
+                "mol": [_make_mol("C")],
+            }
+        )
+        df, labels = build_labeled_dataset(simple_graph, mol_df, min_molecules=1)
+        assert "Z" in labels
+        assert df.iloc[0]["Z"] == True  # noqa: E712
+
+    def test_non_isa_edges_ignored(self):
+        """Only is_a edges should be used for hierarchy traversal."""
+        g = nx.DiGraph()
+        for n in ["X", "Y", "Z"]:
+            g.add_node(n, name=n, smiles=None, subset=None)
+        g.add_edge("X", "Y", relation="is_a")
+        g.add_edge("X", "Z", relation="has_part")
+
+        mol_df = pd.DataFrame(
+            {
+                "chebi_id": ["X"],
+                "mol": [_make_mol("C")],
+            }
+        )
+        df, labels = build_labeled_dataset(g, mol_df, min_molecules=1)
+        # X is_a Y, so labels should include X and Y (but NOT Z via has_part)
+        assert set(labels) == {"X", "Y"}
+        assert df.iloc[0]["X"] == True  # noqa: E712
+        assert df.iloc[0]["Y"] == True  # noqa: E712
+
+    def test_empty_molecules_dataframe(self, simple_graph):
+        mol_df = pd.DataFrame(columns=["chebi_id", "mol"])
+        df, labels = build_labeled_dataset(simple_graph, mol_df, min_molecules=1)
+        assert df.empty
+        assert labels == []
+
+    def test_returned_labels_list_sorted(self, simple_graph, simple_molecules):
+        _, labels = build_labeled_dataset(simple_graph, simple_molecules, min_molecules=2)
+        assert labels == ["B", "C", "D"]
+
+    def test_returned_labels_match_columns(self, simple_graph, simple_molecules):
+        df, labels = build_labeled_dataset(simple_graph, simple_molecules, min_molecules=1)
+        label_cols = [c for c in df.columns if c not in ("chebi_id", "mol")]
+        assert label_cols == labels