ChEB-AI
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 48 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 84 additions & 1 deletion b/‎README.md‎
Lines changed: 84 additions & 1 deletion
diff --git a/‎chebi_utils/__init__.py‎
Lines changed: 13 additions & 0 deletions b/‎chebi_utils/__init__.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎chebi_utils/downloader.py‎
Lines changed: 53 additions & 0 deletions b/‎chebi_utils/downloader.py‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎chebi_utils/obo_extractor.py‎
Lines changed: 103 additions & 0 deletions b/‎chebi_utils/obo_extractor.py‎
Lines changed: 103 additions & 0 deletions
diff --git a/‎chebi_utils/sdf_extractor.py‎
Lines changed: 85 additions & 0 deletions b/‎chebi_utils/sdf_extractor.py‎
Lines changed: 85 additions & 0 deletions
@@ -0,0 +1,48 @@
+name: CI
+
+on:
+  push:
+    branches: ["**"]
+  pull_request:
+    branches: ["**"]
+
+jobs:
+  lint:
+    name: Lint (ruff)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install ruff
+        run: pip install ruff
+
+      - name: Check formatting
+        run: ruff format --check .
+
+      - name: Check linting
+        run: ruff check .
+
+  test:
+    name: Unit Tests
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install package and test dependencies
+        run: pip install -e ".[dev]"
+
+      - name: Run tests
+        run: pytest tests/ -v
@@ -1,2 +1,85 @@
 # python-chebi-utils
-Common processing functionality for the ChEBI ontology (e.g. extraction of molecules, classes and relations).
+
+Common processing functionality for the ChEBI ontology — download data files, extract classes and relations, extract molecules, and generate stratified train/val/test splits.
+
+## Installation
+
+```bash
+pip install chebi-utils
+```
+
+For development (includes `pytest` and `ruff`):
+
+```bash
+pip install -e ".[dev]"
+```
+
+## Features
+
+### Download ChEBI data files
+
+```python
+from chebi_utils import download_chebi_obo, download_chebi_sdf
+
+obo_path = download_chebi_obo(dest_dir="data/")   # downloads chebi.obo
+sdf_path = download_chebi_sdf(dest_dir="data/")   # downloads chebi.sdf.gz
+```
+
+Files are fetched from the [EBI FTP server](https://ftp.ebi.ac.uk/pub/databases/chebi/).
+
+### Extract ontology classes and relations
+
+```python
+from chebi_utils import extract_classes, extract_relations
+
+classes = extract_classes("chebi.obo")
+# DataFrame: id, name, definition, is_obsolete
+
+relations = extract_relations("chebi.obo")
+# DataFrame: source_id, target_id, relation_type  (is_a, has_role, …)
+```
+
+### Extract molecules
+
+```python
+from chebi_utils import extract_molecules
+
+molecules = extract_molecules("chebi.sdf.gz")
+# DataFrame: chebi_id, name, smiles, inchi, inchikey, formula, charge, mass, …
+```
+
+Both plain `.sdf` and gzip-compressed `.sdf.gz` files are supported.
+
+### Generate train/val/test splits
+
+```python
+from chebi_utils import create_splits
+
+splits = create_splits(molecules, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1)
+train_df = splits["train"]
+val_df   = splits["val"]
+test_df  = splits["test"]
+```
+
+Pass `stratify_col` to preserve class proportions across splits:
+
+```python
+splits = create_splits(classes, stratify_col="is_obsolete", seed=42)
+```
+
+## Running Tests
+
+```bash
+pytest tests/ -v
+```
+
+## Linting
+
+```bash
+ruff check .
+ruff format --check .
+```
+
+## CI/CD
+
+A GitHub Actions workflow (`.github/workflows/ci.yml`) automatically runs ruff linting and the full test suite on every push and pull request across Python 3.10, 3.11, and 3.12.
@@ -0,0 +1,13 @@
+from chebi_utils.downloader import download_chebi_obo, download_chebi_sdf
+from chebi_utils.obo_extractor import extract_classes, extract_relations
+from chebi_utils.sdf_extractor import extract_molecules
+from chebi_utils.splitter import create_splits
+
+__all__ = [
+    "download_chebi_obo",
+    "download_chebi_sdf",
+    "extract_classes",
+    "extract_relations",
+    "extract_molecules",
+    "create_splits",
+]
@@ -0,0 +1,53 @@
+"""Download ChEBI data files from the EBI FTP server."""
+
+from __future__ import annotations
+
+import urllib.request
+from pathlib import Path
+
+CHEBI_OBO_URL = "https://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.obo"
+CHEBI_SDF_URL = "https://ftp.ebi.ac.uk/pub/databases/chebi/SDF/ChEBI_complete.sdf.gz"
+
+
+def download_chebi_obo(dest_dir: str | Path = ".", filename: str = "chebi.obo") -> Path:
+    """Download the ChEBI OBO ontology file from the EBI FTP server.
+
+    Parameters
+    ----------
+    dest_dir : str or Path
+        Directory where the file will be saved (created if it doesn't exist).
+    filename : str
+        Name for the downloaded file.
+
+    Returns
+    -------
+    Path
+        Path to the downloaded file.
+    """
+    dest_dir = Path(dest_dir)
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    dest_path = dest_dir / filename
+    urllib.request.urlretrieve(CHEBI_OBO_URL, dest_path)
+    return dest_path
+
+
+def download_chebi_sdf(dest_dir: str | Path = ".", filename: str = "chebi.sdf.gz") -> Path:
+    """Download the ChEBI SDF file from the EBI FTP server.
+
+    Parameters
+    ----------
+    dest_dir : str or Path
+        Directory where the file will be saved (created if it doesn't exist).
+    filename : str
+        Name for the downloaded file.
+
+    Returns
+    -------
+    Path
+        Path to the downloaded file.
+    """
+    dest_dir = Path(dest_dir)
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    dest_path = dest_dir / filename
+    urllib.request.urlretrieve(CHEBI_SDF_URL, dest_path)
+    return dest_path
@@ -0,0 +1,103 @@
+"""Extract classes and relations from ChEBI OBO ontology files."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pandas as pd
+
+
+def _parse_obo_stanzas(filepath: str | Path) -> list[dict[str, list[str]]]:
+    """Parse an OBO file and return a list of stanza dicts."""
+    stanzas: list[dict[str, list[str]]] = []
+    current_stanza: dict[str, list[str]] | None = None
+
+    with open(filepath, encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith("!"):
+                continue
+            if line.startswith("["):
+                if current_stanza is not None:
+                    stanzas.append(current_stanza)
+                stanza_type = line.strip("[]")
+                current_stanza = {"_type": [stanza_type]}
+            elif current_stanza is not None and ":" in line:
+                key, _, value = line.partition(":")
+                current_stanza.setdefault(key.strip(), []).append(value.strip())
+
+    if current_stanza is not None:
+        stanzas.append(current_stanza)
+
+    return stanzas
+
+
+def extract_classes(filepath: str | Path) -> pd.DataFrame:
+    """Extract ontology classes (terms) from a ChEBI OBO file.
+
+    Parameters
+    ----------
+    filepath : str or Path
+        Path to the ChEBI OBO file.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with columns: id, name, definition, is_obsolete.
+    """
+    stanzas = _parse_obo_stanzas(filepath)
+    rows = []
+    for stanza in stanzas:
+        if stanza.get("_type", [None])[0] != "Term":
+            continue
+        row = {
+            "id": stanza.get("id", [None])[0],
+            "name": stanza.get("name", [None])[0],
+            "definition": stanza.get("def", [None])[0],
+            "is_obsolete": stanza.get("is_obsolete", ["false"])[0] == "true",
+        }
+        rows.append(row)
+    return pd.DataFrame(rows, columns=["id", "name", "definition", "is_obsolete"])
+
+
+def extract_relations(filepath: str | Path) -> pd.DataFrame:
+    """Extract class relations from a ChEBI OBO file.
+
+    Parameters
+    ----------
+    filepath : str or Path
+        Path to the ChEBI OBO file.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with columns: source_id, target_id, relation_type.
+    """
+    stanzas = _parse_obo_stanzas(filepath)
+    rows = []
+
+    for stanza in stanzas:
+        if stanza.get("_type", [None])[0] != "Term":
+            continue
+        source_id = stanza.get("id", [None])[0]
+        if source_id is None:
+            continue
+
+        for is_a_val in stanza.get("is_a", []):
+            target_id = is_a_val.split("!")[0].strip()
+            rows.append({"source_id": source_id, "target_id": target_id, "relation_type": "is_a"})
+
+        for rel_val in stanza.get("relationship", []):
+            parts = rel_val.split()
+            if len(parts) >= 2:
+                rel_type = parts[0]
+                target_id = parts[1].split("!")[0].strip()
+                rows.append(
+                    {
+                        "source_id": source_id,
+                        "target_id": target_id,
+                        "relation_type": rel_type,
+                    }
+                )
+
+    return pd.DataFrame(rows, columns=["source_id", "target_id", "relation_type"])
@@ -0,0 +1,85 @@
+"""Extract molecule data from ChEBI SDF files."""
+
+from __future__ import annotations
+
+import gzip
+from pathlib import Path
+
+import pandas as pd
+
+
+def _iter_sdf_records(filepath: str | Path):
+    """Yield individual SDF records as strings."""
+    opener = gzip.open if str(filepath).endswith(".gz") else open
+    current_record: list[str] = []
+
+    with opener(filepath, "rt", encoding="utf-8") as f:
+        for line in f:
+            current_record.append(line)
+            if line.strip() == "$$$$":
+                yield "".join(current_record)
+                current_record = []
+
+
+def _parse_sdf_record(record: str) -> dict[str, str]:
+    """Parse a single SDF record into a dict of data-item properties."""
+    props: dict[str, str] = {}
+    lines = record.splitlines()
+
+    if lines:
+        props["mol_name"] = lines[0].strip()
+
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        if line.startswith("> <") and line.rstrip().endswith(">"):
+            key = line.strip()[3:-1]
+            value_lines: list[str] = []
+            i += 1
+            while i < len(lines) and lines[i].strip() not in ("", "$$$$"):
+                value_lines.append(lines[i].strip())
+                i += 1
+            props[key] = "\n".join(value_lines)
+        else:
+            i += 1
+
+    return props
+
+
+def extract_molecules(filepath: str | Path) -> pd.DataFrame:
+    """Extract molecule data from a ChEBI SDF file.
+
+    Supports both plain (``.sdf``) and gzip-compressed (``.sdf.gz``) files.
+
+    Parameters
+    ----------
+    filepath : str or Path
+        Path to the ChEBI SDF (or SDF.gz) file.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with one row per molecule. Columns depend on the properties
+        present in the file. Common columns (renamed for convenience):
+        chebi_id, name, inchi, inchikey, smiles, formula, charge, mass.
+    """
+    records = [_parse_sdf_record(r) for r in _iter_sdf_records(filepath)]
+
+    if not records:
+        return pd.DataFrame()
+
+    df = pd.DataFrame(records)
+
+    rename_map = {
+        "ChEBI ID": "chebi_id",
+        "ChEBI Name": "name",
+        "InChI": "inchi",
+        "InChIKey": "inchikey",
+        "SMILES": "smiles",
+        "Formulae": "formula",
+        "Charge": "charge",
+        "Mass": "mass",
+    }
+    df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})
+
+    return df