diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b528987..c9ca93b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -30,11 +30,5 @@ jobs: python -m pip install --upgrade pip pip install -e .[dev] - - name: ruff lint - run: ruff check src - - - name: ruff format check - run: ruff format --check src - - name: run tests run: pytest -q diff --git a/.gitignore b/.gitignore index ba66f6c..2b0003b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,8 @@ .DS_Store .pytest_cache/ -.ruff_cache/ .vscode/ .idea/ scratch/ __pycache__/ -dist/ \ No newline at end of file +dist/ +lib/ \ No newline at end of file diff --git a/README.md b/README.md index bdccc4c..89cd3c8 100644 --- a/README.md +++ b/README.md @@ -59,28 +59,15 @@ In either case a the output folder will contain a log file together with the res Any column, field, or property in the input file, either CSV, TSV, SDF, or JSON, is preserved as props in the output JSON or JSONL. -Stereochemistry parsing is supported by supplying the `--matchstereochem` flag. This flag annotates the identifier of every identified monomer with R/S and E/Z annotation where applicable. +Stereochemistry parsing is supported by supplying the `-c` flag. -Result JSONs or lines from a JSONL file can be loaded into Python using RetroMol's `Result` class for further downstream analyses: +Result JSONs or lines from a JSONL file can be loaded into Python using RetroMol's `Result` class for further downstream analyses. -```python -from retromol.io import Result - -result = Result.from_serialized() - -# e.g., calculate coverage -coverage = result.best_total_coverage() -``` - -Check out the [examples](https://github.com/moltools/RetroMol/tree/main/examples) folder for example scripts demonstrating how to use RetroMol as a library: -* [Read out and align linear monomer readouts](https://github.com/moltools/RetroMol/tree/main/examples/align_compounds.py) -* [Calculate and cluster monomer fingerprints](https://github.com/moltools/RetroMol/tree/main/examples/cluster_compounds.py) +Check out the [examples](https://github.com/moltools/RetroMol/tree/main/examples) folder for example scripts demonstrating how to use RetroMol as a library. ### Using custom rules -RetroMol comes with a default set of retrosynthetic rules for modular natural products. - -You can also provide your own custom rules and configurations. See [the custom rules documentation](docs/customize_rules.md) for details on the YAML formats supported. +RetroMol comes with a default set of retrosynthetic rules for modular natural products. See [the default rules](src/retromol/data/) for examples of the included rules. ## Attribution @@ -101,8 +88,6 @@ in "editable" mode with the development dependencies: git clone git+https://github.com/MolTools/RetroMol.git cd RetroMol pip install -e .[dev] -pip install hatch # if you don't have hatch installed yet; needed for building the package -hatch env create dev # create the development environment ``` You can now make code changes locally and have them immediately available for testing. diff --git a/docs/customize_rules.md b/docs/customize_rules.md deleted file mode 100644 index 82e9a6a..0000000 --- a/docs/customize_rules.md +++ /dev/null @@ -1,142 +0,0 @@ -# Custom rule file formats - -Below are the specifications for custom reaction and matching rule files, as well as wave configuration file, used by RetroMol. These files are written in YAML format. Reaction and matching rule files are dumps of lists of rules, while the configuration file structures them into a workflow. - -## Wave configuration - -### Creating a wave_config.yml - -RetroMol processes molecules in waves: ordered stages that (1) apply selected reaction rules to split or transform the current “frontier” structures, and then (2) match the resulting fragments to motif classes. The behavior of each stage is controlled by a YAML file, wave_config.yml. - -If you don’t pass a config, RetroMol uses `retromol/data/default_wave_config.yml` (the same structure as the example below). You can point the CLI at a custom file with --wave-config (see `src/retromol/cli.py`) or pass it via the API (api.run_retromol(..., wave_configs=...)). - -### File structure - -A wave config is a YAML list, where each item is one wave. Waves run top-to-bottom. Each wave supports the keys below. - -Supported keys (per wave): -- `wave_name` (`string`, required): Human-readable label stored on nodes produced in that wave (see `resolve_mol` in `src/retromol/apply.py`). - -- `reaction_groups` (`list[string]`, required): Names of reaction-rule groups to apply in this wave. These must correspond to groups defined in your reaction rules YAML (loaded via `rules.load_rules_from_files(...)`). Examples: preprocessing, linearization, NRP disassembly, PK disassembly, etc. - -- `matching_groups` (`list[string]`, optional): If provided, only matching rules whose groups intersect this list are considered when assigning node identities in this wave (see filtering in `apply.resolve_mol`). If omitted, all matching rules are eligible. Names correspond to groups defined in your matching rules YAML (loaded via `rules.load_rules_from_files(...)`). Examples: amino acid, polyketide building block, glycosylation, etc. - -- `only_leaf_nodes` (`bool`, optional; default: true): After applying the reaction rules in this wave, RetroMol selects which reaction-graph nodes to turn into motif graph nodes: - * `true`: use leaf nodes created in the previous wave only (preferred; avoids duplicating intermediate steps). - * `false`: use all nodes created in the previous wave. - - Note: If a “leafs only” wave produces no leaves (e.g., an uncontested rule returns itself), RetroMol falls back to all nodes for that wave. - -- `parse_identified_nodes` (`bool`, optional; default: `false`): -Controls eligibility of nodes for re-parsing at the current deepest nesting level (see `find_eligible_nodes` in `src/retromol/api.py`): - * `false`: skip nodes that already have an identity. - * `true`: allow re-parsing nodes even if they already carry an identity. This is useful when one wave (e.g., linearization) changes structures in a way that reveals new tailoring motifs for the next wave. - - -### Execution model (what happens under the hood) - -- Wave 1 operates on the input molecule to produce the initial motif graph (see `api.run_retromol` > first call to `apply.resolve_mol`). - -- Subsequent waves operate on the current frontier: nodes at the deepest nesting level whose graph is `None` (and, unless `parse_identified_nodes: true`, also have no identity). - -- In each wave: - * Reaction rules are chosen from `reaction_groups`. - * The reaction graph is computed; RetroMol selects leaf or all nodes according to `only_leaf_nodes`. - * Selected nodes are matched to motifs using the (optionally filtered) `matching_groups`. - * Each selected node becomes a node in the motif graph and is annotated with `wave_name`, `identity`, `props`, `smiles`, `smiles_no_tags`, and `tags`. - * For nodes that should be further expanded in later waves, a nested graph is attached. - -Note: Stereochemistry matching is controlled separately (CLI `--matchstereochem` or API `match_stereochemistry`), not in the wave config. - -### Minimal example - -A minimal file needs just a name and at least one reaction group: - -```yaml -- wave_name: preprocessing - reaction_groups: - - preprocessing -``` - -This will apply the preprocessing reaction rules and match against all matching rules. - -## Reaction rules - -The default reaction rules can be found at `retromol/data/default_reaction_rules.yml`. - -A minimal reaction rules file looks like: - -```yaml -# reactions.yml -- rid: reverse O-methylation - smarts: "[O;D2:1][CH3:2]>>[O:1].[C:2]" - groups: [preprocessing] - props: {} # optional - -- rid: break ester bond (intermolecular) - smarts: "[C:1][C;!R:2](=[O:3])[O;!R:4][C:5]>>[C:1][C:2](=[O:3])[OH].[OH:4][C:5]" - groups: [linearization] -``` - -```yaml -- rid: # unique human-readable identifier - smarts: # RDKit reaction SMARTS (LHS>>RHS) - groups: [, ...] # arbitrary grouping labels - props: # optional metadata and global conditions - conditions: - # pre-conditions (whole molecule) - reactant: - requires_any: ["", ...] - requires_all: ["", ...] - forbids_any: ["", ...] - min_counts: {"": , ...} - max_counts: {"": , ...} - ring_count: {min: , max: } - atom_count: {min: , max: } - total_charge: {min: , max: } - custom_props: {has_metal: , is_macrocycle: } - # post-conditions (applied to each product) - product: - requires_any: ["", ...] - requires_all: ["", ...] - forbids_any: ["", ...] - min_counts: {"": , ...} - max_counts: {"": , ...} - ring_count: {min: , max: } - atom_count: {min: , max: } - total_charge: {min: , max: } - custom_props: {has_metal: , is_macrocycle: } -``` - -What each field means: -* `rid`: A stable, unique identifier (used for logging/debugging). -* `smarts`: Reaction SMARTS using RDKit’s format, including mapped atoms on both sides. Element identities and multiplicities for each map number must match (the loader enforces this). -* `groups`: Tags used to select rule subsets (preprocessing, linearization, NRP disassembly, etc.). -* `props.conditions.reactant`: “Global pre-filter” — the reaction only runs if the whole reactant satisfies these. -* `props.conditions.product`: “Global post-filter” — each product must satisfy these (result dropped otherwise). - -Tip: Use local constraints inside the SMARTS for atom-level logic (e.g., “:1 must not be acyl”), and global conditions for whole-molecule predicates (e.g., “forbid nitro anywhere”). - -## Matching rules - -The default matching rules can be found at `retromol/data/default_matching_rules.yml`. - -A minimal matching rules file looks like: - -```yaml -- rid: valine - mol: "CC(C)C(N)C(=O)O" - groups: [amino_acids] - props: {} -``` - -Matching uses exact topology equality (same atom/bond counts) plus substructure match. - -If you need stereochemistry matching, it’s supported by adding ste_mols in code (optional advanced usage). - -```yaml -- rid: - mol: "" # exact structure to match (full match, not subgraph) - groups: [, ...] - props: {} -``` diff --git a/examples/.gitkeep b/examples/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/examples/align_compounds.py b/examples/align_compounds.py deleted file mode 100644 index 78982be..0000000 --- a/examples/align_compounds.py +++ /dev/null @@ -1,64 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Example script that shows how to align primary sequences of two compounds using RetroMol.""" - -from typing import List, Tuple - -from versalign.aligner import setup_aligner -from versalign.msa import calc_msa -from versalign.printing import format_alignment -from versalign.scoring import create_substituion_matrix_dynamically - -from retromol.api import run_retromol_with_timeout -from retromol.io import Input as RetroMolInput -from retromol.readout import linear_readout - - -def main() -> None: - """Main function to align two compounds.""" - - # First parse both compounds with RetroMol - inp1 = RetroMolInput("dictyostatin", r"C[C@H]1CC[C@H]([C@@H]([C@@H](OC(=O)/C=C\C=C\[C@H]([C@H](C[C@@H](/C=C\[C@@H]([C@@H]([C@H](C1)C)O)C)O)O)C)[C@@H](C)/C=C\C=C)C)O") - inp2 = RetroMolInput("discodermolide", r"C[C@H]1[C@@H](OC(=O)[C@@H]([C@H]1O)C)C[C@@H](/C=C\[C@H](C)[C@@H]([C@@H](C)/C=C(/C)\C[C@H](C)[C@H]([C@H](C)[C@H]([C@@H](C)/C=C\C=C)OC(=O)N)O)O)O") - res1 = run_retromol_with_timeout(inp1) # uses default ruleset - res2 = run_retromol_with_timeout(inp2) # uses default ruleset - cov1 = res1.best_total_coverage() - cov2 = res2.best_total_coverage() - print(f"Coverage for dictyostatin: {cov1:.1%}") - print(f"Coverage for discodermolide: {cov2:.1%}") - - # Get linear readouts from both compounds - readout1 = linear_readout(res1, require_identified=True) - readout2 = linear_readout(res2, require_identified=True) - readouts = [("dictyostatin", readout1), ("discodermolide", readout2)] - - # Extract primary sequences from readouts - records: List[Tuple[str, str]] = [] - for label, readout in readouts: - for level_idx, level in enumerate(readout["levels"]): - for path_idx, path in enumerate(level["strict_paths"]): - path = path["ordered_monomers"] - if len(path) <= 3: continue # skip too short paths - seq_fwd = [m["identity"] for m in path] - seq_rev = list(reversed(seq_fwd)) - records.append((f"{label}_{level_idx}_{path_idx}_fwd", seq_fwd)) - records.append((f"{label}_{level_idx}_{path_idx}_rev", seq_rev)) - - # Unzip labels and sequences - labels, seqs = zip(*records) - - # Align sequences using versalign - objs = list(set([x for seq in seqs for x in seq])) + ["-"] # include gap character - objs.sort() - sm, _ = create_substituion_matrix_dynamically(objs) - aligner = setup_aligner(sm, mode="global") - msa, order = calc_msa(aligner, seqs, gap_repr="-") - reordered_labels = [labels[i] for i in order] - print("\n" + format_alignment(msa, names=reordered_labels)) - - # NOTE: see versalign documentation for more alignment options, including building - # more elaborate substitution matrices. - - -if __name__ == "__main__": - main() diff --git a/examples/cluster_compounds.py b/examples/cluster_compounds.py deleted file mode 100644 index 3f0e81b..0000000 --- a/examples/cluster_compounds.py +++ /dev/null @@ -1,102 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Example script that shows how to cluster retrobiosynthetic fingerprints of multiple compounds using RetroMol.""" - -from typing import List, Tuple - -import matplotlib.pyplot as plt -import numpy as np -from numpy.typing import NDArray -from scipy.cluster.hierarchy import dendrogram, linkage -from scipy.spatial.distance import squareform - -from retromol.api import run_retromol_with_timeout -from retromol.fingerprint import ( - FingerprintGenerator, - NameSimilarityConfig, - cosine_similarity, - polyketide_family_of -) -from retromol.helpers import iter_json -from retromol.io import Input as RetroMolInput, Result -from retromol.rules import get_path_default_matching_rules - - -COMPOUNDS = [ - ("nocardichelin_B", r"CCCCCCCCCCC/C=C\C(=O)N(CCCCCNC(=O)CCC(=O)N(CCCCCNC(=O)[C@@H]1COC(=N1)C2=CC=CC=C2O)O)O"), - ("desferrioxamin", r"CC(=O)N(O)CCCCCNC(=O)CCC(=O)N(O)CCCCCNC(=O)CCC(=O)N(O)CCCCCN"), - ("erythromycin_C", r"CC[C@@H]1[C@@]([C@@H]([C@H](C(=O)[C@@H](C[C@@]([C@@H]([C@H]([C@@H]([C@H](C(=O)O1)C)O[C@H]2C[C@@]([C@H]([C@@H](O2)C)O)(C)OC)C)O[C@H]3[C@@H]([C@H](C[C@H](O3)C)N(C)C)O)(C)O)C)C)O)(C)O"), - ("megalomycin_A", r"CC[C@@H]1[C@@]([C@@H]([C@H](C(=O)[C@@H](C[C@@]([C@@H]([C@H]([C@@H]([C@H](C(=O)O1)C)O[C@H]2C[C@@]([C@H]([C@@H](O2)C)O)(C)O)C)O[C@H]3[C@@H]([C@H](C[C@H](O3)C)N(C)C)O)(C)O[C@H]4C[C@H]([C@H]([C@@H](O4)C)O)N(C)C)C)C)O)(C)O"), - ("6-deoxyerytrhonolide", r"CC[C@@H]1[C@@H]([C@@H]([C@H](C(=O)[C@@H](C[C@@H]([C@@H]([C@H]([C@@H]([C@H](C(=O)O1)C)O)C)O)C)C)C)O)C"), - ("daptomycin", r"CCCCCCCCCC(=O)N[C@@H](CC1=CNC2=CC=CC=C21)C(=O)N[C@H](CC(=O)N)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@H]3[C@H](OC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@H](NC(=O)CNC(=O)[C@@H](NC(=O)[C@H](NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)CNC3=O)CCCN)CC(=O)O)C)CC(=O)O)CO)[C@H](C)CC(=O)O)CC(=O)C4=CC=CC=C4N)C"), - ("discodermolide", r"C[C@H]1[C@@H](OC(=O)[C@@H]([C@H]1O)C)C[C@@H](/C=C\[C@H](C)[C@@H]([C@@H](C)/C=C(/C)\C[C@H](C)[C@H]([C@H](C)[C@H]([C@@H](C)/C=C\C=C)OC(=O)N)O)O)O"), - ("dictyostatin", r"C[C@H]1CC[C@H]([C@@H]([C@@H](OC(=O)/C=C\C=C\[C@H]([C@H](C[C@@H](/C=C\[C@@H]([C@@H]([C@H](C1)C)O)C)O)O)C)[C@@H](C)/C=C\C=C)C)O"), - ("anthracimycin", r"C[C@@H]1/C=C\C=C\[C@H](OC(=O)[C@@H](C(=O)/C=C(/[C@H]2[C@@H]1C=C[C@@H]3[C@@H]2CC=C(C3)C)\O)C)C"), - ("chlorotonil", r"C[C@@H]1/C=C\C=C\[C@@H](OC(=O)[C@H](C(=O)C(C(=O)[C@@H]2[C@H]1C=C[C@H]3[C@H]2[C@@H](C=C(C3)C)C)(Cl)Cl)C)C"), - ("avilamycin", r"COC[C@H]1O[C@H]([C@H]([C@H]([C@@H]1O[C@@H]2O[C@@H]([C@@H]([C@@H]([C@H]2O)O[C@H]3C[C@]4(OC5(O[C@@H]4[C@H](O3)C)C[C@H]([C@@H]([C@H](O5)C)O[C@H]6C[C@H]([C@@H]([C@H](O6)C)OC(c7c(OC)c(Cl)c(O)c(Cl)c7C)=O)O)O)C)OC)C)O)OC)O[C@@H]8OC[C@@H]9O[C@@]%10(O[C@@H]([C@@](C(O)C)([C@@H]%11OCO[C@H]%11%10)O)C)O[C@H]9[C@H]8OC(C(C)C)=O") -] - - -def similarity_matrix(fps: NDArray[np.float32]) -> NDArray[np.float32]: - """ - Compute pairwise similarity between binary fingerprints. - - :param fps: 2D numpy array of shape (n_samples, n_features) with binary fingerprints. - :return: 2D numpy array of shape (n_samples, n_samples) with pairwise cosine similarities. - """ - n = fps.shape[0] - sim = np.zeros((n, n), dtype=float) - for i in range(n): - a = fps[i] - for j in range(i, n): - b = fps[j] - sim_ij = cosine_similarity(a, b) - sim[i, j] = sim[j, i] = sim_ij - return sim - - -def main() -> None: - """Main function to align two compounds.""" - - # Setup fingerprint generator - path_default_matching_rules = get_path_default_matching_rules() - collapse_by_name = ["glycosylation", "methylation"] - cfg = NameSimilarityConfig(family_of=polyketide_family_of, symmetric=True, family_repeat_scale=1) - generator = FingerprintGenerator( - matching_rules_yaml=path_default_matching_rules, - collapse_by_name=collapse_by_name, - name_similarity=cfg - ) - - # Parse compounds with RetroMol and get fingerprint readouts - labels, fps = [], [] - for name, smiles in COMPOUNDS: - input_data = RetroMolInput(cid=name, repr=smiles) - result = run_retromol_with_timeout(input_data) - cov = result.best_total_coverage() - print(f"Coverage for {name}: {cov:.1%}") - fingerprint = generator.fingerprint_from_result(result, num_bits=512, counted=True) - # One compound can have multiple readouts / mappings - num_mappings = fingerprint.shape[0] - for idx, _ in enumerate(range(num_mappings)): - labels.append(f"{name}_{idx}") - fps.append(fingerprint) - fps_stack = np.vstack(fps) - print(f"Fingerprint array shape: {fps_stack.shape}") - - # Compute similarity matrix and plot dendrogram - cosine_sim_matrix = similarity_matrix(fps_stack) - print(f"Similarity matrix shape: {cosine_sim_matrix.shape}") - distance_matrix = 1.0 - cosine_sim_matrix - distance_matrix = np.maximum(distance_matrix, 0.0) # kill tiny negatives - condensed_distance = squareform(distance_matrix, checks=False) - Z = linkage(condensed_distance, method="average") - plt.figure(figsize=(12, 6)) - dendrogram(Z, labels=labels, leaf_rotation=90, leaf_font_size=10) - plt.ylabel("Cosine distance", fontweight="bold") - plt.subplots_adjust(bottom=0.35) - plt.show() - - -if __name__ == "__main__": - main() diff --git a/pyproject.toml b/pyproject.toml index 5363db0..297d208 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,8 @@ dependencies = [ "numpy", "pandas", "pyyaml", - "ijson" + "ijson", + "matplotlib" ] classifiers = [ @@ -47,8 +48,7 @@ build-backend = "hatchling.build" [project.optional-dependencies] dev = [ - "pytest", - "ruff" + "pytest" ] [tool.hatch.metadata] @@ -58,9 +58,10 @@ allow-direct-references = true packages = ["src/retromol"] sources = ["src"] include = [ - "src/retromol/data/default_matching_rules.yml", - "src/retromol/data/default_reaction_rules.yml", - "src/retromol/data/default_wave_config.yml" + "src/retromol/data/mxn_other.yml", + "src/retromol/data/mxn_pks_chiral.yml", + "src/retromol/data/mxn_pks.yml", + "src/retromol/data/rxn.yml" ] [project.scripts] @@ -74,21 +75,4 @@ features = ["dev"] python = "3.10" [tool.hatch.envs.dev.scripts] -lint = "ruff check ." -fmt = "ruff format ." test = "pytest -q" - -# ------------------------- -# Ruff -# ------------------------- -[tool.ruff] -line-length = 120 -target-version = "py310" -extend-exclude = ["build", "dist", ".venv", "venv", "typings"] - -[tool.ruff.lint] -select = ["E", "F", "I", "UP", "B"] -ignore = [] - -[tool.ruff.lint.per-file-ignores] -"tests/data/**/*" = ["E501"] diff --git a/src/retromol/api.py b/src/retromol/api.py deleted file mode 100644 index d050c9f..0000000 --- a/src/retromol/api.py +++ /dev/null @@ -1,150 +0,0 @@ -"""This module provides the main API for the RetroMol package.""" - -import logging -from importlib.resources import files -from typing import Any, cast - -import yaml -from networkx import Graph - -import retromol.data -from retromol.apply import resolve_mol -from retromol.chem import get_tags_mol, smiles_to_mol -from retromol.config import LOGGER_NAME, TIMEOUT_RUN_RETROMOL -from retromol.helpers import timeout_decorator -from retromol.io import Input, Result -from retromol.rules import Rules, load_rules_from_files - - -def find_eligible_nodes( - graph: "Graph[int | str]", parse_identified_nodes: bool = False -) -> list[tuple["Graph[int | str]", int | str]]: - """ - Walk `graph` and nested sub-graphs (via node attr 'graph') and return a list of - (parent_graph, node_id) for nodes whose attrs['graph'] is None, but ONLY when - their parent graph is at the current deepest nesting level. - - Effectively: expand the most recently created level of graphs, one wave at a time. - - If parse_identified_nodes is False (default), nodes with a non-None 'identity' - are skipped; if True, identity is ignored. - - :param graph: the root graph to search - :param parse_identified_nodes: whether to include nodes with an 'identity' attribute - :return: a list of tuples containing the parent graph and node ID of eligible nodes - """ - - # Determine the maximum depth of any existing graph in the nesting. - # Depth of the root `graph` is 0; a subgraph inside a node of depth d has depth d+1. - def _max_depth(g: "Graph[int | str]", depth: int = 0) -> int: - md = depth - for _, attrs in g.nodes(data=True): - sub: Any = attrs.get("graph") - if isinstance(sub, Graph): - sub = cast("Graph[int | str]", sub) - md = max(md, _max_depth(sub, depth + 1)) - return md - - max_depth = _max_depth(graph) - - # Collect eligible nodes whose parent graph is exactly at max_depth. - eligible: list[tuple[Graph[int | str], int | str]] = [] - - def _collect(g: "Graph[int | str]", depth: int = 0): - # Only collect from graphs at the current frontier depth. - if depth == max_depth: - for nid, attrs in g.nodes(data=True): - if attrs.get("graph") is None: - if parse_identified_nodes or attrs.get("identity") is None: - eligible.append((g, nid)) - # Recurse to find deeper graphs (to maintain correctness of traversal), - # but we only collect at exactly max_depth. - for _, attrs in g.nodes(data=True): - sub = attrs.get("graph") - if isinstance(sub, Graph): - sub = cast("Graph[int | str]", sub) - _collect(sub, depth + 1) - - _collect(graph) - return eligible - - -def run_retromol( - input: Input, - rule_set: Rules | None = None, - wave_configs: list[dict[str, Any]] | None = None, - match_stereochemistry: bool = False, -) -> Result: - """ - Run RetroMol on a given input compound. - - :param input: the input compound to process, as an Input object - :param rule_set: the set of rules to apply, as a Rules object - :param wave_configs: configuration for each wave, as a dictionary mapping wave numbers to config dicts - :param match_stereochemistry: whether to match stereochemistry during processing - :return: a Result object containing the processed graph and metadata - """ - if rule_set is None: - path_rx = str(files(retromol.data).joinpath("default_reaction_rules.yml")) - path_mx = str(files(retromol.data).joinpath("default_matching_rules.yml")) - rule_set = load_rules_from_files(path_rx, path_mx) - - if wave_configs is None: - path_wave_config = str(files(retromol.data).joinpath("default_wave_config.yml")) - with open(path_wave_config) as f: - wave_configs = yaml.safe_load(f) - - logger = logging.getLogger(LOGGER_NAME) - matching_rules = rule_set.get_matching_rules() - - logger.debug(f"Processing input: {input.cid} with SMILES: {input.smi}") - - props = input.props if input.props else {} - motif_graph = None - - reserved_tags: set[int] = set(get_tags_mol(input.mol)) - - # Loop through wave numbers and apply rules to the motif graph, increasing the motif nesting - for wave_config in wave_configs or []: - if motif_graph is None: - # First wave: create the initial motif graph from the input molecule - logger.debug(f"Starting wave {wave_config.get('wave_name', 'unnamed')} on input molecule") - motif_graph = resolve_mol( - input.mol, - reserved_tags, - rule_set.get_reaction_rules(group_names=wave_config.get("reaction_groups", [])), - matching_rules, - match_stereochemistry=match_stereochemistry, - wave_config=wave_config, - ) - continue - - # Find all nodes that are eligible for processing: all nodes without an identity and sub-graph - parse_identified_nodes = wave_config.get("parse_identified_nodes", False) - todo = find_eligible_nodes(motif_graph, parse_identified_nodes=parse_identified_nodes) - - # Loop through the eligible nodes and process them - for parent_graph, n_id in todo: - attrs = parent_graph.nodes[n_id] - logger.debug(f"Processing node {n_id} in graph with {parent_graph.number_of_nodes()} nodes") - sub = resolve_mol( - smiles_to_mol(attrs["smiles"]), - reserved_tags, - rule_set.get_reaction_rules(group_names=wave_config.get("reaction_groups", [])), - matching_rules, - match_stereochemistry=match_stereochemistry, - wave_config=wave_config, - ) - attrs["graph"] = sub - - # Return the motif graph as a labeled result - return Result( - input_id=input.cid, - graph=motif_graph if motif_graph is not None else Graph(), - props=props, - sha256_reaction_rules=rule_set.sha256_reaction_rules, - sha256_matching_rules=rule_set.sha256_matching_rules, - ) - - -run_retromol_with_timeout = timeout_decorator(seconds=TIMEOUT_RUN_RETROMOL)(run_retromol) diff --git a/src/retromol/apply.py b/src/retromol/apply.py deleted file mode 100644 index c8ac284..0000000 --- a/src/retromol/apply.py +++ /dev/null @@ -1,572 +0,0 @@ -"""This module contains functions for applying custom rules to molecules.""" - -import itertools -import logging -from copy import deepcopy -from dataclasses import dataclass -from typing import Any - -from networkx import Graph - -from retromol import matching, rules -from retromol.chem import ( - ChemicalReaction, - Mol, - encode_mol, - get_tags_mol, - mol_to_smiles, - neutralize_mol, - smiles_to_mol, -) -from retromol.config import LOGGER_NAME -from retromol.errors import MotifGraphNodeWithoutAttributesError -from retromol.graph import merge_nodes, mol_to_graph -from retromol.io import Input as RetroMolInput -from retromol.rules import DummyReactionRule, ReactionRule - - -def _reactive_template_atoms(rxn: ChemicalReaction) -> list[set[int]]: - """ - For each reactant-template in rxn, return the set of template-atom-indices - that actually change (i.e. have a broken/formed bond or disappear/appear). - We return a list: one set per reactant-template in the order they appear. - - :param rxn: RDKit ChemicalReaction object - :return: List of sets, each set contains indices of reactive atoms in the corresponding reactant template - """ - # First, build a map from map‐no -> (reactant_template_idx, reactant_atom_idx) - reactant_maps: dict[ - int, tuple[int, int] - ] = {} # map_no -> (which reactant‐template, which atom‐idx in that template) - for ri in range(rxn.GetNumReactantTemplates()): - templ = rxn.GetReactantTemplate(ri) - for atom in templ.GetAtoms(): - mnum = atom.GetAtomMapNum() - if mnum: - reactant_maps[mnum] = (ri, atom.GetIdx()) - - # Next, build a map from map‐no -> (which product_template_idx, product_atom_idx) - product_maps: dict[int, tuple[int, int]] = {} - for pi in range(rxn.GetNumProductTemplates()): - templ_p = rxn.GetProductTemplate(pi) - for atom in templ_p.GetAtoms(): - mnum = atom.GetAtomMapNum() - if mnum: - product_maps[mnum] = (pi, atom.GetIdx()) - - # Now we scan each reactant‐template atom and see if it "persists" into product with the same adjacency, - # or if its bonding pattern changes, or if it disappears entirely. If any of those are true -> it's reactive. - reactive_sets: list[set[int]] = [set() for _ in range(rxn.GetNumReactantTemplates())] - - # Pre‐compute adjacency‐lists (by map‐number) for reactant vs. product - # – build map_no -> set(of neighbor‐map_numbers) in reactant and product - react_adj: dict[int, set[int]] = {} - prod_adj: dict[int, set[int]] = {} - - # Build reactant adjacency by map‐num - for ri in range(rxn.GetNumReactantTemplates()): - templ = rxn.GetReactantTemplate(ri) - for bond in templ.GetBonds(): - a1, a2 = bond.GetBeginAtom(), bond.GetEndAtom() - m1, m2 = a1.GetAtomMapNum(), a2.GetAtomMapNum() - if m1 and m2: - react_adj.setdefault(m1, set()).add(m2) - react_adj.setdefault(m2, set()).add(m1) - - # Build product adjacency by map‐num - for pi in range(rxn.GetNumProductTemplates()): - templ_p = rxn.GetProductTemplate(pi) - for bond in templ_p.GetBonds(): - a1_p, a2_p = bond.GetBeginAtom(), bond.GetEndAtom() - m1, m2 = a1_p.GetAtomMapNum(), a2_p.GetAtomMapNum() - if m1 and m2: - prod_adj.setdefault(m1, set()).add(m2) - prod_adj.setdefault(m2, set()).add(m1) - - # Now: for each map_no in the reactant_templates, check: - # (a) if that map_no does NOT appear in product_maps at all -> the atom was deleted (= reactive) - # (b) if it DOES appear, compare react_adj[map_no] vs. prod_adj[map_no]. - # If they differ -> bond‐pattern changed -> reactive - # (c) also check if atomic number or formal charge changed (rare in a template, but could). - # We compare the two atoms directly. We need to find the reactant‐template Atom and product‐template - # Atom to compare. - for mnum, (rtempl_idx, ratom_idx) in reactant_maps.items(): - if mnum not in product_maps: - # Disappeared in the product – this atom is definitely reactive - reactive_sets[rtempl_idx].add(ratom_idx) - else: - # Compare adjacency - react_neighbors = react_adj.get(mnum, set()) - prod_neighbors = prod_adj.get(mnum, set()) - if react_neighbors != prod_neighbors: - reactive_sets[rtempl_idx].add(ratom_idx) - else: - # Check if element or charge changed - (pi, patom_idx) = product_maps[mnum] - react_atom = rxn.GetReactantTemplate(rtempl_idx).GetAtomWithIdx(ratom_idx) - prod_atom = rxn.GetProductTemplate(pi).GetAtomWithIdx(patom_idx) - if ( - react_atom.GetAtomicNum() != prod_atom.GetAtomicNum() - or react_atom.GetFormalCharge() != prod_atom.GetFormalCharge() - ): - # If neither bonding‐pattern nor element‐/charge changed, it is NOT reactive - reactive_sets[rtempl_idx].add(ratom_idx) - - return reactive_sets - - -def _index_uncontested( - mol: Mol, - rls: list[ReactionRule], - failed_combos: set[tuple[int, frozenset[int]]], -) -> list[tuple[ReactionRule, set[int]]]: - """ - Index uncontested reactions for applying preprocessing rules in bulk. - - :param mol: RDKit molecule - :param rls: List of preprocessing rules - :param failed_combos: Set of failed combinations to avoid infinite loops - :return: Uncontested reactions - """ - up_for_election: list[tuple[ReactionRule, set[int], set[int]]] = [] - for rl in rls: - if not rl.rxn: - continue # skip rules without a reaction template - - reactive_inds = _reactive_template_atoms(rl.rxn)[0] - all_reactant_matches: list[tuple[tuple[int, ...], ...]] = [] - all_reactant_matches_reactive_items: list[list[list[int]]] = [] - for template_idx in range(rl.rxn.GetNumReactantTemplates()): - reactant_template = rl.rxn.GetReactantTemplate(template_idx) - reactant_matches: tuple[tuple[int, ...], ...] = mol.GetSubstructMatches(reactant_template) - all_reactant_matches.append(reactant_matches) - new_reactant_matches: list[list[int]] = [] - for reactant_match in reactant_matches: - new_reactant_matches.append([reactant_match[idx] for idx in reactive_inds]) - all_reactant_matches_reactive_items.append(new_reactant_matches) - - # Generate all possible match sets, for when reaction template matches multiple sites - match_sets = list(itertools.product(*all_reactant_matches)) - match_sets_reactive_items = list(itertools.product(*all_reactant_matches_reactive_items)) - match_sets = [set(itertools.chain(*match_set)) for match_set in match_sets] - match_sets_reactive_items = [set(itertools.chain(*match_set)) for match_set in match_sets_reactive_items] - for match_set, match_set_reactive_items in zip(match_sets, match_sets_reactive_items, strict=True): - up_for_election.append((rl, match_set, match_set_reactive_items)) - - # Check which reactions with matched templates are uncontested and which are contested - uncontested: list[tuple[ReactionRule, set[int]]] = [] - for i, (rl, match_set, match_set_reactive_items) in enumerate(up_for_election): - # Rules with ring matching conditions are always contested - if rl.has_ring_matching_condition(): - continue - - # Check if match set has overlap with any other match set - # has_overlap = any(match_set.intersection(o) for j, (_, o, o_r) in enumerate(up_for_election) if i != j) - has_overlap = any( - match_set_reactive_items.intersection(o_r) for j, (_, _, o_r) in enumerate(up_for_election) if i != j - ) - if not has_overlap: - uncontested.append((rl, match_set)) - - # Filter out failed combinations to avoid infinite loops - uncontested = [ - (rl, match_set) for rl, match_set in uncontested if (rl.id, frozenset(match_set)) not in failed_combos - ] - - return uncontested - - -def _apply_uncontested( - parent: Mol, - uncontested: list[tuple[rules.ReactionRule, set[int]]], - original_taken_tags: list[int], -) -> tuple[list[Mol], set[str], set[tuple[int, frozenset[int]]]]: - """ - Apply uncontested reactions in bulk. - - :param parent: RDKit molecule - :param uncontested: List of uncontested reactions - :param original_taken_tags: List of atom tags from original reactant - :return: List of trtue products, a set of applied reaction ids, and a set of failed combinations - """ - applied_reactions: set[str] = set() - - tags_in_parent: set[int] = set(get_tags_mol(parent)) - - # We make sure all atoms, even the ones not from original reactant, have a - # unique isotope number, so we can track them through consecutive reactions - temp_taken_tags = get_tags_mol(parent) - for atom in parent.GetAtoms(): - if atom.GetIsotope() == 0: - tag = 1 - while tag in original_taken_tags or tag in temp_taken_tags: - tag += 1 - atom.SetIsotope(tag) - temp_taken_tags.append(tag) - - # Validate that all atoms have a unique tag - num_tagged_atoms = len(set(get_tags_mol(parent))) - if num_tagged_atoms != len(parent.GetAtoms()): - raise ValueError("Not all atoms have a unique tag before applying uncontested reactions") - - # Map tags to atomic nums so we can create masks and reassign atomic nums later on - idx_to_tag = {a.GetIdx(): a.GetIsotope() for a in parent.GetAtoms()} - - # All uncontested reactions become a single node in the reaction_graph - products: list[Mol] = [] - failed_combos: set[tuple[int, frozenset[int]]] = set() # keep track of failed combinations to avoid infinite loops - - for rl, match_set in uncontested: - msk = set([idx_to_tag[idx] for idx in match_set]) # create mask for reaction - - # We use the input parent if there are no products, otherwise we have to find out - # which product now contains the mask (i.e., the reaction template for this reaction) - if len(products) != 0: - new_parent: Mol | None = None - for product in products: - product_tags = set(get_tags_mol(product)) - if msk.issubset(product_tags): - new_parent = product - products = [p for p in products if p != product] - break - - if new_parent is None: - # raise ValueError("no product found that contains the mask") - # If no product is found, we continue with the next uncontested reaction - continue - - parent = new_parent - - # Register all tags currently taken by atoms in parent - temp_taken_tags_uncontested = get_tags_mol(parent) - - # Newly introduced atoms by one of the uncontested reactions need a unique tag - for atom in parent.GetAtoms(): - if atom.GetIsotope() == 0: # newly introduced atom has tag 0 - # Loop until we find a tag that is not already taken - tag = 1 - while tag in (temp_taken_tags_uncontested + original_taken_tags + temp_taken_tags): - tag += 1 - atom.SetIsotope(tag) - temp_taken_tags_uncontested.append(tag) - - unmasked_parent = deepcopy(parent) # keep original parent for later - results = rl(parent, msk) # apply reaction rule - - try: - if len(results) == 0: - raise ValueError(f"No products from uncontested reaction {rl.rid}") - - if len(results) > 1: - raise ValueError(f"More than one product from uncontested reaction {rl.rid}") - - result = results[0] - applied_reactions.add(rl.rid) # keep track of successfully applied reactions - - # Reset atom tags in products for atoms not in original reactant - for product in result: - for atom in product.GetAtoms(): - if atom.GetIsotope() not in original_taken_tags and atom.GetIsotope() != 0: - atom.SetIsotope(0) - products.append(product) - - except Exception: - # Start function again with the next uncontested reaction - for atom in parent.GetAtoms(): - if atom.GetIsotope() not in original_taken_tags and atom.GetIsotope() != 0: - atom.SetIsotope(0) - products.append(unmasked_parent) - failed_combos.add( - ( - rl.id, - frozenset(match_set), - ) - ) - - for product in products: - # Any tag in product that is not in parent should be 0; otherwise we run into issues with - # the set cover algorithm - for atom in product.GetAtoms(): - if atom.GetIsotope() not in tags_in_parent and atom.GetIsotope() != 0: - atom.SetIsotope(0) - - return products, applied_reactions, failed_combos - - -@dataclass -class ProcessingResult: - """ - Data structure for processed data. - - :param enc_to_mol: Maps mol hash to Chem.Mol - :param enc_to_rxn: Maps rxn index to Reaction - :param rxn_graph: As {reactant_mol_encoding: {reaction_encoding: [child_mol_encodings], ...}, ...} - :param applied_rxns: Set of successfully applied reaction ids - """ - - enc_to_mol: dict[str, Mol] # encoding is a canonical SMILES with isotopic tags - enc_to_rxn: dict[int, rules.ReactionRule] - rxn_graph: dict[str, dict[int, list[str]]] - applied_rxns: set[str] - - -class Input: - """ - This class describes the input for a RetroMol run. - """ - - def __init__(self, cid: str, smi: str) -> None: - """ - Initialize the input compound. - - :param cid: Compound identifier - :param smi: SMILES representation - """ - self.cid = cid - - # convert SMILES into RDKit molecule - self.mol = smiles_to_mol(smi) - neutralize_mol(self.mol) - - # store original atom indices as isotope number - for atom in self.mol.GetAtoms(): - idx = atom.GetIdx() - tag = idx + 1 - atom.SetIsotope(tag) - - # store SMILES representation with tags - self._smi = mol_to_smiles(self.mol) - - def get_tags(self) -> list[int]: - """ - Get the atom tags. - - :return: atom tags - """ - return get_tags_mol(self.mol) - - -def process_mol(inp: RetroMolInput, reaction_rules: list[rules.ReactionRule]) -> ProcessingResult: - """ - Apply custom rules to linearize a SMILES string. - - :param inp: Input object - :param reaction_rules: List of processing rules - :param reserved_nodes: Set of atom tags that should not be used for remapping - :return: Preprocessed data structures - """ - logger = logging.getLogger(LOGGER_NAME) - - # Setup processing - original_taken_tags = inp.get_tags() - rst_pre = ProcessingResult({}, {}, {}, set()) - num_rxn_nodes = 0 # keeps track of number of reaction nodes in reaction graph - mols = [deepcopy(inp.mol)] # queue of molecules to process - - # Set of (rule id, frozenset of match_set) tuples to track failed combinations - failed_combos: set[tuple[int, frozenset[int]]] = set() # keep track of failed combinations to avoid infinite loops - - # Process queue - while mols: - parent = mols.pop(0) - - # Encode parent molecule - parent_encoding = encode_mol(parent) - if parent_encoding not in rst_pre.enc_to_mol: - rst_pre.enc_to_mol[parent_encoding] = deepcopy(parent) - rst_pre.rxn_graph[parent_encoding] = {} - - # Index uncontested reactions - uncontested = _index_uncontested(parent, reaction_rules, failed_combos) - - # Apply uncontested reactions in bulk - if uncontested: - products, applied_in_bulk, new_failed_combos = _apply_uncontested(parent, uncontested, original_taken_tags) - logger.debug( - f"Uncontested reactions applied ({len(applied_in_bulk)}): {', '.join([rl.rid for rl, _ in uncontested])}" # noqa E501 - ) - logger.debug( - f"Uncontested reactions failed ({len(new_failed_combos)}): {', '.join([str(id) for id, _ in new_failed_combos])}" # noqa E501 - ) - failed_combos.update(new_failed_combos) - - # If all uncontested reactions failed, we continue with the next parent - logger.debug(f"Found {len(products)} product(s) from uncontested reactions") - - # If no reaction was successful, we continue with the next parent - if len(applied_in_bulk) == 0: - logger.debug("No uncontested reactions applied, continuing with next parent") - # Apparently everything failed, so we need to reparse the parent and try again with contested reactions - mols.append(parent) - continue - - if products: - rst_pre.applied_rxns.update(applied_in_bulk) - - # All products are now products of our combined reaction node that contains all uncontested reactions - num_rxn_nodes += 1 - rst_pre.enc_to_rxn[num_rxn_nodes] = DummyReactionRule("uncontested") - rst_pre.rxn_graph[parent_encoding][num_rxn_nodes] = list() - for product in products: - product_encoding = encode_mol(product) - if product_encoding not in rst_pre.rxn_graph[parent_encoding][num_rxn_nodes]: - rst_pre.rxn_graph[parent_encoding][num_rxn_nodes].append(product_encoding) - if product_encoding not in rst_pre.enc_to_mol: - rst_pre.enc_to_mol[product_encoding] = deepcopy(product) - rst_pre.rxn_graph[product_encoding] = {} - mols.append(product) - - # Restart loop with new parent - continue - - # Exhaustive reaction_rule application for all contested reactions - for rl in reaction_rules: - results = rl(parent, None) # apply reaction rule - - if results: - logger.debug(f"Contested reaction {rl.rid} applied") - - for result in results: - rst_pre.applied_rxns.add(rl.rid) # keep track of successfully applied reactions - - # Encode reaction node - num_rxn_nodes += 1 - if num_rxn_nodes not in rst_pre.enc_to_rxn: - rst_pre.enc_to_rxn[num_rxn_nodes] = rl - else: - raise ValueError(f"reaction node {num_rxn_nodes} already exists for reaction {rl.rid}") - - # Encode product molecules - rst_pre.rxn_graph[parent_encoding][num_rxn_nodes] = list() - for child in result: - child_encoding = encode_mol(child) - if child_encoding not in rst_pre.rxn_graph[parent_encoding][num_rxn_nodes]: - rst_pre.rxn_graph[parent_encoding][num_rxn_nodes].append(child_encoding) - - # Add child to queue if not already present in encoding_to_mol (i.e., previously processed) - if child_encoding not in rst_pre.enc_to_mol: - rst_pre.enc_to_mol[child_encoding] = deepcopy(child) - rst_pre.rxn_graph[child_encoding] = {} - mols.append(child) - - return rst_pre - - -def resolve_mol( - mol: Mol, - reserved_tags: set[int], - reaction_rules: list[rules.ReactionRule], - matching_rules: list[rules.MatchingRule], - match_stereochemistry: bool, - wave_config: dict[str, Any], -) -> "Graph[int | str]": - """ - Apply custom rules to sequence a molecule into motif codes. - - :param mol: RDKit molecule - :param reserved_tags: set of atom tags that should not be used for remapping - :param reaction_rules: list of sequencing rules - :param matching_rules: matching rules for identifying nodes - :param match_stereochemistry: whether to match stereochemistry - :param wave_config: configuration for the current wave - :return: motif graph with merged and possibly identified nodes - """ - logger = logging.getLogger(LOGGER_NAME) - - # Retrieve wave configuration - use_leafs_only = wave_config.get("only_leaf_nodes", True) - matching_groups = wave_config.get("matching_groups", None) - - # Filter matching rules based on priorities if provided - if matching_groups is not None: - matching_rules = [mr for mr in matching_rules if any([True for gr in mr.groups if gr in matching_groups])] - logger.debug(f"Filtered matching rules based on priorities to {len(matching_rules)} rules") - - # Tag atoms without a tag yet with unique tags - mol_to_process = RetroMolInput( - "mol", mol, tag_compound=True, reserved_tags=deepcopy(reserved_tags) - ) # copy because we modify it in place - all_tags = mol_to_process.get_tags() - all_tags.sort() - - logger.debug(f"Processing molecule {mol_to_process.cid} with SMILES {mol_to_process.smi}") - logger.debug(f"Tags in molecule: {all_tags}") - - processing_result = process_mol(mol_to_process, reaction_rules) - - # Decide what nodes to use from processing result - if use_leafs_only: - # Only use leaf nodes from the reaction graph - encoding_to_mol = { - enc: mol - for enc, mol in processing_result.enc_to_mol.items() - if not processing_result.rxn_graph.get(enc) # check if node has children - } - - # If there are no leaf nodes, resort to all nodes - # This might happen when an uncontested rule fails and returns itself as product - if not encoding_to_mol: - encoding_to_mol = processing_result.enc_to_mol - - else: - # Use all nodes from the reaction graph - encoding_to_mol = processing_result.enc_to_mol - - # Identify nodes and pick best set of identified nodes - encoding_to_mol_identified = matching.identify_nodes(encoding_to_mol, matching_rules, match_stereochemistry) - identified_nodes = list(encoding_to_mol_identified.keys()) # keys are the identified nodes - - unidentified_nodes = [node for node in encoding_to_mol.keys() if node not in identified_nodes] - best_fit_identified, best_fit_unidentified = matching.solve_exact_cover_with_priority( - encoding_to_mol, identified_nodes, unidentified_nodes, all_tags - ) - - # Create monomer graph for best fit for identified and unidentified nodes - motif_graph: Graph[int | str] = mol_to_graph(mol_to_process.mol, use_tags=True) - - for node in best_fit_identified + best_fit_unidentified: - # Merged nodes have a str key, unmerged nodes have an int key - if not isinstance(node, str): - continue # skip unmerged nodes - node_mol = encoding_to_mol[node] - node_smiles = mol_to_smiles(node_mol) - tags = get_tags_mol(node_mol) - merge_nodes(motif_graph, merged_node_id=node, nodes=tags) # modifies node, merged node has str key - - # Get identity of node - if identity_with_props := encoding_to_mol_identified.get(node, None): - identity = identity_with_props["identity"] - props = identity_with_props["props"] - else: - identity = None - props = {} - - logger.debug(f"Node {node} ({node_smiles}) has identity {identity}") - - # Make sure node_smiles only has reserved tags, if any, for mapping in readout functions - node_mol = smiles_to_mol(node_smiles) - for atom in node_mol.GetAtoms(): - tag = atom.GetIsotope() - if tag not in reserved_tags: - atom.SetIsotope(0) - node_smiles = mol_to_smiles(node_mol) - tags = get_tags_mol(node_mol) - - # Give properties to monomer graph nodes - motif_graph.nodes[node]["graph"] = None - motif_graph.nodes[node]["identity"] = identity - motif_graph.nodes[node]["props"] = props - motif_graph.nodes[node]["tags"] = get_tags_mol(node_mol) - motif_graph.nodes[node]["smiles"] = node_smiles - motif_graph.nodes[node]["smiles_no_tags"] = mol_to_smiles(node_mol, remove_tags=True) - motif_graph.nodes[node]["wave_name"] = wave_config.get("wave_name", None) - - # Check if there are ny nodes that have no attrs (something went wrong...) - nodes_without_attrs = [n for n, d in motif_graph.nodes(data=True) if not d] - if nodes_without_attrs: - # This happened sometimes before we explicitly checked for overlapping atom - # tags (see rules.OverlappingAtomTagsError). Might be caused by some other type - # of behavior from RDKit, so we explicitly check for it now. - raise MotifGraphNodeWithoutAttributesError(f"Nodes without attributes found: {nodes_without_attrs}") - - return motif_graph diff --git a/src/retromol/chem.py b/src/retromol/chem.py deleted file mode 100644 index 7982919..0000000 --- a/src/retromol/chem.py +++ /dev/null @@ -1,379 +0,0 @@ -"""This module contains functions for chemical operations using RDKit.""" - -import numpy as np -from numpy.typing import NDArray -from rdkit import Chem, RDLogger -from rdkit.Chem.inchi import MolToInchiKey -from rdkit.Chem.MolStandardize.rdMolStandardize import LargestFragmentChooser, TautomerEnumerator, Uncharger -from rdkit.Chem.rdchem import Atom, BondStereo, BondType, GetPeriodicTable, Mol, PeriodicTable -from rdkit.Chem.rdChemReactions import ChemicalReaction, ReactionFromSmarts -from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator -from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect -from rdkit.Chem.rdmolfiles import MolFromSmarts, MolFromSmiles, MolToSmarts, MolToSmiles -from rdkit.Chem.rdmolops import ( - AssignStereochemistry, - GetMolFrags, - RemoveStereochemistry, - SanitizeMol, - SetBondStereoFromDirections, -) -from rdkit.DataStructs.cDataStructs import ExplicitBitVect, TanimotoSimilarity - -RDLogger.DisableLog("rdApp.*") - -# Type aliases for imported functions/classes -__all__ = ["Atom", "Mol", "ChemicalReaction", "ExplicitBitVect"] - - -def get_periodic_table() -> PeriodicTable: - """ - Returns the RDKit periodic table. - - :return: the RDKit periodic table - """ - return GetPeriodicTable() - - -def sanitize_mol(mol: Mol) -> None: - """ - Sanitizes an RDKit molecule. - - :param mol: the molecule to sanitize - .. note:: this function modifies the input molecule in place - """ - SanitizeMol(mol) - - -def smiles_to_mol(smiles: str, retain_largest_fragment: bool = False) -> Mol: - """ - Converts a SMILES string to an RDKit molecule. - - :param smiles: the SMILES string to convert - :param retain_largest_fragment: if True, retains only the largest fragment of the molecule - :return: the RDKit molecule - :raises ValueError: if the SMILES is invalid - """ - mol: Mol | None = MolFromSmiles(smiles) - - match mol: - case None: - raise ValueError(f"invalid SMILES: {smiles}") - case m: - if retain_largest_fragment: - chooser: LargestFragmentChooser = LargestFragmentChooser() - m: Mol = chooser.choose(m) - SanitizeMol(m) - return m - - -def standardize_from_smiles( - smi: str, - keep_stereo: bool = False, - neutralize: bool = True, - tautomer_canon: bool = True, -) -> Mol | None: - """ - Standardize a molecule from its SMILES representation. - - :param smi: input SMILES string - :param keep_stereo: whether to retain stereochemistry - :param neutralize: whether to neutralize charges - :param tautomer_canon: whether to canonicalize tautomers - :return: standardized molecule or None if input SMILES is invalid - """ - mol = smiles_to_mol(smi) - mol = largest_fragment(mol) - if neutralize: - mol = Uncharger().uncharge(mol) - if tautomer_canon: - mol = TautomerEnumerator().Canonicalize(mol) - sanitize_mol(mol) - if not keep_stereo: - RemoveStereochemistry(mol) - return mol - - -def smarts_to_mol(smarts: str) -> Mol: - """ - Converts a SMARTS string to an RDKit molecule. - - :param smarts: the SMARTS string to convert - :return: the RDKit molecule - :raises ValueError: if the SMARTS pattern is invalid - """ - mol: Mol | None = MolFromSmarts(smarts) - - if mol is None: - raise ValueError(f"invalid SMARTS: {smarts}") - - return mol - - -def smarts_to_reaction(smarts: str, use_smiles: bool = False) -> ChemicalReaction: - """ - Converts a SMARTS string to an RDKit reaction. - - :param smarts: the SMARTS string to convert - :param use_smiles: whether to interpret the SMARTS as SMILES - :return: the RDKit reaction - :raises ValueError: if the SMARTS pattern is invalid - """ - rxn: ChemicalReaction | None = ReactionFromSmarts(smarts, useSmiles=use_smiles) - - if rxn is None: - raise ValueError(f"invalid reaction SMARTS: {smarts}") - - return rxn - - -def mol_to_smiles(mol: Mol, remove_tags: bool = False, isomeric: bool = True, canonical: bool = True) -> str: - """ - Converts an RDKit molecule to a SMILES string. - - :param mol: the molecule to convert - :param remove_tags: whether to remove atom tags (isotopes) from the SMILES - :param isomeric: whether to include isomeric information in the SMILES - :param canonical: whether to generate a canonical SMILES - :return: the SMILES string - """ - if remove_tags: - for atom in mol.GetAtoms(): - atom.SetIsotope(0) - - return MolToSmiles(mol, isomericSmiles=isomeric, canonical=canonical) - - -def mol_to_smarts(mol: Mol) -> str: - """ - Converts an RDKit molecule to a SMARTS string. - - :param mol: the molecule to convert - :return: the SMARTS string - """ - return MolToSmarts(mol) - - -def mol_to_fpr(mol: Mol, rad: int = 2, nbs: int = 2048) -> NDArray[np.int8]: - """ - Converts an RDKit molecule to a Morgan fingerprint. - - :param mol: the molecule to convert - :param rad: the radius of the Morgan fingerprint - :param nbs: the number of bits in the fingerprint - :return: the Morgan fingerprint - """ - gen = GetMorganGenerator(radius=rad, fpSize=nbs, includeChirality=True) - return np.array(gen.GetFingerprint(mol)) - - -def mol_to_inchikey(mol: Mol) -> str: - """ - Converts an RDKit molecule to an InChIKey. - - :param mol: the molecule to convert - :return: the InChIKey - """ - return MolToInchiKey(mol) - - -def calc_tanimoto_similarity(arr1: NDArray[np.int8], arr2: NDArray[np.int8]) -> float: - """ - Calculate the Tanimoto similarity between two fingerprints. - - :param arr1: the first fingerprint - :param arr2: the second fingerprint - :return: the Tanimoto similarity score - """ - assert arr1.shape == arr2.shape - assert np.all(np.logical_or(arr1 == 0, arr1 == 1)) - intersection = np.dot(arr1, arr2) - sum_arr1 = np.sum(arr1) - sum_arr2 = np.sum(arr2) - score = intersection / (sum_arr1 + sum_arr2 - intersection) - assert 0 <= score <= 1 - return score - - -def calc_tanimoto_similarity_rdkit(fp1: ExplicitBitVect, fp2: ExplicitBitVect) -> float: - """ - Calculate the Tanimoto similarity between two RDKit fingerprints. - - :param fp1: the first fingerprint - :param fp2: the second fingerprint - :return: the Tanimoto similarity score - """ - return TanimotoSimilarity(fp1, fp2) - - -def encode_mol(mol: Mol) -> str: - """ - Encodes an RDKit molecule as a canonical isomeric SMILES string. - - :param mol: the molecule to encode - :return: the encoded molecule - """ - return MolToSmiles(mol, isomericSmiles=True, canonical=True) - - -def neutralize_mol(mol: Mol) -> None: - """ - Neutralizes formal charges on an RDKit molecule. - - :param mol: the molecule to neutralize - .. note:: this function modifies the input molecule in place - """ - charge_smarts = "[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]" - charge_pattern = smarts_to_mol(charge_smarts) - at_matches = mol.GetSubstructMatches(charge_pattern) - - if len(at_matches) > 0: - for match in at_matches: - at_idx = match[0] # get the atom index from the match tuple - atom = mol.GetAtomWithIdx(at_idx) - if atom.GetSymbol() in ["B"]: - continue # skip boron atoms - charge = atom.GetFormalCharge() - h_count = atom.GetTotalNumHs() - atom.SetFormalCharge(0) - atom.SetNumExplicitHs(h_count - charge) - atom.UpdatePropertyCache() - - -def get_default_valence(anr: int) -> int: - """ - Returns the default valence for an atom number. - - :param anr: the atom number - :return: the default valence - """ - return get_periodic_table().GetDefaultValence(anr) - - -def get_tags_mol(mol: Mol) -> list[int]: - """Get the atom tags from a molecule. - - :param mol: the molecule - :return: unordered set of atom tags - """ - tags: list[int] = [] - for atom in mol.GetAtoms(): - if atom.GetIsotope() != 0: - tags.append(atom.GetIsotope()) - return tags - - -def get_tags_mols(mols: list[Mol]) -> set[int]: - """ - Get the atom tags from a list of molecules. - - :param mols: the list of molecules - :return: unordered set of atom tags - """ - tags: set[int] = set() - for mol in mols: - tags.update(get_tags_mol(mol)) - return tags - - -def largest_fragment(mol: Mol) -> Mol: - """ - Return the largest fragment of a molecule (by atom count). - - :param mol: input molecule - :return: largest fragment molecule - """ - frags: tuple[Mol, ...] = GetMolFrags(mol, asMols=True, sanitizeFrags=True) - return max(frags, key=lambda m: m.GetNumAtoms()) if frags else mol - - -def count_fragments(mol: Mol) -> int: - """ - Counts the number of fragments in a molecule. - - :param mol: the molecule to analyze - :return: the number of fragments - """ - frags: tuple[Mol, ...] = GetMolFrags(mol, asMols=True, sanitizeFrags=False) - return len(frags) - - -def prepare_stereo(mol: Mol) -> Mol: - """ - Ensure RDKit has the best shot at assigning stereo. - - picks up R/S from chiral tags or 3D coordinates - - converts wedge/hash directions to bond stereo where relevant - - assigns E/Z on double bonds where substituent directions are known - - :param mol: input molecule - :return: molecule with assigned stereochemistry - """ - mol = Mol(mol) # copy - # If you have 2D/3D coords with wedge bonds, capture bond stereo first - SetBondStereoFromDirections(mol) - # Final pass to set CIP labels & E/Z where possible - AssignStereochemistry(mol, cleanIt=True, force=True, flagPossibleStereoCenters=True) - return mol - - -def stereo_summary(mol: Mol, one_based: bool = True) -> str: - """ - Return a compact stereochemistry summary like: "C@2=R; C@7=S; DB(3-4)=E; DB(10-11)=Z" - - :param mol: input molecule - :param one_based: whether to use one-based atom indexing (default True) - :return: stereochemistry summary string (none if no stereochemistry is present) - """ - # Ensure CIP labels and bond stereo are assigned - mol = prepare_stereo(mol) - - # Chiral centers (R/S or '?') - chiral_bits: list[str] = [] - for idx, _ in Chem.FindMolChiralCenters(mol, includeUnassigned=True, useLegacyImplementation=False): - # RDKit stores CIP on atom property "_CIPCode" when defined - atom: Atom = mol.GetAtomWithIdx(idx) - cip = atom.GetProp("_CIPCode") if atom.HasProp("_CIPCode") else None - lbl = cip.upper() if cip else "?" - aid = idx + 1 if one_based else idx - chiral_bits.append(f"C@{aid}={lbl}") - - # Double bond stereo (E/Z, cis/trans, or '?') - dbits: list[str] = [] - for b in mol.GetBonds(): - if b.GetBondType() != BondType.DOUBLE: - continue - st: BondStereo = b.GetStereo() - if st in ( - BondStereo.STEREOE, - BondStereo.STEREOZ, - BondStereo.STEREOCIS, - BondStereo.STEREOTRANS, - ): - a = b.GetBeginAtomIdx() + (1 if one_based else 0) - c = b.GetEndAtomIdx() + (1 if one_based else 0) - if st == BondStereo.STEREOE: - tag = "E" - elif st == BondStereo.STEREOZ: - tag = "Z" - elif st == BondStereo.STEREOCIS: - tag = "cis" - elif st == BondStereo.STEREOTRANS: - tag = "trans" - else: - tag = "?" - dbits.append(f"DB({a}-{c})={tag}") - - if not chiral_bits and not dbits: - return "none" - - return ";".join(chiral_bits + dbits) - - -def ecfp4(mol: Mol, n_bits: int = 2048) -> ExplicitBitVect: - """ - Compute the ECFP4 fingerprint for a molecule. - - :param mol: input molecule - :param n_bits: number of bits in the fingerprint - :return: ECFP4 fingerprint as an RDKit ExplicitBitVect - """ - return GetMorganFingerprintAsBitVect(mol, radius=2, nBits=n_bits) diff --git a/src/retromol/chem/__init__.py b/src/retromol/chem/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/retromol/chem/fingerprint.py b/src/retromol/chem/fingerprint.py new file mode 100644 index 0000000..3381eaa --- /dev/null +++ b/src/retromol/chem/fingerprint.py @@ -0,0 +1,38 @@ +"""Module for calculating and comparing chemical fingerprints.""" + +from rdkit.Chem.rdchem import Mol +from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator +from rdkit.DataStructs.cDataStructs import ExplicitBitVect, TanimotoSimilarity + + +def mol_to_morgan_fingerprint( + mol: Mol, + radius: int = 2, + num_bits: int = 2048, + use_chirality: bool = True +) -> ExplicitBitVect: + """ + Compute the Morgan fingerprint for a molecule. + + :param mol: input molecule + :param radius: radius of the Morgan fingerprint + :param num_bits: number of bits in the fingerprint + :param use_chirality: whether to include chirality information + :return: Morgan fingerprint as an RDKit ExplicitBitVect + """ + generator = GetMorganGenerator(radius=radius, fpSize=num_bits, includeChirality=use_chirality) + fingerprint = generator.GetFingerprint(mol) + + return fingerprint + + +def calculate_tanimoto_similarity(fp1: ExplicitBitVect, fp2: ExplicitBitVect) -> float: + """ + Calculate the Tanimoto similarity between two RDKit fingerprints. + + :param fp1: the first fingerprint + :param fp2: the second fingerprint + :return: the Tanimoto similarity score + .. note:: perfect similarity returns 1.0, no similarity returns 0.0 + """ + return TanimotoSimilarity(fp1, fp2) diff --git a/src/retromol/chem/masking.py b/src/retromol/chem/masking.py new file mode 100644 index 0000000..94c7362 --- /dev/null +++ b/src/retromol/chem/masking.py @@ -0,0 +1,118 @@ +"""Module for masking atoms in a molecule based on their tags.""" + +from rdkit.Chem.rdchem import Mol + +from retromol.chem.tagging import all_atoms_have_unique_tags + + +def mask_atoms(mol: Mol, mask_tags: set[int]) -> dict[int, int]: + """ + Set atom numbers not in mask to 0 based on atom tags. + + :param mol: the molecule to mask + :param mask_tags: mask of atom tags + :return: mapping of atom tags to pre-mask atomic numbers + """ + # Ensure all atoms have unique tags + if not all_atoms_have_unique_tags(mol): + raise ValueError("all atoms in the molecule must have unique tags before masking") + + # Create mapping of tags to atomic numbers + tag_to_atomic_num = {a.GetIsotope(): a.GetAtomicNum() for a in mol.GetAtoms()} + + # Now we can apply the mask, set atomic numbers to 0 for atoms that do not have a tag in mask_tags + for atom in mol.GetAtoms(): + if atom.GetIsotope() not in mask_tags: + atom.SetAtomicNum(0) + + return tag_to_atomic_num + + +def unmask_atoms(mol: Mol, tag_to_atomic_num: dict[int, int]) -> None: + """ + Restore atomic numbers based on the provided tag to atomic number mapping. + + :param mol: the molecule to unmask + :param tag_to_atomic_num: mapping of atom tags to original atomic numbers + :return: None + .. note:: this function modifies the input molecule in place + """ + for atom in mol.GetAtoms(): + if atom.GetIsotope() != 0: # newly added atoms by rxn have isotope 0 + original_atom_num = tag_to_atomic_num.get(atom.GetIsotope(), None) + if original_atom_num is None: + raise ValueError(f"tag {atom.GetIsotope()} not found in tag_to_atomic_num mapping during unmasking") + + atom.SetAtomicNum(original_atom_num) + + # Validate that no atomic numbers are zero after unmasking + curr_atomic_nums = {atom.GetAtomicNum() for atom in mol.GetAtoms()} + if 0 in curr_atomic_nums: + raise ValueError("unmasking failed, some atomic numbers are still zero") + + +def mapped_tags_changed(reactant: Mol, product: Mol) -> set[int]: + """ + Determine which mapped atom tags have changed between reactant and product. + + :param reactant: reactant molecule + :param product: product molecule + :return: set of changed atom tags + .. note:: a tag is considered changed if either the atom with that tag changes + atomic number, or its neighbor signature (by tagged IDs / atom types + bond order) + changes + """ + def _tag_to_idx(m: Mol) -> dict[int, int]: + # Atom tags live in the isotope numbers; ignore zeros + d: dict[int, int] = {} + for a in m.GetAtoms(): + t = a.GetIsotope() + if t: + d[t] = a.GetIdx() + return d + + def _neighbor_sig(m: Mol, ai: int) -> list[tuple[int, float]]: + # Neighbor signature by (neighbor tag or neighbor atomicnum if untagged, bond order) + out: list[tuple[int, float]] = [] + a = m.GetAtomWithIdx(ai) + for b in a.GetBonds(): + nb = b.GetOtherAtomIdx(ai) + na = m.GetAtomWithIdx(nb) + ntag = na.GetIsotope() + key = ntag if ntag else -na.GetAtomicNum() + out.append((key, float(b.GetBondTypeAsDouble()))) + out.sort() + return out + + changed: set[int] = set() + rmap = _tag_to_idx(reactant) + pmap = _tag_to_idx(product) + for t in set(rmap).intersection(pmap): + ra = reactant.GetAtomWithIdx(rmap[t]) + pa = product.GetAtomWithIdx(pmap[t]) + if ra.GetAtomicNum() != pa.GetAtomicNum(): + changed.add(t) + continue + if _neighbor_sig(reactant, rmap[t]) != _neighbor_sig(product, pmap[t]): + changed.add(t) + + return changed + + +def is_masked_preserved(reactant: Mol, products: list[Mol], allowed: set[int]) -> bool: + """ + Check that only tags in 'allowed' are changed across all products. + + :param reactant: reactant molecule + :param products: list of product molecules + :param allowed: set of allowed atom tags that can change + :return: True if only allowed tags are changed, False otherwise + """ + if not allowed: + return True # there is no mask, everything is allowed + + changed: set[int] = set() + for pr in products: + changed |= mapped_tags_changed(reactant, pr) + + return changed.issubset(allowed) diff --git a/src/retromol/chem/matching.py b/src/retromol/chem/matching.py new file mode 100644 index 0000000..defc169 --- /dev/null +++ b/src/retromol/chem/matching.py @@ -0,0 +1,26 @@ +"""Module for matching molecules to MatchingRules.""" + +from retromol.chem.mol import Mol +from retromol.model.rules import MatchingRule +from retromol.model.identity import MolIdentity + + +def match_mol( + mol: Mol, + rules: list[MatchingRule], + match_stereochemistry: bool = False +) -> MolIdentity | None: + """ + Match a molecule to a motif. + + :param mol: RDKit molecule to match + :param rules: list of MatchingRule to use for matching + :param match_stereochemistry: whether to consider stereochemistry in matching + :return: MolIdentity | None: the identity if matched, else None + .. note:: this function uses a greedy approach to match a molecule to a motif + """ + for rl in rules: + if rl.is_match(mol, match_stereochemistry): + return MolIdentity(matched_rule=rl) + + return None diff --git a/src/retromol/chem/mol.py b/src/retromol/chem/mol.py new file mode 100644 index 0000000..7509ba5 --- /dev/null +++ b/src/retromol/chem/mol.py @@ -0,0 +1,193 @@ +"""Module for RDKit molecule utilities.""" + +from rdkit.Chem.rdchem import Mol +from rdkit.Chem.MolStandardize.rdMolStandardize import TautomerEnumerator, Uncharger +from rdkit.Chem.rdmolfiles import MolFromSmiles, MolFromSmarts, MolToSmiles +from rdkit.Chem.rdmolops import GetMolFrags, RemoveStereochemistry, SanitizeMol +from rdkit.Chem.inchi import MolToInchiKey +from rdkit.Chem.rdmolops import ( + AssignAtomChiralTagsFromStructure, + AssignStereochemistry, + SetBondStereoFromDirections, +) + +from retromol.chem.tagging import remove_tags +from retromol.chem.valence import correct_hydrogens +from retromol.utils.hashing import sha256_hex + + +def sanitize_mol(mol: Mol, fix_hydrogens: bool = False) -> bool: + """ + Sanitizes an RDKit molecule in place, returning success status. + + :param mol: the molecule to sanitize + :param fix_hydrogens: whether to correct hydrogen counts before sanitization + :return: True if sanitization was successful, False otherwise + .. note:: this function mutates the input molecule in place + """ + try: + if fix_hydrogens: + correct_hydrogens(mol) + SanitizeMol(mol) + return True + except Exception: + return False + + +def reassign_stereochemistry(mol: Mol) -> Mol: + """ + Reassign stereochemistry of a molecule without changing its identity. + + :param mol: input molecule + :return: molecule with reassigned stereochemistry + """ + mm = Mol(mol) + + SetBondStereoFromDirections(mm) + + if mm.GetNumConformers() > 0: + # When conformers are present, reassign chiral tags from structure + AssignAtomChiralTagsFromStructure(mm, replaceExistingTags=True) + + AssignStereochemistry(mm, cleanIt=True, force=True, flagPossibleStereoCenters=True) + + return mm + + +def smiles_to_mol(smiles: str) -> Mol: + """ + Converts a SMILES string to an RDKit molecule. + + :param smiles: the SMILES string to convert + :return: the RDKit molecule + :raises ValueError: if the SMILES is invalid + """ + mol = MolFromSmiles(smiles) + + if mol is None: + raise ValueError(f"invalid SMILES: {smiles}") + + return mol + + +def smarts_to_mol(smarts: str) -> Mol: + """ + Converts a SMARTS string to an RDKit molecule. + + :param smarts: the SMARTS string to convert + :return: the RDKit molecule + :raises ValueError: if the SMARTS pattern is invalid + """ + mol: Mol | None = MolFromSmarts(smarts) + + if mol is None: + raise ValueError(f"invalid SMARTS: {smarts}") + + return mol + + +def get_fragments(mol: Mol) -> tuple[Mol, ...]: + """ + Returns the fragments of a molecule. + + :param mol: the molecule to analyze + :return: a tuple of fragment molecules + """ + frags: tuple[Mol, ...] = GetMolFrags(mol, asMols=True, sanitizeFrags=False) + return frags + + +def count_fragments(mol: Mol) -> int: + """ + Counts the number of fragments in a molecule. + + :param mol: the molecule to analyze + :return: the number of fragments + """ + return len(get_fragments(mol)) + + +def largest_fragment(mol: Mol) -> Mol: + """ + Return the largest fragment of a molecule (by atom count). + + :param mol: input molecule + :return: largest fragment molecule + """ + frags = get_fragments(mol) + return max(frags, key=lambda m: m.GetNumAtoms()) if frags else mol + + +def standardize_from_smiles( + smi: str, + keep_stereo: bool = False, + neutralize: bool = True, + tautomer_canon: bool = True, +) -> Mol | None: + """ + Standardize a molecule from its SMILES representation. + + :param smi: input SMILES string + :param keep_stereo: whether to retain stereochemistry + :param neutralize: whether to neutralize charges + :param tautomer_canon: whether to canonicalize tautomers + :return: standardized molecule or None if input SMILES is invalid + """ + mol = smiles_to_mol(smi) + mol = largest_fragment(mol) + + if neutralize: + mol = Uncharger().uncharge(mol) + + if tautomer_canon: + mol = TautomerEnumerator().Canonicalize(mol) + + sanitize_mol(mol, fix_hydrogens=False) + + if not keep_stereo: + RemoveStereochemistry(mol) + + return mol + + +def mol_to_smiles( + mol: Mol, + include_tags: bool = False, + isomeric: bool = True, + canonical: bool = True +) -> str: + """ + Converts an RDKit molecule to a SMILES string. + + :param mol: the molecule to convert + :param include_tags: whether to include tagging information in the SMILES + :param isomeric: whether to include isomeric information in the SMILES + :param canonical: whether to generate a canonical SMILES + :return: the SMILES string + """ + if not include_tags: + mol = remove_tags(mol, in_place=False) + + return MolToSmiles(mol, isomericSmiles=isomeric, canonical=canonical) + + +def mol_to_inchikey(mol: Mol) -> str: + """ + Converts an RDKit molecule to an InChIKey. + + :param mol: the molecule to convert + :return: the InChIKey + """ + return MolToInchiKey(mol) + + +def encode_mol(mol: Mol) -> str: + """ + Encodes an RDKit molecule as a canonical isomeric SMILES string. + + :param mol: the molecule to encode + :return: the encoded molecule + """ + smiles = mol_to_smiles(mol, include_tags=True, isomeric=True, canonical=True) + + return sha256_hex(smiles) diff --git a/src/retromol/chem/reaction.py b/src/retromol/chem/reaction.py new file mode 100644 index 0000000..312b62f --- /dev/null +++ b/src/retromol/chem/reaction.py @@ -0,0 +1,108 @@ +"""Module for RDKit reaction utilities.""" + +from rdkit.Chem.rdChemReactions import ChemicalReaction, ReactionFromSmarts + + +def smarts_to_reaction(smarts: str, use_smiles: bool = False) -> ChemicalReaction: + """ + Converts a SMARTS string to an RDKit reaction. + + :param smarts: the SMARTS string to convert + :param use_smiles: whether to interpret the SMARTS as SMILES + :return: the RDKit reaction + :raises ValueError: if the SMARTS pattern is invalid + """ + rxn = ReactionFromSmarts(smarts, useSmiles=use_smiles) + + if rxn is None: + raise ValueError(f"invalid reaction SMARTS: {smarts}") + + return rxn + + +def reactive_template_atoms(rxn: ChemicalReaction) -> list[set[int]]: + """ + For each reactant-template in rxn, return the set of template-atom-indices + that actually change (i.e. have a broken/formed bond or disappear/appear). + We return a list: one set per reactant-template in the order they appear. + + :param rxn: RDKit ChemicalReaction object + :return: List of sets, each set contains indices of reactive atoms in the corresponding reactant template + """ + # First, build a map from map‐no -> (reactant_template_idx, reactant_atom_idx) + reactant_maps: dict[int, tuple[int, int]] = {} # map_no -> (which reactant‐template, which atom‐idx in that template) + for ri in range(rxn.GetNumReactantTemplates()): + templ = rxn.GetReactantTemplate(ri) + for atom in templ.GetAtoms(): + mnum = atom.GetAtomMapNum() + if mnum: + reactant_maps[mnum] = (ri, atom.GetIdx()) + + # Next, build a map from map‐no -> (which product_template_idx, product_atom_idx) + product_maps: dict[int, tuple[int, int]] = {} + for pi in range(rxn.GetNumProductTemplates()): + templ_p = rxn.GetProductTemplate(pi) + for atom in templ_p.GetAtoms(): + mnum = atom.GetAtomMapNum() + if mnum: + product_maps[mnum] = (pi, atom.GetIdx()) + + # Now we scan each reactant‐template atom and see if it "persists" into product with the same adjacency, + # or if its bonding pattern changes, or if it disappears entirely. If any of those are true -> it's reactive. + reactive_sets: list[set[int]] = [set() for _ in range(rxn.GetNumReactantTemplates())] + + # Pre‐compute adjacency‐lists (by map‐number) for reactant vs. product + # – build map_no -> set(of neighbor‐map_numbers) in reactant and product + react_adj: dict[int, set[int]] = {} + prod_adj: dict[int, set[int]] = {} + + # Build reactant adjacency by map‐num + for ri in range(rxn.GetNumReactantTemplates()): + templ = rxn.GetReactantTemplate(ri) + for bond in templ.GetBonds(): + a1, a2 = bond.GetBeginAtom(), bond.GetEndAtom() + m1, m2 = a1.GetAtomMapNum(), a2.GetAtomMapNum() + if m1 and m2: + react_adj.setdefault(m1, set()).add(m2) + react_adj.setdefault(m2, set()).add(m1) + + # Build product adjacency by map‐num + for pi in range(rxn.GetNumProductTemplates()): + templ_p = rxn.GetProductTemplate(pi) + for bond in templ_p.GetBonds(): + a1_p, a2_p = bond.GetBeginAtom(), bond.GetEndAtom() + m1, m2 = a1_p.GetAtomMapNum(), a2_p.GetAtomMapNum() + if m1 and m2: + prod_adj.setdefault(m1, set()).add(m2) + prod_adj.setdefault(m2, set()).add(m1) + + # Now: for each map_no in the reactant_templates, check: + # (a) if that map_no does NOT appear in product_maps at all -> the atom was deleted (= reactive) + # (b) if it DOES appear, compare react_adj[map_no] vs. prod_adj[map_no] + # If they differ -> bond‐pattern changed -> reactive + # (c) also check if atomic number or formal charge changed (rare in a template, but could) + # We compare the two atoms directly. We need to find the reactant‐template Atom and product‐template + # Atom to compare + for mnum, (rtempl_idx, ratom_idx) in reactant_maps.items(): + if mnum not in product_maps: + # Disappeared in the product – this atom is definitely reactive + reactive_sets[rtempl_idx].add(ratom_idx) + else: + # Compare adjacency + react_neighbors = react_adj.get(mnum, set()) + prod_neighbors = prod_adj.get(mnum, set()) + if react_neighbors != prod_neighbors: + reactive_sets[rtempl_idx].add(ratom_idx) + else: + # Check if element or charge changed + (pi, patom_idx) = product_maps[mnum] + react_atom = rxn.GetReactantTemplate(rtempl_idx).GetAtomWithIdx(ratom_idx) + prod_atom = rxn.GetProductTemplate(pi).GetAtomWithIdx(patom_idx) + if ( + react_atom.GetAtomicNum() != prod_atom.GetAtomicNum() + or react_atom.GetFormalCharge() != prod_atom.GetFormalCharge() + ): + # If neither bonding‐pattern nor element‐/charge changed, it is NOT reactive + reactive_sets[rtempl_idx].add(ratom_idx) + + return reactive_sets diff --git a/src/retromol/chem/tagging.py b/src/retromol/chem/tagging.py new file mode 100644 index 0000000..902cef1 --- /dev/null +++ b/src/retromol/chem/tagging.py @@ -0,0 +1,73 @@ +"""Module for RDKit atom tagging utilities.""" + +from rdkit.Chem.rdchem import Mol + + +def tag_mol(mol: Mol) -> None: + """ + Tags the atoms in an RDKit molecule with unique isotope-based tags. + + :param mol: the molecule to tag + .. note:: this function modifies the input molecule in place + """ + for i, atom in enumerate(mol.GetAtoms(), start=1): + atom.SetIsotope(i) + + +def remove_tags(mol: Mol, in_place: bool = False) -> Mol: + """ + Removes atom tags from an RDKit molecule. + + :param mol: the molecule to process + :param in_place: whether to modify the input molecule in place + :return: the molecule without atom tags + """ + if not in_place: + mol = Mol(mol) + + for atom in mol.GetAtoms(): + atom.SetIsotope(0) + + return mol + + +def get_tags_mol(mol: Mol) -> set[int]: + """ + Get the atom tags from a molecule. + + :param mol: the molecule + :return: unordered set of atom tags + """ + tags: set[int] = set() + for atom in mol.GetAtoms(): + if atom.GetIsotope() != 0: + tags.add(atom.GetIsotope()) + + return tags + + +def get_tags_mols(mols: list[Mol]) -> set[int]: + """ + Get the atom tags from a list of molecules. + + :param mols: the list of molecules + :return: unordered set of atom tags + """ + tags: set[int] = set() + for mol in mols: + tags.update(get_tags_mol(mol)) + + return tags + + +def all_atoms_have_unique_tags(mol: Mol) -> bool: + """ + Check if all atoms in a molecule have unique tags. + + :param mol: the molecule + :return: True if all atoms have unique tags, False otherwise + """ + num_atoms = mol.GetNumAtoms() + curr_tags = get_tags_mol(mol) + + return len(curr_tags) == num_atoms diff --git a/src/retromol/chem/valence.py b/src/retromol/chem/valence.py new file mode 100644 index 0000000..2479b28 --- /dev/null +++ b/src/retromol/chem/valence.py @@ -0,0 +1,47 @@ +"""Module for chemical valence rules and validation.""" + +from rdkit.Chem.rdchem import Mol +from rdkit.Chem.rdchem import GetPeriodicTable, PeriodicTable + + +PERIODIC_TABLE: PeriodicTable = GetPeriodicTable() + + +def get_default_valence(atom_num: int) -> int: + """ + Returns the default valence for a given atom number. + + :param atom_num: the atomic number + :return: the default valence + """ + return PERIODIC_TABLE.GetDefaultValence(atom_num) + + +def correct_hydrogens(mol: Mol) -> None: + """ + Correct explicit hydrogens on atoms based on valence rules. + + :param mol: the RDKit molecule + .. note:: this function modifies the molecule in place + """ + for atom in mol.GetAtoms(): + # Skip aromatic and charged atoms + if atom.GetIsAromatic() or atom.GetFormalCharge() != 0: + continue + + # Skip phosphorus and sulfur (can have expanded valence) + if atom.GetAtomicNum() in {15, 16}: + continue + + # Check if atom complies with valence rules, otherwise adjust explicit hydrogens + valence_bonds = int(sum([bond.GetValenceContrib(atom) for bond in atom.GetBonds()])) + default_valence = get_default_valence(atom.GetAtomicNum()) + num_hs = atom.GetNumExplicitHs() + + if default_valence - valence_bonds < num_hs: + new_valence = default_valence - valence_bonds + + if new_valence < 0: + raise ValueError("new valence for atom is negative") + + atom.SetNumExplicitHs(new_valence) diff --git a/src/retromol/cli.py b/src/retromol/cli.py index 4b34197..b382185 100644 --- a/src/retromol/cli.py +++ b/src/retromol/cli.py @@ -4,31 +4,29 @@ import json import logging import os -import os.path as osp from collections import Counter from datetime import datetime from typing import Any -import yaml from tqdm import tqdm +from rdkit import RDLogger -from retromol.api import run_retromol_with_timeout -from retromol.config import LOGGER_LEVEL, LOGGER_NAME -from retromol.drawing import draw_result -from retromol.io import Input as RetroMolInput -from retromol.io import Result -from retromol.readout import linear_readout_with_timeout -from retromol.rules import ( - get_path_default_matching_rules, - get_path_default_reaction_rules, - get_path_default_wave_config, - load_rules_from_files, -) -from retromol.streaming import run_retromol_stream, stream_json_records, stream_sdf_records, stream_table_rows from retromol.version import __version__ +from retromol.utils.logging import setup_logging, add_file_handler +from retromol.model.rules import RuleSet +from retromol.model.submission import Submission +from retromol.model.result import Result +from retromol.model.readout import LinearReadout +from retromol.pipelines.parsing import run_retromol_with_timeout +from retromol.io.streaming import run_retromol_stream, stream_sdf_records, stream_table_rows, stream_json_records +from retromol.chem.mol import encode_mol +from retromol.visualization.reaction_graph import visualize_reaction_graph -logger = logging.getLogger(LOGGER_NAME) -logger.setLevel(LOGGER_LEVEL) + +log = logging.getLogger(__name__) + + +RDLogger.DisableLog('rdApp.*') # disable RDKit warnings def cli() -> argparse.Namespace: @@ -39,49 +37,11 @@ def cli() -> argparse.Namespace: """ parser = argparse.ArgumentParser(add_help=False) parser.add_argument("-o", "--outdir", type=str, required=True, help="output directory for results") + parser.add_argument("-l", "--log-level", type=str, choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="logging level (default: INFO)") parser.add_argument("-h", "--help", action="help", help="show cli options") parser.add_argument("-v", "--version", action="version", version=f"%(prog)s {__version__}") - - parser.add_argument( - "-rx", - "--reaction-rules", - type=str, - required=False, - default=get_path_default_reaction_rules(), - help="path to reaction rules yaml", - ) - parser.add_argument( - "-rm", - "--matching-rules", - type=str, - required=False, - default=get_path_default_matching_rules(), - help="path to matching rules yaml", - ) - parser.add_argument( - "-wc", - "--wave-config", - type=str, - required=False, - default=get_path_default_wave_config(), - help="path to wave configuration yaml", - ) - - # Flags - parser.add_argument( - "-C", - "--matchstereochem", - action="store_true", - help="match stereochemistry in the input SMILES (default: False)", - ) - parser.add_argument("-V", "--verbose", action="store_true", help="enable verbose output") - parser.add_argument( - "-D", - "--check-duplicates", - action="store_true", - help="check for duplicate items in matching rules", - ) + parser.add_argument("-c", action="store_true", help="match stereochemistry in the input SMILES (default: False)") # Create two subparsers 'single' and 'batch' subparsers = parser.add_subparsers(dest="mode", required=True) @@ -96,180 +56,100 @@ def cli() -> argparse.Namespace: input_group.add_argument("-s", "--sdf", type=str, help="path to an SDF file containing compounds to process") input_group.add_argument("-t", "--table", type=str, help="path to a CSV/TSV file containing compounds to process") input_group.add_argument("-j", "--json", type=str, help="path to a JSONL file containing compounds to process") - batch_parser.add_argument( - "--batch-size", - type=int, - default=2000, - help="max tasks buffered before dispatch (default: 2000)", - ) + batch_parser.add_argument("--batch-size", type=int, default=2000, help="max tasks buffered before dispatch (default: 2000)") batch_parser.add_argument("--chunksize", type=int, default=20000, help="rows per CSV/TSV chunk (default: 20000)") - batch_parser.add_argument( - "--pool-chunksize", - type=int, - default=50, - help="chunksize hint for imap_unordered (default: 50)", - ) - batch_parser.add_argument( - "--maxtasksperchild", - type=int, - default=2000, - help="recycle worker after N tasks (default: 2000)", - ) - batch_parser.add_argument( - "--results", - choices=["files", "jsonl"], - default="jsonl", - help="write each result to a file or append to JSONL (default: jsonl)", - ) - batch_parser.add_argument( - "--jsonl-path", - type=str, - default=None, - help="path to results jsonl (default: /results.jsonl)", - ) + batch_parser.add_argument("--pool-chunksize", type=int, default=50, help="chunksize hint for imap_unordered (default: 50)") + batch_parser.add_argument("--maxtasksperchild", type=int, default=2000, help="recycle worker after N tasks (default: 2000)") + batch_parser.add_argument("--results", choices=["files", "jsonl"], default="jsonl", help="write each result to a file or append to JSONL (default: jsonl)") + batch_parser.add_argument("--jsonl-path", type=str, default=None, help="path to results jsonl (default: /results.jsonl)") batch_parser.add_argument("--no-tqdm", action="store_true", help="disable progress bars for lowest overhead") - batch_parser.add_argument( - "--rdkit-fast", - action="store_true", - help="use fast SDF parse (sanitize=False, removeHs=True); we’ll sanitize only when needed", - ) + batch_parser.add_argument("--rdkit-fast", action="store_true", help="use fast SDF parse (sanitize=False, removeHs=True); we'll sanitize only when needed") # Only read when input type is table - batch_parser.add_argument( - "--separator", - type=str, - choices=["comma", "tab"], - default="comma", - help="separator for table file (default: ',')", - ) - batch_parser.add_argument( - "--id-col", - type=str, - default="inchikey", - help="name of the column containing InChIKeys (default: 'inchikey')", - ) - batch_parser.add_argument( - "--smiles-col", - type=str, - default="smiles", - help="name of the column containing SMILES strings (default: 'smiles')", - ) + batch_parser.add_argument("--separator", type=str, choices=["comma", "tab"], default="comma", help="separator for table file (default: ',')") + batch_parser.add_argument("--id-col", type=str, default="inchikey", help="name of the column containing InChIKeys (default: 'inchikey')") + batch_parser.add_argument("--smiles-col", type=str, default="smiles", help="name of the column containing SMILES strings (default: 'smiles')") # Batch mode also allows for parallel processing - batch_parser.add_argument( - "-w", - "--workers", - type=int, - default=1, - help="number of worker processes to use (default: 1)", - ) + batch_parser.add_argument("-w", "--workers", type=int, default=1, help="number of worker processes to use (default: 1)") return parser.parse_args() -def setup_logger(log_file_path: str, verbose: bool) -> logging.Logger: +def _open_jsonl(outdir: str, jsonl_path: str | None) -> tuple[Any, str]: """ - Sets up a logger that ONLY uses the handlers you attach, - and won't bubble up to the root logger. - - :param log_file_path: path to the log file - :param verbose: if True, also log to stdout - :return: configured logger instance + Open a JSONL file for appending results. + + :param outdir: str: output directory + :param jsonl_path: str | None: path to JSONL file, or None to use default + :return: tuple[file handle, path]: opened file handle and the path used """ - # Remove old log file - if os.path.exists(log_file_path): - os.remove(log_file_path) - - # Remove any handlers that were already attached - if logger.hasHandlers(): - logger.handlers.clear() - - # Common formatter - fmt = logging.Formatter("[%(asctime)s][%(levelname)s]: %(message)s") - - # File handler - fh = logging.FileHandler(log_file_path) - fh.setLevel(LOGGER_LEVEL) - fh.setFormatter(fmt) - logger.addHandler(fh) - - # Only add stream handler if verbose is True - if verbose: - sh = logging.StreamHandler() - sh.setLevel(LOGGER_LEVEL) - sh.setFormatter(fmt) - logger.addHandler(sh) - - return logger - - -def _open_jsonl(outdir: str, jsonl_path: str | None) -> tuple[Any, str]: path = jsonl_path or os.path.join(outdir, "results.jsonl") os.makedirs(os.path.dirname(path), exist_ok=True) return open(path, "a", buffering=1), path # line-buffered -def _write_result_file(outdir: str, inchikey: str, payload: dict[str, Any] | None) -> None: - with open(os.path.join(outdir, f"result_{inchikey}.json"), "w") as f: - json.dump(payload, f, indent=0) # indent=0 faster than 4 - - def main() -> None: """ Main entry point for the CLI. """ - # Parse command line arguments and set up logging start_time = datetime.now() + + # Parse command line arguments and set up logging args = cli() + + # Create output directory if it doesn't exist os.makedirs(args.outdir, exist_ok=True) - log_file_path = osp.join(args.outdir, "_retromol.log") # add underscore to make log file appear at top of folder - logger = setup_logger(log_file_path, args.verbose) - logger.debug(f"command line arguments: {args}") - - # Load rules from files - path_reaction_rules = args.reaction_rules - path_matching_rules = args.matching_rules - rule_set = load_rules_from_files(path_reaction_rules, path_matching_rules) - logger.info(f"Loaded rule set: {rule_set}") - - # Check for duplicates if flag is set - if args.check_duplicates: - rule_set.check_for_duplicates() - logger.info("Checked for duplicates in the rule set. Please remove duplicates for better performance.") - - # Load wave configuration - with open(args.wave_config) as f: - wave_configs = yaml.safe_load(f) - logger.info(f"Loaded {len(wave_configs)} wave configuration(s) from {args.wave_config}") - logger.debug(f"Loaded wave configuration: {wave_configs}") + + # Setup logging + setup_logging(level=args.log_level) + + # If log file exists, remove it + log_fp = os.path.join(args.outdir, "retromol.log") + if os.path.exists(log_fp): + os.remove(log_fp) + + # Add file handler to log to file + add_file_handler(log_fp, level=args.log_level) + + # Log command line arguments + log.info("command line arguments:") + for arg, val in vars(args).items(): + log.info(f"\t{arg}: {val}") + + # Load default ruleset + ruleset = RuleSet.load_default(match_stereochemistry=args.c) + log.info(f"loaded default ruleset: {ruleset}") result_counts: Counter[str] = Counter() # Single mode if args.mode == "single": - mol = RetroMolInput("target", args.smiles, props={}) - result: Result = run_retromol_with_timeout(mol, rule_set, wave_configs, args.matchstereochem) - logger.info(f"Result: {result}") - serialized_result = result.serialize() - with open(osp.join(args.outdir, "result.json"), "w") as f: - json.dump(serialized_result, f, indent=4) - - summary = result.summarize_by_depth() - with open(osp.join(args.outdir, "summary.json"), "w") as f: - json.dump(summary, f, indent=4) - - best_total_cov = result.best_total_coverage(round_to=2) - logger.info(f"Best total coverage (rounded to 2 decimals): {best_total_cov}") - - draw_result(result, args.outdir, background_color="#fffaf6") - - logger.info("Writing linear readouts to log...") - linear_readout = linear_readout_with_timeout(result) - for level in linear_readout["levels"]: - for path in level["strict_paths"]: - ordered_monomers = path["ordered_monomers"] - names = [m["identity"] for m in ordered_monomers] - logger.info(f"{level['dfs_index']} - {level['depth']} - {names}") + submission = Submission(args.smiles, props={}) + result: Result = run_retromol_with_timeout(submission, ruleset) + log.info(f"result: {result}") + + # Report on coverage as percentage of tags identified + coverage = result.calculate_coverage() + log.info(f"coverage: {coverage:.2%}") + + # Get linear readout; print summary + linear_readout = LinearReadout.from_result(result) + out_assembly_graph_fig = os.path.join(args.outdir, "assembly_graph.png") + linear_readout.assembly_graph.draw(show_unassigned=True, savepath=out_assembly_graph_fig) + log.info(f"linear readout: {linear_readout}") + + # Visualize reaction graph + root = encode_mol(result.submission.mol) + visualize_reaction_graph( + result.reaction_graph, + html_path=os.path.join(args.outdir, "reaction_graph.html"), + root_enc=root + ) + + result_dict = result.to_dict() + with open(os.path.join(args.outdir, "result.json"), "w") as f: + json.dump(result_dict, f, indent=4) result_counts["successes"] += 1 @@ -296,38 +176,32 @@ def main() -> None: jsonl_path = None if args.results == "jsonl": jsonl_fh, jsonl_path = _open_jsonl(args.outdir, args.jsonl_path) - logger.info(f"Appending results to JSONL file at: {jsonl_path}") + log.info(f"Appending results to JSONL file at: {jsonl_path}") result_counts = Counter() processed_in_current_batch = 0 for evt in run_retromol_stream( - # Config - rule_set=rule_set, # already loaded above - wave_configs=wave_configs, # already loaded above - match_stereo=args.matchstereochem, - # Data & schema + ruleset=ruleset, row_iter=source_iter, - id_col=id_col, smiles_col=smiles_col, - # Concurrency knobs workers=args.workers, batch_size=args.batch_size, pool_chunksize=args.pool_chunksize, maxtasksperchild=args.maxtasksperchild, ): - # evt has: inchikey, result (dict or None), error (str or None) - input_id = evt.inchikey + # evt has: result (dict or None) and error (str or None) if evt.error is not None: - logger.error(f"Error {input_id}: {evt.error}") + log.error(evt.error) result_counts["errors"] += 1 - else: - if args.results == "files": - _write_result_file(args.outdir, input_id, evt.result) - else: - jsonl_fh.write(json.dumps({"inchikey": input_id, "result": evt.result}) + "\n") + elif evt.result is not None: + # Result is already serialized as dict + jsonl_fh.write(json.dumps(evt.result) + "\n") result_counts["successes"] += 1 + else: + log.error("received empty result with no error message") + result_counts["failures"] += 1 # Progress pbar_inner.update(1) @@ -346,18 +220,19 @@ def main() -> None: if jsonl_fh: jsonl_fh.close() - logger.info(f"Streaming complete. Summary: {dict(result_counts)}") + log.info(f"Streaming complete. Summary: {dict(result_counts)}") else: - logger.error("Either --smiles or --database must be provided.") + log.error("either --smiles or --database must be provided") - logger.info(f"Processing complete. Summary of results: {dict(result_counts)}") + log.info("processing complete") + log.info(f"summary of results: {dict(result_counts)}") # Wrap up end_time = datetime.now() run_time = end_time - start_time - logger.info(f"start time: {start_time}, end time: {end_time}, run time: {run_time}") - logger.info("Goodbye.") + log.info(f"start time: {start_time}, end time: {end_time}, run time: {run_time}") + log.info("goodbye") if __name__ == "__main__": diff --git a/src/retromol/config.py b/src/retromol/config.py deleted file mode 100644 index 72aa3c4..0000000 --- a/src/retromol/config.py +++ /dev/null @@ -1,23 +0,0 @@ -"""This module defines global constants and configuration for the RetroMol package.""" - -import logging -import os - -# Global logger name for RetroMol -LOGGER_NAME = "retromol" -LOGGER_LEVEL = int(os.getenv("LOGGER_LEVEL", logging.INFO)) - - -# Default timeout for running RetroMol on a single molecule -DEFAULT_TIMEOUT_RUN_RETROMOL = 5 # seconds -TIMEOUT_RUN_RETROMOL = int(os.getenv("TIMEOUT", DEFAULT_TIMEOUT_RUN_RETROMOL)) - - -# Default timeout for computing optimal mappings -DEFAULT_TIMEOUT_OPTIMAL_MAPPINGS = 30 # seconds -TIMEOUT_OPTIMAL_MAPPINGS = int(os.getenv("TIMEOUT_OPTIMAL_MAPPINGS", DEFAULT_TIMEOUT_OPTIMAL_MAPPINGS)) - - -# Default timeout for computing linear readouts -DEFAULT_TIMEOUT_LINEAR_READOUT = 30 # seconds -TIMEOUT_LINEAR_READOUT = int(os.getenv("TIMEOUT_LINEAR_READOUT", DEFAULT_TIMEOUT_LINEAR_READOUT)) diff --git a/src/retromol/data/default_matching_rules.yml b/src/retromol/data/default_matching_rules.yml deleted file mode 100644 index 84c4014..0000000 --- a/src/retromol/data/default_matching_rules.yml +++ /dev/null @@ -1,2611 +0,0 @@ - -- rid: methylation - mol: C - groups: - - small tailoring - -- rid: oxidation - mol: O - groups: - - small tailoring - -- rid: amination - mol: N - groups: - - small tailoring - -- rid: cyanide - mol: C#N - groups: - - small tailoring - -- rid: boronation - mol: B - groups: - - small tailoring - -- rid: phosphonylation - mol: OP(O)(O)O - groups: - - small tailoring - -- rid: fluorination - mol: F - groups: - - halogenation - -- rid: chlorination - mol: Cl - groups: - - halogenation - -- rid: bromination - mol: Br - groups: - - halogenation - -- rid: iodination - mol: I - groups: - - halogenation - -- rid: methanol - mol: CO - groups: - - hydrogenation - -- rid: ethanol - mol: CCO - groups: - - hydrogenation - -- rid: propanol - mol: CCCO - groups: - - hydrogenation - -- rid: formaldehyde - mol: C=O - groups: - - acetalization - -- rid: carbamic acid - mol: NC(=O)O - groups: - - acetylation - -- rid: carbonic acid - mol: OC(=O)O - groups: - - acetylation - - urea bond - props: - - residue urea bond formation - -- rid: acetic acid - mol: CC(=O)O - groups: - - acetylation - - fatty acid - - polyketide starter - -- rid: glyceric acid - mol: OC(CO)C(O)=O - groups: - - fatty acid - - polyketide starter - - acetylation - -- rid: propanoic acid - mol: CCC(O)=O - groups: - - fatty acid - - polyketide starter - -- rid: glycosylation - mol: CC1CC(N)C(O)C(O)O1 - groups: - - glycosylation - -- rid: glycosylation - mol: CC1OC(O)CC(N)C1O - groups: - - glycosylation - -- rid: glycosylation - mol: CC1OC(O)CC(C)(O)C1O - groups: - - glycosylation - -- rid: glycosylation - mol: CC1CC(O)C(O)C(O)O1 - groups: - - glycosylation - -- rid: glycosylation - mol: NC1C(O)OC(O)C(O)C1O - groups: - - glycosylation - -- rid: glycosylation - mol: OC1COC(O)C(O)C1O - groups: - - glycosylation - -- rid: glycosylation - mol: OCC1OC(O)C(O)C(O)C1O - groups: - - glycosylation - -- rid: glycosylation - mol: CC1OC(O)CCC1O - groups: - - glycosylation - -- rid: glycosylation - mol: CC1OC(O)CC(O)C1O - groups: - - glycosylation - -- rid: glycosylation - mol: CC1OC(O)C(O)C(O)C1O - groups: - - glycosylation - -- rid: glycosylation - mol: CC1OC(O)C(O)C(N)C1O - groups: - - glycosylation - -- rid: glycosylation - mol: CC(C1(C(C(C(OC1C)O)O)O)O)O - groups: - - glycosylation - -- rid: glycosylation - mol: CC(C1(O)C(O)C(O)C(O)OC1C)=O - groups: - - glycosylation - -- rid: glycosylation - mol: NC1C(C(C(OC1O)CO)O)O - groups: - - glycosylation - -- rid: glycosylation - mol: NC1COC(CC1O)O - groups: - - glycosylation - -- rid: glycosylation - mol: OC1C(O)C(N)C(C)OC1O - groups: - - glycosylation - -- rid: glycosylation - mol: CC1OC(CC(C1S)O)O - groups: - - glycosylation - -- rid: glycosylation - mol: CC1(N)CC(O)OC(C)C1O - groups: - - glycosylation - -- rid: glycosylation - mol: NC1C(O)C(O)C(O)C(N)C1 - groups: - - glycosylation - -- rid: glycosylation - mol: O=CC1(O)C(CO)OC(O)C1O - groups: - - glycosylation - -# TODO: annoying that we have to specify both tautomers here -- rid: glycosylation - mol: NC(N)=NC1C(O)C(O)C(O)C(N=C(N)N)C1O - groups: - - glycosylation - -- rid: glycosylation - mol: N=C(N)NC1C(O)C(NC(=N)N)C(O)C(O)C1O - groups: - - glycosylation - -- rid: glycosylation - mol: OCC1OC(O)C(O)C1O # ribose - groups: - - glycosylation - -- rid: glycosylation - mol: NCC1OC(C(C(C1O)O)N)O - groups: - - glycosylation - -- rid: arginine - mol: NC(N)=NCCCC(N)C(=O)O - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - - positively charged amino acid - props: - - tautomer - -- rid: arginine - mol: N=C(N)NCCCC(N)C(=O)O - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - - positively charged amino acid - props: - - tautomer - -- rid: histidine - mol: NC(Cc1cnc[nH]1)C(=O)O - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - - positively charged amino acid - -- rid: 3-hydroxy histdine - mol: NC(C(c1[nH]cnc1)O)C(O)=O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - - positively charged amino acid - -- rid: lysine - mol: NCCCCC(N)C(=O)O - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - - positively charged amino acid - -- rid: aspartic acid - mol: NC(CC(=O)O)C(=O)O - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - - negatively charged amino acid - -- rid: glutamic acid - mol: NC(CCC(=O)O)C(=O)O - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - - negatively charged amino acid - -- rid: serine - mol: NC(CO)C(=O)O - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - - polar uncharged amino acid - -- rid: threonine - mol: CC(O)C(N)C(=O)O - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - - polar uncharged amino acid - -- rid: asparagine - mol: NC(=O)CC(N)C(=O)O - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - - polar uncharged amino acid - -- rid: glutamine - mol: NC(=O)CCC(N)C(=O)O - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - - polar uncharged amino acid - -- rid: glycine - mol: NCC(=O)O - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - -- rid: proline - mol: O=C(O)C1CCCN1 - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - -- rid: cysteine - mol: NC(CS)C(=O)O - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - -- rid: leucine - mol: CC(C)CC(N)C(=O)O - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - - hydrophobic amino acid - -- rid: isoleucine - mol: CCC(C)C(N)C(=O)O - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - - hydrophobic amino acid - -- rid: alanine - mol: CC(N)C(=O)O - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - - hydrophobic amino acid - -- rid: valine - mol: CC(C)C(N)C(=O)O - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - - hydrophobic amino acid - -- rid: methionine - mol: CSCCC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - - hydrophobic amino acid - -- rid: phenylalanine - mol: NC(Cc1ccccc1)C(=O)O - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - - hydrophobic amino acid - -- rid: tryptophan - mol: NC(Cc1c[nH]c2ccccc12)C(=O)O - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - - hydrophobic amino acid - -- rid: tyrosine - mol: NC(Cc1ccc(O)cc1)C(=O)O - groups: - - amino acid - - alpha amino acid - - proteinogenic amino acid - - hydrophobic amino acid - -- rid: 3,4-dihydroxyphenylalanine - mol: NC(C(O)=O)Cc1c(O)cc(O)cc1 - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: kynurenine - mol: Nc1ccccc1C(=O)C[C@H](N)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: ornithine - mol: '[NH2]CCCC([NH2])C(=O)O' - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: ornithine alkaloid - mol: 'NC(NCCCCO)=N' - groups: - - non-amino acid A-domain substrate - -- rid: bicyclic guanidine alkaloid - mol: 'CCCCCCCCCC1N=C(N2CCCC2=C1C(O)=O)N' - groups: - - non-amino acid A-domain substrate - - fatty acid - -- rid: ornithine alkaloid - mol: 'NC(N)=NCCCCO' - groups: - - non-amino acid A-domain substrate - -- rid: dehydrobutyrine - mol: CC=C(C(O)=O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: dehydrohomoserine - mol: NC(C(O)=O)C=CO - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: N6-formyl-N6-hydroxylysine - mol: NC(CCCCN(O)C=O)C(O)=O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2,3-diaminopropionate - mol: NCC(N)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-hydroxyquinaldic acid - mol: O=C(O)c1nc2ccccc2cc1O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: beta-alanine - mol: NCCC(=O)O - groups: - - amino acid - - beta amino acid - - non-proteinogenic amino acid - -- rid: salicylic acid - mol: O=C(O)c1ccccc1O - groups: - - phenolic acid - - non-amino acid A-domain substrate - -- rid: 2,3-dihydroxybenzoic acid - mol: O=C(O)c1cccc(O)c1O - groups: - - phenolic acid - - non-amino acid A-domain substrate - -- rid: 3,5-dihydroxybenzoic acid - mol: O=C(c1cc(O)cc(O)c1)O - groups: - - phenolic acid - - non-amino acid A-domain substrate - -- rid: isobutyric acid - mol: CC(C)C(=O)O - groups: - - fatty acid - - polyketide starter - -- rid: lactic acid - mol: CC(O)C(=O)O - groups: - - fatty acid - - polyketide starter - -- rid: 2-methyl lactic acid - mol: CC(C)(O)C(=O)O - groups: - - fatty acid - - polyketide starter - -- rid: 2,2-dimethylpropanoic acid - mol: CC(C)(C)C(=O)O - groups: - - fatty acid - - polyketide starter - -- rid: butanoic acid - mol: CCCC(O)=O - groups: - - fatty acid - -- rid: 3-methylbutanoic acid - mol: CC(CC(O)=O)C - groups: - - fatty acid - - polyketide starter - -- rid: 4-amino-2-hydroxybutanoic acid - mol: NCCC(C(O)=O)O - groups: - - fatty acid - -- rid: 3-butynoic acid - mol: C#CCC(=O)O - groups: - - fatty acid - - polyketide starter - -- rid: pentanoic acid - mol: CCCCC(O)=O - groups: - - fatty acid - -- rid: hexanoic acid - mol: CCCCCC(O)=O - groups: - - fatty acid - -- rid: heptanoic acid - mol: CCCCCCC(O)=O - groups: - - fatty acid - -- rid: octanoic acid - mol: CCCCCCCC(O)=O - groups: - - fatty acid - -- rid: nonanoic acid - mol: CCCCCCCCC(O)=O - groups: - - fatty acid - -- rid: 2-hydroxy-3-aminononanoic acid - mol: CCCCCCCC(N)C(O)C(=O)O - groups: - - fatty acid - -- rid: decanoic acid - mol: CCCCCCCCCC(O)=O - groups: - - fatty acid - -- rid: dec-3-enoic acid - mol: CCCCCCC=CCC(=O)O - groups: - - fatty acid - -- rid: 3-hydroxydecanoic acid - mol: CCCCCCCC(O)CC(=O)O - groups: - - fatty acid - -- rid: dodecanoic acid - mol: CCCCCCCCCCCC(=O)O - groups: - - fatty acid - -- rid: tridecanoic acid - mol: CCCCCCCCCCCCC(=O)O - groups: - - fatty acid - -- rid: 3-hydroxytridecanoic acid - mol: O=C(O)CC(O)CCCCCCCCCC - groups: - - fatty acid - -- rid: tetradecanoic acid - mol: CCCCCCCCCCCCCC(=O)O - groups: - - fatty acid - -- rid: tetradec-2-enoic acid - mol: CCCCCCCCCCCC=CC(=O)O - groups: - - fatty acid - -- rid: pentadecanoic acid - mol: CCCCCCCCCCCCCCC(=O)O - groups: - - fatty acid - -- rid: hexadecanoic acid - mol: CCCCCCCCCCCCCCCC(=O)O - groups: - - fatty acid - -- rid: 2,3-dihydroxyhexadecanoic acid - mol: CCCCCCCCCCCCCC(C(C(=O)O)O)O - groups: - - fatty acid - -- rid: 3-hydroxy-2,2-dimethyldecanoic acid - mol: CCCCCCCC(C(C)(C(O)=O)C)O - groups: - - fatty acid - -- rid: 2-methylprop-2-enoic acid - mol: C=C(C)C(=O)O - groups: - - polyketide starter - -- rid: cyclopentane-1,2-dicarboxylic acid - mol: O=C(O)C1CCCC1C(=O)O - groups: - - polyketide starter - -- rid: furan-3-carboxylic acid - mol: O=C(O)c1ccoc1 - groups: - - polyketide starter - -- rid: benzoic acid - mol: O=C(O)c1ccccc1 - groups: - - phenolic acid - - polyketide starter - -- rid: 2-phenylacetic acid - mol: O=C(O)Cc1ccccc1 - groups: - - phenolic acid - - polyketide starter - -- rid: orsellinic acid - mol: O=C(O)c1c(O)cc(O)cc1C - groups: - - phenolic acid - - polyketide starter - -- rid: 3-amino-5-hydroxybenzoic acid - mol: Nc1cc(C(O)=O)cc(O)c1 - groups: - - amino acid - - non-proteinogenic amino acid - - phenolic acid - - polyketide starter - -- rid: 2-(2,6-dioxopiperidin-4-yl)acetic acid - mol: O=C1NC(CC(CC(O)=O)C1)=O - groups: - - polyketide starter - -- rid: artificial amino acid harzianic acid - mol: NC(C(O)=O)CC(C(O)=O)(C(C)C)O - groups: - - artificial amino acid - -- rid: trans-2-hexenoic acid - mol: CCCC=CC(O)=O - groups: - - fatty acid - -- rid: 4-hydroxyphenylglycine - mol: NC(C(=O)O)c1ccc(O)cc1 - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2,4-diaminobytyric acid - mol: NCCC(N)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: beta-hydroxytyrosine - mol: NC(C(=O)O)C(O)c1ccc(O)cc1 - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3,5-dihydroxyphenylglycine - mol: NC(C(=O)O)c1cc(O)cc(O)c1 - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: pipecolic acid - mol: O=C(O)C1CCCCN1 - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: N5-formyl-N5-hydroxyornithine - mol: NC(CCCN(O)C=O)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: N5-hydroxyornithine - mol: NC(CCCNO)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: anthranilic acid - mol: Nc1ccccc1C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2-amino-3-hydroxy-4-(4-nitrophenyl)butanoic acid - mol: C1=CC(=CC=C1CC(C(C(=O)O)N)O)[N+](=O)[O-] - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2,6-diamino-57-dihydroxy-heptanoic acid - mol: C(CC(C(=O)O)N)C(C(CO)N)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4-methylhex-2-enoic acid - mol: CCC(C)C=CC(=O)O - groups: - - fatty acid - -- rid: 2-amino-4-decenoic acid - mol: CCCCCC=CCC(C(=O)O)N - groups: - - fatty acid - -- rid: 1-(1,1-dimethylallyl)-tryptophan - mol: CC(C)(C=C)N1C=C(C2=CC=CC=C21)CC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 1-aminocyclopropane-1-carboxylic acid - mol: C(O)(=O)C1(CC1)(N) - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 1-pyrroline-5-carboxylic acid - mol: O=C(O)C1N=CCC1 - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 10,14-dimethyloctadecanoic acid - mol: OC(CCCCCCCCC(C)CCCC(C)CCCC)=O - groups: - - fatty acid - -- rid: 2,3-diaminobutyric acid - mol: NC(C)C(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2,3-dihydroxy-para-aminobenzoic acid - mol: C1=CC(=C(C(=C(N)1)O)O)C(=O)O - groups: - - non-amino acid A-domain substrate - -- rid: 2,4-dihydroxypentanoic acid - mol: CC(CC(C(=O)O)O)O - groups: - - non-amino acid A-domain substrate - - fatty acid - -- rid: 2-(1-methylcyclopropyl)-glycine - mol: CC1(CC1)C(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2-amino-3,5-dimethyl-4-hexenoic Acid - mol: CC(C=C(C)C)C(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2-amino-3-hydroxycyclopent-2-enone - mol: C1CC(=O)C(=C1O)N - groups: - - non-amino acid A-domain substrate - -- rid: 2-amino-6-hydroxy-4-methyl-8-oxodecanoic acid - mol: CCC(=O)CC(CC(C)CC(C(=O)O)N)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2-aminoadipic acid - mol: C(CC(C(=O)O)N)CC(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2-aminobutyric acid - mol: CCC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2-aminoisobutyric acid - mol: O=C(O)C(N)(C)C - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2-carboxy-6-hydroxyoctahydroindole - mol: N1[C@H](C(=O)O)CC2CCC(O)CC12 - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2-chloro-3,5-dihydroxy-4-methylphenylglycine - mol: CC1=C(O)C(Cl)=C(C=C(O)1)C(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2-chlorobenzoic acid - mol: C1=CC=C(C(=C1)C(=O)O)Cl - groups: - - non-amino acid A-domain substrate - -- rid: 2-hydroxy-4-methylpentanoic acid - mol: CC(C)CC(C(=O)O)O - groups: - - fatty acid - - non-amino acid A-domain substrate - -- rid: 2-hydroxypent-4-enoic acid - mol: C=CCC(C(=O)O)O - groups: - - fatty acid - - non-amino acid A-domain substrate - -- rid: 2-ketoglutaric acid - mol: C(CC(=O)O)C(=O)C(=O)O - groups: - - fatty acid - - non-amino acid A-domain substrate - -- rid: 2-ketoisocaproic acid - mol: O=C(C(=O)O)CC(C)C - groups: - - fatty acid - - non-amino acid A-domain substrate - -- rid: 2-ketoisovaleric acid - mol: O=C(C(=O)O)C(C)C - groups: - - fatty acid - - non-amino acid A-domain substrate - -- rid: 2-methylserine - mol: CC(CO)(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2-sulfamoylacetic acid - mol: C(C(=O)O)S(=O)(=O)N - groups: - - non-amino acid A-domain substrate - -- rid: 2-hydroxy-3-methylpentanoic acid - mol: CCC(C)C(C(=O)O)O - groups: - - fatty acid - - non-amino acid A-domain substrate - -- rid: 2-amino-4-hexenoic acid - mol: CC=CCC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2-amino-8-oxodecanoic acid - mol: CCC(=O)CCCCCC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2-amino-9,10-epoxy-8-oxodecanoic acid - mol: C1C(O1)C(=O)CCCCCC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2-aminodecanoic acid - mol: CCCCCCCCC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2-aminododecanoic acid - mol: CCCCCCCCCCC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2-aminooctanoic acid - mol: CCCCCCC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2-hydroxyisovaleric acid - mol: CC(C)C(C(=O)O)O - groups: - - fatty acid - - non-amino acid A-domain substrate - -- rid: 2-methyl-3-oxobutyrine - mol: CC(=O)C(C)(N)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3,3-dihomo-4-methoxytyrosine - mol: NC(CCCC1=CC=C(OC)C=C1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3,3-dihomophenylalanine - mol: NC(CCCC1=CC=CC=C1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3,3-dihomotyrosine - mol: NC(CCCC1=CC=C(O)C=C1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3,4-dehydrolysine - mol: C(CCN)=CC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3,4-dihydroxybenzoic acid - mol: C1=CC(=C(C=C1C(=O)O)O)O - groups: - - non-amino acid A-domain substrate - -- rid: 3,5-dichloro-4-hydroxyphenylglycine - mol: C1=C(Cl)C(=C(Cl)C=C1C(C(=O)O)N)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-(2-nitrocyclopropylalanine) - mol: C1C(C1[N+](=O)[O-])CC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-(3ridyl)-alanine - mol: C1=CC(=CN=C1)CC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-amino-2,4-dihydroxybenzoic acid - mol: C1=CC(=C(C(=C1C(=O)O)O)N)O - groups: - - non-amino acid A-domain substrate - -- rid: 3-amino-4-hydroxybenzoic acid - mol: C1=CC(=C(C=C1C(=O)O)N)O - groups: - - non-amino acid A-domain substrate - -- rid: 3-amino-6-hydroxy-2-pridone - mol: C1CC(NC(=O)C1N)O - groups: - - non-amino acid A-domain substrate - -- rid: 3-chlorotyrosine - mol: C1=C(Cl)C(=CC=C1CC(C(=O)O)N)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-hydroxy-4-methylproline - mol: CC1C(O)C(NC1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-hydroxy-O-methyl-5-methyltyrosine - mol: C1=C(O)C(=C(C)C=C1CC(C(=O)O)N)OC - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-hydroxy-O-methyltyrosine - mol: C1=C(O)C(=CC=C1CC(C(=O)O)N)OC - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-hydroxy-para-aminobenzoic acid - mol: C1=CC(=C(C=C1C(=O)O)O)N - groups: - - non-amino acid A-domain substrate - -- rid: 3-hydroxyaspartic acid - mol: NC(C(C(=O)O)O)(C(=O)O) - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 2-hydroxyglycine - mol: NC(C(O)=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-hydroxyglutamine - mol: C(C(C(C(=O)O)N)O)C(=O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-hydroxyglutamic acid - mol: NC(C(CC(O)=O)O)C(O)=O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-hydroxykynurenine - mol: C1=CC(=C(C(=C1)O)N)C(=O)CC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-hydroxyleucine - mol: CC(C)C(C(C(=O)O)N)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-hydroxypicolinic acid - mol: C1=CC(=C(N=C1)C(=O)O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-hydroxytyrosine - mol: C1=CC(=C(C=C1CC(C(=O)O)N)O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-hydroxyvaline - mol: CC(O)(C)C(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-methoxyanthranilic acid - mol: COC1=CC=CC(=C1N)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-methoxyaspartic acid - mol: NC(C(C(=O)O)OC)(C(=O)O) - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-methylasparagine - mol: CC(C(C(=O)O)N)C(=O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-methylaspartic acid - mol: CC(C(C(=O)O)N)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-methylglutamic acid - mol: CC(CC(=O)O)C(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-methylleucine - mol: CC(C)C(C)C(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-nitrotyrosine - mol: C1=CC(=C(C=C1CC(C(=O)O)N)[N+](=O)[O-])O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-chloroproline - mol: C1C(Cl)C(NC1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-hydroxy-2,4-diaminobutyric acid - mol: NCC(O)C(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-hydroxyasparagine - mol: NC(C(O)=O)C(O)C(N)=O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-hydroxyhomotyrosine - mol: C1=CC(=CC=C1CC(C(C(=O)O)N)O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-methylbeta-alanine - mol: NCC(C)C(=O)O - groups: - - amino acid - - beta amino acid - - non-proteinogenic amino acid - -- rid: 3,4-dichloroproline - mol: ClC1C(Cl)C(NC1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3,4-dihydroxyhomotyrosine - mol: C1=CC(=CC=C1C(O)C(C(C(=O)O)N)O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-aminobutyric acid - mol: CC(CC(=O)O)N - groups: - - amino acid - - beta amino acid - - non-proteinogenic amino acid - -- rid: 3-cyclohex-2-enylalanine - mol: C1C=CC(CC1)CC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-hydroxy-4-methyloctanoic acid - mol: CCCCC(C(CC(O)=O)O)C - groups: - - fatty acid - - non-amino acid A-domain substrate - -- rid: 3-hydroxy-6-chlorohistidine - mol: C1=C(NC(Cl)=N1)C(C(C(=O)O)N)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-hydroxypipecolic acid - mol: C1CC(C(NC1)C(=O)O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-hydroxyproline - mol: OC1C(NCC1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-methylproline - mol: CC1C(NCC1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4,5-dehydroarginine - mol: O=C(O)C(N)CC=CNC(N)=N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4,5-dihydroxyornithine - mol: C(C(C(=O)O)N)C(C(N)O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4-acetamidopyrrole-2-carboxylic acid - mol: CC(=O)NC1=CNC(=C1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4-amino-2-hydroxy-3-isopropoxybenzoic acid - mol: CC(C)OC1=C(C=CC(=C1O)C(=O)O)N - groups: - - non-amino acid A-domain substrate - -- rid: 4-aminobutyric acid - mol: NCCCC(=O)O - groups: - - amino acid - - non-proteinogenic amino acid - -- rid: 4-aminophenylalanine - mol: C1=CC(=CC=C1CC(C(=O)O)N)N - groups: - - amino acid - - non-proteinogenic amino acid - -- rid: 4-chlorobenzoic acid - mol: C1=CC(=CC=C1C(=O)O)Cl - groups: - - non-amino acid A-domain substrate - -- rid: 4-hydroxy-3-nitrobenzoic acid - mol: C1=CC(=C(C=C1C(=O)O)[N+](=O)[O-])O - groups: - - non-amino acid A-domain substrate - -- rid: 4-hydroxy-D-kynurenine - mol: C1=C(O)C=C(C(=C1)C(=O)CC(C(=O)O)N)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4-hydroxybenzoic acid - mol: C1=CC(=CC=C1C(=O)O)O - groups: - - non-amino acid A-domain substrate - -- rid: 4-hydroxyglutamine - mol: C(C(O)C(=O)N)C(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4-hydroxyindole-3-carboxylic acid - mol: c1cc2c(c(c1)O)c(c[nH]2)C(=O)O - groups: - - amino acid - - beta amino acid - - non-proteinogenic amino acid - -- rid: 4-hydroxyphenylpyruvic acid - mol: C1=CC(=CC=C1CC(=O)C(=O)O)O - groups: - - non-amino acid A-domain substrate - -- rid: 4-hydroxythreonine - mol: C(C(C(C(=O)O)N)O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4-hydroxyvaline - mol: CC(CO)C(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4-methoxytryptophan - mol: C1=CC=C2C(=C1OC)C(=CN2)CC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4-nitrotryptophan - mol: C1=CC=C2C(=C1[N+](=O)[O-])C(=CN2)CC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4-oxoproline - mol: C1C(NCC1=O)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4-butenyl-4-methylthreonine - mol: CC=CCC(C)C(C(C(=O)O)N)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4-hydroxyproline - mol: C1C(NCC1O)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4-propylproline - mol: CCCC1CC(NC1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4,5-dihydroxy-2-aminopentanoic acid - mol: OC(CC(C(=O)O)N)CO - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4-acetyl-5-methylproline - mol: CC(=O)OC1CC(NC(C)1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4-hydroxylysine - mol: NCCC(O)CC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4-methylazetidine-2-carboxylic acid - mol: CC1CC(N1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4-methylproline - mol: CC1CC(NC1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 4-propenylproline - mol: CC=CC1CC(NC1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 5-hydroxylysine - mol: NCC(CCC(C(O)=O)N)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 5,5-dimethylpipecolic acid - mol: C1C(C)(C)CNC(C1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 5-aminolevulinic acid - mol: C(CC(=O)O)C(=O)CN - groups: - - amino acid - - non-proteinogenic amino acid - -- rid: 5-chloroanthranilic acid - mol: C1=CC(=C(C=C1Cl)C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 5-chlorotryptophan - mol: C1=CC2=C(C=C1Cl)C(=CN2)CC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 5-methoxytyrosine - mol: C1=C(OC)C(=CC=C1CC(C(=O)O)N)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 5-methylorsellinic acid - mol: C=1(C=C(C(=C(C1C)C)C(=O)O)O)O - groups: - - non-amino acid A-domain substrate - -- rid: 5-methylproline - mol: C1CC(NC(C)1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 6,7-dichlorotryptophan - mol: C1=C(Cl)C(Cl)=C2C(=C1)C(=CN2)CC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 6-chloro-4-hydroxy-1-methyl-indole-3-carboxylic acid - mol: C(O)1=C(Cl)C=C2C(=C1)C(=CN(C)2)C(=O)O - groups: - - amino acid - - beta amino acid - - non-proteinogenic amino acid - -- rid: 6-chloro-4-hydroxyindole-3-carboxylic acid - mol: c(Cl)1cc2c(c(c1)O)c(c[nH]2)C(=O)O - groups: - - amino acid - - beta amino acid - - non-proteinogenic amino acid - -- rid: 6-chlorotryptophan - mol: C1=C(Cl)C=C2C(=C1)C(=CN2)CC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 6-hydroxy-tetrahydro-isoquinoline-3-carboxylic acid - mol: C1C(NCC2=C1C=C(C=C2)O)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 6-methylsalicylic acid - mol: CC1=C(C(=CC=C1)O)C(=O)O - groups: - - non-amino acid A-domain substrate - -- rid: 6-methyl-pipecolic acid - mol: C1CC(C)NC(C1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: An acid hydrazine polyene (intermediate 14) - mol: OC(=O)CCC(=O)NNCC(=O)O - groups: - - amino acid - - non-proteinogenic amino acid - -- rid: Compound 4 (formed by the decarboxylative condensation of L-Phe and succinyl-CoA) - mol: C1=CC=C(C=C1)CC(C(=O)CCC(=O)O)N - groups: - - amino acid - - non-proteinogenic amino acid - -- rid: isovaline - mol: CCC(C)(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: lysergic acid - mol: CN1CC(C=C2C1CC3=CNC4=CC=CC2=C34)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: N-(1-methyl)-tryptophan - mol: C1=CC=C2C(=C1)C(=CN(C)2)CC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: N-(1-propargyl)-tryptophan - mol: C1=CC=C2C(=C1)C(=CN(CC#C)2)CC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: N-formylglycine - mol: C(C(=O)O)NC=O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: N-hydroxyvaline - mol: CC(C)C(C(=O)O)NO - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: N-methylphenylalanine - mol: CNC(CC1=CC=CC=C1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: N-methyltyrosine - mol: C1=CC(=CC=C1CC(C(=O)O)NC)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: N1-methoxytryptophan - mol: C1=CC=C2C(=C1)C(=CN(OC)2)CC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: N5-acetyl-hydroxyornithine - mol: CC(=O)N(CCCC(C(=O)O)N)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: N5-nitroso-N5-hydroxyornithine - mol: O=NN(CCCC(C(=O)O)N)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: N5-anhydromevalonyl-N5-hydroxyornithine - mol: C(CC(C(=O)O)N)CN(O)C(=O)C=C(C)CCO - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: N6-hydroxylysine - mol: C(CCNO)CC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: 3-hydroxy-3-methylproline - mol: OC(C)1C(NCC1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: aza-beta-tyrosine - mol: C1=CC(=NC=C1O)C(CC(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: beta-methyltryptophan - mol: CC(C1=CNC2=CC=CC=C21)C(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: beta-phenylalanine - mol: C1=CC=C(C=C1)C(CC(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: adenosylmethionine - mol: C[S+](CCC(C(=O)[O-])N)CC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: beta-hydroxycyclohex-2-enylalanine - mol: C1C=CC(CC1)C(O)C(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: beta-hydroxyenduracididine - mol: C1C(NC(=N1)N)C(O)C(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: beta-methylphenylalanine - mol: CC(C1=CC=CC=C1)C(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: beta-tyrosine - mol: C1=CC(=CC=C1C(CC(=O)O)N)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: alaninol - mol: CC(CO)N - groups: - - non-amino acid A-domain substrate - -- rid: argininol - mol: NC(CO)CCCN=C(N)N - groups: - - non-amino acid A-domain substrate - -- rid: azetidine-2-carboxylic acid - mol: O=C(O)C1NCC1 - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: benzoxazolinate - mol: c1ccc2c(c1)nc(o2)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: beta-hydroxy-3-hydroxy-O-methyl-5-methyltyrosine - mol: C1=C(C)C(=C(O)C=C1C(O)C(C(=O)O)N)OC - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: beta-hydroxy-gamma-methyl-hexadecanoic acid - mol: CCCCCCCCCCCCC(C)C(O)CC(=O)O - groups: - - fatty acid - - non-amino acid A-domain substrate - -- rid: beta-hydroxyarginine - mol: C(C(O)C(C(=O)O)N)CN=C(N)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: beta-hydroxyphenylalanine - mol: OC(C1=CC=CC=C1)C(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: beta-lysine - mol: C(CC(CC(=O)O)N)CN - groups: - - amino acid - - beta amino acid - - non-proteinogenic amino acid - -- rid: betaine - mol: C[N+](C)(C)CC(=O)O - groups: - - non-amino acid A-domain substrate - -- rid: caffeic acid - mol: OC(=O)C=Cc1ccc(O)c(O)c1 - groups: - - non-amino acid A-domain substrate - -- rid: capreomycidine - mol: C1CN=C(NC1C(C(=O)O)N)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: cinnamic acid - mol: C1=CC=C(C=C1)C=CC(=O)O - groups: - - non-amino acid A-domain substrate - -- rid: citrulline - mol: C(CC(C(=O)O)N)CNC(=O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: colletorin D acid - mol: CC1=CC(=C(C(=C1C(=O)O)O)CC=C(C)C)O - groups: - - non-amino acid A-domain substrate - -- rid: coumaric acid - mol: C1=CC(=CC=C1C=CC(=O)O)O - groups: - - non-amino acid A-domain substrate - -- rid: cysteic acid - mol: C(C(C(=O)O)N)S(=O)(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: dehydroarginine - mol: C(CN=C(N)N)C=C(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: dehydrophenylalanine - mol: NC(=CC1=CC=CC=C1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: dehydrotryptophan - mol: C1=CC=C2C(=C1)C(=CN2)C=C(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: dehydrovaline - mol: CC(=C(C(=O)O)N)C - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: dehydroalanine - mol: C=C(N)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: dihydrolysergic acid - mol: CN1CC(CC2C1CC3=CNC4=CC=CC2=C34)C(=O)O - groups: - - amino acid - - beta amino acid - - non-proteinogenic amino acid - -- rid: dimethylsulfoniopropionic acid - mol: C[S+](C)CCC(=O)O - groups: - - non-amino acid A-domain substrate - -- rid: enduracididine - mol: C1C(NC(=N1)N)CC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: fumaric acid - mol: C(=CC(=O)O)C(=O)O - groups: - - non-amino acid A-domain substrate - -- rid: glycolic acid - mol: C(C(=O)O)O - groups: - - non-amino acid A-domain substrate - -- rid: grifolic acid - mol: CC(C)=CCCC(C)=CCCC(C)=CCC1=C(O)C=C(C)C(C(=O)O)=C(O)1 - groups: - - non-amino acid A-domain substrate - -- rid: homophenylalanine - mol: C1=CC=C(C=C1)CCC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: homoserine - mol: C(CO)C(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: homotyrosine - mol: C1=CC(=CC=C1CCC(C(=O)O)N)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: homocysteine - mol: NC(CCS)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: indole pyruvic acid - mol: C1=CC=C2C(=C1)C(=CN2)CC(=O)C(=O)O - groups: - - amino acid - -- rid: leucinol - mol: CC(C)CC(CO)N - groups: - - non-amino acid A-domain substrate - -- rid: linoleic acid - mol: CCCCCC=CCC=CCCCCCCCC(=O)O - groups: - - fatty acid - - non-amino acid A-domain substrate - -- rid: malonic acid - mol: O=C(O)CC(=O)O - groups: - - fatty acid - -- rid: malic acid - mol: C(C(C(=O)O)O)C(=O)O - groups: - - non-amino acid A-domain substrate - -- rid: malonamate - mol: NC(=O)CC(=O)O - groups: - - non-amino acid A-domain substrate - -- rid: meta-tyrosine - mol: C1=CC(=CC(=C1)O)CC(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: methylglutaconyl hydroxyornithine - mol: CC(=CC(=O)N(CCCC(C(=O)O)N)O)CC(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: nicotinic acid - mol: C1=CC(=CN=C1)C(=O)O - groups: - - non-amino acid A-domain substrate - -- rid: norcoronamic acid - mol: CC1CC1(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: ochratoxin beta - mol: CC1CC2=C(C(=C(C=C2)C(=O)O)O)C(=O)O1 - groups: - - non-amino acid A-domain substrate - -- rid: p-hydroxybenzoylformic acid - mol: C1=CC(=CC=C1C(=O)C(=O)O)O - groups: - - non-amino acid A-domain substrate - -- rid: p-hydroxymandelate - mol: C1=CC(=CC=C1C(C(=O)O)O)O - groups: - - non-amino acid A-domain substrate - -- rid: para-aminobenzoic acid - mol: O=C(O)c1ccc(N)cc1 - groups: - - amino acid - - non-proteinogenic amino acid - -- rid: phenazine-1,6-dicarboxylic acid - mol: C1=CC(=C2C(=C1)N=C3C(=N2)C=CC=C3C(=O)O)C(=O)O - groups: - - non-amino acid A-domain substrate - -- rid: phenylalaninol - mol: C1=CC=C(C=C1)CC(CO)N - groups: - - non-amino acid A-domain substrate - -- rid: phenylglycine - mol: C1=CC=C(C=C1)C(C(=O)O)N - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: phenyllactic acid - mol: C1=CC=C(C=C1)CC(C(=O)O)O - groups: - - non-amino acid A-domain substrate - -- rid: phenylpyruvic acid - mol: C1=CC=C(C=C1)CC(=O)C(=O)O - groups: - - non-amino acid A-domain substrate - -- rid: piperazic acid - mol: C1CC(NNC1)C(=O)O - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - -- rid: piperonylic acid - mol: OC(=O)c1ccc2OCOc2c1 - groups: - - non-amino acid A-domain substrate - -- rid: pyrrole-2-carboxylic acid - mol: C1=CNC(=C1)C(=O)O - groups: - - amino acid - - beta amino acid - - non-proteinogenic amino acid - -- rid: pyruvic acid - mol: CC(=O)C(=O)O - groups: - - non-amino acid A-domain substrate - -- rid: quinoxaline-2-carboxylic acid - mol: C1=CC=C2C(=C1)N=CC(=N2)C(=O)O - groups: - - non-amino acid A-domain substrate - -- rid: succinic semialdehyde - mol: C(CC(=O)O)C=O - groups: - - non-amino acid A-domain substrate - -- rid: succinyl-hydrazinoacetic acid - mol: NN=CC=CC=CC=CC=CC=CC(=O)O - groups: - - amino acid - - non-proteinogenic amino acid - -- rid: tricarballylic acid - mol: C(C(CC(=O)O)C(=O)O)C(=O)O - groups: - - non-amino acid A-domain substrate - -- rid: ustethylinic acid - mol: c1(C)c(O)c(C(=O)O)c(CC)cc(O)1 - groups: - - non-amino acid A-domain substrate - -- rid: valine isocyanide - mol: CC(C)C([N+]#[C-])C(O)=O - groups: - - non-amino acid A-domain substrate - -- rid: valinol - mol: CC(C)C(CO)N - groups: - - non-amino acid A-domain substrate - -- rid: parahydroxy phenylacetic acid - mol: O=C(O)Cc1ccc(O)cc1 - groups: - - phenolic acid - -- rid: tryptophan alkaloid - mol: O=C(O)C(CC12O)N[C@@H]2Nc3c1cccc3 - groups: - - amino acid - - alpha amino acid - - non-proteinogenic amino acid - - indole alkaloid - -- rid: guanidine-containing fatty acid - mol: N=C(NCCCCCC(O)=O)N - groups: - - fatty acid - - guanidine - - non-amino acid A-domain substrate - -- rid: guanidine-containing fatty acid - mol: N=C(NCCCC=CC(O)=O)N - groups: - - fatty acid - - guanidine - - non-amino acid A-domain substrate - -- rid: guanidine-containing fatty acid - mol: N=C(N)NCCCC(=O)O - groups: - - fatty acid - - guanidine - - non-amino acid A-domain substrate - -- rid: guanidine-containing fatty acid - mol: N=C(N)NCC(=O)O - groups: - - fatty acid - - guanidine - - non-amino acid A-domain substrate - -- rid: isoleucinol - mol: CCC(C(N)CO)C - groups: - - other - -- rid: putrescin - mol: NCCCCN - groups: - - other - -- rid: cadaverine - mol: NCCCCCN - groups: - - other - -- rid: N-(5-aminopentyl)hydroxylamine - mol: NCCCCCNO - groups: - - other - -- rid: butanedioic acid - mol: O=C(O)CCC(O)=O - groups: - - other - -- rid: 3-formamido-2-hydroxybenzoic acid - mol: O=CNc1cccc(C(=O)O)c1O - groups: - - other - -- rid: 2-phenylethanamine - mol: NCCc1ccccc1 - groups: - - other - -- rid: homocysteamine - mol: NCCCS - groups: - - other - -- rid: valienol - mol: OC1C=C(C(C(C1O)O)O)CO - groups: - - other - - carbohydrate - -- rid: valienone - mol: OC1C=C(C(C(C1O)O)=O)CO - groups: - - other - - carbohydrate - -- rid: streptamine - mol: NC1CC(C(C(C1O)O)O)N - groups: - - other - - carbohydrate - -- rid: A1 - mol: O=C(O)CC(=O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: A1 - mol: O=C(O)C=C(O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: A2 - mol: CC(C(=O)O)C(=O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: A2 - mol: CC(C(=O)O)=C(O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: A3 - mol: CC(C)(C(=O)O)C(=O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: A4 - mol: CCC(C(=O)O)C(=O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: A5 - mol: O=C(O)C(O)C(=O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: A5 - mol: O=C(C(O)=C(SO)O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: A6 - mol: CC(O)(C(=O)O)C(=O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: A7 - mol: O=C(O)C(CO)C(=O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: A7 - mol: O=C(O)C(CO)=C(O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: A8 - mol: C=C(CC(=O)O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: A8 - mol: CC(=CC(=O)O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: A9 - mol: C=C(SO)C(C)C(=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: A10 - mol: C=C(SO)C(O)C(=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: A11 - mol: C=C(SO)C(=O)C(=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: B1 - mol: O=C(O)CC(O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: B2 - mol: CC(C(=O)O)C(O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: B3 - mol: CC(C)(C(=O)O)C(O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: B4 - mol: CCC(C(=O)O)C(O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: B5 - mol: O=C(O)C(O)C(O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: B6 - mol: CC(O)(C(=O)O)C(O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: B7 - mol: O=C(O)C(CO)C(O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: B8 - mol: CC(O)(CC(=O)O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: B9 - mol: CC(C(=O)O)C(C)(O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: B10 - mol: CC(O)(SO)C(O)C(=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: B11 - mol: O=C(O)C(=O)C(O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: B12 - mol: NC(C(=O)O)C(O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: B12 - mol: OSC(O)C(C(=O)O)C(=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: C1 - mol: O=C(O)C=CSO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: C2 - mol: CC(=CSO)C(=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: C4 - mol: CCC(=CSO)C(=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: C5 - mol: O=C(C(C(O)=O)=CSO)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: C6 - mol: O=C(O)C(=CSO)CO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: D1 - mol: O=C(O)CCSO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: D2 - mol: CC(CSO)C(=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: D3 - mol: CC(C)(CSO)C(=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: D4 - mol: CCC(CSO)C(=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: D5 - mol: O=C(O)C(O)CSO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: D6 - mol: CC(O)(CSO)C(=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: D7 - mol: O=C(O)C(CO)CSO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: D8 - mol: CC(CC(=O)O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: D9 - mol: CC(SO)C(O)C(=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: D11 - mol: O=C(O)C(=O)CSO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: D11 - mol: O=C(O)C(O)=CSO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: D12 - mol: CC(O)C(CSO)C(=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: D13 - mol: C=C(C(O)=O)CSO - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: D14 - mol: OSCC(CC=O)C(=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: D15 - mol: O=C(C(C(C)SO)=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: D15 - mol: O=C(C(O)=C(C)SO)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: E1 - mol: OSC(N)CC(=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: E2 - mol: OSC(N)C(C)C(=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: E3 - mol: OSC(N)C(C)(C)C(=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: E4 - mol: OSC(N)C(CC)C(=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: E5 - mol: OSC(N)C(C)(O)C(=O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - -- rid: E6 - mol: NC(C(O)=O)CSO - groups: - - polyketide building block - props: - - novel polyketide encoding - - flipped E1 - -- rid: E7 - mol: NC(C(C(O)=O)O)SO - groups: - - polyketide building block - props: - - novel polyketide encoding - - flipped E1 - -- rid: other polyketide building block - mol: O=C(C(C(C(O)=O)SO)O)O - groups: - - polyketide building block - props: - - novel polyketide encoding - - BGC0002987 diff --git a/src/retromol/data/default_reaction_rules.yml b/src/retromol/data/default_reaction_rules.yml deleted file mode 100644 index c5d1b2a..0000000 --- a/src/retromol/data/default_reaction_rules.yml +++ /dev/null @@ -1,484 +0,0 @@ -- rid: break ester bond (intermolecular) - smarts: '[*:6]-[C,c:1]-[O;!R:2]-[C;!R:3](=[O:4])-[C,c:5]>>[*:6]-[C,c:1]-[OH:2].[O]-[C:3](=[O:4])-[C,c:5]' - groups: - - disconnect macrostructures - -- rid: break thio-ester bond (intermolecular) - smarts: '[*:6]-[C,c:1]-[S;!R:2]-[C;!R:3](=[O:4])-[C,c:5]>>[*:6]-[C,c:1]-[SH:2].[O]-[C:3](=[O:4])-[C,c:5]' - groups: - - disconnect macrostructures - -- rid: break N-O glycosidic bond (intermolecular) - smarts: '[C:1][NH1:2][O:3][C:4]>>[C:1][NH2:2].[OH:3][C:4]' - groups: - - disconnect macrostructures - -- rid: break glycosidic bond - smarts: '[C,c:1][O:2][C:3]1[O:4][C:5][C:6][C:7][C:8]1>>[C,c:1][OH:2].[OH][C:3]1[O:4][C:5][C:6][C:7][C:8]1' - groups: - - disconnect macrostructures - -- rid: break furanosidic bond - smarts: '[C,c:1][O:2][C:3]1[O:4][C:5][C:7][C:8]1>>[C,c:1][OH:2].[OH][C:3]1[O:4][C:5][C:7][C:8]1' - groups: - - disconnect macrostructures - -- rid: break glycosidic ester bond - smarts: '[C:8](=[O:9])[O:7][C:1]1[C:2][O:3][C:4][C:5][C:6]1>>[C:8](=[O:9])[OH:7].[O][C:1]1[C:2][O:3][C:4][C:5][C:6]1' - groups: - - disconnect macrostructures - -- rid: break N-glycosidic linkage - smarts: '[O:2][C:3]1[O:4][C:5][C:6]([N:9][C:10])[C:7][C:8]1>>[O:2][C:3]1[O:4][C:5][C:6]([N:9])[C:7][C:8]1.[O][C:10]' - groups: - - disconnect macrostructures - -- rid: reverse epoxidation - smarts: '[C:1]1[O:2][C:3]1>>[C:1]=[C:3].[O:2]' - groups: - - preprocessing - - linearization - -- rid: reverse O-methylation - smarts: '[O&D2:1][CH3:2]>>[O:1].[C:2]' - groups: - - preprocessing - -- rid: reverse N-methylation - smarts: '[N:1][CH3:2]>>[N:1].[C:2]' - groups: - - preprocessing - -- rid: reverse S-methylation - smarts: '[S:1][CH3:2]>>[S:1].[C:2]' - groups: - - preprocessing - -- rid: reverse C-methylation amino acid - smarts: '[N:1][C:2]([CH3:3])([*:4])[C:5](=[O:6])[O:7]>>[N:1][C:2]([*:4])[C:5](=[O:6])[O:7].[CH3:3]' - groups: - - preprocessing - -- rid: reverse monohalogenation - smarts: '[C,c:1]([Cl,Br,F,I:2])>>[C,c:1].[Cl,Br,F,I:2]' - groups: - - preprocessing - -- rid: reverse carbamic acid tailoring (type 1) - smarts: '[NH2:1]-[CH0:2](=[O:3])-[OH0:4]-[*:5]>>[OH1:4]-[*:5].[NH2:1]-[CH0:2](=[O:3])[OH]' - groups: - - preprocessing - - postprocessing linearization - -- rid: reverse carbamic acid tailoring (type 2) - smarts: '[NH2:1]-[C:2](=[O:3])-[N:4]-[*:5]>>[N:4]-[*:5].[NH2:1]-[C:2](=[O:3])[OH]' - groups: - - preprocessing - - postprocessing linearization - -- rid: reverse carbinolamide tailoring - smarts: '[OH:1]-[C:2]1[C:3][C:4][O:5][C:6](=[O:7])[N:8]1>>[O:1]=[C:2][C:3][C:4][OH:5].[O][C:6](=[O:7])[N:8]' - groups: - - preprocessing - - postprocessing linearization - props: - references: - - 'PMID:12060743' - -- rid: reverse acetic acid tailoring - smarts: '[CH3:1]-[CH0:2](=[O:3])-[OH0:4]-[*:5]>>[OH1:4]-[*:5].[CH3:1]-[CH0:2](=[O:3])[OH]' - groups: - - preprocessing - - postprocessing linearization - -- rid: reverse carboxylic acid tailoring - smarts: '[OH1:1]-[CH0:2](=[O:3])-[NH1:4]-[*:5]>>[NH2:4]-[*:5].[OH1:1]-[CH0:2](=[O:3])[OH]' - groups: - - preprocessing - - postprocessing linearization - -- rid: reverse acylation (type 1) - smarts: '[CH3:1][C:2](=[O:3])[O,N:4][*:5]>>[CH3:1][C:2](=[O:3])[OH].[O,N:4][*:5]' - groups: - - preprocessing - - postprocessing linearization - -- rid: reverse acylation (type 2) - smarts: '[CH1:1]([CH3:6])([CH3:7])[C:2](=[O:3])[O,N:4][*:5]>>[CH1:1]([CH3:6])([CH3:7])[C:2](=[O:3])[OH].[O,N:4][*:5]' - groups: - - preprocessing - - postprocessing linearization - -- rid: reverse acylation (type 3) - smarts: '[*:1][NH1:2][CH2:3][CH3:4]>>[*:1][NH2:2].[OH][CH2:3][CH3:4]' - groups: - - preprocessing - - postprocessing linearization - -- rid: reverse cyanide tailoring - smarts: '[C:1][C&D2:2]#[N&D1:3]>>[C:1].[C:2]#[N:3]' - groups: - - preprocessing - -- rid: reverse hydroxyl sulfonation - smarts: '[*:1][S:2](=[O:3])(=[O:4])[OH:5]>>[*:1].[OH][S:2](=[O:3])(=[O:4])[OH:5]' - groups: - - preprocessing - -- rid: reverse hydroxyl phosphonylation - smarts: '[C:3][O:1][P:2]>>[C:3][O:1].[O][P:2]' - groups: - - preprocessing - -- rid: reverse 1,3-dioxane formation (cyclic acetal) - smarts: '[*:6][C:1]1~[C:2]~[C:3]([*:7])[O:4][C:8][O:5]1>>[*:6][C:1]([OH:5])~[C:2]~[C:3]([OH:4])[*:7].[C:8]=[O]' - groups: - - preprocessing - -- rid: reverse 1,3-dioxolane formation (spiro-bis(pyranose) system) - smarts: '[*:6][C:1]1~[C:2]([*:7])[O:4][C:8][O:5]1>>[*:6][C:1]([OH:5])~[C:2]([OH:4])[*:7].[C:8][OH]' - groups: - - preprocessing - -- rid: reverse 2,3-diaminopropionate amidation - smarts: '[NH1:1][CH1:2]([C:3](=[O:4])[NH2:5])[CH2:6][NH2:7]>>[NH1:1][CH1:2]([C:3](=[O:4])[OH])[CH2:6][NH2:7].[N:5]' - groups: - - preprocessing - -- rid: reverse hydrogenation tailoring - smarts: '[CH2:1][CH1:2]([OH1:3])[CH2:4][OH1:5]>>[CH2:1][CH0:2](=[OH0:3])[OH1].[CH3:4][OH1:5]' - groups: - - preprocessing - props: - conditions: - reactant: - # Reverse hydrogenation tailoring is fallback for when default reactive group for starting PK disassembly is not present - forbids_any: - - '[C](=[O])-[OH]' # carboxylic group - - '[C](=[O])-[O]-[C]' # still cyclized carboxylic group - -- rid: reverse etherification - smarts: '[*:1]-[CH0:2]1(-[OH:3])-[O:4]-[CH1:5](-[*:6])-[C:7][C:8][C:9]1>>([*:1]-[CH0:2]1=[OH0:3].[OH1:4]-[CH1:5](-[*:6])-[C:7][C:8][C:9]1)' - groups: - - linearization - -- rid: reverse terminal etherification - smarts: '[CH1:2]1(-[OH:3])-[O:4]-[CH1:5](-[*:6])-[C:7][C:8][C:9]1>>([CH0:2]1(=[OH0:3])[OH].[OH1:4]-[CH1:5](-[*:6])-[C:7][C:8][C:9]1)' - groups: - - linearization - -- rid: reverse etherification (6-ring; no hydroxyl group) - smarts: '[C:1]-[C:2]([C:5]~[C:6]~[C:7]1)-[O:3]-[C:4]1[C:8][C:9]>>([C:1]-[C:2]([C:5][C:6][C:7]1)-[OH0:3].[C:4]1=[C:8][C:9])' - groups: - - linearization - -- rid: reverse etherification (5-ring; no hydroxyl group) - smarts: '[C:1]-[C:2]([C:5]~[C:7]1)-[O:3]-[C:4]1[C:8][C:9]>>([C:1]-[C:2]([C:5][C:7]1)-[OH0:3].[C:4]1=[C:8][C:9])' - groups: - - linearization - -- rid: reverse macrolactonization - smarts: '[C;R:1][C;R:2](=[O:3])[O;R:4][C;R:5]>>([C:1][C:2](=[O:3])[OH].[OH:4][C:5])' - groups: - - linearization - -- rid: break ester bond (intermolecular) - smarts: '[C,c:1][C;!R:2](=[O:3])[O;!R:4][C:5]>>[C,c:1][C:2](=[O:3])[OH].[OH:4][C:5]' - groups: - - linearization - -- rid: break aromatic ester bond (intramolecular) - smarts: '[c:1]1[c:2][c:3][c:4][c:5](=[O:7])[o:6]1>>([OH:6][C:1]=[C:2]-[C:3]=[C:4]-[C:5](=[O:7])[OH])' - groups: - - linearization - -- rid: reverse macrolactonethionization - smarts: '[C;R:1][C;R:2](=[O:3])[S;R:4][C;R:5]>>([C:1][C:2](=[O:3])[OH].[SH:4][C:5])' - groups: - - linearization - -- rid: break thio-ester bond (intermolecular) - smarts: '[C:1][C;!R:2](=[O:3])[S;!R:4][C:5]>>[C:1][C:2](=[O:3])[OH].[SH:4][C:5]' - groups: - - linearization - -- rid: reverse carbocyclization (type 1) - smarts: '[*:1][C:2]1[C:3]([*:4])[C:5]=[C:6][CH1:7]([C;R:8])[C:9]([C;R:10])1>>([*:1][C:2]=1.[C:3]([*:4])=[C:5]-[C:6]=[CH1:7]([C;R:8]).[C:9]([C;R:10])=1)' - groups: - - linearization - -- rid: reverse carbocyclization (type 2) - smarts: '[*:1][C:2]1[C:3]([*:4])[C:5]=[C:6][C:7](=[C;R:8][C;R:11])[C:9]([C;R:10])1>>([*:1][C:2]=1.[C:3]([*:4])=[C:5]-[C:6]=[C:7](-[C;R:8]=[C;R:11]).[C:9]([C;R:10])=1)' - groups: - - linearization - -- rid: reverse carbocycle oxidation - smarts: '[#6:1]1(-[#6:4](=[#6:6]-[#6:7](-[#8:3])-[#6:8](-[#6:10])-[#6:2]-1-[#6:9])-[*:11])-[*:12]>>[#6:1]1(-[#6:4](-[#6:6]=[#6:7]-[#6:8](-[#6:10])-[#6:2]-1-[#6:9])-[*:11])-[*:12].[#8:3]' - groups: - - linearization - -- rid: reverse heterocyclization - smarts: '[C:1]1[C:2][N:3][C:4](=[O:5])[C:6]=1([*:7])>>[C:1]([OH])(=[O])[C:2][N:3][C:4](=[O:5])[C:6]([*:7])' - groups: - - linearization - -- rid: break dilsufide bridge (intramolecular) - smarts: '[C;R:1][C:2][S:3][S:4][C:5][C;R:6]>>([C;R:1][C:2][SH:3].[SH:4][C:5][C;R:6])' - groups: - - linearization - -- rid: break disulfide bridge (intermolecular) - smarts: '[C:1][C:2][S:3][S:4][C:5][C:6]>>[C;R:1][C:2][SH:3].[SH:4][C:5][C;R:6]' - groups: - - linearization - -- rid: break threonine-cysteine bridge (intramolecular) - smarts: '[C:1](=[O:2])[C:3]([N:4])[CH1:5]([CH3:6])[S:7][CH2:8][C:9]([N:10])[C:11]=[O:12]>>([C:1](=[O:2])[C:3]([N:4])[CH1:5]([CH3:6])[OH].[SH:7][CH2:8][C:9]([N:10])[C:11]=[O:12])' - groups: - - linearization - -- rid: break threonine-cysteine bridge (intermolecular) - smarts: '[C:1](=[O:2])[C:3]([N:4])[CH1:5]([CH3:6])[S:7][CH2:8][C:9]([N:10])[C:11]=[O:12]>>[C:1](=[O:2])[C:3]([N:4])[CH1:5]([CH3:6])[OH].[SH:7][CH2:8][C:9]([N:10])[C:11]=[O:12]' - groups: - - linearization - -- rid: break serine-cysteine bridge (intramolecular) - smarts: '[C:1](=[O:2])[C:3]([N:4])[CH2:5][S:7][CH2:8][C:9]([N:10])[C:11]=[O:12]>>([C:1](=[O:2])[C:3]([N:4])[CH2:5][SH:7].[OH][CH2:8][C:9]([N:10])[C:11]=[O:12])' - groups: - - linearization - -- rid: break serine-cysteine bridge (intermolecular) - smarts: '[C:1](=[O:2])[C:3]([N:4])[CH2:5][S:7][CH2:8][C:9]([N:10])[C:11]=[O:12]>>[C:1](=[O:2])[C:3]([N:4])[CH2:5][SH:7].[OH][CH2:8][C:9]([N:10])[C:11]=[O:12]' - groups: - - linearization - -- rid: break cysteine bond (intramolecular) - smarts: '[O:1][C:2](=[O:3])[C:4][C:5][SH0:6][C:7]>>[O:1][C:2](=[O:3])[C:4][C:5][SH1:6].[C:7]' - groups: - - linearization - -- rid: reverse reduction (type 1) - smarts: '[NH2:1][CH0:2]([OH:3])[C:4](=[O:5])[OH:6]>>[NH2:1][CH1:2][C:4](=[O:5])[OH:6].[OH2:3]' - groups: - - linearization - -- rid: reverse reduction (type 2) - smarts: '[NH2:1][CH0:2]([OH:3])[CH2:4][C:5](=[O:6])[OH:7]>>[NH2:1][CH1:2][CH2:4][C:5](=[O:6])[OH:7].[OH2:3]' - groups: - - linearization - -- rid: reverse oxazole formation - smarts: '[C,c:1][c:2]1[o:3][c:4][c:5]([C:6])[n:7]1>>[C,c:1][CH0:2](=[O:3])[NH1:7][CH1:5]([CH2:4][OH1])[C:6]' - groups: - - linearization - -- rid: reverse oxazoline formation - smarts: '[C,c:1][C:2]=1[OH0:3][C:4][C:5]([C:6])[N:7]1>>[C,c:1][CH0:2](=[O:3])[NH1:7][CH1:5]([CH2:4][OH1])[C:6]' - groups: - - linearization - -- rid: reverse thiazole formation - smarts: '[C:1][c:2]1[s:3][c:4][c:5][n:7]1>>([C:1]-[CH0:2]-1(=O).[SH1:3]-[CH2:4]-[CH1:5]-[NH1:7]-1)' - groups: - - linearization - -- rid: reverse thiazoline tautomerization - smarts: '[OH0:1]=[C:2]1-[S:3]-[C:4]-[C:5](-[C:6])-[N:7]-1>>[OH1:1]-[C:2]1-[S:3]-[C:4]-[C:5](-[C:6])-[N:7]=1' - groups: - - linearization - -- rid: reverse thiazoline formation (type 1) - smarts: '[*:1][C:2]=1[SH0:3][C:4][C:5][N:7]1>>([*:1]-[CH0:2]-1(=O).[SH1:3]-[CH2:4]-[C:5]-[NH1:7]-1)' - groups: - - linearization - -- rid: reverse thiazoline formation (type 2) - smarts: '[*:1][C:2]([C:3][SH0:4]1)[NH1:5][C:6]1[*:7]>>[*:1][C:2]([C:3][SH1:4])[NH1:5][CH0:6](=[O])[*:7]' - groups: - - linearization - -- rid: reverse tetramate formation (type 1) - smarts: '[*:1]-[C:2](-[OH:3])=[C:4]1[C:5](=[O:6])[N:7][C:8][C:9]1=[O:10]>>[*:1]-[C:2](=[OH0:3])-[CH2:4][C:5](=[O:6])[N:7][C:8][C:9](-[OH])=[O:10]' - groups: - - linearization - -- rid: reverse tetramate formation (type 2) - smarts: '[*:1]-[C:2](-[OH:3])-[C:4]=1[C:5](=[O:6])[N:7][C:8][C:9]1-[OH:10]>>[*:1]-[C:2](=[OH0:3])-[CH2:4][C:5](=[O:6])[N:7][C:8][C:9](-[OH])=[O:10]' - groups: - - linearization - -- rid: reverse tetronate formation (type 1) - smarts: '[*:1]-[C:2](-[OH:3])=[C:4]1[C:5](=[O:6])[O:7][C:8][C:9]1=[O:10]>>[*:1]-[C:2](=[OH0:3])-[CH2:4][C:5](=[O:6])[O:7][C:8][C:9](-[OH])=[O:10]' - groups: - - linearization - -- rid: reverse tetronate formation (type 2) - smarts: '[*:1]-[C:2](-[OH:3])-[C:4]=1[C:5](=[O:6])[O:7][C:8][C:9]1-[OH:10]>>[*:1]-[C:2](=[OH0:3])-[CH2:4][C:5](=[O:6])[O:7][C:8][C:9](-[OH])=[O:10]' - groups: - - linearization - -- rid: reverse tetronate formation (type 3; spirotetronate) - smarts: '[C:9]=[C:1]1[O:2][C:3](=[O:4])[C:5](-[O:6])=[C:7](-[OH:8])1>>([C:9]=[C:1]1[O:2][C:3](=[O:4])[C:5](-[O:6]).[OH][C:7](=[O:8])1)' - groups: - - linearization - -- rid: reverse acetylation - smarts: '[CH2:1]=[C:2](-[OH:3])[C:4](=[O:5])-[OH:6]>>[OH][CH2:1]-[C:2](-[OH:3])[C:4](=[O:5])-[OH:6]' - groups: - - linearization - props: - examples: - - chlorothricin biosynthesis - -- rid: reverse kirromycin-like substructure formation - smarts: '[c:1]1([C:2](=[O:8])[*:9])[c:3](~[O:10])[c:4][c:5][n:6][c:7](~[O:11])1>>[CH0:3](=[O:10])(-[OH])[CH2:4][CH2:5][NH1:6][CH0:7](=[O:11])[CH2:1]([CH0:2](=[O:8])[*:9])' - groups: - - linearization - -- rid: reverse beta-lactam formation - smarts: '[C:1][N:2][C:3]1[C:4]2[S:5][C:6][C:7]([C:8])[N:9]2[C:10]1=[O:11]>>([C:1][N:2][C:3]1[C:4][S:5].[C:6][C:7]([C:8])[N:9][C:10]1=[O:11])' - groups: - - linearization - -- rid: reverse DAOC-synthetase - smarts: '[C:1][N:2][C:3]1[C:4]2[S:5][C:6][CH0:7]=[CH0:8]([C:9])[N:10]2[C:11]1=[O:12]>>[C:1][N:2][C:3]1[C:4]2[S:5][CH0:7]([CH3:6])[CH1:8]([C:9])[N:10]2[C:11]1=[O:12]' - groups: - - linearization - -- rid: reverse salinosporamide-like substructure formation - smarts: '[*:1][C:2]1[C:3](=[O:4])[N:5][C:6]([C:7](=[O:8])2)([*:9])[C:10]1([C:11])[O:12]2>>[*:1][C:2][C:3](=[O:4])[OH].[N:5][C:6]([C:7](=[O:8])[OH])([*:9]).[C:10]([C:11])(=[O])[OH:12]' - groups: - - linearization - -- rid: reverse cyclization on lysine-like substructure - smarts: '[C:1]1(-[N:2])[C:3][C:4][C:5][C:6][N:7]([OH:8])[C:9](=[O:10])1>>([C:1](-[N:2])([C:9](=[O:10])([OH]))[C:3][C:4][C:5][C:6][N:7]([OH0:8]))' - groups: - - linearization - -- rid: open terminal aromatic ring - smarts: '[c:1]1[c:2]([C:7](=[O:8])([OH:9]))[c:3]([C:10])[c:4][c:5][c:6]1>>([C:1]1=[C:2]([C:7](=[O:8])([OH:9])).[C:3]([C:10])=[C:4][C:5]=[C:6]1)' - groups: - - linearization - -- rid: reverse spirocycle formation (type 1) - smarts: '[#6:1]1(-[*:14])-[#8:6]-[#6:5]2(-[#8:7]-[#6:11](-[#6:13]-[*:12])-[#6:10]-[#6:9]-[#6:8]-2)-[#6:4]-[#6:3]-[#6:2]-1>>[#6:1](-[*:14])(-[#8:6])-[#6:2]-[#6:3]-[#6:4]-[#6:5](=[#8:7])-[#6:8]-[#6:9]-[#6:10]-[#6:11]=[#6:13]-[*:12]' - groups: - - linearization - -- rid: reverse spirocycle formation (type 2) - smarts: '[#6:1]1(-[*:13])-[#8:6]-[#6:5]2(-[#8:7]-[#6:10](-[#6:9]-[#6:8]-2)-[#6:12]-[*:11])-[#6:4]-[#6:3]-[#6:2]-1>>[#6:1](-[*:13])(-[#8:6])-[#6:2]-[#6:3]-[#6:4]-[#6:5](=[#8:7])-[#6:8]-[#6:9]-[#6:10]=[#6:12]-[*:11]' - groups: - - linearization - -- rid: reversed cyclized hydroxy ornithine - smarts: '[C:2]1[C:3](=[O:4])[NH0:5]([OH:7])[C:6][C:8][C:9]1>>[NH2:5]([OH:7])[C:6][C:8][C:9][C:2][C:3](=[O:4])[O]' - groups: - - linearization - -- rid: reverse atrop-abyssomycin C like formation - smarts: '[C;R:1]=[C;R:2][O;R:3][C;R:4][C;R:5][O:6]>>([C;R:1]=[C;R:2][OH1:3].[C;R:4]1[C;R:5][OH0:6]1)' - groups: - - linearization - -- rid: reverse dies-alder (intramolecular) - smarts: '[*:9][C:1]1-[C:2]=[C:3]-[C:4](-[C:5])-[C:6]-[C:7]1-[*:8]>>([*:9][C:1]=[C:2]-[C:3]=[C:4]-[C:5].[C:6]=[C:7]-[*:8])' - groups: - - linearization - -- rid: reverse dies-alder (intermolecular) - smarts: '[*:9][C:1]1-[C:2]=[C:3]-[C:4](-[C:5])-[C:6]-[C:7]1-[*:8]>>[*:9][C:1]=[C:2]-[C:3]=[C:4]-[C:5].[C:6]=[C:7]-[*:8]' - groups: - - linearization - -- rid: reverse aryl amicoumacin-like - smarts: '[C:1][c:2]1[c:3]([OH:9])[c:4][c:5][c:6][c:7]1[C:8]>>[C:1][C:2]=[C:3]([OH:9])[C:4]=[C:5][C:6]=[C:7][C:8]' - groups: - - linearization - -- rid: reverse spiroborate - smarts: '[O:1]1[C:2][C:3][O:4][B:5]12[O:6][C:7][C:8][O:9]2>>([OH1:1][C:2][C:3][OH1:4].[OH1:6][C:7][C:8][OH1:9]).[B:5]' - groups: - - linearization - -- rid: reverse cremimycin-like substructure - smarts: '[C:1]1[C:2][C:3]([O:4])[C:5](-,=[O:6])-,=[C:7]1>>[C:1]=[C:2][C:3][C:5](-[OH:6])-[C:7].[O:4]' - groups: - - linearization - props: - references: - - 'DOI:10.1002/cbic.201300370' - -- rid: reversed NRP biosynthesis (alpha amino acid; intermolecular) - smarts: '[C,c:2][C:3](=[O:4])[NH1:5][C,c;!$(C=O):6]>>[C,c:2][C:3](=[O:4])[O].[NH2:5][C,c:6]' - groups: - - NRP disassembly - -- rid: reversed NRP biosynthesis (alpha amino acid; intramolecular) - smarts: '[C,c:2][C;R:3](=[O:4])[NH1;R:5][C,c;!$(C=O):6]>>([C,c:2][C:3](=[O:4])[O].[NH2:5][C,c:6])' - groups: - - NRP disassembly - -- rid: reverse NRP biosynthesis (aromatic nitrogen; intermolecular) - smarts: '[*:1][C:2](=[O:3])[n:4]>>[*:1][C:2](=[O:3])[OH].[nH:4]' - groups: - - NRP disassembly - -- rid: reverse NRP biosynthesis (aromatic nitrogen; intramolecular) - smarts: '[*:1][C:2](=[O:3])[n:4]>>([*:1][C:2](=[O:3])[OH].[nH:4])' - groups: - - NRP disassembly - -- rid: reverse urea bond formation (intermolecular) - smarts: '[C:1][NH1:2][C:3](=[O:4])[NH1:5][C:6]>>[C:1][NH2:2].[NH2:5][C:6].[OH][C:3](=[O:4])[OH]' - groups: - - NRP disassembly - -- rid: reverse urea bond formation (intramolecular) - smarts: '[C:1][NH1:2][C:3](=[O:4])[NH1:5][C:6]>>([C:1][NH2:2].[NH2:5][C:6]).[OH][C:3](=[O:4])[OH]' - groups: - - NRP disassembly - -- rid: reversed NRP biosynthesis (alpha amino acid; proline-like) - smarts: '[*:1][C:2](=[O:3])[NH0:4][C:5][C:6](=[O:7])[OH:8]>>[*:1][C:2](=[O:3])[OH].[NH1:4][C:5][C:6](=[O:7])[OH:8]' - groups: - - NRP disassembly - -- rid: reversed NRP biosynthesis (beta amino acid) - smarts: '[*:1][C:2](=[O:3])[NH1:4][C:5][CH2:6][C:7](=[O:8])[OH:9]>>[*:1][C:2](=[O:3])[OH].[NH2:4][C:5][CH2:6][C:7](=[O:8])[OH:9]' - groups: - - NRP disassembly - -- rid: break amide bond (hydroxamic) - smarts: '[C:1][C:2](=[O:3])[N:4](-[OH:5])[C:6]>>[C:1][C:2](=[O:3])-[OH].[N:4](-[OH:5])[C:6]' - groups: - - NRP disassembly - -- rid: reversed polyketide synthesis (saturated) - smarts: '[C,c:1][C;!R:2]-[C;!R:3]-[C:4](=[O:5])[OH:6]>>[C,c:1]C(=O)[OH].[OH][S][C:2]-[C:3]-[C:4](=[O:5])[OH:6]' - groups: - - PK disassembly - -- rid: reversed polyketide synthesis (saturated anhydride) - smarts: '[C,c:1][C;!R:2]-[C;!R:3]-[C:4](=[O:5])[CH2:6][CH3:7]>>[C,c:1]C(=O)[OH].[OH][S][C:2]-[C:3]-[C:4](=[O:5])[OH].[C](=[O])([OH])[C:6][C:7]' - groups: - - PK disassembly - -- rid: reversed polyketide synthesis (unsaturated) - smarts: '[C,c:1][C;!R:2]=[C;!R:3]-[C:4](=[O:5])[OH:6]>>[C,c:1]C(=O)[OH].[OH][S][C:2]=[C:3]-[C:4](=[O:5])[OH:6]' - groups: - - PK disassembly - -- rid: reversed polyketide synthesis (unsaturated and shifted type 1) - smarts: '[C,c:1]=[C;!R:2]-[C;!R:3]-[C:4](=[O:5])[OH:6]>>[C,c:1]C(=O)[OH].[OH][S][C:2]=[C:3]-[C:4](=[O:5])[OH:6]' - groups: - - PK disassembly - -- rid: reversed polyketide synthesis (unsaturated and shifted type 2) - smarts: '[C:7][C:1]=[C;!R:2]-[C&D4;!R:3]-[C:4](=[O:5])[OH:6]>>[C:7]=[C:1]C(=O)[OH].[OH][S][C:2]-[C:3]-[C:4](=[O:5])[OH:6]' - groups: - - PK disassembly - -- rid: reversed polyketide synthesis (shifted and late stage oxidation) - smarts: '[C,c:1]=[C;!R:2]-[C;!R:3](-[OH:7])-[C:4](=[O:5])[OH:6]>>[C,c:1]C(=O)[OH].[OH][S][C:2]=[C:3]-[C:4](=[O:5])[OH:6].[O:7]' - groups: - - PK disassembly \ No newline at end of file diff --git a/src/retromol/data/default_wave_config.yml b/src/retromol/data/default_wave_config.yml deleted file mode 100644 index 4db2635..0000000 --- a/src/retromol/data/default_wave_config.yml +++ /dev/null @@ -1,83 +0,0 @@ -- wave_name: disconnect macrostructures - reaction_groups: - - disconnect macrostructures - -# Remove tailoring moieties -- wave_name: preprocessing - only_leaf_nodes: true - parse_identified_nodes: false - - reaction_groups: - - preprocessing - - matching_groups: - - acetalization - - acetylation - - glycosylation - - halogenation - - hydrogenation - - small tailoring # e.g., methylation, epoxidation etc. - -# Linearize molecule by reversing cyclizations; no matching groups here -- wave_name: linearization - only_leaf_nodes: true - parse_identified_nodes: false - reaction_groups: - - linearization - -# Linearization might reveal new tailoring groups, so do another tailoring removal wave -- wave_name: postprocessing linearization - only_leaf_nodes: true - parse_identified_nodes: false - - reaction_groups: - - postprocessing linearization - - matching_groups: - - acetylation - - small tailoring # e.g., methylation, epoxidation etc. - - glycosylation - -# Break down peptide(-like) structures into amino acids -- wave_name: NRP disassembly - only_leaf_nodes: true - parse_identified_nodes: false - - reaction_groups: - - NRP disassembly - - matching_groups: - - amino acid - - artificial amino acid - - # Carboxylic acids that are not amino acids - - fatty acid - - phenolic acid - - non-amino acid A-domain substrate - - # Unassigned - - other - - # Residue urea bond formation - - urea bond - -# Break down polyketide(-like) structures into polyketide building blocks -- wave_name: PK disassembly - only_leaf_nodes: false - parse_identified_nodes: true - - reaction_groups: - - PK disassembly - - matching_groups: - - polyketide building block - - polyketide starter - - # Also match these in case they initiate a mixed NRP-PK assembly line; - # we omit fatty acids here to prioritize their processing as PK building blocks in this wave - - amino acid - - artificial amino acid - - non-amino acid A-domain substrate - - # Unassigned - - other diff --git a/src/retromol/data/mxn_other.yml b/src/retromol/data/mxn_other.yml new file mode 100644 index 0000000..0174965 --- /dev/null +++ b/src/retromol/data/mxn_other.yml @@ -0,0 +1,1120 @@ + +- name: methylation + smiles: "C" + +- name: oxidation + smiles: "O" + +- name: amination + smiles: "N" + +- name: cyanide + smiles: "C#N" + +- name: boronation + smiles: "B" + +- name: phosphonylation + smiles: "OP(O)(O)O" + +- name: fluorination + smiles: "F" + family_tokens: ["halogenation"] + +- name: chlorination + smiles: "Cl" + family_tokens: ["halogenation"] + +- name: bromination + smiles: "Br" + family_tokens: ["halogenation"] + +- name: iodination + smiles: "I" + family_tokens: ["halogenation"] + +- name: methanol + smiles: "CO" + +- name: ethanol + smiles: "CCO" + +- name: propanol + smiles: "CCCO" + +- name: formaldehyde + smiles: "C=O" + +- name: carbamic acid + smiles: "NC(=O)O" + +- name: carbonic acid + smiles: "OC(=O)O" + +- name: acetic acid + smiles: "CC(=O)O" + +- name: glyceric acid + smiles: "OC(CO)C(O)=O" + +- name: propanoic acid + smiles: "CCC(O)=O" + +- name: 4-amino-6-methyloxane-2,3-diol + smiles: "CC1CC(N)C(O)C(O)O1" + family_tokens: ["glycosylation"] + +- name: sugar + smiles: "CC1OC(O)CC(N)C1O" + family_tokens: ["glycosylation"] + +- name: 4,6-dimethyloxane-2,4,5-triol + smiles: "CC1OC(O)CC(C)(O)C1O" + family_tokens: ["glycosylation"] + +- name: 6-methyloxane-2,3,4-triol + smiles: "CC1CC(O)C(O)C(O)O1" + family_tokens: ["glycosylation"] + +- name: sugar + smiles: "NC1C(O)OC(O)C(O)C1O" + family_tokens: ["glycosylation"] + +- name: arabinose + smiles: "OC1COC(O)C(O)C1O" + family_tokens: ["glycosylation"] + +- name: glucose + smiles: "OCC1OC(O)C(O)C(O)C1O" + family_tokens: ["glycosylation"] + +- name: 6-methyloxane-2,5-diol + smiles: "CC1OC(O)CCC1O" + family_tokens: ["glycosylation"] + +- name: 6-methyloxane-2,4,5-triol + smiles: "CC1OC(O)CC(O)C1O" + family_tokens: ["glycosylation"] + +- name: rhamnose + smiles: "CC1OC(O)C(O)C(O)C1O" + family_tokens: ["glycosylation"] + +- name: sugar + smiles: "CC1OC(O)C(O)C(N)C1O" + family_tokens: ["glycosylation"] + +- name: sugar + smiles: "CC(C1(C(C(C(OC1C)O)O)O)O)O" + family_tokens: ["glycosylation"] + +- name: sugar + smiles: "CC(C1(O)C(O)C(O)C(O)OC1C)=O" + family_tokens: ["glycosylation"] + +- name: glucosamine + smiles: "NC1C(C(C(OC1O)CO)O)O" + family_tokens: ["glycosylation"] + +- name: sugar + smiles: "NC1COC(CC1O)O" + family_tokens: ["glycosylation"] + +- name: 5-amino-6-methyloxane-2,3,4-triol + smiles: "OC1C(O)C(N)C(C)OC1O" + family_tokens: ["glycosylation"] + +- name: 6-methyl-5-sulfanyloxane-2,4-diol + smiles: "CC1OC(CC(C1S)O)O" + family_tokens: ["glycosylation"] + +- name: 4-amino-4,6-dimethyloxane-2,5-diol + smiles: "CC1(N)CC(O)OC(C)C1O" + family_tokens: ["glycosylation"] + +- name: 4,6-diaminocyclohexane-1,2,3-triol + smiles: "NC1C(O)C(O)C(O)C(N)C1" + family_tokens: ["glycosylation"] + +- name: sugar + smiles: "O=CC1(O)C(CO)OC(O)C1O" + family_tokens: ["glycosylation"] + +# Tautomers +- name: 2-[3-(diaminomethylideneamino)-2,4,5,6-tetrahydroxycyclohexyl]guanidine + smiles: "NC(N)=NC1C(O)C(O)C(O)C(N=C(N)N)C1O" + family_tokens: ["glycosylation"] +- name: 2-[3-(diaminomethylideneamino)-2,4,5,6-tetrahydroxycyclohexyl]guanidine + smiles: "N=C(N)NC1C(O)C(NC(=N)N)C(O)C(O)C1O" + family_tokens: ["glycosylation"] + +- name: ribose + smiles: "OCC1OC(O)C(O)C1O" + family_tokens: ["glycosylation"] + +- name: 3-amino-6-(aminomethyl)oxane-2,4,5-triol + smiles: "NCC1OC(C(C(C1O)O)N)O" + family_tokens: ["glycosylation"] + +- name: sugar + smiles: "OC1CC(C)(OC)C(O)C(C)O1" + family_tokens: ["glycosylation"] + +- name: 4-dimethylamino-6-methyloxane-2,3-diol + smiles: "OC1C(O)C(N(C)C)CC(C)O1" + family_tokens: ["glycosylation"] + +# Tautomers +- name: arginine + smiles: "NC(N)=NCCCC(N)C(=O)O" +- name: arginine + smiles: "N=C(N)NCCCC(N)C(=O)O" + +- name: histidine + smiles: "NC(Cc1cnc[nH]1)C(=O)O" + +- name: 3-hydroxy histdine + smiles: "NC(C(c1[nH]cnc1)O)C(O)=O" + +- name: lysine + smiles: "NCCCCC(N)C(=O)O" + +- name: aspartic acid + smiles: "NC(CC(=O)O)C(=O)O" + +- name: glutamic acid + smiles: "NC(CCC(=O)O)C(=O)O" + +- name: serine + smiles: "NC(CO)C(=O)O" + +- name: threonine + smiles: "CC(O)C(N)C(=O)O" + +- name: asparagine + smiles: "NC(=O)CC(N)C(=O)O" + +- name: glutamine + smiles: "NC(=O)CCC(N)C(=O)O" + +- name: glycine + smiles: "NCC(=O)O" + +- name: proline + smiles: "O=C(O)C1CCCN1" + +- name: cysteine + smiles: "NC(CS)C(=O)O" + +- name: leucine + smiles: "CC(C)CC(N)C(=O)O" + +- name: isoleucine + smiles: "CCC(C)C(N)C(=O)O" + +- name: alanine + smiles: "CC(N)C(=O)O" + +- name: valine + smiles: "CC(C)C(N)C(=O)O" + +- name: methionine + smiles: "CSCCC(C(=O)O)N" + +- name: phenylalanine + smiles: "NC(Cc1ccccc1)C(=O)O" + +- name: tryptophan + smiles: "NC(Cc1c[nH]c2ccccc12)C(=O)O" + +- name: tyrosine + smiles: "NC(Cc1ccc(O)cc1)C(=O)O" + +- name: 3,4-dihydroxyphenylalanine + smiles: "NC(C(O)=O)Cc1c(O)cc(O)cc1" + +- name: kynurenine + smiles: "Nc1ccccc1C(=O)C[C@H](N)C(=O)O" + +- name: ornithine + smiles: "[NH2]CCCC([NH2])C(=O)O" + +- name: ornithine alkaloid + smiles: "NC(NCCCCO)=N" + +- name: bicyclic guanidine alkaloid + smiles: "CCCCCCCCCC1N=C(N2CCCC2=C1C(O)=O)N" + +- name: ornithine alkaloid + smiles: "NC(N)=NCCCCO" + +- name: dehydrobutyrine + smiles: "CC=C(C(O)=O)N" + +- name: dehydrohomoserine + smiles: "NC(C(O)=O)C=CO" + +- name: N6-formyl-N6-hydroxylysine + smiles: "NC(CCCCN(O)C=O)C(O)=O" + +- name: 2,3-diaminopropionate + smiles: "NCC(N)C(=O)O" + +- name: 3-hydroxyquinaldic acid + smiles: "O=C(O)c1nc2ccccc2cc1O" + +- name: beta-alanine + smiles: "NCCC(=O)O" + +- name: salicylic acid + smiles: "O=C(O)c1ccccc1O" + +- name: 2,3-dihydroxybenzoic acid + smiles: "O=C(O)c1cccc(O)c1O" + +- name: 3,5-dihydroxybenzoic acid + smiles: "O=C(c1cc(O)cc(O)c1)O" + +- name: isobutyric acid + smiles: "CC(C)C(=O)O" + +- name: lactic acid + smiles: "CC(O)C(=O)O" + +- name: 2-methyl lactic acid + smiles: "CC(C)(O)C(=O)O" + +- name: 2,2-dimethylpropanoic acid + smiles: "CC(C)(C)C(=O)O" + terminal: false + +- name: butanoic acid + smiles: "CCCC(O)=O" + terminal: false + +- name: 3-methylbutanoic acid + smiles: "CC(CC(O)=O)C" + terminal: false + +- name: 4-amino-2-hydroxybutanoic acid + smiles: "NCCC(C(O)=O)O" + terminal: false + +- name: 3-butynoic acid + smiles: "C#CCC(=O)O" + terminal: false + +- name: pentanoic acid + smiles: "CCCCC(O)=O" + terminal: false + +- name: hexanoic acid + smiles: "CCCCCC(O)=O" + terminal: false + +- name: heptanoic acid + smiles: "CCCCCCC(O)=O" + terminal: false + +- name: octanoic acid + smiles: "CCCCCCCC(O)=O" + terminal: false + +- name: nonanoic acid + smiles: "CCCCCCCCC(O)=O" + terminal: false + +- name: 2-hydroxy-3-aminononanoic acid + smiles: "CCCCCCCC(N)C(O)C(=O)O" + terminal: false + +- name: decanoic acid + smiles: "CCCCCCCCCC(O)=O" + terminal: false + +- name: dec-3-enoic acid + smiles: "CCCCCCC=CCC(=O)O" + terminal: false + +- name: 3-hydroxydecanoic acid + smiles: "CCCCCCCC(O)CC(=O)O" + terminal: false + +- name: dodecanoic acid + smiles: "CCCCCCCCCCCC(=O)O" + terminal: false + +- name: tridecanoic acid + smiles: "CCCCCCCCCCCCC(=O)O" + terminal: false + +- name: 3-hydroxytridecanoic acid + smiles: "O=C(O)CC(O)CCCCCCCCCC" + terminal: false + +- name: tetradecanoic acid + smiles: "CCCCCCCCCCCCCC(=O)O" + terminal: false + +- name: tetradec-2-enoic acid + smiles: "CCCCCCCCCCCC=CC(=O)O" + terminal: false + +- name: pentadecanoic acid + smiles: "CCCCCCCCCCCCCCC(=O)O" + terminal: false + +- name: hexadecanoic acid + smiles: "CCCCCCCCCCCCCCCC(=O)O" + terminal: false + +- name: 2,3-dihydroxyhexadecanoic acid + smiles: "CCCCCCCCCCCCCC(C(C(=O)O)O)O" + terminal: false + +- name: 3-hydroxy-2,2-dimethyldecanoic acid + smiles: "CCCCCCCC(C(C)(C(O)=O)C)O" + terminal: false + +- name: trans-2-hexanoic acid + smiles: "CCCC=CC(O)=O" + terminal: false + +- name: orsellinic acid + smiles: "O=C(O)c1c(O)cc(O)cc1C" + terminal: false + +- name: 6-methylsalicylic acid + smiles: "CC1=C(C(=CC=C1)O)C(=O)O" + terminal: false + +- name: 2-methylprop-2-enoic acid + smiles: "C=C(C)C(=O)O" + +- name: cyclopentane-1,2-dicarboxylic acid + smiles: "O=C(O)C1CCCC1C(=O)O" + +- name: furan-3-carboxylic acid + smiles: "O=C(O)c1ccoc1" + +- name: benzoic acid + smiles: "O=C(O)c1ccccc1" + +- name: 2-phenylacetic acid + smiles: "O=C(O)Cc1ccccc1" + +- name: 3-amino-5-hydroxybenzoic acid + smiles: "Nc1cc(C(O)=O)cc(O)c1" + +- name: 2-(2,6-dioxopiperidin-4-yl)acetic acid + smiles: "O=C1NC(CC(CC(O)=O)C1)=O" + +- name: artificial amino acid harzianic acid + smiles: "NC(C(O)=O)CC(C(O)=O)(C(C)C)O" + +- name: 4-hydroxyphenylglycine + smiles: "NC(C(=O)O)c1ccc(O)cc1" + +- name: 2,4-diaminobytyric acid + smiles: "NCCC(N)C(=O)O" + +- name: beta-hydroxytyrosine + smiles: "NC(C(=O)O)C(O)c1ccc(O)cc1" + +- name: 3,5-dihydroxyphenylglycine + smiles: "NC(C(=O)O)c1cc(O)cc(O)c1" + +- name: pipecolic acid + smiles: "O=C(O)C1CCCCN1" + +- name: N5-formyl-N5-hydroxyornithine + smiles: "NC(CCCN(O)C=O)C(=O)O" + +- name: N5-hydroxyornithine + smiles: "NC(CCCNO)C(=O)O" + +- name: anthranilic acid + smiles: "Nc1ccccc1C(=O)O" + +- name: 2-amino-3-hydroxy-4-(4-nitrophenyl)butanoic acid + smiles: "C1=CC(=CC=C1CC(C(C(=O)O)N)O)[N+](=O)[O-]" + +- name: 2,6-diamino-57-dihydroxy-heptanoic acid + smiles: "C(CC(C(=O)O)N)C(C(CO)N)O" + +- name: 4-methylhex-2-enoic acid + smiles: "CCC(C)C=CC(=O)O" + +- name: 2-amino-4-decenoic acid + smiles: "CCCCCC=CCC(C(=O)O)N" + +- name: 1-(1,1-dimethylallyl)-tryptophan + smiles: "CC(C)(C=C)N1C=C(C2=CC=CC=C21)CC(C(=O)O)N" + +- name: 1-aminocyclopropane-1-carboxylic acid + smiles: "C(O)(=O)C1(CC1)(N)" + +- name: 1-pyrroline-5-carboxylic acid + smiles: "O=C(O)C1N=CCC1" + +- name: 10,14-dimethyloctadecanoic acid + smiles: "OC(CCCCCCCCC(C)CCCC(C)CCCC)=O" + +- name: 2,3-diaminobutyric acid + smiles: "NC(C)C(C(=O)O)N" + +- name: 2,3-dihydroxy-para-aminobenzoic acid + smiles: "C1=CC(=C(C(=C(N)1)O)O)C(=O)O" + +- name: 2,4-dihydroxypentanoic acid + smiles: "CC(CC(C(=O)O)O)O" + +- name: 2-(1-methylcyclopropyl)-glycine + smiles: "CC1(CC1)C(C(=O)O)N" + +- name: 2-amino-3,5-dimethyl-4-hexenoic Acid + smiles: "CC(C=C(C)C)C(C(=O)O)N" + +- name: 2-amino-3-hydroxycyclopent-2-enone + smiles: "C1CC(=O)C(=C1O)N" + +- name: 2-amino-6-hydroxy-4-methyl-8-oxodecanoic acid + smiles: "CCC(=O)CC(CC(C)CC(C(=O)O)N)O" + +- name: 2-aminoadipic acid + smiles: "C(CC(C(=O)O)N)CC(=O)O" + +- name: 2-aminobutyric acid + smiles: "CCC(C(=O)O)N" + +- name: 2-aminoisobutyric acid + smiles: "O=C(O)C(N)(C)C" + +- name: 2-carboxy-6-hydroxyoctahydroindole + smiles: "N1[C@H](C(=O)O)CC2CCC(O)CC12" + +- name: 2-chloro-3,5-dihydroxy-4-methylphenylglycine + smiles: "CC1=C(O)C(Cl)=C(C=C(O)1)C(C(=O)O)N" + +- name: 2-chlorobenzoic acid + smiles: "C1=CC=C(C(=C1)C(=O)O)Cl" + +- name: 2-hydroxy-4-methylpentanoic acid + smiles: "CC(C)CC(C(=O)O)O" + +- name: 2-hydroxypent-4-enoic acid + smiles: "C=CCC(C(=O)O)O" + +- name: 2-ketoglutaric acid + smiles: "C(CC(=O)O)C(=O)C(=O)O" + +- name: 2-ketoisocaproic acid + smiles: "O=C(C(=O)O)CC(C)C" + +- name: 2-ketoisovaleric acid + smiles: "O=C(C(=O)O)C(C)C" + +- name: 2-methylserine + smiles: "CC(CO)(C(=O)O)N" + +- name: 2-sulfamoylacetic acid + smiles: "C(C(=O)O)S(=O)(=O)N" + +- name: 2-hydroxy-3-methylpentanoic acid + smiles: "CCC(C)C(C(=O)O)O" + +- name: 2-amino-4-hexenoic acid + smiles: "CC=CCC(C(=O)O)N" + +- name: 2-amino-8-oxodecanoic acid + smiles: "CCC(=O)CCCCCC(C(=O)O)N" + +- name: 2-amino-9,10-epoxy-8-oxodecanoic acid + smiles: "C1C(O1)C(=O)CCCCCC(C(=O)O)N" + +- name: 2-aminodecanoic acid + smiles: "CCCCCCCCC(C(=O)O)N" + +- name: 2-aminododecanoic acid + smiles: "CCCCCCCCCCC(C(=O)O)N" + +- name: 2-aminooctanoic acid + smiles: "CCCCCCC(C(=O)O)N" + +- name: 2-hydroxyisovaleric acid + smiles: "CC(C)C(C(=O)O)O" + +- name: 2-methyl-3-oxobutyrine + smiles: "CC(=O)C(C)(N)C(=O)O" + +- name: 3,3-dihomo-4-methoxytyrosine + smiles: "NC(CCCC1=CC=C(OC)C=C1)C(=O)O" + +- name: 3,3-dihomophenylalanine + smiles: "NC(CCCC1=CC=CC=C1)C(=O)O" + +- name: 3,3-dihomotyrosine + smiles: "NC(CCCC1=CC=C(O)C=C1)C(=O)O" + +- name: 3,4-dehydrolysine + smiles: "C(CCN)=CC(C(=O)O)N" + +- name: 3,4-dihydroxybenzoic acid + smiles: "C1=CC(=C(C=C1C(=O)O)O)O" + +- name: 3,5-dichloro-4-hydroxyphenylglycine + smiles: "C1=C(Cl)C(=C(Cl)C=C1C(C(=O)O)N)O" + +- name: 3-(2-nitrocyclopropylalanine) + smiles: "C1C(C1[N+](=O)[O-])CC(C(=O)O)N" + +- name: 3-(3ridyl)-alanine + smiles: "C1=CC(=CN=C1)CC(C(=O)O)N" + +- name: 3-amino-2,4-dihydroxybenzoic acid + smiles: "C1=CC(=C(C(=C1C(=O)O)O)N)O" + +- name: 3-amino-4-hydroxybenzoic acid + smiles: "C1=CC(=C(C=C1C(=O)O)N)O" + +- name: 3-amino-6-hydroxy-2-pridone + smiles: "C1CC(NC(=O)C1N)O" + +- name: 3-chlorotyrosine + smiles: "C1=C(Cl)C(=CC=C1CC(C(=O)O)N)O" + +- name: 3-hydroxy-4-methylproline + smiles: "CC1C(O)C(NC1)C(=O)O" + +- name: 3-hydroxy-O-methyl-5-methyltyrosine + smiles: "C1=C(O)C(=C(C)C=C1CC(C(=O)O)N)OC" + +- name: 3-hydroxy-O-methyltyrosine + smiles: "C1=C(O)C(=CC=C1CC(C(=O)O)N)OC" + +- name: 3-hydroxy-para-aminobenzoic acid + smiles: "C1=CC(=C(C=C1C(=O)O)O)N" + +- name: 3-hydroxyaspartic acid + smiles: "NC(C(C(=O)O)O)(C(=O)O)" + +- name: 2-hydroxyglycine + smiles: "NC(C(O)=O)O" + +- name: 3-hydroxyglutamine + smiles: "C(C(C(C(=O)O)N)O)C(=O)N" + +- name: 3-hydroxyglutamic acid + smiles: "NC(C(CC(O)=O)O)C(O)=O" + +- name: 3-hydroxykynurenine + smiles: "C1=CC(=C(C(=C1)O)N)C(=O)CC(C(=O)O)N" + +- name: 3-hydroxyleucine + smiles: "CC(C)C(C(C(=O)O)N)O" + +- name: 3-hydroxypicolinic acid + smiles: "C1=CC(=C(N=C1)C(=O)O)O" + +- name: 3-hydroxytyrosine + smiles: "C1=CC(=C(C=C1CC(C(=O)O)N)O)O" + +- name: 3-hydroxyvaline + smiles: "CC(O)(C)C(C(=O)O)N" + +- name: 3-methoxyanthranilic acid + smiles: "COC1=CC=CC(=C1N)C(=O)O" + +- name: 3-methoxyaspartic acid + smiles: "NC(C(C(=O)O)OC)(C(=O)O)" + +- name: 3-methylasparagine + smiles: "CC(C(C(=O)O)N)C(=O)N" + +- name: 3-methylaspartic acid + smiles: "CC(C(C(=O)O)N)C(=O)O" + +- name: 3-methylglutamic acid + smiles: "CC(CC(=O)O)C(C(=O)O)N" + +- name: 3-methylleucine + smiles: "CC(C)C(C)C(C(=O)O)N" + +- name: 3-nitrotyrosine + smiles: "C1=CC(=C(C=C1CC(C(=O)O)N)[N+](=O)[O-])O" + +- name: 3-chloroproline + smiles: "C1C(Cl)C(NC1)C(=O)O" + +- name: 3-hydroxy-2,4-diaminobutyric acid + smiles: "NCC(O)C(C(=O)O)N" + +- name: 3-hydroxyasparagine + smiles: "NC(C(O)=O)C(O)C(N)=O" + +- name: 3-hydroxyhomotyrosine + smiles: "C1=CC(=CC=C1CC(C(C(=O)O)N)O)O" + +- name: 3-methylbeta-alanine + smiles: "NCC(C)C(=O)O" + +- name: 3,4-dichloroproline + smiles: "ClC1C(Cl)C(NC1)C(=O)O" + +- name: 3,4-dihydroxyhomotyrosine + smiles: "C1=CC(=CC=C1C(O)C(C(C(=O)O)N)O)O" + +- name: 3-aminobutyric acid + smiles: "CC(CC(=O)O)N" + +- name: 3-cyclohex-2-enylalanine + smiles: "C1C=CC(CC1)CC(C(=O)O)N" + +- name: 3-hydroxy-4-methyloctanoic acid + smiles: "CCCCC(C(CC(O)=O)O)C" + +- name: 3-hydroxy-6-chlorohistidine + smiles: "C1=C(NC(Cl)=N1)C(C(C(=O)O)N)O" + +- name: 3-hydroxypipecolic acid + smiles: "C1CC(C(NC1)C(=O)O)O" + +- name: 3-hydroxyproline + smiles: "OC1C(NCC1)C(=O)O" + +- name: 3-methylproline + smiles: "CC1C(NCC1)C(=O)O" + +- name: 4,5-dehydroarginine + smiles: "O=C(O)C(N)CC=CNC(N)=N" + +- name: 4,5-dihydroxyornithine + smiles: "C(C(C(=O)O)N)C(C(N)O)O" + +- name: 4-acetamidopyrrole-2-carboxylic acid + smiles: "CC(=O)NC1=CNC(=C1)C(=O)O" + +- name: 4-amino-2-hydroxy-3-isopropoxybenzoic acid + smiles: "CC(C)OC1=C(C=CC(=C1O)C(=O)O)N" + +- name: 4-aminobutyric acid + smiles: "NCCCC(=O)O" + +- name: 4-aminophenylalanine + smiles: "C1=CC(=CC=C1CC(C(=O)O)N)N" + +- name: 4-chlorobenzoic acid + smiles: "C1=CC(=CC=C1C(=O)O)Cl" + +- name: 4-hydroxy-3-nitrobenzoic acid + smiles: "C1=CC(=C(C=C1C(=O)O)[N+](=O)[O-])O" + +- name: 4-hydroxy-D-kynurenine + smiles: "C1=C(O)C=C(C(=C1)C(=O)CC(C(=O)O)N)N" + +- name: 4-hydroxybenzoic acid + smiles: "C1=CC(=CC=C1C(=O)O)O" + +- name: 4-hydroxyglutamine + smiles: "C(C(O)C(=O)N)C(C(=O)O)N" + +- name: 4-hydroxyindole-3-carboxylic acid + smiles: "c1cc2c(c(c1)O)c(c[nH]2)C(=O)O" + +- name: 4-hydroxyphenylpyruvic acid + smiles: "C1=CC(=CC=C1CC(=O)C(=O)O)O" + +- name: 4-hydroxythreonine + smiles: "C(C(C(C(=O)O)N)O)O" + +- name: 4-hydroxyvaline + smiles: "CC(CO)C(C(=O)O)N" + +- name: 4-methoxytryptophan + smiles: "C1=CC=C2C(=C1OC)C(=CN2)CC(C(=O)O)N" + +- name: 4-nitrotryptophan + smiles: "C1=CC=C2C(=C1[N+](=O)[O-])C(=CN2)CC(C(=O)O)N" + +- name: 4-oxoproline + smiles: "C1C(NCC1=O)C(=O)O" + +- name: 4-butenyl-4-methylthreonine + smiles: "CC=CCC(C)C(C(C(=O)O)N)O" + +- name: 4-hydroxyproline + smiles: "C1C(NCC1O)C(=O)O" + +- name: 4-propylproline + smiles: "CCCC1CC(NC1)C(=O)O" + +- name: 4,5-dihydroxy-2-aminopentanoic acid + smiles: "OC(CC(C(=O)O)N)CO" + +- name: 4-acetyl-5-methylproline + smiles: "CC(=O)OC1CC(NC(C)1)C(=O)O" + +- name: 4-hydroxylysine + smiles: "NCCC(O)CC(C(=O)O)N" + +- name: 4-methylazetidine-2-carboxylic acid + smiles: "CC1CC(N1)C(=O)O" + +- name: 4-methylproline + smiles: "CC1CC(NC1)C(=O)O" + +- name: 4-propenylproline + smiles: "CC=CC1CC(NC1)C(=O)O" + +- name: 5-hydroxylysine + smiles: "NCC(CCC(C(O)=O)N)O" + +- name: 5,5-dimethylpipecolic acid + smiles: "C1C(C)(C)CNC(C1)C(=O)O" + +- name: 5-aminolevulinic acid + smiles: "C(CC(=O)O)C(=O)CN" + +- name: 5-chloroanthranilic acid + smiles: "C1=CC(=C(C=C1Cl)C(=O)O)N" + +- name: 5-chlorotryptophan + smiles: "C1=CC2=C(C=C1Cl)C(=CN2)CC(C(=O)O)N" + +- name: 5-methoxytyrosine + smiles: "C1=C(OC)C(=CC=C1CC(C(=O)O)N)O" + +- name: 5-methylorsellinic acid + smiles: "C=1(C=C(C(=C(C1C)C)C(=O)O)O)O" + +- name: 5-methylproline + smiles: "C1CC(NC(C)1)C(=O)O" + +- name: 6,7-dichlorotryptophan + smiles: "C1=C(Cl)C(Cl)=C2C(=C1)C(=CN2)CC(C(=O)O)N" + +- name: 6-chloro-4-hydroxy-1-methyl-indole-3-carboxylic acid + smiles: "C(O)1=C(Cl)C=C2C(=C1)C(=CN(C)2)C(=O)O" + +- name: 6-chloro-4-hydroxyindole-3-carboxylic acid + smiles: "c(Cl)1cc2c(c(c1)O)c(c[nH]2)C(=O)O" + +- name: 6-chlorotryptophan + smiles: "C1=C(Cl)C=C2C(=C1)C(=CN2)CC(C(=O)O)N" + +- name: 6-hydroxy-tetrahydro-isoquinoline-3-carboxylic acid + smiles: "C1C(NCC2=C1C=C(C=C2)O)C(=O)O" + +- name: 6-methyl-pipecolic acid + smiles: "C1CC(C)NC(C1)C(=O)O" + +- name: An acid hydrazine polyene (intermediate 14) + smiles: "OC(=O)CCC(=O)NNCC(=O)O" + +- name: Compound 4 (formed by the decarboxylative condensation of L-Phe and succinyl-CoA) + smiles: "C1=CC=C(C=C1)CC(C(=O)CCC(=O)O)N" + +- name: isovaline + smiles: "CCC(C)(C(=O)O)N" + +- name: lysergic acid + smiles: "CN1CC(C=C2C1CC3=CNC4=CC=CC2=C34)C(=O)O" + +- name: N-(1-methyl)-tryptophan + smiles: "C1=CC=C2C(=C1)C(=CN(C)2)CC(C(=O)O)N" + +- name: N-(1-propargyl)-tryptophan + smiles: "C1=CC=C2C(=C1)C(=CN(CC#C)2)CC(C(=O)O)N" + +- name: N-formylglycine + smiles: "C(C(=O)O)NC=O" + +- name: N-hydroxyvaline + smiles: "CC(C)C(C(=O)O)NO" + +- name: N-methylphenylalanine + smiles: "CNC(CC1=CC=CC=C1)C(=O)O" + +- name: N-methyltyrosine + smiles: "C1=CC(=CC=C1CC(C(=O)O)NC)O" + +- name: N1-methoxytryptophan + smiles: "C1=CC=C2C(=C1)C(=CN(OC)2)CC(C(=O)O)N" + +- name: N5-acetyl-hydroxyornithine + smiles: "CC(=O)N(CCCC(C(=O)O)N)O" + +- name: N5-nitroso-N5-hydroxyornithine + smiles: "O=NN(CCCC(C(=O)O)N)O" + +- name: N5-anhydromevalonyl-N5-hydroxyornithine + smiles: "C(CC(C(=O)O)N)CN(O)C(=O)C=C(C)CCO" + +- name: N6-hydroxylysine + smiles: "C(CCNO)CC(C(=O)O)N" + +- name: 3-hydroxy-3-methylproline + smiles: "OC(C)1C(NCC1)C(=O)O" + +- name: aza-beta-tyrosine + smiles: "C1=CC(=NC=C1O)C(CC(=O)O)N" + +- name: beta-methyltryptophan + smiles: "CC(C1=CNC2=CC=CC=C21)C(C(=O)O)N" + +- name: beta-phenylalanine + smiles: "C1=CC=C(C=C1)C(CC(=O)O)N" + +- name: adenosylmethionine + smiles: "C[S+](CCC(C(=O)[O-])N)CC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)O)O" + +- name: beta-hydroxycyclohex-2-enylalanine + smiles: "C1C=CC(CC1)C(O)C(C(=O)O)N" + +- name: beta-hydroxyenduracididine + smiles: "C1C(NC(=N1)N)C(O)C(C(=O)O)N" + +- name: beta-methylphenylalanine + smiles: "CC(C1=CC=CC=C1)C(C(=O)O)N" + +- name: beta-tyrosine + smiles: "C1=CC(=CC=C1C(CC(=O)O)N)O" + +- name: alaninol + smiles: "CC(CO)N" + +- name: argininol + smiles: "NC(CO)CCCN=C(N)N" + +- name: azetidine-2-carboxylic acid + smiles: "O=C(O)C1NCC1" + +- name: benzoxazolinate + smiles: "c1ccc2c(c1)nc(o2)C(=O)O" + +- name: beta-hydroxy-3-hydroxy-O-methyl-5-methyltyrosine + smiles: "C1=C(C)C(=C(O)C=C1C(O)C(C(=O)O)N)OC" + +- name: beta-hydroxy-gamma-methyl-hexadecanoic acid + smiles: "CCCCCCCCCCCCC(C)C(O)CC(=O)O" + +- name: beta-hydroxyarginine + smiles: "C(C(O)C(C(=O)O)N)CN=C(N)N" + +- name: beta-hydroxyphenylalanine + smiles: "OC(C1=CC=CC=C1)C(C(=O)O)N" + +- name: beta-lysine + smiles: "C(CC(CC(=O)O)N)CN" + +- name: betaine + smiles: "C[N+](C)(C)CC(=O)O" + +- name: caffeic acid + smiles: "OC(=O)C=Cc1ccc(O)c(O)c1" + +- name: capreomycidine + smiles: "C1CN=C(NC1C(C(=O)O)N)N" + +- name: cinnamic acid + smiles: "C1=CC=C(C=C1)C=CC(=O)O" + +- name: citrulline + smiles: "C(CC(C(=O)O)N)CNC(=O)N" + +- name: colletorin D acid + smiles: "CC1=CC(=C(C(=C1C(=O)O)O)CC=C(C)C)O" + +- name: coumaric acid + smiles: "C1=CC(=CC=C1C=CC(=O)O)O" + +- name: cysteic acid + smiles: "C(C(C(=O)O)N)S(=O)(=O)O" + +- name: dehydroarginine + smiles: "C(CN=C(N)N)C=C(C(=O)O)N" + +- name: dehydrophenylalanine + smiles: "NC(=CC1=CC=CC=C1)C(=O)O" + +- name: dehydrotryptophan + smiles: "C1=CC=C2C(=C1)C(=CN2)C=C(C(=O)O)N" + +- name: dehydrovaline + smiles: "CC(=C(C(=O)O)N)C" + +- name: dehydroalanine + smiles: "C=C(N)C(=O)O" + +- name: dihydrolysergic acid + smiles: "CN1CC(CC2C1CC3=CNC4=CC=CC2=C34)C(=O)O" + +- name: dimethylsulfoniopropionic acid + smiles: "C[S+](C)CCC(=O)O" + +- name: enduracididine + smiles: "C1C(NC(=N1)N)CC(C(=O)O)N" + +- name: fumaric acid + smiles: "C(=CC(=O)O)C(=O)O" + +- name: glycolic acid + smiles: "C(C(=O)O)O" + +- name: grifolic acid + smiles: "CC(C)=CCCC(C)=CCCC(C)=CCC1=C(O)C=C(C)C(C(=O)O)=C(O)1" + +- name: homophenylalanine + smiles: "C1=CC=C(C=C1)CCC(C(=O)O)N" + +- name: homoserine + smiles: "C(CO)C(C(=O)O)N" + +- name: homotyrosine + smiles: "C1=CC(=CC=C1CCC(C(=O)O)N)O" + +- name: homocysteine + smiles: "NC(CCS)C(=O)O" + +- name: indole pyruvic acid + smiles: "C1=CC=C2C(=C1)C(=CN2)CC(=O)C(=O)O" + +- name: leucinol + smiles: "CC(C)CC(CO)N" + +- name: linoleic acid + smiles: "CCCCCC=CCC=CCCCCCCCC(=O)O" + +- name: malonic acid + smiles: "O=C(O)CC(=O)O" + +- name: malic acid + smiles: "C(C(C(=O)O)O)C(=O)O" + +- name: malonamate + smiles: "NC(=O)CC(=O)O" + +- name: meta-tyrosine + smiles: "C1=CC(=CC(=C1)O)CC(C(=O)O)N" + +- name: methylglutaconyl hydroxyornithine + smiles: "CC(=CC(=O)N(CCCC(C(=O)O)N)O)CC(=O)O" + +- name: nicotinic acid + smiles: "C1=CC(=CN=C1)C(=O)O" + +- name: norcoronamic acid + smiles: "CC1CC1(C(=O)O)N" + +- name: ochratoxin beta + smiles: "CC1CC2=C(C(=C(C=C2)C(=O)O)O)C(=O)O1" + +- name: p-hydroxybenzoylformic acid + smiles: "C1=CC(=CC=C1C(=O)C(=O)O)O" + +- name: p-hydroxymandelate + smiles: "C1=CC(=CC=C1C(C(=O)O)O)O" + +- name: para-aminobenzoic acid + smiles: "O=C(O)c1ccc(N)cc1" + +- name: phenazine-1,6-dicarboxylic acid + smiles: "C1=CC(=C2C(=C1)N=C3C(=N2)C=CC=C3C(=O)O)C(=O)O" + +- name: phenylalaninol + smiles: "C1=CC=C(C=C1)CC(CO)N" + +- name: phenylglycine + smiles: "C1=CC=C(C=C1)C(C(=O)O)N" + +- name: phenyllactic acid + smiles: "C1=CC=C(C=C1)CC(C(=O)O)O" + +- name: phenylpyruvic acid + smiles: "C1=CC=C(C=C1)CC(=O)C(=O)O" + +- name: piperazic acid + smiles: "C1CC(NNC1)C(=O)O" + +- name: piperonylic acid + smiles: "OC(=O)c1ccc2OCOc2c1" + +- name: pyrrole-2-carboxylic acid + smiles: "C1=CNC(=C1)C(=O)O" + +- name: pyruvic acid + smiles: "CC(=O)C(=O)O" + +- name: quinoxaline-2-carboxylic acid + smiles: "C1=CC=C2C(=C1)N=CC(=N2)C(=O)O" + +- name: succinic semialdehyde + smiles: "C(CC(=O)O)C=O" + +- name: succinyl-hydrazinoacetic acid + smiles: "NN=CC=CC=CC=CC=CC=CC(=O)O" + +- name: tricarballylic acid + smiles: "C(C(CC(=O)O)C(=O)O)C(=O)O" + +- name: ustethylinic acid + smiles: "c1(C)c(O)c(C(=O)O)c(CC)cc(O)1" + +- name: valine isocyanide + smiles: "CC(C)C([N+]#[C-])C(O)=O" + +- name: valinol + smiles: "CC(C)C(CO)N" + +- name: parahydroxy phenylacetic acid + smiles: "O=C(O)Cc1ccc(O)cc1" + +- name: tryptophan alkaloid + smiles: "O=C(O)C(CC12O)N[C@@H]2Nc3c1cccc3" + +- name: guanidine-containing fatty acid + smiles: "N=C(NCCCCCC(O)=O)N" + +- name: guanidine-containing fatty acid + smiles: "N=C(NCCCC=CC(O)=O)N" + +- name: guanidine-containing fatty acid + smiles: "N=C(N)NCCCC(=O)O" + +- name: guanidine-containing fatty acid + smiles: "N=C(N)NCC(=O)O" + +- name: isoleucinol + smiles: "CCC(C(N)CO)C" + +- name: putrescin + smiles: "NCCCCN" + +- name: cadaverine + smiles: "NCCCCCN" + +- name: N-(5-aminopentyl)hydroxylamine + smiles: "NCCCCCNO" + family_tokens: ["siderophore"] + +- name: butanedioic acid + smiles: "O=C(O)CCC(O)=O" + family_tokens: ["siderophore"] + +- name: 3-formamido-2-hydroxybenzoic acid + smiles: "O=CNc1cccc(C(=O)O)c1O" + +- name: 2-phenylethanamine + smiles: "NCCc1ccccc1" + +- name: homocysteamine + smiles: "NCCCS" + +- name: valienol + smiles: "OC1C=C(C(C(C1O)O)O)CO" + +- name: valienone + smiles: "OC1C=C(C(C(C1O)O)=O)CO" + +- name: streptamine + smiles: "NC1CC(C(C(C1O)O)O)N" diff --git a/src/retromol/data/mxn_pks.yml b/src/retromol/data/mxn_pks.yml new file mode 100644 index 0000000..c618b67 --- /dev/null +++ b/src/retromol/data/mxn_pks.yml @@ -0,0 +1,224 @@ +- name: A1 + smiles: "O=C(O)CC(=O)SO" + ancestor_tokens: ["PKS", "A"] +- name: A1 + smiles: "O=C(O)C=C(O)SO" + ancestor_tokens: ["PKS", "A"] + +- name: A2 + smiles: "CC(C(=O)O)C(=O)SO" + ancestor_tokens: ["PKS", "A"] +- name: A2 + smiles: "CC(C(=O)O)=C(O)SO" + ancestor_tokens: ["PKS", "A"] + +- name: A3 + smiles: "CC(C)(C(=O)O)C(=O)SO" + ancestor_tokens: ["PKS", "A"] + +- name: A4 + smiles: "CCC(C(=O)O)C(=O)SO" + ancestor_tokens: ["PKS", "A"] + +- name: A5 + smiles: "O=C(O)C(O)C(=O)SO" + ancestor_tokens: ["PKS", "A"] +- name: A5 + smiles: "O=C(C(O)=C(SO)O)O" + ancestor_tokens: ["PKS", "A"] + +- name: A6 + smiles: "CC(O)(C(=O)O)C(=O)SO" + ancestor_tokens: ["PKS", "A"] + +- name: A7 + smiles: "O=C(O)C(CO)C(=O)SO" + ancestor_tokens: ["PKS", "A"] +- name: A7 + smiles: "O=C(O)C(CO)=C(O)SO" + ancestor_tokens: ["PKS", "A"] + +- name: A8 + smiles: "C=C(CC(=O)O)SO" + ancestor_tokens: ["PKS", "A"] +- name: A8 + smiles: "CC(=CC(=O)O)SO" + ancestor_tokens: ["PKS", "A"] + +- name: A9 + smiles: "C=C(SO)C(C)C(=O)O" + ancestor_tokens: ["PKS", "A"] + +- name: A10 + smiles: "C=C(SO)C(O)C(=O)O" + ancestor_tokens: ["PKS", "A"] + +- name: A11 + smiles: "C=C(SO)C(=O)C(=O)O" + ancestor_tokens: ["PKS", "A"] + +- name: B1 + smiles: "O=C(O)CC(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B2 + smiles: "CC(C(=O)O)C(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B3 + smiles: "CC(C)(C(=O)O)C(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B4 + smiles: "CCC(C(=O)O)C(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B5 + smiles: "O=C(O)C(O)C(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B6 + smiles: "CC(O)(C(=O)O)C(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B7 + smiles: "O=C(O)C(CO)C(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B8 + smiles: "CC(O)(CC(=O)O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B9 + smiles: "CC(C(=O)O)C(C)(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B10 + smiles: "CC(O)(SO)C(O)C(=O)O" + ancestor_tokens: ["PKS", "B"] + +- name: B11 + smiles: "O=C(O)C(=O)C(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B12 + smiles: "NC(C(=O)O)C(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B13 + smiles: "OSC(O)C(C(=O)O)C(=O)O" + ancestor_tokens: ["PKS", "B"] + +- name: C1 + smiles: "O=C(O)C=CSO" + ancestor_tokens: ["PKS", "C"] + +- name: C2 + smiles: "CC(=CSO)C(=O)O" + ancestor_tokens: ["PKS", "C"] + +- name: C4 + smiles: "CCC(=CSO)C(=O)O" + ancestor_tokens: ["PKS", "C"] + +- name: C7 + smiles: "O=C(O)C(=CSO)CO" + ancestor_tokens: ["PKS", "C"] + +- name: C13 + smiles: "O=C(C(C(O)=O)=CSO)O" + ancestor_tokens: ["PKS", "C"] + +- name: D1 + smiles: "O=C(O)CCSO" + ancestor_tokens: ["PKS", "D"] + +- name: D2 + smiles: "CC(CSO)C(=O)O" + ancestor_tokens: ["PKS", "D"] + +- name: D3 + smiles: "CC(C)(CSO)C(=O)O" + ancestor_tokens: ["PKS", "D"] + +- name: D4 + smiles: "CCC(CSO)C(=O)O" + ancestor_tokens: ["PKS", "D"] + +- name: D5 + smiles: "O=C(O)C(O)CSO" + ancestor_tokens: ["PKS", "D"] + +- name: D6 + smiles: "CC(O)(CSO)C(=O)O" + ancestor_tokens: ["PKS", "D"] + +- name: D7 + smiles: "O=C(O)C(CO)CSO" + ancestor_tokens: ["PKS", "D"] + +- name: D8 + smiles: "CC(CC(=O)O)SO" + ancestor_tokens: ["PKS", "D"] + +- name: D10 + smiles: "CC(SO)C(O)C(=O)O" + ancestor_tokens: ["PKS", "D"] + +- name: D11 + smiles: "O=C(O)C(=O)CSO" + ancestor_tokens: ["PKS", "D"] +- name: D11 + smiles: "O=C(O)C(O)=CSO" + ancestor_tokens: ["PKS", "D"] + +- name: D14 + smiles: "CC(O)C(CSO)C(=O)O" + ancestor_tokens: ["PKS", "D"] + +- name: D15 + smiles: "C=C(C(O)=O)CSO" + ancestor_tokens: ["PKS", "D"] + +- name: D16 + smiles: "OSCC(CC=O)C(=O)O" + ancestor_tokens: ["PKS", "D"] + +- name: D17 + smiles: "O=C(C(C(C)SO)=O)O" + ancestor_tokens: ["PKS", "D"] +- name: D17 + smiles: "O=C(C(O)=C(C)SO)O" + ancestor_tokens: ["PKS", "D"] + +- name: A + smiles: "OSC(N)CC(=O)O" + family_tokens: ["PKS"] + +- name: A + smiles: "OSC(N)C(C)C(=O)O" + family_tokens: ["PKS"] + +- name: A + smiles: "OSC(N)C(C)(C)C(=O)O" + family_tokens: ["PKS"] + +- name: A + smiles: "OSC(N)C(CC)C(=O)O" + family_tokens: ["PKS"] + +- name: A + smiles: "OSC(N)C(C)(O)C(=O)O" + family_tokens: ["PKS"] + +- name: A + smiles: "NC(C(O)=O)CSO" + family_tokens: ["PKS"] + +- name: A + smiles: "NC(C(C(O)=O)O)SO" + family_tokens: ["PKS"] + +- name: A + smiles: "O=C(C(C(C(O)=O)SO)O)O" + family_tokens: ["PKS"] diff --git a/src/retromol/data/mxn_pks_chiral.yml b/src/retromol/data/mxn_pks_chiral.yml new file mode 100644 index 0000000..d2ded39 --- /dev/null +++ b/src/retromol/data/mxn_pks_chiral.yml @@ -0,0 +1,452 @@ +- name: A1 + smiles: "O=C(O)CC(=O)SO" + ancestor_tokens: ["PKS", "A"] +- name: A1 + smiles: "O=C(O)C=C(O)SO" + ancestor_tokens: ["PKS", "A"] + +- name: A2^R + smiles: "C[C@@H](C(SO)=O)C(O)=O" + ancestor_tokens: ["PKS", "A"] +- name: A2^S + smiles: "C[C@H](C(SO)=O)C(O)=O" + ancestor_tokens: ["PKS", "A"] +- name: A2 + smiles: "CC(C(=O)O)C(=O)SO" + ancestor_tokens: ["PKS", "A"] +- name: A2 + smiles: "CC(C(=O)O)=C(O)SO" + ancestor_tokens: ["PKS", "A"] + +- name: A3 + smiles: "CC(C)(C(=O)O)C(=O)SO" + ancestor_tokens: ["PKS", "A"] + +- name: A4^R + smiles: "CC[C@@H](C(SO)=O)C(O)=O" + ancestor_tokens: ["PKS", "A"] +- name: A4^S + smiles: "CC[C@H](C(SO)=O)C(O)=O" + ancestor_tokens: ["PKS", "A"] +- name: A4 + smiles: "CCC(C(=O)O)C(=O)SO" + ancestor_tokens: ["PKS", "A"] + +- name: A5^R + smiles: "O=C([C@H](C(SO)=O)O)O" + ancestor_tokens: ["PKS", "A"] +- name: A5^S + smiles: "O=C([C@@H](C(SO)=O)O)O" + ancestor_tokens: ["PKS", "A"] +- name: A5 + smiles: "O=C(O)C(O)C(=O)SO" + ancestor_tokens: ["PKS", "A"] +- name: A5 + smiles: "O=C(C(O)=C(SO)O)O" + ancestor_tokens: ["PKS", "A"] + +- name: A6^R + smiles: "C[C@@](C(O)=O)(C(SO)=O)O" + ancestor_tokens: ["PKS", "A"] +- name: A6^S + smiles: "C[C@](C(O)=O)(C(SO)=O)O" + ancestor_tokens: ["PKS", "A"] +- name: A6 + smiles: "CC(O)(C(=O)O)C(=O)SO" + ancestor_tokens: ["PKS", "A"] + +- name: A7^R + smiles: "O=C([C@H](C(SO)=O)CO)O" + ancestor_tokens: ["PKS", "A"] +- name: A7^S + smiles: "O=C([C@@H](C(SO)=O)CO)O" + ancestor_tokens: ["PKS", "A"] +- name: A7 + smiles: "O=C(O)C(CO)C(=O)SO" + ancestor_tokens: ["PKS", "A"] +- name: A7 + smiles: "O=C(O)C(CO)=C(O)SO" + ancestor_tokens: ["PKS", "A"] + +- name: A8 + smiles: "C=C(CC(=O)O)SO" + ancestor_tokens: ["PKS", "A"] +- name: A8 + smiles: "CC(=CC(=O)O)SO" + ancestor_tokens: ["PKS", "A"] + +- name: A9^R + smiles: "C=C([C@@H](C(O)=O)C)SO" + ancestor_tokens: ["PKS", "A"] +- name: A9^S + smiles: "C=C([C@H](C(O)=O)C)SO" + ancestor_tokens: ["PKS", "A"] +- name: A9 + smiles: "C=C(SO)C(C)C(=O)O" + ancestor_tokens: ["PKS", "A"] + +- name: A10^R + smiles: "C=C([C@@H](C(O)=O)O)SO" + ancestor_tokens: ["PKS", "A"] +- name: A10^S + smiles: "C=C([C@H](C(O)=O)O)SO" + ancestor_tokens: ["PKS", "A"] +- name: A10 + smiles: "C=C(SO)C(O)C(=O)O" + ancestor_tokens: ["PKS", "A"] + +- name: A11 + smiles: "C=C(SO)C(=O)C(=O)O" + ancestor_tokens: ["PKS", "A"] + +- name: B^R1 + smiles: "O=C(C[C@@H](SO)O)O" + ancestor_tokens: ["PKS", "B"] +- name: B^S1 + smiles: "O=C(C[C@H](SO)O)O" + ancestor_tokens: ["PKS", "B"] +- name: B1 + smiles: "O=C(O)CC(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B^R2^R + smiles: "C[C@@H]([C@@H](SO)O)C(O)=O" + ancestor_tokens: ["PKS", "B"] +- name: B^R2^S + smiles: "C[C@H]([C@@H](SO)O)C(O)=O" + ancestor_tokens: ["PKS", "B"] +- name: B^R2 + smiles: "CC([C@@H](SO)O)C(O)=O" + ancestor_tokens: ["PKS", "B"] +- name: B^S2^R + smiles: "C[C@@H]([C@H](SO)O)C(O)=O" + ancestor_tokens: ["PKS", "B"] +- name: B^S2^S + smiles: "C[C@H]([C@H](SO)O)C(O)=O" + ancestor_tokens: ["PKS", "B"] +- name: B^S2 + smiles: "CC([C@H](SO)O)C(O)=O" + ancestor_tokens: ["PKS", "B"] +- name: B2 + smiles: "CC(C(=O)O)C(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B^R3 + smiles: "CC(C(O)=O)([C@@H](SO)O)C" + ancestor_tokens: ["PKS", "B"] +- name: B^S3 + smiles: "CC(C(O)=O)([C@H](SO)O)C" + ancestor_tokens: ["PKS", "B"] +- name: B3 + smiles: "CC(C)(C(=O)O)C(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B^R4^R + smiles: "CC[C@@H]([C@@H](SO)O)C(O)=O" + ancestor_tokens: ["PKS", "B"] +- name: B^R4^S + smiles: "CC[C@H]([C@@H](SO)O)C(O)=O" + ancestor_tokens: ["PKS", "B"] +- name: B^S4^R + smiles: "CC[C@@H]([C@H](SO)O)C(O)=O" + ancestor_tokens: ["PKS", "B"] +- name: B^S4^S + smiles: "CC[C@H]([C@H](SO)O)C(O)=O" + ancestor_tokens: ["PKS", "B"] +- name: B4 + smiles: "CCC(C(=O)O)C(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B^R5^R + smiles: "O=C([C@H]([C@@H](SO)O)O)O" + ancestor_tokens: ["PKS", "B"] +- name: B^R5^S + smiles: "O=C([C@@H]([C@@H](SO)O)O)O" + ancestor_tokens: ["PKS", "B"] +- name: B^S5^R + smiles: "O=C([C@H]([C@H](SO)O)O)O" + ancestor_tokens: ["PKS", "B"] +- name: B^S5^S + smiles: "O=C([C@@H]([C@H](SO)O)O)O" + ancestor_tokens: ["PKS", "B"] +- name: B5 + smiles: "O=C(O)C(O)C(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B^R6^R + smiles: "C[C@@](C(O)=O)([C@@H](SO)O)O" + ancestor_tokens: ["PKS", "B"] +- name: B^R6^S + smiles: "C[C@](C(O)=O)([C@@H](SO)O)O" + ancestor_tokens: ["PKS", "B"] +- name: B^S6^R + smiles: "C[C@@](C(O)=O)([C@H](SO)O)O" + ancestor_tokens: ["PKS", "B"] +- name: B^S6^S + smiles: "C[C@](C(O)=O)([C@H](SO)O)O" + ancestor_tokens: ["PKS", "B"] +- name: B6 + smiles: "CC(O)(C(=O)O)C(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B^R7^R + smiles: "O=C([C@H]([C@@H](SO)O)CO)O" + ancestor_tokens: ["PKS", "B"] +- name: B^R7^S + smiles: "O=C([C@@H]([C@@H](SO)O)CO)O" + ancestor_tokens: ["PKS", "B"] +- name: B^S7^R + smiles: "O=C([C@H]([C@H](SO)O)CO)O" + ancestor_tokens: ["PKS", "B"] +- name: B^S7^S + smiles: "O=C([C@@H]([C@H](SO)O)CO)O" + ancestor_tokens: ["PKS", "B"] +- name: B7 + smiles: "O=C(O)C(CO)C(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B^R8 + smiles: "C[C@](CC(O)=O)(SO)O" + ancestor_tokens: ["PKS", "B"] +- name: B^S8 + smiles: "C[C@@](CC(O)=O)(SO)O" + ancestor_tokens: ["PKS", "B"] +- name: B8 + smiles: "CC(O)(CC(=O)O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B^R9^R + smiles: "C[C@@H]([C@](O)(SO)C)C(O)=O" + ancestor_tokens: ["PKS", "B"] +- name: B^R9^S + smiles: "C[C@H]([C@](O)(SO)C)C(O)=O" + ancestor_tokens: ["PKS", "B"] +- name: B^S9^R + smiles: "C[C@@H]([C@@](O)(SO)C)C(O)=O" + ancestor_tokens: ["PKS", "B"] +- name: B^S9^S + smiles: "C[C@H]([C@@](O)(SO)C)C(O)=O" + ancestor_tokens: ["PKS", "B"] +- name: B9 + smiles: "CC(C(=O)O)C(C)(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B^R10^R + smiles: "C[C@@](SO)([C@@H](C(O)=O)O)O" + ancestor_tokens: ["PKS", "B"] +- name: B^R10^S + smiles: "C[C@@](SO)([C@H](C(O)=O)O)O" + ancestor_tokens: ["PKS", "B"] +- name: B^S10^R + smiles: "C[C@](SO)([C@@H](C(O)=O)O)O" + ancestor_tokens: ["PKS", "B"] +- name: B^S10^S + smiles: "C[C@](SO)([C@H](C(O)=O)O)O" + ancestor_tokens: ["PKS", "B"] +- name: B10 + smiles: "CC(O)(SO)C(O)C(=O)O" + ancestor_tokens: ["PKS", "B"] + +- name: B11 + smiles: "O=C(O)C(=O)C(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B^R12^R + smiles: "N[C@@H]([C@@H](SO)O)C(O)=O" + ancestor_tokens: ["PKS", "B"] +- name: B^R12^S + smiles: "N[C@H]([C@@H](SO)O)C(O)=O" + ancestor_tokens: ["PKS", "B"] +- name: B^S12^R + smiles: "N[C@@H]([C@H](SO)O)C(O)=O" + ancestor_tokens: ["PKS", "B"] +- name: B^S12^S + smiles: "N[C@H]([C@H](SO)O)C(O)=O" + ancestor_tokens: ["PKS", "B"] +- name: B12 + smiles: "NC(C(=O)O)C(O)SO" + ancestor_tokens: ["PKS", "B"] + +- name: B^R13 + smiles: "OS[C@H](C(C(O)=O)C(O)=O)O" + ancestor_tokens: ["PKS", "B"] +- name: B^S13 + smiles: "OS[C@@H](C(C(O)=O)C(O)=O)O" + ancestor_tokens: ["PKS", "B"] +- name: B13 + smiles: "OSC(O)C(C(=O)O)C(=O)O" + ancestor_tokens: ["PKS", "B"] + +- name: C1 + smiles: "O=C(O)C=CSO" + ancestor_tokens: ["PKS", "C"] + +- name: C2 + smiles: "CC(=CSO)C(=O)O" + ancestor_tokens: ["PKS", "C"] + +- name: C4 + smiles: "CCC(=CSO)C(=O)O" + ancestor_tokens: ["PKS", "C"] + +- name: C7 + smiles: "O=C(O)C(=CSO)CO" + ancestor_tokens: ["PKS", "C"] + +- name: C13 + smiles: "O=C(C(C(O)=O)=CSO)O" + ancestor_tokens: ["PKS", "C"] + +- name: D1 + smiles: "O=C(O)CCSO" + ancestor_tokens: ["PKS", "D"] + +- name: D2^R + smiles: "C[C@H](C(O)=O)CSO" + ancestor_tokens: ["PKS", "D"] +- name: D2^S + smiles: "C[C@@H](C(O)=O)CSO" + ancestor_tokens: ["PKS", "D"] +- name: D2 + smiles: "CC(CSO)C(=O)O" + ancestor_tokens: ["PKS", "D"] + +- name: D3 + smiles: "CC(C)(CSO)C(=O)O" + ancestor_tokens: ["PKS", "D"] + +- name: D4^R + smiles: "CC[C@H](C(O)=O)CSO" + ancestor_tokens: ["PKS", "D"] +- name: D4^S + smiles: "CC[C@@H](C(O)=O)CSO" + ancestor_tokens: ["PKS", "D"] +- name: D4 + smiles: "CCC(CSO)C(=O)O" + ancestor_tokens: ["PKS", "D"] + +- name: D5^R + smiles: "O=C([C@H](CSO)O)O" + ancestor_tokens: ["PKS", "D"] +- name: D5^S + smiles: "O=C([C@@H](CSO)O)O" + ancestor_tokens: ["PKS", "D"] +- name: D5 + smiles: "O=C(O)C(O)CSO" + ancestor_tokens: ["PKS", "D"] + +- name: D6^R + smiles: "C[C@](CSO)(C(O)=O)O" + ancestor_tokens: ["PKS", "D"] +- name: D6^S + smiles: "C[C@@](CSO)(C(O)=O)O" + ancestor_tokens: ["PKS", "D"] +- name: D6 + smiles: "CC(O)(CSO)C(=O)O" + ancestor_tokens: ["PKS", "D"] + +- name: D7^R + smiles: "O=C([C@H](CSO)CO)O" + ancestor_tokens: ["PKS", "D"] +- name: D7^S + smiles: "O=C([C@@H](CSO)CO)O" + ancestor_tokens: ["PKS", "D"] +- name: D7 + smiles: "O=C(O)C(CO)CSO" + ancestor_tokens: ["PKS", "D"] + +- name: D^R8 + smiles: "C[C@@H](SO)CC(O)=O" + ancestor_tokens: ["PKS", "D"] +- name: D^S8 + smiles: "C[C@H](SO)CC(O)=O" + ancestor_tokens: ["PKS", "D"] +- name: D8 + smiles: "CC(CC(=O)O)SO" + ancestor_tokens: ["PKS", "D"] + +- name: D^R10^R + smiles: "C[C@H]([C@@H](C(O)=O)O)SO" + ancestor_tokens: ["PKS", "D"] +- name: D^R10^S + smiles: "C[C@H]([C@H](C(O)=O)O)SO" + ancestor_tokens: ["PKS", "D"] +- name: D^S10^R + smiles: "C[C@@H]([C@@H](C(O)=O)O)SO" + ancestor_tokens: ["PKS", "D"] +- name: D^S10^S + smiles: "C[C@@H]([C@H](C(O)=O)O)SO" + ancestor_tokens: ["PKS", "D"] +- name: D10 + smiles: "CC(SO)C(O)C(=O)O" + ancestor_tokens: ["PKS", "D"] + +- name: D11 + smiles: "O=C(O)C(=O)CSO" + ancestor_tokens: ["PKS", "D"] +- name: D11 + smiles: "O=C(O)C(O)=CSO" + ancestor_tokens: ["PKS", "D"] + +- name: D14 + smiles: "CC(O)C(CSO)C(=O)O" + ancestor_tokens: ["PKS", "D"] + +- name: D15 + smiles: "C=C(C(O)=O)CSO" + ancestor_tokens: ["PKS", "D"] + +- name: D16^R + smiles: "OSC[C@@H](C(O)=O)CC=O" + ancestor_tokens: ["PKS", "D"] +- name: D16^S + smiles: "OSC[C@H](C(O)=O)CC=O" + ancestor_tokens: ["PKS", "D"] +- name: D16 + smiles: "OSCC(CC=O)C(=O)O" + ancestor_tokens: ["PKS", "D"] + +- name: D^R17 + smiles: "O=C(O)C([C@H](SO)C)=O" + ancestor_tokens: ["PKS", "D"] +- name: D^S17 + smiles: "O=C(O)C([C@@H](SO)C)=O" + ancestor_tokens: ["PKS", "D"] +- name: D17 + smiles: "O=C(C(C(C)SO)=O)O" + ancestor_tokens: ["PKS", "D"] +- name: D17 + smiles: "O=C(C(O)=C(C)SO)O" + ancestor_tokens: ["PKS", "D"] + +- name: A + smiles: "OSC(N)CC(=O)O" + family_tokens: ["PKS"] + +- name: A + smiles: "OSC(N)C(C)C(=O)O" + family_tokens: ["PKS"] + +- name: A + smiles: "OSC(N)C(C)(C)C(=O)O" + family_tokens: ["PKS"] + +- name: A + smiles: "OSC(N)C(CC)C(=O)O" + family_tokens: ["PKS"] + +- name: A + smiles: "OSC(N)C(C)(O)C(=O)O" + family_tokens: ["PKS"] + +- name: A + smiles: "NC(C(O)=O)CSO" + family_tokens: ["PKS"] + +- name: A + smiles: "NC(C(C(O)=O)O)SO" + family_tokens: ["PKS"] + +- name: A + smiles: "O=C(C(C(C(O)=O)SO)O)O" + family_tokens: ["PKS"] diff --git a/src/retromol/data/rxn.yml b/src/retromol/data/rxn.yml new file mode 100644 index 0000000..29faee6 --- /dev/null +++ b/src/retromol/data/rxn.yml @@ -0,0 +1,324 @@ +- name: break ester bond (intermolecular) + smarts: "[*:6]-[C,c:1]-[O;!R:2]-[C;!R:3](=[O:4])-[C,c:5]>>[*:6]-[C,c:1]-[OH:2].[O]-[C:3](=[O:4])-[C,c:5]" + allowed_in_bulk: true + +- name: break thio-ester bond (intermolecular) + smarts: "[*:6]-[C,c:1]-[S;!R:2]-[C;!R:3](=[O:4])-[C,c:5]>>[*:6]-[C,c:1]-[SH:2].[O]-[C:3](=[O:4])-[C,c:5]" + allowed_in_bulk: true + +- name: break N-O glycosidic bond (intermolecular) + smarts: "[C:1][NH1:2][O:3][C:4]>>[C:1][NH2:2].[OH:3][C:4]" + allowed_in_bulk: true + +- name: break glycosidic bond + smarts: "[C,c:1][O:2][C:3]1[O:4][C:5][C:6][C:7][C:8]1>>[C,c:1][OH:2].[OH][C:3]1[O:4][C:5][C:6][C:7][C:8]1" + allowed_in_bulk: true + +- name: break furanosidic bond + smarts: "[C,c:1][O:2][C:3]1[O:4][C:5][C:7][C:8]1>>[C,c:1][OH:2].[OH][C:3]1[O:4][C:5][C:7][C:8]1" + allowed_in_bulk: true + +- name: break glycosidic ester bond + smarts: "[C:8](=[O:9])[O:7][C:1]1[C:2][O:3][C:4][C:5][C:6]1>>[C:8](=[O:9])[OH:7].[O][C:1]1[C:2][O:3][C:4][C:5][C:6]1" + allowed_in_bulk: true + +- name: break N-glycosidic linkage + smarts: "[O:2][C:3]1[O:4][C:5][C:6]([N:9][C:10])[C:7][C:8]1>>[O:2][C:3]1[O:4][C:5][C:6]([N:9])[C:7][C:8]1.[O][C:10]" + allowed_in_bulk: true + +- name: reverse epoxidation + smarts: "[C:1]1[O:2][C:3]1>>[C:1]=[C:3].[O:2]" + allowed_in_bulk: true + +- name: reverse O-methylation + smarts: "[O&D2:1][CH3:2]>>[O:1].[C:2]" + allowed_in_bulk: true + +- name: reverse N-methylation + smarts: "[N:1][CH3:2]>>[N:1].[C:2]" + allowed_in_bulk: true + +- name: reverse S-methylation + smarts: "[S:1][CH3:2]>>[S:1].[C:2]" + allowed_in_bulk: true + +- name: reverse C-methylation amino acid + smarts: "[N:1][C:2]([CH3:3])([*:4])[C:5](=[O:6])[O:7]>>[N:1][C:2]([*:4])[C:5](=[O:6])[O:7].[CH3:3]" + allowed_in_bulk: true + +- name: reverse halogenation + smarts: "[C,c:1]([Cl,Br,F,I:2])>>[C,c:1].[Cl,Br,F,I:2]" + allowed_in_bulk: true + +- name: reverse carbamic acid tailoring (type 1) + smarts: "[NH2:1]-[CH0:2](=[O:3])-[OH0:4]-[*:5]>>[OH1:4]-[*:5].[NH2:1]-[CH0:2](=[O:3])[OH]" + allowed_in_bulk: true + +- name: reverse carbamic acid tailoring (type 2) + smarts: "[NH2:1]-[C:2](=[O:3])-[N:4]-[*:5]>>[N:4]-[*:5].[NH2:1]-[C:2](=[O:3])[OH]" + allowed_in_bulk: true + +- name: reverse carbinolamide tailoring + smarts: "[OH:1]-[C:2]1[C:3][C:4][O:5][C:6](=[O:7])[N:8]1>>[O:1]=[C:2][C:3][C:4][OH:5].[O][C:6](=[O:7])[N:8]" + allowed_in_bulk: true + props: + references: + - "PMID:12060743" + +- name: reverse acetic acid tailoring + smarts: "[CH3:1]-[CH0:2](=[O:3])-[OH0:4]-[*:5]>>[OH1:4]-[*:5].[CH3:1]-[CH0:2](=[O:3])[OH]" + allowed_in_bulk: true + +- name: reverse carboxylic acid tailoring + smarts: "[OH1:1]-[CH0:2](=[O:3])-[NH1:4]-[*:5]>>[NH2:4]-[*:5].[OH1:1]-[CH0:2](=[O:3])[OH]" + allowed_in_bulk: true + +- name: reverse acylation (type 1) + smarts: "[CH3:1][C:2](=[O:3])[O,N:4][*:5]>>[CH3:1][C:2](=[O:3])[OH].[O,N:4][*:5]" + allowed_in_bulk: true + +- name: reverse acylation (type 2) + smarts: "[CH1:1]([CH3:6])([CH3:7])[C:2](=[O:3])[O,N:4][*:5]>>[CH1:1]([CH3:6])([CH3:7])[C:2](=[O:3])[OH].[O,N:4][*:5]" + allowed_in_bulk: true + +- name: reverse acylation (type 3) + smarts: "[*:1][NH1:2][CH2:3][CH3:4]>>[*:1][NH2:2].[OH][CH2:3][CH3:4]" + allowed_in_bulk: true + +- name: reverse cyanide tailoring + smarts: "[C:1][C&D2:2]#[N&D1:3]>>[C:1].[C:2]#[N:3]" + allowed_in_bulk: true + +- name: reverse hydroxyl sulfonation + smarts: "[*:1][S:2](=[O:3])(=[O:4])[OH:5]>>[*:1].[OH][S:2](=[O:3])(=[O:4])[OH:5]" + allowed_in_bulk: true + +- name: reverse hydroxyl phosphonylation + smarts: "[C:3][O:1][P:2]>>[C:3][O:1].[O][P:2]" + allowed_in_bulk: true + +- name: reverse 1,3-dioxane formation (cyclic acetal) + smarts: "[*:6][C:1]1~[C:2]~[C:3]([*:7])[O:4][C:8][O:5]1>>[*:6][C:1]([OH:5])~[C:2]~[C:3]([OH:4])[*:7].[C:8]=[O]" + +- name: reverse 1,3-dioxolane formation (spiro-bis(pyranose) system) + smarts: "[*:6][C:1]1~[C:2]([*:7])[O:4][C:8][O:5]1>>[*:6][C:1]([OH:5])~[C:2]([OH:4])[*:7].[C:8][OH]" + +- name: reverse 2,3-diaminopropionate amidation + smarts: "[N:1][CH1:2]([C:3](=[O:4])[NH2:5])[CH2:6][NH2:7]>>[NH1:1][CH1:2]([C:3](=[O:4])[OH])[CH2:6][NH2:7].[N:5]" + +- name: reverse hydrogenation tailoring + smarts: "[CH2:1][CH1:2]([OH1:3])[CH2:4][OH1:5]>>[CH2:1][CH0:2](=[OH0:3])[OH1].[CH3:4][OH1:5]" + +- name: reverse etherification + smarts: "[*:1]-[CH0:2]1(-[OH:3])-[O:4]-[CH1:5](-[*:6])-[C:7][C:8][C:9]1>>([*:1]-[CH0:2]1=[OH0:3].[OH1:4]-[CH1:5](-[*:6])-[C:7][C:8][C:9]1)" + +- name: reverse terminal etherification + smarts: "[CH1:2]1(-[OH:3])-[O:4]-[CH1:5](-[*:6])-[C:7][C:8][C:9]1>>([CH0:2]1(=[OH0:3])[OH].[OH1:4]-[CH1:5](-[*:6])-[C:7][C:8][C:9]1)" + +- name: reverse etherification (6-ring; no hydroxyl group) + smarts: "[C:1]-[C:2]([C:5]~[C:6]~[C:7]1)-[O:3]-[C:4]1[C:8][C:9]>>([C:1]-[C:2]([C:5][C:6][C:7]1)-[OH0:3].[C:4]1=[C:8][C:9])" + +- name: reverse etherification (5-ring; no hydroxyl group) + smarts: "[C:1]-[C:2]([C:5]~[C:7]1)-[O:3]-[C:4]1[C:8][C:9]>>([C:1]-[C:2]([C:5][C:7]1)-[OH0:3].[C:4]1=[C:8][C:9])" + +- name: reverse macrolactonization + smarts: "[C;R:1][C;R:2](=[O:3])[O;R:4][C;R:5]>>([C:1][C:2](=[O:3])[OH].[OH:4][C:5])" + +- name: break ester bond (intermolecular) + smarts: "[C,c:1][C;!R:2](=[O:3])[O;!R:4][C:5]>>[C,c:1][C:2](=[O:3])[OH].[OH:4][C:5]" + +- name: break aromatic ester bond (intramolecular) + smarts: "[c:1]1[c:2][c:3][c:4][c:5](=[O:7])[o:6]1>>([OH:6][C:1]=[C:2]-[C:3]=[C:4]-[C:5](=[O:7])[OH])" + +- name: reverse macrolactonethionization + smarts: "[C;R:1][C;R:2](=[O:3])[S;R:4][C;R:5]>>([C:1][C:2](=[O:3])[OH].[SH:4][C:5])" + +- name: break thio-ester bond (intermolecular) + smarts: "[C:1][C;!R:2](=[O:3])[S;!R:4][C:5]>>[C:1][C:2](=[O:3])[OH].[SH:4][C:5]" + +- name: reverse carbocyclization (type 1) + smarts: "[*:1][C:2]1[C:3]([*:4])[C:5]=[C:6][CH1:7]([C;R:8])[C:9]([C;R:10])1>>([*:1][C:2]=1.[C:3]([*:4])=[C:5]-[C:6]=[CH1:7]([C;R:8]).[C:9]([C;R:10])=1)" + +- name: reverse carbocyclization (type 2) + smarts: "[*:1][C:2]1[C:3]([*:4])[C:5]=[C:6][C:7](=[C;R:8][C;R:11])[C:9]([C;R:10])1>>([*:1][C:2]=1.[C:3]([*:4])=[C:5]-[C:6]=[C:7](-[C;R:8]=[C;R:11]).[C:9]([C;R:10])=1)" + +- name: reverse carbocycle oxidation + smarts: "[#6:1]1(-[#6:4](=[#6:6]-[#6:7](-[#8:3])-[#6:8](-[#6:10])-[#6:2]-1-[#6:9])-[*:11])-[*:12]>>[#6:1]1(-[#6:4](-[#6:6]=[#6:7]-[#6:8](-[#6:10])-[#6:2]-1-[#6:9])-[*:11])-[*:12].[#8:3]" + +- name: reverse heterocyclization + smarts: "[C:1]1[C:2][N:3][C:4](=[O:5])[C:6]=1([*:7])>>[C:1]([OH])(=[O])[C:2][N:3][C:4](=[O:5])[C:6]([*:7])" + +- name: break dilsufide bridge (intramolecular) + smarts: "[C;R:1][C:2][S:3][S:4][C:5][C;R:6]>>([C;R:1][C:2][SH:3].[SH:4][C:5][C;R:6])" + allowed_in_bulk: true + +- name: break disulfide bridge (intermolecular) + smarts: "[C:1][C:2][S:3][S:4][C:5][C:6]>>[C;R:1][C:2][SH:3].[SH:4][C:5][C;R:6]" + allowed_in_bulk: true + +- name: break threonine-cysteine bridge (intramolecular) + smarts: "[C:1](=[O:2])[C:3]([N:4])[CH1:5]([CH3:6])[S:7][CH2:8][C:9]([N:10])[C:11]=[O:12]>>([C:1](=[O:2])[C:3]([N:4])[CH1:5]([CH3:6])[OH].[SH:7][CH2:8][C:9]([N:10])[C:11]=[O:12])" + allowed_in_bulk: true + +- name: break threonine-cysteine bridge (intermolecular) + smarts: "[C:1](=[O:2])[C:3]([N:4])[CH1:5]([CH3:6])[S:7][CH2:8][C:9]([N:10])[C:11]=[O:12]>>[C:1](=[O:2])[C:3]([N:4])[CH1:5]([CH3:6])[OH].[SH:7][CH2:8][C:9]([N:10])[C:11]=[O:12]" + allowed_in_bulk: true + +- name: break serine-cysteine bridge (intramolecular) + smarts: "[C:1](=[O:2])[C:3]([N:4])[CH2:5][S:7][CH2:8][C:9]([N:10])[C:11]=[O:12]>>([C:1](=[O:2])[C:3]([N:4])[CH2:5][SH:7].[OH][CH2:8][C:9]([N:10])[C:11]=[O:12])" + allowed_in_bulk: true + +- name: break serine-cysteine bridge (intermolecular) + smarts: "[C:1](=[O:2])[C:3]([N:4])[CH2:5][S:7][CH2:8][C:9]([N:10])[C:11]=[O:12]>>[C:1](=[O:2])[C:3]([N:4])[CH2:5][SH:7].[OH][CH2:8][C:9]([N:10])[C:11]=[O:12]" + allowed_in_bulk: true + +- name: break cysteine bond (intramolecular) + smarts: "[O:1][C:2](=[O:3])[C:4][C:5][SH0:6][C:7]>>[O:1][C:2](=[O:3])[C:4][C:5][SH1:6].[C:7]" + allowed_in_bulk: true + +- name: reverse reduction (type 1) + smarts: "[NH2:1][CH0:2]([OH:3])[C:4](=[O:5])[OH:6]>>[NH2:1][CH1:2][C:4](=[O:5])[OH:6].[OH2:3]" + +- name: reverse reduction (type 2) + smarts: "[NH2:1][CH0:2]([OH:3])[CH2:4][C:5](=[O:6])[OH:7]>>[NH2:1][CH1:2][CH2:4][C:5](=[O:6])[OH:7].[OH2:3]" + +- name: reverse oxazole formation + smarts: "[C,c:1][c:2]1[o:3][c:4][c:5]([C:6])[n:7]1>>[C,c:1][CH0:2](=[O:3])[NH1:7][CH1:5]([CH2:4][OH1])[C:6]" + +- name: reverse oxazoline formation + smarts: "[C,c:1][C:2]=1[OH0:3][C:4][C:5]([C:6])[N:7]1>>[C,c:1][CH0:2](=[O:3])[NH1:7][CH1:5]([CH2:4][OH1])[C:6]" + +- name: reverse thiazole formation + smarts: "[C:1][c:2]1[s:3][c:4][c:5][n:7]1>>([C:1]-[CH0:2]-1(=O).[SH1:3]-[CH2:4]-[CH1:5]-[NH1:7]-1)" + +- name: reverse thiazoline tautomerization + smarts: "[OH0:1]=[C:2]1-[S:3]-[C:4]-[C:5](-[C:6])-[N:7]-1>>[OH1:1]-[C:2]1-[S:3]-[C:4]-[C:5](-[C:6])-[N:7]=1" + +- name: reverse thiazoline formation (type 1) + smarts: "[*:1][C:2]=1[SH0:3][C:4][C:5][N:7]1>>([*:1]-[CH0:2]-1(=O).[SH1:3]-[CH2:4]-[C:5]-[NH1:7]-1)" + +- name: reverse thiazoline formation (type 2) + smarts: "[*:1][C:2]([C:3][SH0:4]1)[NH1:5][C:6]1[*:7]>>[*:1][C:2]([C:3][SH1:4])[NH1:5][CH0:6](=[O])[*:7]" + +- name: reverse tetramate formation (type 1) + smarts: "[*:1]-[C:2](-[OH:3])=[C:4]1[C:5](=[O:6])[N:7][C:8][C:9]1=[O:10]>>[*:1]-[C:2](=[OH0:3])-[CH2:4][C:5](=[O:6])[N:7][C:8][C:9](-[OH])=[O:10]" + +- name: reverse tetramate formation (type 2) + smarts: "[*:1]-[C:2](-[OH:3])-[C:4]=1[C:5](=[O:6])[N:7][C:8][C:9]1-[OH:10]>>[*:1]-[C:2](=[OH0:3])-[CH2:4][C:5](=[O:6])[N:7][C:8][C:9](-[OH])=[O:10]" + +- name: reverse tetramate formation (type 3; aromatic) + smarts: "[*:1]-[C:2](=[O:3])[c:4]1[c:5](-[OH:6])[n:7][c:8][c:9]1-[OH:10]>>[*:1]-[C:2](=[OH0:3])-[CH2:4][C:5](=[O:6])[N:7][C:8][C:9](-[OH])=[O:10]" + +- name: reverse tetronate formation (type 1) + smarts: "[*:1]-[C:2](-[OH:3])=[C:4]1[C:5](=[O:6])[O:7][C:8][C:9]1=[O:10]>>[*:1]-[C:2](=[OH0:3])-[CH2:4][C:5](=[O:6])[O:7][C:8][C:9](-[OH])=[O:10]" + +- name: reverse tetronate formation (type 2) + smarts: "[*:1]-[C:2](-[OH:3])-[C:4]=1[C:5](=[O:6])[O:7][C:8][C:9]1-[OH:10]>>[*:1]-[C:2](=[OH0:3])-[CH2:4][C:5](=[O:6])[O:7][C:8][C:9](-[OH])=[O:10]" + +- name: reverse tetronate formation (type 3; spirotetronate) + smarts: "[C:9]=[C:1]1[O:2][C:3](=[O:4])[C:5](-[O:6])=[C:7](-[OH:8])1>>([C:9]=[C:1]1[O:2][C:3](=[O:4])[C:5](-[O:6]).[OH][C:7](=[O:8])1)" + +- name: reverse acetylation + smarts: "[CH2:1]=[C:2](-[OH:3])[C:4](=[O:5])-[OH:6]>>[OH][CH2:1]-[C:2](-[OH:3])[C:4](=[O:5])-[OH:6]" + +- name: reverse kirromycin-like substructure formation + smarts: "[c:1]1([C:2](=[O:8])[*:9])[c:3](~[O:10])[c:4][c:5][n:6][c:7](~[O:11])1>>[CH0:3](=[O:10])(-[OH])[CH2:4][CH2:5][NH1:6][CH0:7](=[O:11])[CH2:1]([CH0:2](=[O:8])[*:9])" + +- name: reverse beta-lactam formation + smarts: "[N:1][C:2]1[C:3]2[S:4][C:5][C:6]([C:7])[N:8]2[C:9]1=[O:10]>>([N:1][C:2]1[C:3][S:4].[C:5][C:6]([C:7])[N:8][C:9]1=[O:10])" + +- name: reverse DAOC-synthetase + smarts: "[C:1][N:2][C:3]1[C:4]2[S:5][C:6][CH0:7]=[CH0:8]([C:9])[N:10]2[C:11]1=[O:12]>>[C:1][N:2][C:3]1[C:4]2[S:5][CH0:7]([CH3:6])[CH1:8]([C:9])[N:10]2[C:11]1=[O:12]" + +- name: reverse salinosporamide-like substructure formation + smarts: "[*:1][C:2]1[C:3](=[O:4])[N:5][C:6]([C:7](=[O:8])2)([*:9])[C:10]1([C:11])[O:12]2>>[*:1][C:2][C:3](=[O:4])[OH].[N:5][C:6]([C:7](=[O:8])[OH])([*:9]).[C:10]([C:11])(=[O])[OH:12]" + +- name: reverse cyclization on lysine-like substructure + smarts: "[C:1]1(-[N:2])[C:3][C:4][C:5][C:6][N:7]([OH:8])[C:9](=[O:10])1>>([C:1](-[N:2])([C:9](=[O:10])([OH]))[C:3][C:4][C:5][C:6][N:7]([OH0:8]))" + +- name: open terminal aromatic ring + smarts: "[c:1]1[c:2]([C:7](=[O:8])([OH:9]))[c:3]([C:10])[c:4][c:5][c:6]1>>([C:1]1=[C:2]([C:7](=[O:8])([OH:9])).[C:3]([C:10])=[C:4][C:5]=[C:6]1)" + +- name: reverse spirocycle formation (type 1) + smarts: "[#6:1]1(-[*:14])-[#8:6]-[#6:5]2(-[#8:7]-[#6:11](-[#6:13]-[*:12])-[#6:10]-[#6:9]-[#6:8]-2)-[#6:4]-[#6:3]-[#6:2]-1>>[#6:1](-[*:14])(-[#8:6])-[#6:2]-[#6:3]-[#6:4]-[#6:5](=[#8:7])-[#6:8]-[#6:9]-[#6:10]-[#6:11]=[#6:13]-[*:12]" + +- name: reverse spirocycle formation (type 2) + smarts: "[#6:1]1(-[*:13])-[#8:6]-[#6:5]2(-[#8:7]-[#6:10](-[#6:9]-[#6:8]-2)-[#6:12]-[*:11])-[#6:4]-[#6:3]-[#6:2]-1>>[#6:1](-[*:13])(-[#8:6])-[#6:2]-[#6:3]-[#6:4]-[#6:5](=[#8:7])-[#6:8]-[#6:9]-[#6:10]=[#6:12]-[*:11]" + +- name: reversed cyclized hydroxy ornithine + smarts: "[C:2]1[C:3](=[O:4])[NH0:5]([OH:7])[C:6][C:8][C:9]1>>[NH2:5]([OH:7])[C:6][C:8][C:9][C:2][C:3](=[O:4])[O]" + +- name: reverse atrop-abyssomycin C like formation + smarts: "[C;R:1]=[C;R:2][O;R:3][C;R:4][C;R:5][O:6]>>([C;R:1]=[C;R:2][OH1:3].[C;R:4]1[C;R:5][OH0:6]1)" + +- name: reverse dies-alder (intramolecular) + smarts: "[*:9][C:1]1-[C:2]=[C:3]-[C:4](-[C:5])-[C:6]-[C:7]1-[*:8]>>([*:9][C:1]=[C:2]-[C:3]=[C:4]-[C:5].[C:6]=[C:7]-[*:8])" + +- name: reverse dies-alder (intermolecular) + smarts: "[*:9][C:1]1-[C:2]=[C:3]-[C:4](-[C:5])-[C:6]-[C:7]1-[*:8]>>[*:9][C:1]=[C:2]-[C:3]=[C:4]-[C:5].[C:6]=[C:7]-[*:8]" + +- name: reverse aryl amicoumacin-like + smarts: "[C:1][c:2]1[c:3]([OH:9])[c:4][c:5][c:6][c:7]1[C:8]>>[C:1][C:2]=[C:3]([OH:9])[C:4]=[C:5][C:6]=[C:7][C:8]" + +- name: reverse spiroborate + smarts: "[O:1]1[C:2][C:3][O:4][B:5]12[O:6][C:7][C:8][O:9]2>>([OH1:1][C:2][C:3][OH1:4].[OH1:6][C:7][C:8][OH1:9]).[B:5]" + +- name: reverse cremimycin-like substructure + smarts: "[C:1]1[C:2][C:3]([O:4])[C:5](-,=[O:6])-,=[C:7]1>>[C:1]=[C:2][C:3][C:5](-[OH:6])-[C:7].[O:4]" + props: + references: + - "DOI:10.1002/cbic.201300370" + +- name: reversed NRP biosynthesis (alpha amino acid; intermolecular) + smarts: "[C,c:2][C:3](=[O:4])[NH1:5][C,c;!$(C=O):6]>>[C,c:2][C:3](=[O:4])[O].[NH2:5][C,c:6]" + allowed_in_bulk: true + +- name: reversed NRP biosynthesis (alpha amino acid; intramolecular) + smarts: "[C,c:2][C;R:3](=[O:4])[NH1;R:5][C,c;!$(C=O):6]>>([C,c:2][C:3](=[O:4])[O].[NH2:5][C,c:6])" + allowed_in_bulk: true + +- name: reverse NRP biosynthesis (aromatic nitrogen; intermolecular) + smarts: "[*:1][C:2](=[O:3])[n:4]>>[*:1][C:2](=[O:3])[OH].[nH:4]" + allowed_in_bulk: true + +- name: reverse NRP biosynthesis (aromatic nitrogen; intramolecular) + smarts: "[*:1][C:2](=[O:3])[n:4]>>([*:1][C:2](=[O:3])[OH].[nH:4])" + allowed_in_bulk: true + +- name: reverse urea bond formation (intermolecular) + smarts: "[C:1][NH1:2][C:3](=[O:4])[NH1:5][C:6]>>[C:1][NH2:2].[NH2:5][C:6].[OH][C:3](=[O:4])[OH]" + allowed_in_bulk: true + +- name: reverse urea bond formation (intramolecular) + smarts: "[C:1][NH1:2][C:3](=[O:4])[NH1:5][C:6]>>([C:1][NH2:2].[NH2:5][C:6]).[OH][C:3](=[O:4])[OH]" + allowed_in_bulk: true + +- name: reversed NRP biosynthesis (alpha amino acid; proline-like) + smarts: "[*:1][C:2](=[O:3])[NH0:4][C:5][C:6](=[O:7])[OH:8]>>[*:1][C:2](=[O:3])[OH].[NH1:4][C:5][C:6](=[O:7])[OH:8]" + allowed_in_bulk: true + +- name: reversed NRP biosynthesis (beta amino acid) + smarts: "[*:1][C:2](=[O:3])[NH1:4][C:5][CH2:6][C:7](=[O:8])[OH:9]>>[*:1][C:2](=[O:3])[OH].[NH2:4][C:5][CH2:6][C:7](=[O:8])[OH:9]" + allowed_in_bulk: true + +- name: break amide bond (hydroxamic) + smarts: "[C:1][C:2](=[O:3])[N:4](-[OH:5])[C:6]>>[C:1][C:2](=[O:3])-[OH].[N:4](-[OH:5])[C:6]" + allowed_in_bulk: true + +- name: reversed polyketide synthesis (saturated) + smarts: "[C,c:1][C;!R:2]-[C;!R:3]-[C:4](=[O:5])[OH:6]>>[C,c:1]C(=O)[OH].[OH][S][C:2]-[C:3]-[C:4](=[O:5])[OH:6]" + +- name: reversed polyketide synthesis (saturated anhydride) + smarts: "[C,c:1][C;!R:2]-[C;!R:3]-[C:4](=[O:5])[CH2:6][CH3:7]>>[C,c:1]C(=O)[OH].[OH][S][C:2]-[C:3]-[C:4](=[O:5])[OH].[C](=[O])([OH])[C:6][C:7]" + +- name: reversed polyketide synthesis (unsaturated) + smarts: "[C,c:1][C;!R:2]=[C;!R:3]-[C:4](=[O:5])[OH:6]>>[C,c:1]C(=O)[OH].[OH][S][C:2]=[C:3]-[C:4](=[O:5])[OH:6]" + +- name: reversed polyketide synthesis (unsaturated and shifted type 1) + smarts: "[C,c:1]=[C;!R:2]-[C;!R:3]-[C:4](=[O:5])[OH:6]>>[C,c:1]C(=O)[OH].[OH][S][C:2]=[C:3]-[C:4](=[O:5])[OH:6]" + +- name: reversed polyketide synthesis (unsaturated and shifted type 2) + smarts: "[C:7][C:1]=[C;!R:2]-[C&D4;!R:3]-[C:4](=[O:5])[OH:6]>>[C:7]=[C:1]C(=O)[OH].[OH][S][C:2]-[C:3]-[C:4](=[O:5])[OH:6]" + +- name: reversed polyketide synthesis (shifted and late stage oxidation) + smarts: "[C,c:1]=[C;!R:2]-[C;!R:3](-[OH:7])-[C:4](=[O:5])[OH:6]>>[C,c:1]C(=O)[OH].[OH][S][C:2]=[C:3]-[C:4](=[O:5])[OH:6].[O:7]" \ No newline at end of file diff --git a/src/retromol/drawing.py b/src/retromol/drawing.py deleted file mode 100644 index a9006f4..0000000 --- a/src/retromol/drawing.py +++ /dev/null @@ -1,172 +0,0 @@ -"""Module for RetroMol results drawing.""" - -import logging -import os -from copy import deepcopy -from enum import Enum - -from rdkit.Chem.Draw.rdMolDraw2D import MolDraw2DSVG, MolDrawOptions - -from retromol import chem, config, io, readout - - -class Palette(Enum): - Red = (230, 25, 75) - Blue = (0, 130, 200) - Green = (60, 180, 75) - Maroon = (128, 0, 0) - Brown = (170, 110, 40) - Olive = (128, 128, 0) - Teal = (0, 128, 128) - Navy = (0, 0, 128) - Orange = (245, 130, 48) - Yellow = (255, 225, 25) - Lime = (210, 245, 60) - Cyan = (70, 240, 240) - Purple = (145, 30, 180) - Magenta = (240, 50, 230) - Pink = (255, 190, 212) - Apricot = (255, 215, 180) - Beige = (255, 250, 200) - Mint = (170, 255, 195) - Lavender = (220, 190, 255) - - def hex(self, alpha: float) -> str: - """ - Get hex representation of the color with specified alpha transparency. - - :param alpha: alpha transparency (0.0 to 1.0) - :return: hex color string with alpha - """ - return f"#{self.value[0]:02x}{self.value[1]:02x}{self.value[2]:02x}{int(alpha * 255):02x}" - - def normalize(self, min_val: float = 0.0, max_val: float = 255.0) -> tuple[float, float, float]: - """ - Get normalized RGB tuple of the color. - - :param min_val: minimum value for normalization - :param max_val: maximum value for normalization - :return: normalized RGB tuple - """ - r, g, b = self.value - return ( - (r - min_val) / (max_val - min_val), - (g - min_val) / (max_val - min_val), - (b - min_val) / (max_val - min_val), - ) - - -def hex_to_rgb_tuple(hex_str: str) -> tuple[float, float, float]: - """ - Convert hex color string to normalized RGB tuple. - - :param hex_str: hex color string (e.g. "#ff5733" or "#ff5733ff") - :return: normalized RGB tuple - """ - hex_str = hex_str.lstrip("#") - if len(hex_str) == 6: - r, g, b = int(hex_str[0:2], 16), int(hex_str[2:4], 16), int(hex_str[4:6], 16) - elif len(hex_str) == 8: - r, g, b = int(hex_str[0:2], 16), int(hex_str[2:4], 16), int(hex_str[4:6], 16) - # alpha = int(hex_str[6:8], 16) # Alpha is ignored in this function - else: - raise ValueError(f"Invalid hex color string: {hex_str}") - return (r / 255.0, g / 255.0, b / 255.0) - - -def draw_result( - result: io.Result, - out_dir: str, - base_name: str = "optimal_mapping", - window_size: tuple[int, int] = (800, 800), - background_color: str | None = None, -) -> None: - """ - Draw optimal mappings for a RetroMol result. - - For each optimal mapping of identified nodes to the input molecule, generate - a 2D drawing highlighting the atoms and bonds covered by each node in a - distinct color. - - :param result: RetroMol Result object containing the input molecule and identified nodes - :param out_dir: directory to save the output SVG files - :param base_name: base name for the output files; an index will be appended for each mapping - :param window_size: size of the drawing window (width, height) - :param background_color: optional background color for the drawing in - hex format (e.g. "#ffffff" for white). If None, defaults to transparent - :return: None - """ - logger = logging.getLogger(config.LOGGER_NAME) - - optimal_mappings = readout.optimal_mappings_with_timeout(result) - logger.info(f"{len(optimal_mappings)} optimal mapping(s) found.") - - # Retrieve input SMILES with tags - input_smi = result.get_input_smiles(remove_tags=False) - input_mol = chem.smiles_to_mol(input_smi) - - # If no optimal mappings, just draw the input molecule - if len(optimal_mappings) == 0: - optimal_mappings.append({}) - - for map_idx, o_m in enumerate(optimal_mappings): - out_path = os.path.join(out_dir, f"{base_name}_{map_idx + 1}.svg") - - drawing: MolDraw2DSVG = MolDraw2DSVG(*window_size) - palette = [c.normalize() for c in Palette] - - atoms_to_highlight: list[int] = [] - bonds_to_highlight: list[int] = [] - atom_highlight_colors: dict[int, tuple[float, float, float]] = {} - bond_highlight_colors: dict[int, tuple[float, float, float]] = {} - - for n_idx, node in enumerate(o_m.get("nodes", [])): - color = palette[n_idx % len(palette)] - n_tags = node["tags"] - - logger.info(f"Mapping {map_idx + 1} - node {n_idx + 1}: {node['identity']} {n_tags}") - - for atom in input_mol.GetAtoms(): - a_tag = atom.GetIsotope() - if a_tag in n_tags: - a_idx = atom.GetIdx() - atoms_to_highlight.append(a_idx) - atom_highlight_colors[a_idx] = color - - for bond in input_mol.GetBonds(): - b_begin_idx = bond.GetBeginAtom() - b_end_idx = bond.GetEndAtom() - b_begin_tag = b_begin_idx.GetIsotope() - b_end_tag = b_end_idx.GetIsotope() - if b_begin_tag in n_tags and b_end_tag in n_tags: - b_idx = bond.GetIdx() - bonds_to_highlight.append(b_idx) - bond_highlight_colors[b_idx] = color - - options: MolDrawOptions = drawing.drawOptions() - if background_color is not None: - options.setBackgroundColour(hex_to_rgb_tuple(background_color)) - options.useBWAtomPalette() - - # Remove isotopic labels for drawing - cp_input_mol = deepcopy(input_mol) - for atom in cp_input_mol.GetAtoms(): - atom.SetIsotope(0) - - drawing.DrawMolecule( - cp_input_mol, - highlightAtoms=atoms_to_highlight, - highlightBonds=bonds_to_highlight, - highlightAtomColors=atom_highlight_colors, - highlightBondColors=bond_highlight_colors, - ) - - drawing.FinishDrawing() - svg_str = drawing.GetDrawingText().replace("svg:", "") - - with open(out_path, "w") as f: - f.write(svg_str) - logger.info(f"Wrote drawing to {out_path}") - - # Close the drawing to free up memory - drawing = None diff --git a/src/retromol/errors.py b/src/retromol/errors.py deleted file mode 100644 index 0fea62a..0000000 --- a/src/retromol/errors.py +++ /dev/null @@ -1,28 +0,0 @@ -"""This module defines custom exceptions for the RetroMol package.""" - - -class FunctionTimeoutError(Exception): - """Custom exception for function timeout.""" - - pass - - -class MotifGraphNodeWithoutAttributesError(Exception): - """Custom error for when a motif graph node is created without attributes.""" - - def __init__(self, message: str) -> None: - """ - Initialize the error with a message. - - :param message: the error message - """ - super().__init__(message) - self.message = message - - def __str__(self) -> str: - """ - Return the string representation of the error. - - :return: the error message - """ - return f"MotifGraphNodeWithoutAttributesError: {self.message}" diff --git a/src/retromol/fingerprint.py b/src/retromol/fingerprint.py deleted file mode 100644 index ba8a089..0000000 --- a/src/retromol/fingerprint.py +++ /dev/null @@ -1,845 +0,0 @@ -"""This module contains functions for generating hashed fingerprints from k-mers.""" - -import hashlib -import os -import pickle -import re -import struct -from collections.abc import Callable, Iterable, Sequence -from datetime import datetime -from itertools import combinations -from typing import Any, TypeVar - -import numpy as np -import yaml -from numpy.typing import NDArray - -from retromol.graph import iter_kmers -from retromol.helpers import blake64_hex, sha256_hex -from retromol.io import Result -from retromol.monomer_collapse import ( - Group, - NameSimilarityConfig, - assign_to_existing_groups, - collapse_monomers_order_invariant, -) -from retromol.readout import mapping_to_graph, optimal_mappings -from retromol.rules import MatchingRule - -T = TypeVar("T") - - -def _norm_token(tok: object, none_sentinel: str = "") -> bytes: - """ - Turn a token (possibly None) into stable bytes. - - :param tok: token to normalize (str, int, float, or None) - :param none_sentinel: string to use for None tokens - :return: bytes representation of the token - """ - if tok is None: - return none_sentinel.encode("utf-8") - - # Strings/ints are common; fall back to repr for others - if isinstance(tok, (str, int, float)): - return str(tok).encode("utf-8") - - return repr(tok).encode("utf-8") - - -def _family_token(fam: str) -> str: - """ - Generate a family token. - - :param fam: family name - :return: family token string - """ - fam = fam or "" - return f"NF:{blake64_hex(f'FAM:{fam.lower()}')}" - - -def _pair_token(a: str, b: str) -> str: - """ - Generate a pairwise token for two names. - - :param a: first name - :param b: second name - :return: pairwise token string - """ - a, b = sorted([(a or "").lower(), (b or "").lower()]) - return f"NS:{blake64_hex(f'PAIR:{a}|{b}')}" - - -def _hash_kmer_tokens( - tokens: Sequence[bytes], - n_bits: int, - n_hashes: int, - seed: int = 0, - k_salt: int = 0, -) -> list[int]: - """ - Map a tokenized k-mer (as bytes) to n_hashes bit indices in [0, n_bits). - - :param tokens: sequence of bytes tokens (e.g. from _norm_token) - :param n_bits: number of bits in the fingerprint - :param n_hashes: number of hash indices to produce - :param seed: global seed for hashing - :param k_salt: salt value specific to the k-mer length (to decorrelate lengths) - :return: list of bit indices - - .. note:: Deterministic across runs/machines. Different k values get a salt. - """ - data = b"\x1f".join(tokens) # unit separator - - idxs: list[int] = [] - for i in range(n_hashes): - # Include both global seed and per-hash index, plus a per-k salt - salted = data + struct.pack(">III", seed, i, k_salt) - digest = hashlib.blake2b(salted, digest_size=8).digest() - val = int.from_bytes(digest, "big") % n_bits - idxs.append(val) - - return idxs - - -def kmers_to_fingerprint( - kmers: Iterable[Sequence[Any]], - n_bits: int = 2048, - n_hashes_per_kmer: int | Callable[[int], int] = 2, - seed: int = 42, - none_policy: str = "keep", - counted: bool = False, - count_dtype: Any = np.uint32, -) -> NDArray[np.generic]: - """ - Build a hashed fingerprint from an iterable of tokenized k-mers. - - :param kmers: iterable of k-mers, where each k-mer is a sequence of tokens (str, int, float, or None) - :param n_bits: number of bits in the fingerprint - :param n_hashes_per_kmer: number of hash indices to produce per k-mer (int or callable that takes k-mer length - as input and returns the number of hashes). - :param seed: global seed for hashing. - :param none_policy: policy for handling None tokens: "keep" (treat as a special token), "skip-token" - (omit the token), or "drop-kmer" (skip the entire k-mer). - :param counted: if True, produce a count vector instead of a binary vector - :param count_dtype: data type for counts (if counted is True) - :return: fingerprint as a numpy array of shape (n_bits,) - """ - if n_bits <= 0: - raise ValueError("n_bits must be positive") - - # Normalize n_hashes_per_kmer to callable - if isinstance(n_hashes_per_kmer, int): - if n_hashes_per_kmer <= 0: - raise ValueError("n_hashes_per_kmer must be positive") - - def _nh(_: int) -> int: - return n_hashes_per_kmer - else: - _nh: Callable[[int], int] = n_hashes_per_kmer - - # Allocate output - if counted: - vec = np.zeros(n_bits, dtype=count_dtype) - else: - vec = np.zeros(n_bits, dtype=np.uint8) - - # Main loop - for kmer in kmers: - if none_policy == "drop-kmer" and any(t is None for t in kmer): - continue - - # Normalize per token - normd: list[bytes] = [] - for t in kmer: - if t is None: - if none_policy == "skip-token": - continue - normd.append(_norm_token(None)) - else: - normd.append(_norm_token(t)) - if not normd: - continue - - n_hashes = _nh(len(kmer)) - if n_hashes <= 0: - continue - - # Simple salt tied to (normalized) k-mer length - k_salt = len(normd) - - idxs = _hash_kmer_tokens( - normd, - n_bits=n_bits, - n_hashes=n_hashes, - seed=seed, - k_salt=k_salt, - ) - - if counted: - # Increment counts; duplicates in idxs will accumulate - vec[idxs] += 1 - else: - # Set bits to 1 (binary) - vec[idxs] = 1 - - return vec - - -def cosine_similarity(fp1: NDArray[np.int8], fp2: NDArray[np.int8]) -> float: - """ - Cosine similarity for fingerprints. - - :param fp1: first fingerprint (1D array) - :param fp2: second fingerprint (1D array) - :return: cosine similarity in [0, 1] - """ - a = np.asarray(fp1) - b = np.asarray(fp2) - - # Ensure 1D - a = a.ravel() - b = b.ravel() - if a.shape != b.shape: - raise ValueError(f"Different lengths: {a.shape} vs {b.shape}") - - # Upcast to float to avoid integer overflow and match sklearn - a = a.astype(np.float64, copy=False) - b = b.astype(np.float64, copy=False) - - # Compute cosine - dot = float(np.dot(a, b)) - na = float(np.linalg.norm(a)) - nb = float(np.linalg.norm(b)) - - if na == 0.0 or nb == 0.0: - return 0.0 - - return dot / (na * nb) - - -def tanimoto_similarity(fp1: NDArray[np.int8], fp2: NDArray[np.int8]) -> float: - """ - Tanimoto similarity for molecular fingerprints (binary or count-based). - - :param fp1: first fingerprint (1D array) - :param fp2: second fingerprint (1D array) - :return: Tanimoto similarity in [0, 1] - """ - a = np.asarray(fp1) - b = np.asarray(fp2) - - # Ensure 1D - a = a.ravel() - b = b.ravel() - if a.shape != b.shape: - raise ValueError(f"Different lengths: {a.shape} vs {b.shape}") - - # Upcast to float to prevent overflow and ensure precision - a = a.astype(np.float64, copy=False) - b = b.astype(np.float64, copy=False) - - # Dot product = intersection term - ab = float(np.dot(a, b)) - aa = float(np.dot(a, a)) - bb = float(np.dot(b, b)) - - denom = aa + bb - ab - if denom == 0.0: - return 0.0 - return ab / denom - - -def get_kmers(seq: tuple[T, ...], k: int) -> list[tuple[T, ...]]: - """ - Return all contiguous, bidirectional k-mers (subtuples of length k) from a tuple. - - :param seq: input sequence as a tuple - :param k: k-mer length - :return: list of k-mers (as tuples) - """ - if k <= 0: - return [] - n = len(seq) - if k > n: - return [] - forward_kmers = [seq[i : i + k] for i in range(n - k + 1)] - backward_kmers = [tuple(reversed(kmer)) for kmer in forward_kmers] - return forward_kmers + backward_kmers - - -class FingerprintGenerator: - """Class to generate fingerprints based on monomer collapse groups.""" - - def __init__( - self, - matching_rules_yaml: str | None, - keep_stereo: bool = False, - tanimoto_threshold: float = 0.85, - collapse_by_name: list[str] | None = None, - name_similarity: NameSimilarityConfig | None = None, - ) -> None: - """ - Initialize FingerprintGenerator. - - :param matching_rules_yaml: path to matching rules YAML file - :param keep_stereo: whether to retain stereochemistry during standardization - :param tanimoto_threshold: Tanimoto similarity threshold for structural grouping - :param collapse_by_name: optional list of names to always collapse by name - :param name_similarity: optional configuration for name similarity - :raises FileNotFoundError: if the matching rules YAML file does not exist - """ - if not os.path.exists(matching_rules_yaml): - raise FileNotFoundError(f"Matching rules YAML file not found: {matching_rules_yaml}") - - # Load matching rules and compute SHA256 hash for provenance - with open(matching_rules_yaml) as f: - matching_rules_src = f.read() - sha256_matching_rules = sha256_hex(matching_rules_src) - self.sha256_matching_rules = sha256_matching_rules - - # Parse out the matching rules, and turn into records - matching_rules_data = yaml.safe_load(matching_rules_src) - records = [] - for i, rule_data in enumerate(matching_rules_data): - matching_rule = MatchingRule.from_json_serializable_dict(i, rule_data) - records.append((matching_rule.rid, matching_rule.smiles)) - - # Collapse monomers into groups - groups, monomers = collapse_monomers_order_invariant( - records, - keep_stereo=keep_stereo, - tanimoto_thresh=tanimoto_threshold, - collapse_by_name=collapse_by_name, - ) - self.collapse_by_name = collapse_by_name or [] - self.groups = groups - self.monomers = monomers - self.name_similarity = name_similarity - - self.keep_stereo = keep_stereo - self.tanimoto_threshold = tanimoto_threshold - - # For speedup - self._assign_cache: dict[tuple[str | None, str], Group | None] = {} - self._token_bytes_cache: dict[object, bytes] = {} - - def __repr__(self) -> str: - """ - String representation of the FingerprintGenerator. - - :return: string representation - """ - return ( - f"FingerprintGenerator(num_groups={len(self.groups)}, " - f"num_monomers={len(self.monomers)}, " - f"keep_stereo={self.keep_stereo}, " - f"tanimoto_threshold={self.tanimoto_threshold}, " - f"collapse_by_name={self.collapse_by_name}, " - f"name_similarity={self.name_similarity})" - ) - - def assign_to_group(self, smi: str, name: str | None = None) -> Group: - """ - Assign a new monomer to an existing group based on its SMILES. - - :param name: name of the monomer - :param smi: SMILES string of the monomer - :return: group ID if assigned, None otherwise - """ - # Cache key: only use name when we're collapsing by that name - key = (name if (name is not None and name in self.collapse_by_name) else None, smi) - - # Return from cache (including cached None) if present - g = self._assign_cache.get(key) - if g is not None or key in self._assign_cache: - return g # may be None - - # Name branch: cache the hit when found - if key[0] is not None: - # Deterministically scan existing roups - for gg in self.groups: - if gg.kind == "name" and gg.name_key == key[0]: - self._assign_cache[key] = gg - return gg - # If we intended to collapse by name but no such group exists, that's an error - raise ValueError(f"No existing name-based group found for name: {name}") - - # Structure branch: assign based on Tanimoto similarity - gid = assign_to_existing_groups( - smi=smi, - groups=self.groups, - monomers=self.monomers, - keep_stereo=self.keep_stereo, - tanimoto_thresh=self.tanimoto_threshold, - ) - - # gid can be 0; only None means "not assigned" - g = self.groups[gid] if gid is not None else None - - # Cache result (including None) so we don't recompute on repeats - self._assign_cache[key] = g - return g - - def fingerprint_from_result( - self, - result: Result, - num_bits: int = 2048, - kmer_sizes: list[int] | None = None, - kmer_weights: dict[int, int] | None = None, - strict: bool = True, - counted: bool = False, - ) -> NDArray[np.int8] | None: - """ - Generate a fingerprint from a RetroMolResult. - - :param result: RetroMol Result object - :param num_bits: number of bits in the fingerprint - :param kmer_sizes: list of k-mer sizes to consider - :param kmer_weights: weights for each k-mer size. Determines how many bits each k-mer sets. - :param strict: if True, verify that the matching rules SHA256 matches. - :param counted: if True, count the number of times each k-mer appears. - :return: fingerprint as a numpy array, or None if no monomers found. - """ - if strict: - sha256_self = self.sha256_matching_rules - sha256_result = result.sha256_matching_rules - if sha256_self != sha256_result: - raise ValueError( - "Mismatch in matching rules SHA256: FingerprintGenerator was " - "created with different matching rules than the Result" - ) - - # Defaults - if kmer_sizes is None: - kmer_sizes = [1, 2, 3] - if kmer_weights is None: - kmer_weights = {1: 16, 2: 4, 3: 2} - - # Resolve similarity config - cfg = self.name_similarity - family_of = cfg.family_of if cfg and cfg.family_of is not None else (lambda n: None) - pairwise = (cfg.pairwise if cfg else {}) or {} - symmetric = bool(cfg.symmetric) if cfg else True - fam_rep = max(0, int(cfg.family_repeat_scale)) if cfg else 0 - pair_rep = max(0, int(cfg.pair_repeat_scale)) if cfg else 0 - ancestors_of = getattr(cfg, "ancestors_of", None) if cfg else None - anc_rep = max(0, int(getattr(cfg, "ancestor_repeat_scale", 0))) if cfg else 0 - - # Gather optimal mappings - oms = [om for om in optimal_mappings(result)] - - # Get tagged SMILES from result - tagged_smiles = result.get_input_smiles(remove_tags=False) - - fps = [] - for om in oms: - om_graph = mapping_to_graph(tagged_smiles, om) - - token_kmers: list[tuple[str, ...]] = [] - names_per_kmer: list[list[str]] = [] - sizes_per_kmer: list[int] = [] - - for kmer_size in kmer_sizes: - for kmer in iter_kmers(om_graph, kmer_size): - tokenized_kmer = [] - names_in_kmer = [] - - for node in kmer: - node_data = om_graph.nodes[node] - node_id = node_data.get("identity") - node_smiles = node_data.get("smiles") - if node_smiles is None: - raise ValueError("Node in mapping graph missing 'smiles' attribute") - - group = self.assign_to_group(node_smiles, name=node_id) - token = group.token_fine if group is not None else None - tokenized_kmer.append(token) - - if node_id is not None: - names_in_kmer.append(node_id) - - token_kmers.append(tuple(tokenized_kmer)) - names_per_kmer.append(names_in_kmer) - sizes_per_kmer.append(kmer_size) - - # Inject similarity "virtual 1-mers" (families and pairwise), per k-mer - if cfg and (fam_rep > 0 or pair_rep > 0): - # Family tokens: once per name in the k-mer, repeated fam_rep times - if fam_rep > 0: - for names in names_per_kmer: - for nm in sorted(set(n for n in names if n)): - fam_val = family_of(nm) - # Accept str, iterable of str, or None - # This allows for multiple families per name - if fam_val is None: - families = [] - elif isinstance(fam_val, (list, tuple, set)): - families = list(fam_val) - else: - families = [fam_val] - - for fam in sorted({f for f in families if f}): - ftok = _family_token(fam) - if not ftok: - continue - for _ in range(fam_rep): - token_kmers.append((ftok,)) - - # Pairwise tokens: for unordered name pairs in the same k-mer - if pair_rep > 0 and pairwise: - for names in names_per_kmer: - uniq = sorted(set(n for n in names if n)) - if len(uniq) < 2: - continue - for a, b in combinations(uniq, 2): - s = float(pairwise.get(a, {}).get(b, 0.0)) - if symmetric: - s = max(s, float(pairwise.get(b, {}).get(a, 0.0))) - if s <= 0.0: - continue - reps = int(round(s * pair_rep)) - if reps <= 0: - continue - ptoken = _pair_token(a, b) - for _ in range(reps): - token_kmers.append((ptoken,)) - - # Ancestor supertokens - if ancestors_of and anc_rep > 0: - # Small local helper: stable ancestor token w/ level namespace - def _anc_tok(level: int, anc: str) -> str: - anc = (anc or "").lower() - return f"AN:{level}:{blake64_hex(f'ANC:{level}:{anc}')}" - - # Ancestor 1-mers: for each name, emit all ancestors in its path - for names in names_per_kmer: - for nm in set(n for n in names if n): - path = ancestors_of(nm) or [] # e.g., ["polyketide", "polyketide_type_A", "A1"] - for lvl, anc in enumerate(path): - tok = _anc_tok(lvl, anc) - for _ in range(anc_rep): - token_kmers.append((tok,)) - - # Ancestor k-mers: for each window, for each ancestor level present at all positions - for names, ksize in zip(names_per_kmer, sizes_per_kmer, strict=True): - if ksize <= 1: - continue - pos_paths = [(ancestors_of(nm) or []) if nm else [] for nm in names] - if not pos_paths or any(len(p) == 0 for p in pos_paths): - # Require every position to have at least one ancestor (root level) - continue - max_depth = min(len(p) for p in pos_paths) # only levels common to all positions - for lvl in range(max_depth): - # Form one ancestor k-mer at this level by taking the ancestor token per position - kmer_tok = tuple(_anc_tok(lvl, pos_paths[i][lvl]) for i in range(ksize)) - for _ in range(anc_rep): - token_kmers.append(kmer_tok) - - # Hash default + virtual kmers - fp = kmers_to_fingerprint( - token_kmers, - n_bits=num_bits, - n_hashes_per_kmer=lambda k: kmer_weights.get(k, 1), - seed=42, - none_policy="skip-token", - counted=counted, - ) - fps.append(fp) - - # Stack fingerprints from all optimal mappings as rows - if not fps: - return None - - stacked_fps = np.stack(fps, axis=0) # shape (num_mappings, num_bits) - return stacked_fps - - def fingerprint_from_kmers( - self, - kmers: Iterable[Sequence[tuple[str | None, str | None]]], - num_bits: int = 2048, - kmer_weights: dict[int, int] | None = None, - *, - counted: bool = False, - none_policy: str = "skip-token", - allow_raw_name_token: bool = True, - raise_on_unknown_named_group: bool = True, - ) -> NDArray[np.int8]: - """ - Build a fingerprint directly from user-provided k-mers of (name, smiles) tuples. - - Each item in a k-mer must be a 2-tuple: (name|None, smiles|None). - At least one element of the tuple must be non-None. - - Name-only tokens (e.g. ('chlorination', None)): - - If `name` is present and in `self.collapse_by_name`, we resolve to the - existing name-based group token (same as used elsewhere). - If such a group doesn't exist and `raise_on_unknown_named_group` is True, - a ValueError is raised; otherwise we fall back to a stable name token - (when `allow_raw_name_token` is True). - - If `name` is present but NOT in `collapse_by_name`, we optionally - use a stable name token if `allow_raw_name_token` is True. - - Structure-only tokens (e.g. (None, 'CCO')): - - Assigned structurally via `assign_to_group(smiles, name=None)`; - if no structural group is found, the item contributes no token. - - Mixed tokens (e.g. ('A12', 'CCO')): - - If `name` is in `collapse_by_name`, we resolve via the name group. - Otherwise we assign structurally; `name` is still passed to - `assign_to_group` (it can help for caches, but does not force name grouping). - - :param kmers: iterable of k-mers; each k-mer is a sequence of (name, smiles) tuples - :param num_bits: size of the fingerprint - :param kmer_weights: hash multiplicity per k (default {1:16, 2:4, 3:2}) - :param counted: if True, produce a count vector; otherwise binary - :param none_policy: how to handle None tokens at the token level - ('keep', 'skip-token', 'drop-kmer'). Defaults to 'skip-token' - :param allow_raw_name_token: if True, when a name isn't in `collapse_by_name` - (or no name-group exists), use a stable name token so it still hashes - :param raise_on_unknown_named_group: if True and a name is in `collapse_by_name` - but no name-based group exists, raise ValueError - :return: 1D numpy array (num_bits,) with the fingerprint - """ - if kmer_weights is None: - kmer_weights = {1: 16, 2: 4, 3: 2} - - def _stable_name_token(nm: str) -> str: - # Deterministic, case-insensitive token for raw names - return f"NM:{blake64_hex('NAME:' + (nm or '').lower())}" - - token_kmers: list[tuple[str | None, ...]] = [] - names_per_kmer: list[list[str]] = [] - sizes_per_kmer: list[int] = [] - - for kmer in kmers: - if not kmer: - continue - - toks: list[str | None] = [] - names_here: list[str] = [] - - for item in kmer: - if not isinstance(item, tuple) or len(item) != 2: - raise TypeError("Each k-mer item must be a (name|None, smiles|None) tuple") - name, smi = item - if name is None and smi is None: - # Explicitly ignore completely empty items - continue - - tok: str | None = None - - # Priority 1: explicit collapse-by-name - if name is not None and name in (self.collapse_by_name or []): - try: - # assign_to_group will find the existing name-group without needing a real SMILES - g = self.assign_to_group(smi="", name=name) - except ValueError: - # No such name group exists - if raise_on_unknown_named_group: - raise - g = None - if g is not None: - tok = g.token_fine - elif allow_raw_name_token and name is not None: - tok = _stable_name_token(name) - - # Priority 2: structure-based assignment - elif smi: - g = self.assign_to_group(smi=smi, name=name) - tok = g.token_fine if g is not None else None - - # Priority 3: raw name token - elif allow_raw_name_token and name is not None: - tok = _stable_name_token(name) - - # If tok remains None, the item contributes nothing - toks.append(tok) - - if name: - names_here.append(name) - - # Emit this k-mer (even if some items were None; none_policy will handle) - if toks: - token_kmers.append(tuple(toks)) - names_per_kmer.append(names_here) - sizes_per_kmer.append(len(toks)) # logical k size - - # Inject similarity virtual tokens - cfg = self.name_similarity - if cfg: - family_of = cfg.family_of if cfg.family_of is not None else (lambda n: None) - pairwise = cfg.pairwise or {} - symmetric = bool(cfg.symmetric) if cfg.symmetric is not None else True - fam_rep = max(0, int(cfg.family_repeat_scale or 0)) - pair_rep = max(0, int(cfg.pair_repeat_scale or 0)) - ancestors_of = getattr(cfg, "ancestors_of", None) - anc_rep = max(0, int(getattr(cfg, "ancestor_repeat_scale", 0))) - - # helper for stable ancestor token - def _anc_tok(level: int, anc: str) -> str: - anc = (anc or "").lower() - return f"AN:{level}:{blake64_hex(f'ANC:{level}:{anc}')}" - - # Families - if fam_rep > 0: - for names in names_per_kmer: - for nm in sorted(set(n for n in names if n)): - fam_val = family_of(nm) - families = ( - [] - if fam_val is None - else (list(fam_val) if isinstance(fam_val, (list, tuple, set)) else [fam_val]) - ) - for fam in sorted({f for f in families if f}): - ftok = _family_token(fam) - for _ in range(fam_rep): - token_kmers.append((ftok,)) - - # Pairwise - if pair_rep > 0 and pairwise: - for names in names_per_kmer: - uniq = sorted(set(n for n in names if n)) - if len(uniq) < 2: - continue - for a, b in combinations(uniq, 2): - s = float(pairwise.get(a, {}).get(b, 0.0)) - if symmetric: - s = max(s, float(pairwise.get(b, {}).get(a, 0.0))) - reps = int(round(max(0.0, s) * pair_rep)) - if reps > 0: - ptoken = _pair_token(a, b) - for _ in range(reps): - token_kmers.append((ptoken,)) - - # Ancestors (1-mers and aligned k-mers) - if ancestors_of and anc_rep > 0: - # 1-mers - for names in names_per_kmer: - for nm in set(n for n in names if n): - path = ancestors_of(nm) or [] - for lvl, anc in enumerate(path): - tok = _anc_tok(lvl, anc) - for _ in range(anc_rep): - token_kmers.append((tok,)) - # aligned k-mers by ancestor level - for names, ksize in zip(names_per_kmer, sizes_per_kmer, strict=True): - if ksize <= 1: - continue - pos_paths = [(ancestors_of(nm) or []) if nm else [] for nm in names] - if not pos_paths or any(len(p) == 0 for p in pos_paths): - continue - max_depth = min(len(p) for p in pos_paths) - for lvl in range(max_depth): - kmer_tok = tuple(_anc_tok(lvl, pos_paths[i][lvl]) for i in range(ksize)) - for _ in range(anc_rep): - token_kmers.append(kmer_tok) - - if not token_kmers: - # Return an all-zero vector of the requested type/shape to keep behavior predictable - return kmers_to_fingerprint([], n_bits=num_bits, n_hashes_per_kmer=1, counted=counted) - - fp = kmers_to_fingerprint( - token_kmers, - n_bits=num_bits, - n_hashes_per_kmer=lambda k: kmer_weights.get(k, 1), - seed=42, - none_policy=none_policy, - counted=counted, - ) - return fp - - def save(self, path: str) -> None: - """ - Serialize this FingerprintGenerator to a binary file. - - :param path: output file path - .. note:: uses pickle; not secure against untrusted sources - """ - payload = { - "__format__": "retromol.FingerprintGenerator", - "__version__": 1, - "__created__": datetime.utcnow().isoformat() + "Z", - # Core config/state - "keep_stereo": self.keep_stereo, - "tanimoto_threshold": self.tanimoto_threshold, - "collapse_by_name": self.collapse_by_name, - "name_similarity": self.name_similarity, - "sha256_matching_rules": self.sha256_matching_rules, - # Precomputed data - "groups": self.groups, - "monomers": self.monomers, - } - with open(path, "wb") as f: - pickle.dump(payload, f, protocol=pickle.HIGHEST_PROTOCOL) - - @classmethod - def load(cls, path: str) -> "FingerprintGenerator": - """ - Load a FingerprintGenerator previously saved with .save(). - - :param path: input file path - :return: FingerprintGenerator instance - .. note:: uses pickle; not secure against untrusted sources - """ - with open(path, "rb") as f: - payload = pickle.load(f) - - # Basic validation - fmt = payload.get("__format__") - ver = int(payload.get("__version__", -1)) - if fmt != "retromol.FingerprintGenerator" or ver != 1: - raise ValueError(f"Unrecognized FingerprintGenerator save format/version: {fmt} v{ver}") - - # Build instance without invoking __init__ - self = cls.__new__(cls) - - # Restore core config/state - self.keep_stereo = bool(payload["keep_stereo"]) - self.tanimoto_threshold = float(payload["tanimoto_threshold"]) - self.collapse_by_name = list(payload["collapse_by_name"]) if payload["collapse_by_name"] else [] - self.name_similarity = payload["name_similarity"] - self.sha256_matching_rules = payload["sha256_matching_rules"] - - # Restore precomputed data - self.groups = payload["groups"] - self.monomers = payload["monomers"] - - # Recreate transient caches - self._assign_cache = {} - self._token_bytes_cache = {} - - return self - - -def polyketide_family_of(name: str) -> list[str] | None: - """ - Simple polyketide family extractor based on name pattern. - - :param name: monomer name - :return: [family, subfamily] or None if not a polyketide - """ - n = (name or "").strip() - if not n: - return None - is_polyketide = re.match(r"^[ABCD]\d+$", n) is not None - if is_polyketide: - family = "polyketide" - subfamily = n[0] - return [family, subfamily] - return None - - -def polyketide_ancestors_of(name: str) -> list[str]: - """ - Simple polyketide ancestor extractor based on name pattern. - - :param name: monomer name - :return: list of ancestors (e.g., ["polyketide", "polyketide_type_A", "A1"]) - """ - n = (name or "").strip().upper() - if re.match(r"^[ABCD]$", n): - return ["polyketide", f"polyketide_type_{n}"] - if re.match(r"^[ABCD]\d+$", n): - return ["polyketide", f"polyketide_type_{n[0]}", n] - return [] diff --git a/src/retromol/fingerprint/__init__.py b/src/retromol/fingerprint/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/retromol/fingerprint/fingerprint.py b/src/retromol/fingerprint/fingerprint.py new file mode 100644 index 0000000..5df12fa --- /dev/null +++ b/src/retromol/fingerprint/fingerprint.py @@ -0,0 +1,333 @@ +"""This module contains functions for generating hashed fingerprints from k-mers.""" + +import hashlib +import struct +from collections.abc import Callable, Iterable, Sequence +from typing import Any, Literal + +import numpy as np +from numpy.typing import NDArray + +from retromol.model.result import Result +from retromol.model.rules import MatchingRule +from retromol.model.assembly_graph import AssemblyGraph +from retromol.model.reaction_graph import MolNode +from retromol.utils.hashing import blake64_hex + +from retromol.fingerprint.monomer_collapse import Group, collapse_monomers, assign_to_group + + +NonePolicy = Literal["keep", "skip-token", "drop-kmer"] + + +def encode_family_token(fam: str) -> str: + """ + Generate a family token. + + :param fam: family name + :return: family token string + """ + fam = fam or "" + return f"NF:{blake64_hex(f'FAM:{fam.lower()}')}" + + +def normalize_token(tok: object, none_sentinel: str = "") -> bytes: + """ + Turn a token (possibly None) into stable bytes. + + :param tok: token to normalize (str, int, float, or None) + :param none_sentinel: string to use for None tokens + :return: bytes representation of the token + """ + if tok is None: + return none_sentinel.encode("utf-8") + + # Strings/ints are common; fall back to repr for others + if isinstance(tok, (str, int, float)): + return str(tok).encode("utf-8") + + return repr(tok).encode("utf-8") + + +def hash_kmer_tokens( + tokens: Sequence[bytes], + n_bits: int, + n_hashes: int, + seed: int = 0, + k_salt: int = 0, +) -> list[int]: + """ + Map a tokenized k-mer (as bytes) to n_hashes bit indices in [0, n_bits). + + :param tokens: sequence of bytes tokens (e.g. from _norm_token) + :param n_bits: number of bits in the fingerprint + :param n_hashes: number of hash indices to produce + :param seed: global seed for hashing + :param k_salt: salt value specific to the k-mer length (to decorrelate lengths) + :return: list of bit indices + + .. note:: Deterministic across runs/machines. Different k values get a salt. + """ + data = b"\x1f".join(tokens) # unit separator + + idxs: list[int] = [] + for i in range(n_hashes): + # Include both global seed and per-hash index, plus a per-k salt + salted = data + struct.pack(">III", seed, i, k_salt) + digest = hashlib.blake2b(salted, digest_size=8).digest() + val = int.from_bytes(digest, "big") % n_bits + idxs.append(val) + + return idxs + + +def kmers_to_fingerprint( + kmers: Iterable[Sequence[Any]], + num_bits: int = 2048, + num_hashes_per_kmer: int | Callable[[int], int] = 2, + seed: int = 42, + none_policy: NonePolicy = "keep", + counted: bool = False, +) -> NDArray[np.generic]: + """ + Build a hashed fingerprint from an iterable of tokenized k-mers. + + :param kmers: iterable of k-mers, where each k-mer is a sequence of tokens (str, int, float, or None) + :param num_bits: number of bits in the fingerprint + :param num_hashes_per_kmer: number of hash indices to produce per k-mer (int or callable that takes k-mer length + as input and returns the number of hashes). + :param seed: global seed for hashing. + :param none_policy: policy for handling None tokens: "keep" (treat as a special token), "skip-token" + (omit the token), or "drop-kmer" (skip the entire k-mer). + :param counted: if True, produce a count vector instead of a binary vector + :return: fingerprint as a numpy array of shape (n_bits,) + """ + if num_bits <= 0: + raise ValueError("n_bits must be positive") + + # Normalize n_hashes_per_kmer to callable + if isinstance(num_hashes_per_kmer, int): + if num_hashes_per_kmer <= 0: + raise ValueError("num_hashes_per_kmer must be positive") + + def _nh(_: int) -> int: + return num_hashes_per_kmer + else: + _nh: Callable[[int], int] = num_hashes_per_kmer + + # Allocate output + if counted: + fp = np.zeros(num_bits, dtype=np.uint32) + else: + fp = np.zeros(num_bits, dtype=np.uint8) + + # Main loop + for kmer in kmers: + if none_policy == "drop-kmer" and any(t is None for t in kmer): + continue + + # Normalize per token + normd: list[bytes] = [] + for t in kmer: + if t is None: + if none_policy == "skip-token": + continue + normd.append(normalize_token(None)) + else: + normd.append(normalize_token(t)) + if not normd: + continue + + n_hashes = _nh(len(kmer)) + if n_hashes <= 0: + continue + + # Simple salt tied to (normalized) k-mer length + k_salt = len(normd) + + idxs = hash_kmer_tokens( + normd, + n_bits=num_bits, + n_hashes=n_hashes, + seed=seed, + k_salt=k_salt, + ) + + if counted: + # Increment counts; duplicates in idxs will accumulate + fp[idxs] += 1 + else: + # Set bits to 1 (binary) + fp[idxs] = 1 + + return fp + + +class FingerprintGenerator: + """ + Class to generate fingerprints based on monomer collapse groups. + """ + + def __init__( + self, + matching_rules: Iterable[MatchingRule], + keep_stereo: bool = False, + tanimoto_threshold: float = 0.6, + morgan_radius: int = 2, + morgan_num_bits: int = 2048, + family_token_weight: int = 1, + ancestor_token_weight: int = 1, + ) -> None: + """ + Initialize FingerprintGenerator. + + :param matching_rules: iterable of MatchingRule objects for monomer identification + :param keep_stereo: whether to keep stereochemistry when collapsing monomers + :param tanimoto_threshold: Tanimoto similarity threshold for collapsing monomers + :param morgan_radius: radius for Morgan fingerprinting when collapsing monomers + :param morgan_num_bits: number of bits for Morgan fingerprinting when collapsing monomers + :param family_token_weight: weight for family tokens in the fingerprint + :param ancestor_token_weight: weight for ancestor tokens in the fingerprint + """ + matching_rules = list(matching_rules) + + groups, monomers = collapse_monomers( + matching_rules, + keep_stereo=keep_stereo, + tanimoto_threshold=tanimoto_threshold, + morgan_radius=morgan_radius, + morgan_num_bits=morgan_num_bits, + ) + + self.groups = groups + self.monomers = monomers + + self.keep_stereo = keep_stereo + self.tanimoto_threshold = tanimoto_threshold + self.morgan_radius = morgan_radius + self.morgan_num_bits = morgan_num_bits + + # For speedup + self._assign_cache: dict[tuple[str | None, str], Group | None] = {} + self._token_bytes_cache: dict[object, bytes] = {} + + def assign_to_group(self, smiles: str) -> Group | None: + """ + Assign a new monomer to an existing group based on its SMILES. + + :param smiles: SMILES string of the monomer + :return: assigned Group or None if no match + """ + # SMILES was checked before; return from cache + g = self._assign_cache.get(smiles) + if g is not None: + return g + + # Structure branch: assign based on Tanimoto similarity + group = assign_to_group( + smiles=smiles, + groups=self.groups, + monomers=self.monomers, + keep_stereo=self.keep_stereo, + tanimoto_threshold=self.tanimoto_threshold, + morgan_radius=self.morgan_radius, + morgan_num_bits=self.morgan_num_bits, + ) + + # Cache result (including None) so we don't recompute on repeats + self._assign_cache[smiles] = group + + return group + + def ancestor_list_for_node(self, node: MolNode) -> list[str | None]: + """ + Return full ancestor hierarchy for a node. + + :param node: MolNode to get ancestors for + :return: list of ancestor tokens (str or None) + """ + anc: list[str] = [] + + if node.is_identified and node.identity.matched_rule.ancestor_tokens: + anc.extend(node.identity.matched_rule.ancestor_tokens) + + return anc + + def fingerprint_from_result( + self, + result: Result, + num_bits: int = 2048, + kmer_sizes: list[int] | None = None, + kmer_weights: dict[int, int] | None = None, + counted: bool = False, + ) -> NDArray[np.int8]: + """ + Generate a fingerprint from a RetroMolResult. + + :param result: RetroMol Result object + :param num_bits: number of bits in the fingerprint + :param kmer_sizes: list of k-mer sizes to consider + :param kmer_weights: weights for each k-mer size. Determines how many bits each k-mer sets. + :param counted: if True, count the number of times each k-mer appears. + :return: fingerprint as a numpy array + """ + # Default kmer_sizes + if kmer_sizes is None: + kmer_sizes = [1, 2] + + # Default kmer_weights + if kmer_weights is None: + kmer_weights = {1: 1, 2: 1} + + # Create assembly graph of monomers; first collect nodes to include + root = result.submission.mol + collected = result.reaction_graph.get_leaf_nodes(identified_only=False) + a = AssemblyGraph.build(root_mol=root, monomers=collected, include_unassigned=True) + + # Calculate kmers from AssemblyGraph + tokenized_kmers: list[tuple[str | None, ...]] = [] + + for kmer_size in kmer_sizes: + for kmer in a.iter_kmers(k=kmer_size): + + per_node_ancestors: list[list[str | None]] = [] + max_depth = 0 + + for node in kmer: + anc = self.ancestor_list_for_node(node) + per_node_ancestors.append(anc) + max_depth = max(max_depth, len(anc)) + + # Emit ancestor-aligned kmers + for level in range(max_depth): + tokenized_kmers.append(tuple( + anc[level] if level < len(anc) else None + for anc in per_node_ancestors + )) + + # Emite structural kmer separately (structure only) + tokenized_kmers.append(tuple( + (self.assign_to_group(node.smiles).token + if self.assign_to_group(node.smiles) is not None + else None) + for node in kmer + )) + + # Gather additional 1-mer virtual family tokens (defined in matching rules); only once per found monomer + for node in a.monomer_nodes(): + ident = node.identity if node.is_identified else None + if ident is not None: + for fam_tok in ident.matched_rule.family_tokens: + tokenized_kmers.append((encode_family_token(fam_tok),)) + + # Hash kmers + fp = kmers_to_fingerprint( + tokenized_kmers, + num_bits=num_bits, + num_hashes_per_kmer=lambda k: kmer_weights.get(k, 1), + seed=42, + none_policy="keep", + counted=counted, + ) + + return fp diff --git a/src/retromol/fingerprint/monomer_collapse.py b/src/retromol/fingerprint/monomer_collapse.py new file mode 100644 index 0000000..ac7ca26 --- /dev/null +++ b/src/retromol/fingerprint/monomer_collapse.py @@ -0,0 +1,234 @@ +"""Collapse monomers into structural (and optionally name-based) groups, deterministically.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Iterable + +from rdkit.Chem.rdchem import Mol +from rdkit.DataStructs.cDataStructs import ExplicitBitVect + +from retromol.chem.mol import mol_to_smiles, standardize_from_smiles +from retromol.chem.fingerprint import mol_to_morgan_fingerprint, calculate_tanimoto_similarity +from retromol.model.rules import MatchingRule +from retromol.utils.hashing import blake64_hex + + +@dataclass(frozen=True, slots=True) +class Monomer: + """ + Normalized monomer view derived from a MatchingRule. Only keep what we need for determinisic grouping. + + :var rid: rule ID (index in matching rules) + :var name: monomer name + :var mol: RDKit Mol object + :var can_smi: canonical SMILES (isomeric if specified) + :var fp: ECFP4 fingerprint + """ + + rid: int + name: str + mol: Mol + can_smi: str + fp: ExplicitBitVect + + @classmethod + def from_matching_rule( + cls, + matching_rule: MatchingRule, + keep_stereo: bool = False, + morgan_radius: int = 2, + morgan_num_bits: int = 2048, + ) -> "Monomer": + """ + Create Monomer from MatchingRule. + + :param matching_rule: MatchingRule object + :return: Monomer instance + """ + canonical_smiles = mol_to_smiles(matching_rule.mol, isomeric=keep_stereo, canonical=True) + morgan_fingerprint = mol_to_morgan_fingerprint(matching_rule.mol, radius=morgan_radius, num_bits=morgan_num_bits) + + return cls( + rid=matching_rule.id, + name=matching_rule.name, + mol=matching_rule.mol, + can_smi=canonical_smiles, + fp=morgan_fingerprint, + ) + + +@dataclass +class Group: + """ + Deterministic structural group. + + :var gid: group ID + :var rep_rid: representative Monomer.rid + :var members: tuple of member Monomer.rid values (sorted) + :var token: stable token based on rep_can_smi or name + :var rep_can_smi: representative canonical SMILES + """ + + gid: int + rep_rid: int + members: tuple[int, ...] + token: str + rep_can_smi: str + + +def collapse_monomers( + matching_rules: Iterable[MatchingRule], + keep_stereo: bool = False, + tanimoto_threshold: float = 0.6, + morgan_radius: int = 2, + morgan_num_bits: int = 2048, +) -> tuple[list[Group], list[Monomer]]: + """ + Deterministic grouping independent of input order (but still RDKit/version dependent). + + :param records: iterable of (name, SMILES) tuples for monomers + :param keep_stereo: whether to retain stereochemistry during standardization + :param tanimoto_threshold: Tanimoto similarity threshold for structural grouping + :param morgan_radius: radius for Morgan fingerprint + :param morgan_num_bits: number of bits for Morgan fingerprint + :return: tuple of (list of Groups, list of Monomers) + """ + # Build Monomer table from matching rules + monomers: list[Monomer] = [ + Monomer.from_matching_rule( + rl, + keep_stereo=keep_stereo, + morgan_radius=morgan_radius, + morgan_num_bits=morgan_num_bits, + ) + for rl in matching_rules + ] + + # Stable order independent of input order + monomers.sort(key=lambda m: (m.can_smi, m.rid)) + + # Excat groups by canonical smiles + # key: can_smi -> list of monomer rids + by_smi: dict[str, list[Monomer]] = {} + for m in monomers: + by_smi.setdefault(m.can_smi, []).append(m) + + # Sort exact groups by (can_smi) so deterministic + exact_reps: list[Monomer] = [] + exact_members: list[list[Monomer]] = [] + for can_smi in sorted(by_smi.keys()): + ms = sorted(by_smi[can_smi], key=lambda x: x.rid) + exact_reps.append(ms[0]) # deterministic representative + exact_members.append(ms) + + # Similarity collapse across exact groups + # We collapse exact groups into "structural families" deterministically + rep_indices: list[int] = [] # indices into exact_reps that are final representatives + assigned_to: list[int] = [-1] * len(exact_reps) # exact-group i -> rep index (index in rep_indices list) + + for i, rep_i in enumerate(exact_reps): + # Find first earlier representative that matches by similarity + found_rep_slot = None + for slot, rep_group_idx in enumerate(rep_indices): + rep_j = exact_reps[rep_group_idx] + sim = calculate_tanimoto_similarity(rep_i.fp, rep_j.fp) + if sim >= tanimoto_threshold: + found_rep_slot = slot + break + + if found_rep_slot is None: + # New representative + rep_indices.append(i) + assigned_to[i] = len(rep_indices) - 1 + else: + # Assign to existing representative + assigned_to[i] = found_rep_slot + + # Emit final groups deterministically + groups: list[Group] = [] + for gid, rep_group_idx in enumerate(rep_indices): + rep = exact_reps[rep_group_idx] + + # Gather members from all exact-groups assigned to this rep slot + member_rids: list[int] = [] + for i in range(len(exact_reps)): + if assigned_to[i] != gid: + continue + member_rids.extend(m.rid for m in exact_members[i]) + + member_rids = sorted(set(member_rids)) # unique and sorted + groups.append(Group( + gid=gid, + rep_rid=rep.rid, + members=tuple(member_rids), + token=blake64_hex(rep.can_smi), + rep_can_smi=rep.can_smi, + )) + + return groups, monomers + + +def assign_to_group( + smiles: str, + groups: list[Group], + monomers: list[Monomer], + keep_stereo: bool = False, + tanimoto_threshold: float = 0.6, + morgan_radius: int = 2, + morgan_num_bits: int = 2048, +) -> Group | None: + """ + Assign a new monomer (by SMILES) to an existing group if similar enough. + + :param smiles: SMILES string of new monomer + :param groups: existing groups + :param monomers: existing monomers + :param keep_stereo: whether to retain stereochemistry during standardization + :param tanimoto_threshold: Tanimoto similarity threshold for structural grouping + :param morgan_radius: radius for Morgan fingerprint + :param morgan_num_bits: number of bits for Morgan fingerprint + :return: assigned Group or None if no match + """ + mol = standardize_from_smiles(smiles, keep_stereo=keep_stereo) + if mol is None: + raise ValueError(f"could not standardize SMILES {smiles} for group assignment") + + # Create gid -> group mapping + gid_to_group = {g.gid: g for g in groups} + + # Get canonical SMILES and fingerprint for SMILES-to-assign + can_smi = mol_to_smiles(mol, include_tags=False, canonical=True, isomeric=keep_stereo) + fp_new = mol_to_morgan_fingerprint(mol, radius=morgan_radius, num_bits=morgan_num_bits) + + # Exact canonical SMILES -> group (fast path) + rep_smi_to_gid = {g.rep_can_smi: g.gid for g in groups} + gid = rep_smi_to_gid.get(can_smi) + if gid is not None: + return gid_to_group[gid] + + # Similarity fallback vs. representative fingerprints + # Build rid -> Monomer lookup once (monomers contain fp + can_smi) + by_rid = {m.rid: m for m in monomers} + + best_gid: int | None = None + best_sim: float = -1.0 + + # Deterministic iteration: sort by gid + for g in sorted(groups, key=lambda x: x.gid): + rep = by_rid.get(g.rep_rid) + if rep is None: + continue # should not happen if monomers list matches groups + + sim = calculate_tanimoto_similarity(fp_new, rep.fp) + if sim > best_sim: + best_sim = sim + best_gid = g.gid + + if best_gid is None: + return None + + if best_sim >= tanimoto_threshold: + return gid_to_group[best_gid] + + return None diff --git a/src/retromol/fingerprint/similarity.py b/src/retromol/fingerprint/similarity.py new file mode 100644 index 0000000..94eb5dc --- /dev/null +++ b/src/retromol/fingerprint/similarity.py @@ -0,0 +1,69 @@ +"""Fingerprint similarity metrics.""" + +import numpy as np +from numpy.typing import NDArray + + +def calculate_cosine_similarity(fp1: NDArray[np.int8], fp2: NDArray[np.int8]) -> float: + """ + Cosine similarity for fingerprints. + + :param fp1: first fingerprint (1D array) + :param fp2: second fingerprint (1D array) + :return: cosine similarity in [0, 1] + """ + a = np.asarray(fp1) + b = np.asarray(fp2) + + # Ensure 1D + a = a.ravel() + b = b.ravel() + if a.shape != b.shape: + raise ValueError(f"Different lengths: {a.shape} vs {b.shape}") + + # Upcast to float to avoid integer overflow and match sklearn + a = a.astype(np.float64, copy=False) + b = b.astype(np.float64, copy=False) + + # Compute cosine + dot = float(np.dot(a, b)) + na = float(np.linalg.norm(a)) + nb = float(np.linalg.norm(b)) + + if na == 0.0 or nb == 0.0: + return 0.0 + + return dot / (na * nb) + + +def calculate_tanimoto_similarity(fp1: NDArray[np.int8], fp2: NDArray[np.int8]) -> float: + """ + Tanimoto similarity for molecular fingerprints (binary or count-based). + + :param fp1: first fingerprint (1D array) + :param fp2: second fingerprint (1D array) + :return: Tanimoto similarity in [0, 1] + """ + a = np.asarray(fp1) + b = np.asarray(fp2) + + # Ensure 1D + a = a.ravel() + b = b.ravel() + if a.shape != b.shape: + raise ValueError(f"Different lengths: {a.shape} vs {b.shape}") + + # Upcast to float to prevent overflow and ensure precision + a = a.astype(np.float64, copy=False) + b = b.astype(np.float64, copy=False) + + # Dot product = intersection term + ab = float(np.dot(a, b)) + aa = float(np.dot(a, a)) + bb = float(np.dot(b, b)) + + denom = aa + bb - ab + if denom == 0.0: + return 0.0 + + return ab / denom diff --git a/src/retromol/graph.py b/src/retromol/graph.py deleted file mode 100644 index a27ede5..0000000 --- a/src/retromol/graph.py +++ /dev/null @@ -1,217 +0,0 @@ -"""This module contains functions for graph operations with networkx.""" - -from collections.abc import Generator -from copy import deepcopy -from typing import Any - -import networkx as nx -from networkx import Graph - -import retromol.chem as chem - - -def mol_to_graph(mol: chem.Mol, use_tags: bool = False) -> "Graph[int | str]": - """ - Convert RDKit molecule to networkx graph based on atom indices. - - :param mol: RDKit molecule - :return: networkx graph - .. note:: Nodes are atom indices (or atom tags if `use_tags` is True) - """ - smiles = chem.mol_to_smiles(mol) - graph: Graph[int | str] = Graph( - smiles=smiles, - smiles_no_tags=chem.mol_to_smiles(deepcopy(mol), remove_tags=True), - ) - - # If use_tags is True, we will use atom isotopes as tags for nodes - for atom in mol.GetAtoms(): - if use_tags: - atom_tag: int = atom.GetIsotope() - if atom_tag == 0: - atom_idx: int = atom.GetIdx() - atom_tag = -1 * atom_idx - graph.add_node(atom_tag) - else: - graph.add_node(atom.GetIdx()) - - # Add edges between atoms based on bonds - # If use_tags is True, we will use atom isotopes as tags for edges - for bond in mol.GetBonds(): - if use_tags: - begin_atom_idx = bond.GetBeginAtomIdx() - begin_atom_tag = bond.GetBeginAtom().GetIsotope() - if begin_atom_tag == 0: - begin_atom_tag = -1 * begin_atom_idx - end_atom_idx = bond.GetEndAtomIdx() - end_atom_tag = bond.GetEndAtom().GetIsotope() - if end_atom_tag == 0: - end_atom_tag = -1 * end_atom_idx - graph.add_edge(begin_atom_tag, end_atom_tag) - else: - graph.add_edge(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()) - - return graph - - -def merge_nodes( - graph: "Graph[int | str]", - merged_node_id: int | str, - nodes: list[int], - props: dict[str, Any] | None = None, -) -> None: - """ - Merge `nodes` into a single node `merged_node_id`. - - - Internal edges among `nodes` are removed. - - Edges from `nodes` to outside nodes are rewired to `merged_node_id`. - - If `merged_node_id` already exists and is in `nodes`, it is kept; - otherwise a new node is created. - - If `props` is provided, these attributes are set on the merged node - (overwriting any existing attributes with the same keys). - - :param graph: the graph to modify - :param merged_node_id: ID to keep/create for the merged node - :param nodes: node IDs to merge (must exist in `graph`) - :param props: attributes to assign to the merged node after merge - .. note:: this function modifies `graph` in place and returns None - """ - if not nodes: - return - - # Validate nodes exist - missing = [n for n in nodes if n not in graph] - if missing: - raise ValueError(f"Cannot merge {missing} - they are not in the graph.") - - nodes_set: set[int | str] = set(nodes) - - # Collect outside neighbors - outside_neighbors: set[int | str] = set() - for n in nodes: - for nbr in graph.neighbors(n): - if nbr not in nodes_set: - outside_neighbors.add(nbr) - - # Merge paths - if merged_node_id in nodes_set: - # Case A: keep existing merged_node_id; remove others - to_remove = nodes_set - {merged_node_id} - graph.remove_nodes_from(to_remove) - - # Reattach edges to outside neighbors - for nbr in outside_neighbors: - graph.add_edge(merged_node_id, nbr) - - else: - # Case B: create a brand-new merged_node_id - if merged_node_id in graph: - raise ValueError( - f"Cannot create merged node {merged_node_id} because it already exists " - "in the graph (and was not part of the nodes list)." - ) - - # Remove all original nodes, then add merged node - graph.remove_nodes_from(nodes) - graph.add_node(merged_node_id) - - # Reattach edges to outside neighbors - for nbr in outside_neighbors: - graph.add_edge(merged_node_id, nbr) - - # Apply/override attributes on the merged node - if props: - graph.nodes[merged_node_id].update(props) - - -def is_linear_graph(g: "Graph[int | str]") -> bool: - """ - Return True if `g` is a single path (a “string” of nodes): - - connected, - - acyclic, - - exactly two endpoints of degree 1 (or a single node), - - all other nodes of degree 2. - - :param g: the graph to check - :return: True if `g` is a linear graph, False otherwise - """ - n = g.number_of_nodes() - # Empty graph -> not a chain - if n == 0: - return False - - # Single node -> trivially linear - if n == 1: - return True - - # Must be connected - if not nx.is_connected(g): - return False - - # Tree check: exactly n-1 edges for acyclic connected graph - if g.number_of_edges() != n - 1: - return False - - # Count degrees - degrees: dict[int | str, int] = dict(g.degree()) - degs: list[int] = list(degrees.values()) - num_deg1 = sum(1 for d in degs if d == 1) - num_deg2 = sum(1 for d in degs if d == 2) - - # Exactly two endpoints (degree 1) and the rest degree 2 - return (num_deg1 == 2) and (num_deg2 == n - 2) - - -def get_linear_path(g: "Graph[int | str]") -> list[int | str] | None: - """ - If `g` is a linear graph (as per is_linear_graph), return the list - of nodes in path order; otherwise return None. - - :param g: the graph to extract the path from - :return: list of nodes in path order, or None if not linear - """ - # Single node case - if g.number_of_nodes() == 1: - return list(g.nodes()) - - if is_linear_graph(g): - # Find the two endpoints (degree == 1) - endpoints = [node for node, deg in g.degree() if deg == 1] - start, end = endpoints - - # The graph is a tree, so shortest_path is the unique path - return nx.shortest_path(g, source=start, target=end) - - -def iter_kmers(G: "Graph[int | str]", k: int) -> Generator[tuple[int | str, ...], None, None]: - """ - Generate all length-k node walks (k-mers) from graph G, returning node identifiers. - - Each yielded k-mer is a tuple of node identifiers (length == k). - - :param G: the graph to traverse - :param k: the length of the k-mers to generate - :return: a generator yielding k-mers as tuples of node identifiers - .. note:: enumerates walks (nodes may repeat), not simple paths - .. note:: for k == 1, yields one k-mer per node - """ - if k < 1: - raise ValueError("k must be >= 1") - - # k == 1: one k-mer per node - if k == 1: - for n in G.nodes(data=False): - yield (n,) - return - - # Build paths as lists of NodeT; convert to tuple only when yielding. - stack: list[tuple[int | str, list[int | str]]] = [(start, [start]) for start in G.nodes(data=False)] - - while stack: - node, path = stack.pop() - if len(path) == k: - yield tuple(path) - continue - for nbr in G.neighbors(node): - # Append neighbor to the current path (nodes may repeat) - stack.append((nbr, path + [nbr])) diff --git a/src/retromol/io.py b/src/retromol/io.py deleted file mode 100644 index b1ff9b5..0000000 --- a/src/retromol/io.py +++ /dev/null @@ -1,467 +0,0 @@ -"""This module describes the basic output class for RetroMol.""" - -import json -from collections import Counter, defaultdict -from dataclasses import dataclass -from typing import Any - -from networkx import Graph, node_link_data, node_link_graph - -from retromol.chem import ( - Mol, - get_tags_mol, - mol_to_inchikey, - mol_to_smiles, - neutralize_mol, - smiles_to_mol, -) - - -class Input: - """ - This class describes the input for a RetroMol run. - """ - - def __init__( - self, - cid: str, - repr: Mol | str, - props: dict[str, Any] | None = None, - tag_compound: bool = True, - reserved_tags: set[int] | None = None, - ) -> None: - """ - Initialize the input compound. - - :param cid: compound identifier - :param mol: RDKit molecule or SMILES string - :param props: additional properties - :param tag_compound: whether to tag the compound's atoms, existing tags will be preserved - """ - self.cid = cid - self.props = props - - if isinstance(repr, Mol): - self.mol = repr - if tag_compound: - reserved = reserved_tags or set() - for atom in self.mol.GetAtoms(): - tag = atom.GetIsotope() - if tag in reserved: - continue - if tag == 0: - idx = atom.GetIdx() - tag = idx + 1 - while tag in reserved: - tag += 1 - atom.SetIsotope(tag) - reserved.add(tag) - - # Store SMILES representation with tags - self.smi = mol_to_smiles(self.mol) - - else: - smi = repr - - # Sanitize SMILES - smi = smi.replace("[N]", "N") # avoid parsing issues with RDKit - - # Convert SMILES into RDKit molecule - self.mol = smiles_to_mol(smi, retain_largest_fragment=True) - neutralize_mol(self.mol) - - # Store original atom indices as isotope number - if tag_compound: - reserved = reserved_tags or set() - for atom in self.mol.GetAtoms(): - tag = atom.GetIsotope() - if tag in reserved: - continue - if tag == 0: - idx = atom.GetIdx() - tag = idx + 1 - while tag in reserved: - tag += 1 - atom.SetIsotope(tag) - reserved.add(tag) - - # Store SMILES representation with tags - self.smi = mol_to_smiles(self.mol) - - def get_tags(self) -> list[int]: - """ - Get the atom tags. - - :return: atom tags - """ - return get_tags_mol(self.mol) - - -@dataclass -class Result: - """ - A class representing the result of a RetroMol operation. - - :param input_id: a unique identifier for the input molecule - :param graph: a networkx Graph representing the motif graph of the input molecule. Nodes of this graph may have - a “graph” attribute which is itself another nx.Graph (or None), arbitrarily nested - :param props: additional properties associated with the input molecule. - :param sha256_reaction_rules: SHA256 hash of the reaction rules used (optional) - :param sha256_matching_rules: SHA256 hash of the matching rules used (optional) - """ - - input_id: str - graph: "Graph[int | str]" - props: dict[str, Any] | None - sha256_reaction_rules: str | None - sha256_matching_rules: str | None - - def serialize(self) -> dict[str, Any]: - """ - Serialize this Result to a JSON-friendly dict, including any nested graphs. - - :return: serialized representation of the Result - """ - return { - "input_id": self.input_id, - "graph": self._serialize_graph(self.graph), - "props": self.props, - "sha256_reaction_rules": self.sha256_reaction_rules, - "sha256_matching_rules": self.sha256_matching_rules, - } - - def get_input_smiles(self, remove_tags: bool = False) -> str: - """ - Get the SMILES representation of the input molecule. - - This is a convenience method to access the input's SMILES from the Result. - - :param remove_tags: if True, removes the atom tags from the SMILES. - :return: SMILES string of the input molecule. - """ - if remove_tags: - smiles = self.graph.graph.get("smiles_no_tags", None) - else: - smiles = self.graph.graph.get("smiles", None) - - if smiles is None: - raise ValueError("SMILES not found in graph attributes.") - - # Bit unsafe, but we trust our own data structure here - return smiles - - def get_props(self) -> dict[str, Any]: - """ - Get the additional properties associated with the input molecule. - - :return: dictionary of additional properties - """ - return self.props if self.props is not None else {} - - @staticmethod - def _serialize_graph(g: "Graph[int | str]") -> dict[str, Any]: - # First use node_link_data to turn the graph structure into primitives + attrs - data = node_link_data(g) - - # Now look for any node-attribute called "graph" that is itself an nx.Graph, and recurse - for node in data["nodes"]: - node["id"] = str(node["id"]) # Ensure node IDs are strings - sub = node.get("graph") - if isinstance(sub, Graph): - node["graph"] = Result._serialize_graph(sub) - # If it's None (or already a dict), leave it as is - - # Make sure to also stringify the link endpoints - for link in data["links"]: - link["source"] = str(link["source"]) - link["target"] = str(link["target"]) - - return data - - @staticmethod - def from_serialized(data: dict[str, Any]) -> "Result": - """ - Reconstruct a Result from the dict form produced by serialize(). - - :param data: serialized representation of the Result - :return: reconstructed Result object - """ - nested = data["graph"] - g = Result._deserialize_graph(nested) - props = data.get("props", None) - sha256_reaction_rules = data.get("sha256_reaction_rules", None) - sha256_matching_rules = data.get("sha256_matching_rules", None) - return Result( - input_id=data["input_id"], - graph=g, - props=props, - sha256_reaction_rules=sha256_reaction_rules, - sha256_matching_rules=sha256_matching_rules, - ) - - @staticmethod - def _deserialize_graph(data: dict[str, Any]) -> "Graph[int | str]": - # Convert string IDs back to integers - for node in data["nodes"]: - node["id"] = node["id"] - - # Convert string source/target IDs back to integers in links - for link in data["links"]: - link["source"] = link["source"] - link["target"] = link["target"] - - # Build the Graph object from its node-link dict - g = node_link_graph(data) - - # Walk each node, and if it has a "graph" attribute that is a dict, recurse - for _, attrs in g.nodes(data=True): - nested = attrs.get("graph") - if isinstance(nested, dict): - attrs["graph"] = Result._deserialize_graph(nested) - - return g - - def to_json(self) -> str: - """ - Dump the fully serialized form to a JSON string. - - :return: JSON string representation of the Result - """ - return json.dumps(self.serialize(), indent=2) - - @staticmethod - def from_json(s: str) -> "Result": - """ - Load a Result back from a JSON string. - - :param s: JSON string representation of the Result - :return: reconstructed Result object - """ - data = json.loads(s) - return Result.from_serialized(data) - - def summarize_by_depth(self, propagate_through_identified: bool = True) -> dict[int, dict[str, Any]]: - """ - For each depth d (root children are depth 1), accumulate UNIQUE input tags - from all identified nodes seen at depths <= d. - - :param propagate_through_identified: - if True, continue traversing into subgraphs of identified nodes - if False, only traverse into unidentified nodes' subgraphs - - Returns { depth: { - "covered_tag_count": int, # cumulative unique tags - "coverage": float, # covered_tag_count / |input_tags| - "n_nodes": int, # per-depth - "n_identified": int, # per-depth - "n_unidentified": int, # per-depth - "monomer_counts": Counter, # per-depth (identity -> count) - "wave_name": Optional[str], # first seen at that depth - }, ...} - """ - # Input tag universe - root_tags = self.graph.graph.get("tags") - if isinstance(root_tags, (list, tuple, set)): - T0 = {int(t) for t in root_tags} - else: - root_smi = self.graph.graph.get("smiles") - if not root_smi: - raise ValueError("Root graph missing 'tags' and 'smiles'.") - T0 = set(get_tags_mol(smiles_to_mol(root_smi))) - denom = len(T0) if T0 else 0 - - # Per-depth accumulators - tags_at_depth: dict[int, set[int]] = defaultdict(set) # ONLY from identified nodes - monomers_by_depth: dict[int, Counter[str]] = defaultdict(Counter) - nodes_by_depth: dict[int, int] = defaultdict(int) - identified_by_depth: dict[int, int] = defaultdict(int) - unidentified_by_depth: dict[int, int] = defaultdict(int) - wave_name_by_depth: dict[int, str] = {} - - def _walk(g, depth: int): - nd = depth + 1 - for _, attrs in g.nodes(data=True): - nodes_by_depth[nd] += 1 - - identity = attrs.get("identity") - node_tags_raw = attrs.get("tags") - - # Only identified nodes contribute tags - if identity is not None: - identified_by_depth[nd] += 1 - monomers_by_depth[nd][identity] += 1 - - if node_tags_raw is not None: - try: - node_tags = {int(t) for t in node_tags_raw} - except Exception: - node_tags = set() - # Only count tags that belong to the input - tags_at_depth[nd].update(node_tags & T0) - else: - unidentified_by_depth[nd] += 1 - - if nd not in wave_name_by_depth: - wn = attrs.get("wave_name") - if wn is not None: - wave_name_by_depth[nd] = wn - - sub = attrs.get("graph") - if isinstance(sub, Graph): - if propagate_through_identified or identity is None: - _walk(sub, nd) - - _walk(self.graph, depth=0) - - # Cumulative union across depths - depths: list[int] = sorted(set(nodes_by_depth) | set(tags_at_depth)) - cumulative: set[int] = set() - summary: dict[int, dict[str, Any]] = {} - for d in depths: - cumulative |= tags_at_depth.get(d, set()) - covered = len(cumulative) - cov = round((covered / denom), 4) if denom else 0.0 - summary[d] = { - "wave_name": wave_name_by_depth.get(d, None), - "monomer_counts": monomers_by_depth.get(d, Counter()), - "covered_tag_count": covered, - "coverage": cov, - "n_nodes": nodes_by_depth.get(d, 0), - "n_identified": identified_by_depth.get(d, 0), - "n_unidentified": unidentified_by_depth.get(d, 0), - } - return summary - - def max_depth(self) -> int: - """ - Return the deepest nesting level (wave index). Root is 0 - - :return: Maximum depth of the nested motif graph - """ - max_d = 0 - - def _recurse(g: "Graph[int | str]", d: int): - nonlocal max_d - max_d = max(max_d, d) - for _, attrs in g.nodes(data=True): - sub = attrs.get("graph") - if isinstance(sub, Graph): - _recurse(sub, d + 1) - - _recurse(self.graph, 0) - return max_d - - def best_total_coverage(self, round_to: int = 2) -> float: - """ - Best total coverage across all depths. - - :param round_to: number of decimal places to round the coverage to - :return: best total coverage as a float rounded to 'round_to' decimal places - """ - summary = self.summarize_by_depth(propagate_through_identified=False) - total = max(info.get("coverage", 0.0) for info in summary.values()) - # clamp & round like calculate_coverage() - total = max(0.0, min(1.0, total)) - return round(total, round_to) - - def get_unidentified_nodes(self) -> list[tuple[str, str]]: - """ - Return all unidentified nodes as a list of (SMILES_without_tags, InChIKey). - - - Traverses the full nested motif graph - - A node is considered unidentified if attrs.get("identity") is None - - If a node already has "smiles_no_tags", uses it; otherwise removes - isotopes/atom-maps from its "smiles" on the fly - - Deduplicates and returns results sorted by SMILES, then InChIKey - - :return: list of tuples (SMILES_without_tags, InChIKey) - """ - results: list[tuple[str, str]] = [] - seen: set[tuple[str, str]] = set() - - def _untag_smiles(tagged_smi: str) -> str: - # Convert to RDKit mol, strip isotopes & atom map numbers, then re-SMILES - mol = smiles_to_mol(tagged_smi) - for atom in mol.GetAtoms(): - # Remove our tag carriers (we use isotopes for tags; atom-maps just in case) - atom.SetIsotope(0) - try: - atom.SetAtomMapNum(0) # no-op if not set / older RDKit - except Exception: - pass - # Normalize charges - neutralize_mol(mol) - return mol_to_smiles(mol) - - def _inchikey_from_smiles(smi: str) -> str: - """ - Get InChIKey from SMILES string. - - :param smi: SMILES string - """ - mol = smiles_to_mol(smi) - return mol_to_inchikey(mol) # assumes this exists in retromol.chem - - def _walk(g: "Graph[int | str]") -> None: - """ - Recursive walker to find unidentified nodes. - - :param g: current graph to walk - """ - for _, attrs in g.nodes(data=True): - # Collect if unidentified - if attrs.get("identity") is None: - tagged = attrs.get("smiles") - if tagged is None: - continue - smi_no_tags = attrs.get("smiles_no_tags") or _untag_smiles(tagged) - try: - ik = _inchikey_from_smiles(smi_no_tags) - except Exception: - # Fall back to empty key if InChIKey generation fails - ik = "" - pair = (smi_no_tags, ik) - if pair not in seen: - seen.add(pair) - results.append(pair) - - # Recurse into subgraphs regardless of parent identification - sub = attrs.get("graph") - if isinstance(sub, Graph): - _walk(sub) - - _walk(self.graph) - results.sort(key=lambda x: (x[0], x[1])) - return results - - def get_identified_nodes(self) -> set[tuple[str, str, tuple[int, ...]]]: - """ - Traverse the nested motif graph and return all identified nodes. - - :return: each entry is (identity, tags) - """ - results: dict[tuple[str, tuple[int, ...]], str] = {} # SMILES might vary for same identity+tags - - def _walk(g: "Graph[int | str]") -> None: - for _, attrs in g.nodes(data=True): - identity = attrs.get("identity") - if identity is not None: - # Get untagged SMILES - smiles = attrs.get("smiles", "") - mol = smiles_to_mol(smiles) - smiles = mol_to_smiles(mol, remove_tags=True) - - tags_list: list[int] = attrs.get("tags", []) - tags_list.sort() - tags: tuple[int, ...] = tuple(tags_list) - results[(identity, tags)] = smiles - - sub = attrs.get("graph") - if isinstance(sub, Graph): - _walk(sub) - - _walk(self.graph) - - reformatted_results = {(key[0], smiles, key[1]) for (key, smiles) in results.items()} - - return reformatted_results diff --git a/src/retromol/io/__init__.py b/src/retromol/io/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/retromol/io/json.py b/src/retromol/io/json.py new file mode 100644 index 0000000..14d5221 --- /dev/null +++ b/src/retromol/io/json.py @@ -0,0 +1,25 @@ +"""This module provides JSON I/O functions for the RetroMol package.""" + +import json +from typing import Any, Generator + +import ijson + + +def iter_json(path: str, jsonl: bool = False) -> Generator[Any, None, None]: + """ + Stream items from a JSON array or a JSON Lines (JSONL) file. + + :param path: path to the JSON or JSONL file + :param jsonl: if True, treat the file as JSONL (one JSON object per line). If False, assume a single JSON array + :yield: parsed JSON objects + """ + with open(path, "rb") as f: + if jsonl: + for line in f: + line = line.strip() + if not line: + continue + yield json.loads(line) + else: + yield from ijson.items(f, "item") diff --git a/src/retromol/io/streaming.py b/src/retromol/io/streaming.py new file mode 100644 index 0000000..8c80321 --- /dev/null +++ b/src/retromol/io/streaming.py @@ -0,0 +1,190 @@ +"""Streaming RetroMol runs with multiprocessing.""" + +from __future__ import annotations + +from dataclasses import dataclass +from multiprocessing import Pool +from typing import Any, Callable, Iterable, Iterator + +from pandas import DataFrame, read_csv +from rdkit.Chem.rdmolfiles import SDMolSupplier + +from retromol.model.rules import RuleSet +from retromol.model.submission import Submission +from retromol.pipelines.parsing import run_retromol_with_timeout +from retromol.chem.mol import mol_to_smiles, sanitize_mol +from retromol.io.json import iter_json + +_G_RULESET = None + + +def _init_worker(ruleset: RuleSet) -> None: + """ + Initialize worker process with necessary global variables. + + :param ruleset: reaction/matching rule set + :param wave_configs: wave configuration dicts + """ + global _G_RULESET + _G_RULESET = ruleset + + +def _process_compound(args_tuple: tuple[str, dict[str, Any]]) -> tuple[dict[str, Any] | None, str | None]: + """ + Process a single compound in a worker process. + + :param args_tuple: (smiles, props) + :return: (serialized_result or None on error, error message or None on success) + """ + smiles, props = args_tuple + try: + submission = Submission(smiles, props=props) + if _G_RULESET is None: + raise RuntimeError("worker not properly initialized with rule set") + result_obj = run_retromol_with_timeout(submission, _G_RULESET) + return result_obj.to_dict(), None + except Exception as e: + # Traceback not returned here to keep workers light-weight; caller can log + return None, str(e) + + +@dataclass +class ResultEvent: + """ + Represents the result of processing a single compound. + + :param result: serialized result dict or None if there was an error + :param error: error message string or None if processing was successful + """ + + result: dict[str, Any] | None # serialized result or None on error + error: str | None # error message or None on success + + +def _task_buffered_iterator( + source_iter: Iterable[dict[str, Any]], + smiles_col: str, + batch_size: int, +) -> Iterator[list[tuple[str, dict[str, Any]]]]: + """ + Convert row dicts into (smiles, props) tuples and yield in batches. + + :param source_iter: iterable of row dicts + :param smiles_col: name of column containing SMILES + :param batch_size: number of compounds per batch + :return: iterator over lists of (smiles, props) tuples + """ + buf: list[tuple[str, dict[str, Any]]] = [] + for rec in source_iter: + if smiles_col not in rec: + continue + smi = str(rec[smiles_col]) + buf.append((smi, rec)) + if len(buf) >= batch_size: + yield buf + buf = [] + if buf: + yield buf + + +def run_retromol_stream( + ruleset: RuleSet, + row_iter: Iterable[dict[str, Any]], + smiles_col: str = "smiles", + workers: int = 1, + batch_size: int = 2000, + pool_chunksize: int = 50, + maxtasksperchild: int = 2000, + on_result: Callable[[ResultEvent], None] | None = None, +) -> Iterator[ResultEvent]: + """ + Stream RetroMol results with multiprocessing, yielding ResultEvent as soon as + each compound finishes. No files/logs are written here—callers are free to do so. + + :param ruleset: pre-loaded reaction/matching rule set + :param row_iter: iterable of row dicts containing at least id_col and smiles_col + :param smiles_col: name of column containing SMILES (default: "smiles") + :param workers: number of worker processes (default: 1) + :param batch_size: number of compounds to send to each worker at once (default: 2000) + :param pool_chunksize: chunksize for imap_unordered (default: 50) + :param maxtasksperchild: max tasks per worker before restart (default: 2000) + :param on_result: optional callback receiving each ResultEvent as it arrives + :return: iterator over ResultEvent objects + """ + # Start worker pool with same init pattern + with Pool( + processes=workers, + initializer=_init_worker, + initargs=(ruleset,), + maxtasksperchild=maxtasksperchild, + ) as pool: + for task_batch in _task_buffered_iterator(row_iter, smiles_col=smiles_col, batch_size=batch_size): + for serialized, err in pool.imap_unordered(_process_compound, task_batch, chunksize=pool_chunksize): + evt = ResultEvent(serialized, err) + if on_result is not None: + on_result(evt) + yield evt + + +def stream_table_rows(path: str, sep: str = ",", chunksize: int = 20_000) -> Iterator[dict[str, Any]]: + """ + Stream CSV/TSV rows as dicts. Keeps memory usage low (chunked). + + :param path: path to CSV/TSV file + :param sep: field separator (default: ",") + :param chunksize: number of rows to read per chunk (default: 20,000) + :return: iterator over row dicts + """ + chunks: Iterator[DataFrame] = read_csv( + path, + sep=sep, + chunksize=chunksize, + dtype=str, + keep_default_na=False, + ) + + for chunk in chunks: + # iterrows() -> Iterator[Tuple[int, Series]] + for _, row in chunk.iterrows(): + yield row.to_dict() + + +def stream_sdf_records(sdf_path: str, fast: bool = False) -> Iterator[dict[str, Any]]: + """ + Stream SDF as dict rows: {'smiles': , ...props}. + + :param sdf_path: path to SDF file + :param fast: if True, skips sanitization and H removal (default: False) + :return: iterator over record dicts + """ + sanitize = not fast + removeHs = fast + suppl = SDMolSupplier(sdf_path, sanitize=sanitize, removeHs=removeHs) + for mol in suppl: + if mol is None: + continue + try: + try: + smi = mol_to_smiles(mol) + except Exception: + sanitize_mol(mol, fix_hydrogens=False) + smi = mol_to_smiles(mol) + rec = {"smiles": smi} + for pname in mol.GetPropNames(): + rec[pname] = mol.GetProp(pname) + yield rec + except Exception: + continue + + +def stream_json_records(path: str, jsonl: bool = False) -> Iterator[dict[str, Any]]: + """ + Stream JSON or JSONL records as dicts. + + :param path: path to JSON or JSONL file + :param jsonl: if True, treat as JSONL (one JSON object per line) + :return: iterator over record dicts + """ + for rec in iter_json(path, jsonl=jsonl): + if isinstance(rec, dict): + yield rec diff --git a/src/retromol/matching.py b/src/retromol/matching.py deleted file mode 100644 index f779b55..0000000 --- a/src/retromol/matching.py +++ /dev/null @@ -1,262 +0,0 @@ -"""Module for matching reaction graph nodes to motifs.""" - -import logging -from collections.abc import Iterable -from copy import deepcopy -from typing import Any - -from retromol import chem, config, rules - - -def match_mol_greedily( - mol: chem.Mol, rls: list[rules.MatchingRule], sch: bool = False -) -> tuple[str, dict[str, Any]] | None: - """ - Match a molecule to a motif. - - :param mol: RDKit molecule to match - :param rls: list of matching rules (motifs) - :param sch: whether to match stereochemistry - :return: tuple of motif ID and properties if matched, else None - .. note:: this function uses a greedy approach to match a molecule to a motif - """ - for rl in rls: - if rid := rl.is_match(mol, sch): - return rid, rl.props - - return None - - -def greedy_max_set_cover(enc_to_mol: dict[int, chem.Mol], nodes: list[int]) -> list[int]: - """ - Find biggest non-overlapping set of mol nodes in the reaction graph. - - :param enc_to_mol: mapping of encoding to RDKit molecule - :param nodes: list of node encodings to consider for set cover - :return: list of selected node encodings - """ - # Create subsets of atom mappings per node. - subsets: list[tuple[int, set[int]]] = list() - for node in nodes: - mol = enc_to_mol[node] - tags = {atom.GetIsotope() for atom in mol.GetAtoms() if atom.GetIsotope() != 0} - subsets.append((node, tags)) - - # Sort subsets by size of atom mappings, from largest to smallest. - sorted_subsets = sorted(subsets, key=lambda x: len(x[1]), reverse=True) - - # Perform greedy set cover algorithm. - selected_subsets: list[int] = [] - covered_elements: set[int] = set() - for node, subset in sorted_subsets: - uncovered_elements = subset - covered_elements - - # Make sure that only a subset is selected if all elements are uncovered. - if uncovered_elements != subset: - continue - - if uncovered_elements: - selected_subsets.append(node) - covered_elements.update(uncovered_elements) - - return selected_subsets - - -def solve_exact_cover_with_priority( - enc_to_mol: dict[str, "chem.Mol"], - nodes_A: list[str], - nodes_B: list[str], - required_tags: Iterable[int], -) -> tuple[list[int], list[int]]: - """ - Partition `required_tags` into disjoint node-tag sets drawn from nodes_A ∪ nodes_B. - - Objective (lexicographic): - 1) Maximize the number of required tags covered by A (identified coverage) - 2) Among those, minimize the number of A nodes (prefer single big A over multiple small A) - 3) Then minimize total nodes - - :param enc_to_mol: mapping of encoding to RDKit molecule - :param nodes_A: list of node encodings in set A (identified) - :param nodes_B: list of node encodings in set B (unidentified) - :param required_tags: iterable of required tags to cover - :return: tuple of selected nodes from A and B - :raises ValueError: if no exact cover exists - """ - logger = logging.getLogger(config.LOGGER_NAME) - - req: set[int] = set(required_tags) - - def node_tags(node: str) -> set[int]: - mol = enc_to_mol[node] - return set(chem.get_tags_mol(mol)) - - # Build candidate subsets (filter out nodes that contain tags outside the required set) - candA = [(n, node_tags(n)) for n in nodes_A] - candB = [(n, node_tags(n)) for n in nodes_B] - candA = [(n, ts) for (n, ts) in candA if ts and ts.issubset(req)] - candB = [(n, ts) for (n, ts) in candB if ts and ts.issubset(req)] - - # Quick impossibility check: every required tag must appear in at least one candidate - tag_to_candidates: dict[int, list[tuple[str, str, set[int]]]] = {t: [] for t in req} - for src, pool in (("A", candA), ("B", candB)): - for n, ts in pool: - for t in ts: - tag_to_candidates[t].append((src, n, ts)) - - for t in req: - if not tag_to_candidates[t]: - error_msg = f"No node covers required tag {t}; exact cover impossible." - logger.error(error_msg) - - logger.error(f"Tags to cover: {req}") - - for node in nodes_A: - mol = enc_to_mol[node] - smiles_no_tags = chem.mol_to_smiles(deepcopy(mol), remove_tags=True) - logger.debug(f"Node A {node}: SMILES {smiles_no_tags}") - - for node in nodes_B: - mol = enc_to_mol[node] - smiles_no_tags = chem.mol_to_smiles(deepcopy(mol), remove_tags=True) - logger.debug(f"Node B {node}: SMILES {smiles_no_tags}") - - raise ValueError(error_msg) - - # Order candidates within each tag: A-first, larger sets first (helps reduce branching) - for t in req: - tag_to_candidates[t].sort( - key=lambda x: (x[0] != "A", -len(x[2])) # A before B; then bigger sets - ) - - # Greedy optimistic bound: how many tags can A at most still claim disjointly? - def optimistic_A_tag_gain(remaining_tags: set[int], used_tags: set[int]) -> int: - compat: list[tuple[str, set[int]]] = [] - for n, ts in candA: - if ts and ts <= remaining_tags and ts.isdisjoint(used_tags): - compat.append((n, ts)) - compat.sort(key=lambda x: -len(x[1])) # larger tag-sets first - covered: set[int] = set() - for _, ts in compat: - if ts.isdisjoint(covered): - covered |= ts - if covered >= remaining_tags: - break - return len(covered) - - # Choose the "most constrained" remaining tag (fewest compatible candidates) - def pick_most_constrained_tag(remaining: set[int], used: set[int]) -> int: - best_t = None - best_count = None - for t in remaining: - # Count only candidates compatible with used tags - opts = 0 - for _, _, ts in tag_to_candidates[t]: - if ts.isdisjoint(used) and ts <= remaining: - opts += 1 - if best_count is None or opts < best_count: - best_t, best_count = t, opts - if best_count == 1: - break # can't get more constrained than 1 - return best_t - - best_solution: tuple[list[str], list[str]] | None = None - best_A_tags = -1 # maximize - best_A_nodes = 10**9 # minimize - best_total = 10**9 # minimize - - def recurse( - used_tags: set[int], - used_A_tags: set[int], - chosen_A: list[str], - chosen_B: list[str], - ): - nonlocal best_solution, best_A_tags, best_A_nodes, best_total - - remaining = req - used_tags - if not remaining: - # Complete exact cover: compare by (|A-tags| desc, |A-nodes| asc, total asc) - A_tags_cnt = len(used_A_tags) - A_nodes_cnt = len(chosen_A) - total = A_nodes_cnt + len(chosen_B) - if ( - (A_tags_cnt > best_A_tags) - or (A_tags_cnt == best_A_tags and A_nodes_cnt < best_A_nodes) - or (A_tags_cnt == best_A_tags and A_nodes_cnt == best_A_nodes and total < best_total) - ): - best_solution = (chosen_A.copy(), chosen_B.copy()) - best_A_tags = A_tags_cnt - best_A_nodes = A_nodes_cnt - best_total = total - return - - # Branch-and-bound: can we still beat the best A-tag count? - # (Current |A-tags| + optimistic future A-tags) must exceed current best_A_tags. - cur_A_tags = len(used_A_tags) - if cur_A_tags + optimistic_A_tag_gain(remaining, used_tags) < best_A_tags: - return - - # Pick the hardest tag next - t = pick_most_constrained_tag(remaining, used_tags) - - # Try candidates that cover t, A-first (already ordered in tag_to_candidates) - for src, n, ts in tag_to_candidates[t]: - if not ts.isdisjoint(used_tags): - continue # not compatible (overlap) - if not ts <= remaining: - continue - - # Choose - used_tags.update(ts) - if src == "A": - chosen_A.append(n) - added_A = ts - used_A_tags.update(added_A) - else: - chosen_B.append(n) - added_A = None - - recurse(used_tags, used_A_tags, chosen_A, chosen_B) - - # Backtrack - if src == "A": - if added_A is None: - raise RuntimeError("Internal error: added_A should not be None for A nodes.") - chosen_A.pop() - used_A_tags.difference_update(added_A) - else: - chosen_B.pop() - used_tags.difference_update(ts) - - recurse(set(), set(), [], []) - - if best_solution is None: - raise ValueError("No exact cover found with the given A/B nodes for the required tags.") - - selA, selB = best_solution - return selA, selB - - -def identify_nodes( - encoding_to_mol: dict[str, chem.Mol], - matching_rules: list[rules.MatchingRule], - match_stereochemistry: bool = False, -) -> dict[str, dict[str, Any]]: - """ - Identify nodes in a reaction graph that match given motifs. - - :param encoding_to_mol: mapping of encoding to RDKit molecule - :param matching_rules: list of matching rules to apply - :param match_stereochemistry: whether to match stereochemistry during identification - :return: a dictionary mapping node encodings to motif IDs - """ - # Try to identify nodes, keep those that match - identity_mapping: dict[str, dict[str, Any]] = {} - for node in encoding_to_mol.keys(): - mol = encoding_to_mol[node] - matched = match_mol_greedily(mol, matching_rules, match_stereochemistry) - if matched: - rid, props = matched - identity_mapping[node] = {"identity": rid, "props": props} - - return identity_mapping diff --git a/src/retromol/model/__init__.py b/src/retromol/model/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/retromol/model/assembly_graph.py b/src/retromol/model/assembly_graph.py new file mode 100644 index 0000000..1ea7784 --- /dev/null +++ b/src/retromol/model/assembly_graph.py @@ -0,0 +1,616 @@ +"""Module contains utilities for defining and working with assembly graphs.""" + +from dataclasses import dataclass +from typing import Any, Iterable, Iterable, Iterator, Generator + +from rdkit.Chem.rdchem import Mol +import matplotlib.pyplot as plt +import networkx as nx + +from retromol.model.reaction_graph import MolNode +from retromol.chem.tagging import get_tags_mol + + +@dataclass(frozen=True) +class RootBondLink: + """ + One root bond connects two monomer tag-sets. + + :var a1_idx: index of the first atom in the root bond + :var a2_idx: index of the second atom in the root bond + :var a1_tag: tag of the first atom in the root bond + :var a2_tag: tag of the second atom in the root bond + :var a1_symbol: element symbol of the first atom + :var a2_symbol: element symbol of the second atom + :var bond_type: stringified version of RDKit BondType + :var bond_order: bond order if available + """ + + a1_idx: int + a2_idx: int + a1_tag: int + a2_tag: int + + a1_symbol: str + a2_symbol: str + + bond_type: str # stringified version of RDKit BondType + bond_order: float | int | None # include if available + + +def build_assembly_graph( + root_mol: Mol, + monomers: Iterable[MolNode], + allow_overlaps: bool = False, + include_unassigned: bool =False, +) -> nx.Graph: + """ + Build an assembly graph from the given root molecule and monomers. + + :param root_mol: RDKit Mol representing the root molecule + :param monomers: iterable of MolNode representing the monomers + :param allow_overlaps: whether to allow overlapping monomers (default: False) + :param include_unassigned: whether to include unassigned regions as a node (default: False) + :return: NetworkX graph representing the assembly graph + """ + monomers = list(monomers) + + tag_to_monomer: dict[int, str] = {} + monomer_to_tags: dict[str, set[int]] = {} + + # Map root tags -> monomers + for m in monomers: + tags = get_tags_mol(m.mol) + monomer_to_tags[m.enc] = tags + + for t in tags: + if t in tag_to_monomer and not allow_overlaps: + raise ValueError(f"root tag {t} appears in multiple monomersL {tag_to_monomer[t]} and {m.enc}") + tag_to_monomer[t] = m.enc + + # Initialize empty graph + g = nx.Graph() + + # Add monomer nodes + for m in monomers: + tags = monomer_to_tags[m.enc] + identity = m.identity + g.add_node(m.enc, molnode=m, tags=tags, identity=identity) + + UNASSIGNED = "unassigned" + if include_unassigned: + g.add_node(UNASSIGNED, molnode=None, tags=set(), identity=None) + + # Scan root bonds + for b in root_mol.GetBonds(): + a1 = b.GetBeginAtom() + a2 = b.GetEndAtom() + + t1 = int(a1.GetIsotope()) + t2 = int(a2.GetIsotope()) + + if t1 == 0 or t2 == 0: + continue # skip non-tagged atoms + + m1 = tag_to_monomer.get(t1) + m2 = tag_to_monomer.get(t2) + + if m1 is None or m2 is None: + if not include_unassigned: + continue # skip bonds to unassigned regions + m1 = m1 if m1 is not None else UNASSIGNED + m2 = m2 if m2 is not None else UNASSIGNED + + if m1 == m2: + continue # skip intra-monomer bonds + + link = RootBondLink( + a1_idx=a1.GetIdx(), + a2_idx=a2.GetIdx(), + a1_tag=t1, + a2_tag=t2, + a1_symbol=a1.GetSymbol(), + a2_symbol=a2.GetSymbol(), + bond_type=str(b.GetBondType()), + bond_order=float(b.GetBondTypeAsDouble()) if hasattr(b, "GetBondTypeAsDouble") else None, + ) + + if g.has_edge(m1, m2): + g[m1][m2]["bonds"].append(link) + g[m1][m2]["n_bonds"] += 1 + else: + g.add_edge(m1, m2, bonds=[link], n_bonds=1) + + return g + + +@dataclass(frozen=True, slots=True) +class AssemblyGraph: + """ + Assembly graph representing monomer connectivity in a molecule. + + :var g: NetworkX graph representing the assembly graph + :var unassigned: name of the unassigned node + :var validate: validate graph structure upon initialization + """ + + g: nx.Graph + unassigned: str = "unassigned" + validate_upon_initialization: bool = False + + def __post_init__(self) -> None: + """ + Post-initialization to validate the graph if requested. + """ + if self.validate_upon_initialization: + self.validate() + + def __str__(self) -> str: + """ + String representation of the AssemblyGraph. + + :return: string representation + """ + return f"AssemblyGraph(num_nodes={self.g.number_of_nodes()}, num_edges={self.g.number_of_edges()})" + + def monomer_ids(self) -> list[str]: + """ + Get the list of monomer IDs in the assembly graph. + + :return: list of monomer IDs + """ + return [n for n in self.g.nodes if n != self.unassigned] + + def monomer_nodes(self) -> list[MolNode]: + """ + Get the list of monomer MolNodes in the assembly graph. + + :return: list of MolNode instances + """ + out: list[MolNode] = [] + for n in self.monomer_ids(): + mn = self.g.nodes[n]["molnode"] + if mn is None: + raise ValueError(f"AssemblyGraph node {n!r} has None molnode") + out.append(mn) + + return out + + def edges_with_bonds(self) -> Iterator[tuple[str, str, list[RootBondLink]]]: + """ + Iterate over edges with their associated root bonds. + + :return: iterator of tuples (node1, node2, list of RootBondLink) + """ + for u, v, data in self.g.edges(data=True): + yield u, v, data["bonds"] + + def drop_unassigned(self) -> "AssemblyGraph": + """ + Drop the unassigned node from the assembly graph. + + :return: AssemblyGraph without the unassigned node + """ + h = self.g.copy() + + if h.has_node(self.unassigned): + h.remove_node(self.unassigned) + + return AssemblyGraph(g=h, unassigned=self.unassigned, validate_upon_initialization=True) + + def drop_singletons(self) -> "AssemblyGraph": + """ + Drop singleton nodes (nodes with degree 0) from the assembly graph. + + :return: AssemblyGraph with singleton nodes removed + """ + h = self.g.copy() + + singletons = [n for n, d in h.degree() if d == 0] + h.remove_nodes_from(singletons) + + return AssemblyGraph(g=h, unassigned=self.unassigned, validate_upon_initialization=True) + + def filtered_by_root_bond_elements( + self, + allow_pairs: set[frozenset[str]] | None = None, + drop_isolated: bool = True, + ) -> "AssemblyGraph": + """ + Filter the assembly graph by allowed root bond element pairs. + + :param allow_pairs: set of allowed element symbol pairs (as frozensets) + :param drop_isolated: whether to drop isolated nodes after filtering (default: True) + :return: filtered AssemblyGraph + """ + h = self.g.copy() + + if allow_pairs is None: + return AssemblyGraph(g=h, unassigned=self.unassigned) + + to_remove: list[tuple[str, str]] = [] + for u, v, data in list(h.edges(data=True)): + bonds = data.get("bonds", []) + kept = [] + for link in bonds: + pair = frozenset((link.a1_symbol, link.a2_symbol)) + if pair in allow_pairs: + kept.append(link) + + if not kept: + to_remove.append((u, v)) + else: + data["bonds"] = kept + data["n_bonds"] = len(kept) + + h.remove_edges_from(to_remove) + + if drop_isolated: + iso = [n for n in h.nodes if h.degree(n) == 0 and n != self.unassigned] + h.remove_nodes_from(iso) + + return AssemblyGraph(g=h, unassigned=self.unassigned, validate_upon_initialization=True) + + def connected_components(self, keep_unassigned: bool = False) -> list["AssemblyGraph"]: + """ + Get the connected components of the assembly graph. + + :param keep_unassigned: whether to keep the unassigned node in components (default: False) + :return: list of AssemblyGraph instances representing connected components + """ + h = self.g.copy() + + if not keep_unassigned and h.has_node(self.unassigned): + h.remove_node(self.unassigned) + + comps: list["AssemblyGraph"] = [] + for nodes in nx.connected_components(h): + sub = h.subgraph(nodes).copy() + comps.append(AssemblyGraph(g=sub, unassigned=self.unassigned, validate_upon_initialization=True)) + + return comps + + def longest_path(self, keep_unassigned: bool = False, max_starts: int = 25) -> list[MolNode]: + """ + Find the longest path of monomer nodes in assembly graph. + + :param keep_unassigned: whether to keep the unassigned node in the graph (default: False) + :param max_starts: maximum number of starting nodes for greedy search (default: 25) + :return: list of MolNode instances representing the longest path + """ + g = self.g + h = g.copy() + + if not keep_unassigned and hasattr(self, "unassigned") and h.has_node(self.unassigned): + h.remove_node(self.unassigned) + + # Double check that every node has a MolNode attached + for n in h.nodes: + if "molnode" not in h.nodes[n] or h.nodes[n]["molnode"] is None: + raise ValueError(f"AssemblyGraph node {n!r} has no valid 'molnode' attached") + + # Empty graph case + if h.number_of_nodes() == 0: + return [] + + def node_to_molnode(node_id: Any) -> MolNode: + """ + Convert a graph node ID to its corresponding MolNode. + + :param node_id: node ID in the graph + :return: corresponding MolNode + """ + mn = h.nodes[node_id].get("molnode", None) + if mn is None: + raise ValueError(f"Node {node_id!r} has no 'molnode' attached") + + return mn + + def to_molnodes(path_nodes: list[Any]) -> list[MolNode]: + """ + Convert a list of graph node IDs to their corresponding MolNodes. + + :param path_nodes: list of node IDs in the graph + :return: list of corresponding MolNodes + """ + return [node_to_molnode(n) for n in path_nodes] + + + for comp_nodes in nx.connected_components(h): + # Work per connected component; pick the longest result + best_path_nodes: list[Any] = [] + + c = h.subgraph(comp_nodes).copy() + if c.number_of_nodes() == 0: + continue + + # Monomer graph is a tree: use diameter via two BFS + is_tree = nx.is_connected(c) and (c.number_of_edges() == c.number_of_nodes() - 1) + if is_tree: + # Diameter of a tree via two BFS + start = next(iter(c.nodes)) + dist1 = nx.single_source_shortest_path_length(c, start) + u = max(dist1, key=dist1.get) + + dist2 = nx.single_source_shortest_path_length(c, u) + v = max(dist2, key=dist2.get) + + path_uv = nx.shortest_path(c, u, v) + if len(path_uv) > len(best_path_nodes): + best_path_nodes = path_uv + continue + + # General/cyclic case: multi-start + 1-step lookahead + def greedy_walk(start: Any) -> list[Any]: + """ + Perform a greedy walk starting from the given node. + + :param start: starting node ID + :return: list of node IDs in the greedy path + """ + used = {start} + path = [start] + cur = start + + while True: + options = [nb for nb in c.neighbors(cur) if nb not in used] + if not options: + break + + best_nb = None + best_score = None + + # Precompute allowed nodes list once per step + # (unvisited nodes plus candidate neighbor) + all_nodes = list(c.nodes) + + for nb in options: + allowed = [x for x in all_nodes if x not in used] + [nb] + sub = c.subgraph(allowed) + + # How many nodes remain reachable if we go to nb? + reachable = nx.single_source_shortest_path_length(sub, nb) + score = (len(reachable), -c.degree(nb)) # prefer more reachable, tie-break lower degree + + if best_score is None or score > best_score: + best_score = score + best_nb = nb + + if best_nb is None: + break + + used.add(best_nb) + path.append(best_nb) + cur = best_nb + + return path + + # Try low-degree starts first (often good for long simple paths) + nodes_sorted = sorted(c.nodes, key=lambda n: c.degree(n)) + n_starts = min(max_starts, len(nodes_sorted)) + starts = nodes_sorted[:n_starts] + + best_comp_path: list[Any] = [] + for s in starts: + cand = greedy_walk(s) + if len(cand) > len(best_comp_path): + best_comp_path = cand + + # Try starting from the found endpoints (sometimes extends) + if best_comp_path: + for endpoint in (best_comp_path[0], best_comp_path[-1]): + cand = greedy_walk(endpoint) + if len(cand) > len(best_comp_path): + best_comp_path = cand + + if len(best_comp_path) > len(best_path_nodes): + best_path_nodes = best_comp_path + + return to_molnodes(best_path_nodes) + + + def iter_kmers( + self, + k: int, + include_unassigned: bool = False, + identified_only: bool = False + ) -> Generator[tuple[MolNode, ...], None, None]: + """ + Iterate over all k-mers (paths of length k) in the assembly graph. + + :param k: length of the k-mers (must be at least 1) + :param include_unassigned: whether to include the unassigned node in paths (default: False) + :param identified_only: whether to yield only k-mers with all identified monomers (default: False) + :yield: tuples of MolNode instances representing k-mers + """ + if k < 1: + raise ValueError("k must be at least 1") + + g = self.g + + def usable_node_ids() -> list[str]: + ids = [] + for n in g.nodes: + if (not include_unassigned) and (n == self.unassigned): + continue + ids.append(n) + return ids + + def node_to_molnode(node_id: str) -> MolNode: + mn = g.nodes[node_id].get("molnode", None) + if mn is None: + raise ValueError(f"AssemblyGraph node {node_id!r} has no 'molnode' attached") + return mn + + node_ids = usable_node_ids() + + # k == 1: one k-mer per node + if k == 1: + for nid in node_ids: + mn = node_to_molnode(nid) + if identified_only and not mn.is_identified: + continue + yield (mn,) + return + + # Stack items are (current_node_id, path_node_ids) + stack: list[tuple[str, list[str]]] = [(start, [start]) for start in node_ids] + + while stack: + cur, path = stack.pop() + + if len(path) == k: + km = tuple(node_to_molnode(pid) for pid in path) + if identified_only and any(not n.is_identified for n in km): + continue + yield km + continue + + for nbr in g.neighbors(cur): + if (not include_unassigned) and (nbr == self.unassigned): + continue + stack.append((nbr, path + [nbr])) + + + def validate(self) -> None: + """ + Validate the assembly graph structure. + """ + for n, data in self.g.nodes(data=True): + for k in ("molnode", "tags", "identity"): + if k not in data: + raise ValueError(f"AssemblyGraph node {n!r} missing required attribute {k!r}") + + if not isinstance(data["tags"], set): + raise ValueError(f"AssemblyGraph node {n!r} tags must be set[int]") + + if data["molnode"] is None and n != self.unassigned: + raise ValueError(f"AssemblyGraph node {n!r} has None molnode but is not unassigned node") + + for u, v, data in self.g.edges(data=True): + for k in ("bonds", "n_bonds"): + if k not in data: + raise ValueError(f"AssemblyGraph edge {u!r}-{v!r} missing required attribute {k!r}") + + if not isinstance(data["bonds"], list): + raise ValueError(f"AssemblyGraph edge {u!r}-{v!r} bonds must be list[RootBondLink]") + + if not isinstance(data["n_bonds"], int): + raise ValueError(f"AssemblyGraph edge {u!r}-{v!r} n_bonds must be int") + + @classmethod + def build( + cls, + root_mol: Mol, + monomers: Iterable[MolNode], + allow_overlaps: bool = False, + include_unassigned: bool = False, + unassigned: str = "unassigned", + validate: bool = True, + ) -> "AssemblyGraph": + """ + Build an AssemblyGraph from the given root molecule and monomers. + + :param root_mol: RDKit Mol representing the root molecule + :param monomers: iterable of MolNode representing the monomers + :param allow_overlaps: whether to allow overlapping monomers (default: False) + :param include_unassigned: whether to include unassigned regions as a node (default: False) + :param unassigned: name of the unassigned node (default: "unassigned") + :param validate: whether to validate the graph after building (default: True) + :return: AssemblyGraph instance + """ + g = build_assembly_graph( + root_mol=root_mol, + monomers=monomers, + allow_overlaps=allow_overlaps, + include_unassigned=include_unassigned, + ) + ag = cls(g=g, unassigned=unassigned) + + if validate: + ag.validate() + + return ag + + def draw( + self, + with_labels: bool = True, + show_unassigned: bool = False, + node_size: int = 1600, + font_size: int = 9, + edge_with_scale: float = 1.0, + savepath: str | None = None, + ) -> None: + """ + Visualize the assembly graph using Matplotlib. + + :param with_labels: whether to show node labels (default: True) + :param show_unassinged: whether to show the unassigned node (default: False) + :param node_size: size of the nodes (default: 1600) + :param font_size: font size for labels (default: 9) + :param edge_with_scale: scale factor for edge widths (default: 1.0) + :param savepath: optional path to save the figure (default: None) + """ + # Hide unassigned if requested + if not show_unassigned: + g = self.drop_unassigned().g + else: + g = self.g + + if g.number_of_nodes() == 0: + raise ValueError("AssemblyGraph has no nodes to show") + + # Layout + pos = nx.spring_layout(g, seed=42) + + # Node colors + node_colors = [] + labels = {} + + for n, data in g.nodes(data=True): + if n == self.unassigned: + node_colors.append("lightgray") + labels[n] = "unassigned" + continue + + ident = data.get("identity") + if ident is None: + node_colors.append("lightblue") + labels[n] = n[:8] + else: + node_colors.append("lightgreen") + labels[n] = getattr(ident, "name", n[:8]) + + # Edge widths from number of root bonds + widths = [edge_with_scale * max(1, data.get("n_bonds", 1)) for _, _, data in g.edges(data=True)] + + plt.figure(figsize=(8, 8)) + nx.draw_networkx_nodes( + g, + pos, + node_color=node_colors, + node_size=node_size, + edgecolors="black", + ) + nx.draw_networkx_edges( + g, + pos, + width=widths, + alpha=0.8, + ) + + if with_labels: + nx.draw_networkx_labels( + g, + pos, + labels=labels, + font_size=font_size, + ) + + plt.axis("off") + plt.tight_layout() + + if savepath is not None: + plt.savefig(savepath, dpi=300) + else: + plt.show() diff --git a/src/retromol/model/identity.py b/src/retromol/model/identity.py new file mode 100644 index 0000000..27b9b00 --- /dev/null +++ b/src/retromol/model/identity.py @@ -0,0 +1,57 @@ +"""Data structure for representing a molecular identity.""" + +from dataclasses import dataclass + +from retromol.model.rules import MatchingRule + + +@dataclass(frozen=True) +class MolIdentity: + """ + Represents the identity of a molecule based on matched rules. + + :var matched_rules: list[str]: List of matched rule identifiers + """ + + matched_rule: MatchingRule + + @property + def name(self) -> str: + """ + Get the name of the matched rule. + + :return: name of the matched rule + """ + return self.matched_rule.name + + @property + def terminal(self) -> bool: + """ + Check if the matched rule indicates a terminal identity. + + :return: True if the matched rule is terminal, False otherwise + """ + return self.matched_rule.terminal + + def to_dict(self) -> dict: + """ + Serialize the MolIdentity to a dictionary. + + :return: dictionary representation of the MolIdentity + """ + return { + "matched_rule": self.matched_rule.to_dict(), + } + + @classmethod + def from_dict(cls, data: dict) -> "MolIdentity": + """ + Deserialize a MolIdentity from a dictionary. + + :param data: dictionary representation of the MolIdentity + :return: MolIdentity object + """ + matched_rule = MatchingRule.from_dict(data["matched_rule"]) + return cls( + matched_rule=matched_rule, + ) diff --git a/src/retromol/model/reaction_graph.py b/src/retromol/model/reaction_graph.py new file mode 100644 index 0000000..d289799 --- /dev/null +++ b/src/retromol/model/reaction_graph.py @@ -0,0 +1,305 @@ +"""Data structures for representing reaction application graphs.""" + +import logging +from dataclasses import dataclass, field +from typing import Any, Iterable, Literal + +from retromol.chem.mol import Mol, encode_mol, mol_to_smiles, smiles_to_mol +from retromol.model.identity import MolIdentity +from retromol.model.rules import MatchingRule +from retromol.chem.matching import match_mol + +log = logging.getLogger(__name__) + + +StepKind = Literal["uncontested", "contested"] + + +@dataclass(frozen=True) +class MolNode: + """ + A molecule node in the processing graph. + + :var enc: str: unique encoding of the molecule + :var mol: Mol: the molecule object + :var smiles: str: SMILES representation of the molecule + :var identity: MolIdentity | None: identification information if identified + :var identified: bool | None: whether the node has been checked for identification + """ + + enc: str + mol: Mol + smiles: str + identity: MolIdentity | None = None + identified: bool | None = None # None=unknown, False=checked-no, True=checked-yes + + @property + def is_checked(self) -> bool: + return self.identified is not None + + @property + def is_identified(self) -> bool: + return self.identified is True + + @property + def is_unidentified_checked(self) -> bool: + return self.identified is False + + def __str__(self) -> str: + """ + Return a string representation of the MolNode. + + :return: str: string representation + """ + id_name = self.identity.name if self.identity else None + return f"MolNode(enc={self.enc}, id={id_name})" + + def identify(self, rules: list[MatchingRule], match_stereochemistry: bool = False) -> MolIdentity | None: + """ + Identify the molecule node using the provided matching rules. + + :param rules: list[MatchingRule]: the matching rules to apply + :param match_stereochemistry: bool: whether to consider stereochemistry in matching + :return: MolIdentity | None: the identity if matched, else None + """ + if self.is_checked: + return self.identity # identity is present only if identified=True + + if identity := match_mol(self.mol, rules, match_stereochemistry): + object.__setattr__(self, "identity", identity) + object.__setattr__(self, "identified", True) + return identity + + object.__setattr__(self, "identity", None) + object.__setattr__(self, "identified", False) + return None + + def to_dict(self) -> dict[str, Any]: + """ + Serialize the MolNode to a dictionary. + + :return: dictionary representation of the MolNode + """ + return { + "enc": self.enc, + "tagged_smiles": mol_to_smiles(self.mol, include_tags=True), + "smiles": self.smiles, + "identity": self.identity.to_dict() if self.identity else None, + "identified": self.identified, + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "MolNode": + """ + Deserialize a MolNode from a dictionary. + + :param data: dictionary representation of the MolNode + :return: MolNode object + """ + identity = MolIdentity.from_dict(data["identity"]) if data["identity"] else None + + node = cls( + enc=data["enc"], + mol=smiles_to_mol(data["tagged_smiles"]), + smiles=data["smiles"], + identity=identity, + identified=data["identified"], + ) + return node + + +@dataclass(frozen=True) +class ReactionStep: + """ + Edge payload: desribes one application event. + - uncontested: multiple rules applied as one step + - contested: exactly one rule applied + + :var kind: StepKind: 'uncontested' or 'contested' + :var names: tuple[str, ...]: reaction rule IDs (human-facing) + :var rule_ids: tuple[str, ...]: optional numeric IDs (stable internal) + """ + + kind: StepKind + names: tuple[str, ...] # reaction rule IDs (human-facing) + rule_ids: tuple[str, ...] = () # optional numeric IDs (stable internal) + + def to_dict(self) -> dict[str, Any]: + """ + Serialize the ReactionStep to a dictionary. + + :return: dictionary representation of the ReactionStep + """ + return { + "kind": self.kind, + "names": self.names, + "rule_ids": self.rule_ids, + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "ReactionStep": + """ + Deserialize a ReactionStep from a dictionary. + + :param data: dictionary representation of the ReactionStep + :return: ReactionStep object + """ + step = cls( + kind=data["kind"], + names=tuple(data["names"]), + rule_ids=tuple(data.get("rule_ids", ())), + ) + return step + + +@dataclass +class RxnEdge: + """ + Directed hyper-edge parent -> children, labeled by ReactionStep. + + :var src: int: encoding of source molecule node + :var dsts: tuple[int, ...]: encodings of child molecule nodes + :var step: ReactionStep: details of the reaction application + """ + + src: int + dsts: tuple[int, ...] + step: ReactionStep + + def to_dict(self) -> dict[str, Any]: + """ + Serialize the RxnEdge to a dictionary. + + :return: dictionary representation of the RxnEdge + """ + return { + "src": self.src, + "dsts": self.dsts, + "step": self.step.to_dict(), + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "RxnEdge": + """ + Deserialize a RxnEdge from a dictionary. + + :param data: dictionary representation of the RxnEdge + :return: RxnEdge object + """ + edge = cls( + src=data["src"], + dsts=tuple(data["dsts"]), + step=ReactionStep.from_dict(data["step"]), + ) + return edge + + +@dataclass +class ReactionGraph: + """ + Simple directed hypergraph: + - nodes: enc -> MolNode + - edges: list of RxnEdge + - out_edges: adjacency index for fast traversal + """ + + nodes: dict[str, MolNode] = field(default_factory=dict) + edges: list[RxnEdge] = field(default_factory=list) + out_edges: dict[str, list[int]] = field(default_factory=dict) # enc -> indices into edges + + @property + def identified_nodes(self) -> dict[str, MolNode]: + """ + Return only identified nodes. + + :return: dict[str, MolNode]: mapping of encodings to identified MolNodes + """ + return {enc: node for enc, node in self.nodes.items() if node.is_identified} + + def __str__(self) -> str: + """ + Return a string representation of the ReactionGraph. + + :return: str: string representation + """ + return f"ReactionGraph(num_nodes={len(self.nodes)}, num_edges={len(self.edges)})" + + def add_node(self, mol: Mol) -> int: + """ + Add a molecule node to the graph if not already present. + + :param mol: molecule to add + :param keep_stereo_smiles: whether to keep stereochemistry in SMILES + :return: encoding of the molecule node + """ + enc = encode_mol(mol) + if enc not in self.nodes: + self.nodes[enc] = MolNode(enc=enc, mol=Mol(mol), smiles=mol_to_smiles(mol, include_tags=False)) + self.out_edges.setdefault(enc, []) + + return enc + + def add_edge(self, src_enc: str, child_mols: Iterable[Mol], step: ReactionStep) -> tuple[str, ...]: + """ + Add a reaction edge to the graph. + + :param src_enc: encoding of the source molecule node + :param child_mols: iterable of child molecule nodes + :param step: ReactionStep describing the reaction + :return: tuple of encodings of the child molecule nodes + """ + dst_encs: list[str] = [] + for m in child_mols: + dst_encs.append(self.add_node(m)) + + edge = RxnEdge(src=src_enc, dsts=tuple(dst_encs), step=step) + self.edges.append(edge) + self.out_edges.setdefault(src_enc, []).append(len(self.edges) - 1) + + return tuple(dst_encs) + + def get_leaf_nodes(self, identified_only: bool = True) -> list[MolNode]: + """ + Get all leaf nodes (nodes with no outgoing edges). + + :param identified_only: whether to include only identified nodes + :return: list of MolNode objects that are leaves + """ + leaves: list[MolNode] = [] + + for enc, node in self.nodes.items(): + # No outgoing edges -> leaf + if not self.out_edges.get(enc): + if identified_only and not node.is_identified: + continue + leaves.append(node) + + return leaves + + + def to_dict(self) -> dict[str, Any]: + """ + Serialize the ReactionGraph to a dictionary. + + :return: dictionary representation of the ReactionGraph + """ + return { + "nodes": {enc: node.to_dict() for enc, node in self.nodes.items()}, + "edges": [edge.to_dict() for edge in self.edges], + "out_edges": {enc: indices for enc, indices in self.out_edges.items()}, + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "ReactionGraph": + """ + Deserialize a ReactionGraph from a dictionary. + + :param data: dictionary representation of the ReactionGraph + :return: ReactionGraph object + """ + reaction_graph = cls( + nodes={int(enc): MolNode.from_dict(node_data) for enc, node_data in data["nodes"].items()}, + edges=[RxnEdge.from_dict(edge_data) for edge_data in data["edges"]], + out_edges={int(enc): indices for enc, indices in data["out_edges"].items()}, + ) + return reaction_graph diff --git a/src/retromol/model/readout.py b/src/retromol/model/readout.py new file mode 100644 index 0000000..2260e1e --- /dev/null +++ b/src/retromol/model/readout.py @@ -0,0 +1,85 @@ +"""Data structures for representing readouts from RetroMol parsing results.""" + +from dataclasses import dataclass +from typing import Literal + +from retromol.model.reaction_graph import MolNode +from retromol.model.assembly_graph import AssemblyGraph +from retromol.model.result import Result +from retromol.model.rules import MatchingRule +from retromol.chem.mol import encode_mol +from retromol.chem.tagging import get_tags_mol + + +ReadoutMode = Literal["leaf_identified", "first_identified"] + + +@dataclass(frozen=True) +class LinearReadout: + """ + A linear readout representation of a RetroMol parsing result. + """ + + assembly_graph: AssemblyGraph + paths: list[list[MolNode]] + + def __str__(self) -> str: + """ + Return a string representation of the LinearReadout. + + :return: str: string representation + """ + return f"LinearReadout(assembly_graph_nodes={self.assembly_graph.g.number_of_nodes()}; assembly_graph_edges={self.assembly_graph.g.number_of_edges()}; num_paths={len(self.paths)})" + + @classmethod + def from_result( + cls, + result: Result, + root_enc: str | None = None, + exclude_identities: list[MatchingRule] | None = None, + include_identities: list[MatchingRule] | None = None, + ) -> "LinearReadout": + """ + Create a LinearReadout from a Result object. + + :param result: RetroMol parsing result + :param root_enc: optional root molecule encoding; if None, use submission molecule + :param exclude_identities: list of matching rules to exclude identities (not used here) + :param include_identities: list of matching rules to include identities (not used here) + :return: LinearReadout instance + :raises ValueError: if root_enc not found in reaction graph nodes + """ + exclude_identities = exclude_identities or [] + include_identities = include_identities # keep None meaning "no whitelist" + + # Convert identities to their IDs for easier checking + exclude_identities = set([r.id for r in exclude_identities]) + if include_identities is not None: + include_identities = set([r.id for r in include_identities]) + + g = result.reaction_graph + if root_enc is None: + root_enc = encode_mol(result.submission.mol) + + if root_enc not in g.nodes: + raise ValueError(f"root_enc {root_enc} not found in reaction graph nodes") + + # Use root_enc to get root mol + root = g.nodes[root_enc].mol + + # Create assembly graph of monomers; first collect nodes to include + collected = g.get_leaf_nodes(identified_only=False) + a = AssemblyGraph.build(root_mol=root, monomers=collected, include_unassigned=True) + + # Break bonds between monomers that are not backbone-related bonds (i.e., keep C-C and C-N bonds only) + f = a.filtered_by_root_bond_elements(allow_pairs={frozenset(("C", "C")), frozenset(("C", "N"))}, drop_isolated=False) + + # Get individual connected components from the assembly graph and extract longest paths (allow to visit each edge only once) + hs = f.connected_components() + + paths: list[list[MolNode]] = [] + for h in hs: + path = h.longest_path() + paths.append(path) + + return cls(assembly_graph=a, paths=paths) diff --git a/src/retromol/model/result.py b/src/retromol/model/result.py new file mode 100644 index 0000000..2a79211 --- /dev/null +++ b/src/retromol/model/result.py @@ -0,0 +1,74 @@ +"""Module defining the Result data class.""" + +from dataclasses import dataclass, asdict +from typing import Any + +from retromol.model.submission import Submission +from retromol.model.reaction_graph import ReactionGraph +from retromol.chem.tagging import get_tags_mol + + +@dataclass(frozen=True) +class Result: + """ + Represents a RetroMol parsing result. + """ + + submission: Submission + reaction_graph: ReactionGraph + + def __str__(self) -> str: + """ + String representation of the Result. + + :return: string representation of the Result + """ + return f"Result(submission={self.submission}, reaction_graph={self.reaction_graph})" + + def calculate_coverage(self) -> float: + """ + Calculate coverage score for result. + + :return: coverage score as a float + """ + # Collect all unique tags from identified nodes + identified_tags = set() + for node in self.reaction_graph.identified_nodes.values(): + identified_tags.update(get_tags_mol(node.mol)) + + # Get all unique tags from the root + root_tags = set(get_tags_mol(self.submission.mol)) + + # Calculate coverage: proportion of root tags identified + if root_tags: + coverage = len(identified_tags.intersection(root_tags)) / len(root_tags) + return coverage + + return 0.0 + + def to_dict(self) -> dict[str, Any]: + """ + Serialize the Result to a dictionary. + + :return: dictionary representation of the Result + """ + return { + "submission": self.submission.to_dict(), + "reaction_graph": self.reaction_graph.to_dict(), + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "Result": + """ + Deserialize a Result from a dictionary. + + :param data: dictionary representation of the Result + :return: Result object + """ + submission = Submission.from_dict(data["submission"]) + reaction_graph = ReactionGraph.from_dict(data["reaction_graph"]) + + return cls( + submission=submission, + reaction_graph=reaction_graph, + ) diff --git a/src/retromol/model/rules.py b/src/retromol/model/rules.py new file mode 100644 index 0000000..38f306d --- /dev/null +++ b/src/retromol/model/rules.py @@ -0,0 +1,475 @@ +"""Module defining reaction and matching rules.""" + +import logging +import itertools +import hashlib +from collections import Counter +from dataclasses import dataclass, field +from importlib.resources import files +from pathlib import Path +from typing import Any + +import yaml +from rdkit.Chem.rdchem import Mol +from rdkit.Chem.rdChemReactions import ChemicalReaction + +import retromol.data +from retromol.chem.mol import ( + mol_to_smiles, + smiles_to_mol, + count_fragments, + sanitize_mol, + reassign_stereochemistry, +) +from retromol.chem.reaction import smarts_to_reaction, reactive_template_atoms +from retromol.chem.tagging import get_tags_mol +from retromol.chem.masking import is_masked_preserved + + +log = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class ReactionRule: + """ + Represents a chemical reaction rule defined by a SMARTS pattern. + + :var name: str: name of the reaction rule + :var smarts: str: SMARTS pattern defining the reaction + :var props: dict[str, Any]: additional properties associated with the rule + :var allowed_in_bulk: bool: whether this rule is allowed to be applied in bulk preprocessing + """ + + name: str + smarts: str + props: dict[str, Any] + allowed_in_bulk: bool = False + + rxn: ChemicalReaction = field(init=False, repr=False) + + def __post_init__(self) -> None: + """ + Initialize the ChemicalReaction from the SMARTS pattern. + """ + rxn = smarts_to_reaction(self.smarts) + object.__setattr__(self, "rxn", rxn) + + @property + def id(self) -> str: + """ + Unique identifier for the reaction rule based on its SMARTS pattern. + + :return: str: unique identifier + """ + return hashlib.sha256(self.smarts.encode("utf-8")).hexdigest() + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "ReactionRule": + """ + Create a ReactionRule instance from a dictionary. + + :param data: dict[str, Any]: dictionary containing rule data + :return: ReactionRule: the created ReactionRule instance + """ + reaction_rule = cls( + name=data["name"], + smarts=data["smarts"], + props=data.get("props", {}), + allowed_in_bulk=data.get("allowed_in_bulk", False), + ) + return reaction_rule + + def apply(self, reactant: Mol, mask_tags: set[int] | None = None) -> list[list[Mol]]: + """ + Apply the reaction to the given reactant molecule, optionally enforcing a mask on atom tags. + + :param reactant: Mol: the reactant molecule + :param mask_tags: set[int] | None: set of atom tags (isotope-based tags) that are allowed to change + :return: list[list[Mol]]: list of unique product tuples (each tuple as a list[Mol]) + """ + log.debug(f"applying reaction rule '{self.name}'") + + results = self.rxn.RunReactants([reactant]) + if not results: + log.debug("no products generated for reactant") + return [] + log.debug(f"generated {len(results)} raw product tuple(s)") + + # Sanitize and filter + kept: list[list[Mol]] = [] + for tup in results: + products: list[Mol] = [] + + # Quick shape check, and sanitize + atom_tag_sets: list[set[int]] = [] + ok_tuple = True + for prod in tup: + + # Check if product is single component + if not count_fragments(prod) == 1: + log.debug("product has multiple components, skipping") + ok_tuple = False + break + + # Sanitize in place + if not sanitize_mol(prod, fix_hydrogens=True): + log.debug("product sanitization failed, skipping") + ok_tuple = False + break + + # Reassign stereo on the sanitized product + prod = reassign_stereochemistry(prod) + + products.append(prod) + atom_tag_sets.append(get_tags_mol(prod)) + + if not ok_tuple: + log.debug("product tuple failed validation, skipping") + continue + + # Disallow overlapping tag sets across products + total_tags = sum(len(s) for s in atom_tag_sets) + union_tags = len(set().union(*atom_tag_sets)) if atom_tag_sets else 0 + if atom_tag_sets and total_tags != union_tags: + log.debug("products share atom tags, skipping") + continue + + # Mask check + if mask_tags is not None and not is_masked_preserved(reactant, products, mask_tags): + log.debug("products modify tags outside mask, skipping") + continue + + kept.append(products) + + if len(kept) <= 1: + return kept + + # Stereo-aware dereplication (order-insensitive, multiplicity-preserving) + seen: dict[tuple[tuple[str, int], ...], int] = {} + unique: list[list[Mol]] = [] + for res in kept: + + # Create keys based on the SMILES of products without tags + c = Counter(mol_to_smiles(p, include_tags=False, isomeric=True, canonical=True) for p in res) + key = tuple(sorted(c.items(), key=lambda kv: kv[0])) + + if key in seen: + continue + + seen[key] = 1 + unique.append(res) + + return unique + + +def index_uncontested( + mol: Mol, + rules: list[ReactionRule], + failed_combos: set[tuple[int, frozenset[int]]], +) -> list[tuple[ReactionRule, set[int]]]: + """ + Index uncontested reactions for applying preprocessing rules in bulk. + + :param mol: RDKit molecule + :param rules: List of preprocessing rules + :param failed_combos: Set of failed combinations to avoid infinite loops + :return: Uncontested reactions + """ + up_for_election: list[tuple[ReactionRule, set[int], set[int]]] = [] + for rl in rules: + if not rl.rxn: + continue # skip rules without a reaction template + + reactive_inds = reactive_template_atoms(rl.rxn)[0] + all_reactant_matches: list[tuple[tuple[int, ...], ...]] = [] + all_reactant_matches_reactive_items: list[list[list[int]]] = [] + for template_idx in range(rl.rxn.GetNumReactantTemplates()): + reactant_template = rl.rxn.GetReactantTemplate(template_idx) + reactant_matches: tuple[tuple[int, ...], ...] = mol.GetSubstructMatches(reactant_template) + all_reactant_matches.append(reactant_matches) + new_reactant_matches: list[list[int]] = [] + for reactant_match in reactant_matches: + new_reactant_matches.append([reactant_match[idx] for idx in reactive_inds]) + all_reactant_matches_reactive_items.append(new_reactant_matches) + + # Generate all possible match sets, for when reaction template matches multiple sites + match_sets = list(itertools.product(*all_reactant_matches)) + match_sets_reactive_items = list(itertools.product(*all_reactant_matches_reactive_items)) + match_sets = [set(itertools.chain(*match_set)) for match_set in match_sets] + match_sets_reactive_items = [set(itertools.chain(*match_set)) for match_set in match_sets_reactive_items] + for match_set, match_set_reactive_items in zip(match_sets, match_sets_reactive_items, strict=True): + up_for_election.append((rl, match_set, match_set_reactive_items)) + + # Check which reactions with matched templates are uncontested and which are contested + uncontested: list[tuple[ReactionRule, set[int]]] = [] + for i, (rl, match_set, match_set_reactive_items) in enumerate(up_for_election): + # TODO: Rules with ring matching conditions are always contested + # if rl.has_ring_matching_condition(): + # continue + + # Check if match set has overlap with any other match set + # has_overlap = any(match_set.intersection(o) for j, (_, o, o_r) in enumerate(up_for_election) if i != j) + has_overlap = any(match_set_reactive_items.intersection(o_r) for j, (_, _, o_r) in enumerate(up_for_election) if i != j) + if not has_overlap: + uncontested.append((rl, match_set)) + + # Filter out failed combinations to avoid infinite loops + uncontested = [(rl, match_set) for rl, match_set in uncontested if (rl.id, frozenset(match_set)) not in failed_combos] + + return uncontested + + +def apply_uncontested( + parent: Mol, + uncontested: list[tuple[ReactionRule, set[int]]], + original_taken_tags: set[int], +) -> tuple[list[Mol], list[tuple[ReactionRule, set[int]]], set[tuple[int, frozenset[int]]]]: + """ + Apply uncontested reactions in bulk. + + :param parent: RDKit molecule + :param uncontested: List of uncontested reactions + :param original_taken_tags: List of atom tags from original reactant + :return: list of true products, a list of applied ReactionRules with their masks, and a set of failed combinations + """ + applied_reactions: list[tuple[ReactionRule, set[int]]] = [] + + tags_in_parent: set[int] = set(get_tags_mol(parent)) + + # We make sure all atoms, even the ones not from original reactant, have a + # unique isotope number, so we can track them through consecutive reactions + temp_taken_tags = get_tags_mol(parent) + for atom in parent.GetAtoms(): + if atom.GetIsotope() == 0: + tag = 1 + while tag in original_taken_tags or tag in temp_taken_tags: + tag += 1 + atom.SetIsotope(tag) + temp_taken_tags.add(tag) + + # Validate that all atoms have a unique tag + num_tagged_atoms = len(set(get_tags_mol(parent))) + if num_tagged_atoms != len(parent.GetAtoms()): + raise ValueError("Not all atoms have a unique tag before applying uncontested reactions") + + # Map tags to atomic nums so we can create masks and reassign atomic nums later on + idx_to_tag = {a.GetIdx(): a.GetIsotope() for a in parent.GetAtoms()} + + # All uncontested reactions become a single node in the reaction_graph + products: list[Mol] = [] + failed_combos: set[tuple[int, frozenset[int]]] = set() # keep track of failed combinations to avoid infinite loops + + for rl, match_set in uncontested: + msk = set([idx_to_tag[idx] for idx in match_set]) # create mask for reaction + + # We use the input parent if there are no products, otherwise we have to find out + # which product now contains the mask (i.e., the reaction template for this reaction) + if len(products) != 0: + new_parent: Mol | None = None + for product in products: + product_tags = set(get_tags_mol(product)) + if msk.issubset(product_tags): + new_parent = product + products = [p for p in products if p != product] + break + + if new_parent is None: + # raise ValueError("no product found that contains the mask") + # If no product is found, we continue with the next uncontested reaction + continue + + parent = new_parent + + # Register all tags currently taken by atoms in parent + temp_taken_tags_uncontested = get_tags_mol(parent) + + # Newly introduced atoms by one of the uncontested reactions need a unique tag + for atom in parent.GetAtoms(): + if atom.GetIsotope() == 0: # newly introduced atom has tag 0 + # Loop until we find a tag that is not already taken + tag = 1 + while tag in (temp_taken_tags_uncontested | original_taken_tags | temp_taken_tags): + tag += 1 + atom.SetIsotope(tag) + temp_taken_tags_uncontested.add(tag) + + unmasked_parent = Mol(parent) # keep original parent for later + results = rl.apply(parent, msk) # apply reaction rule + + try: + if len(results) == 0: + raise ValueError(f"No products from uncontested reaction {rl.name}") + + if len(results) > 1: + raise ValueError(f"More than one product from uncontested reaction {rl.name}") + + result = results[0] + applied_reactions.append((rl, match_set)) # keep track of successfully applied reactions + + # Reset atom tags in products for atoms not in original reactant + for product in result: + for atom in product.GetAtoms(): + if atom.GetIsotope() not in original_taken_tags and atom.GetIsotope() != 0: + atom.SetIsotope(0) + products.append(product) + + except Exception: + # Start function again with the next uncontested reaction + for atom in parent.GetAtoms(): + if atom.GetIsotope() not in original_taken_tags and atom.GetIsotope() != 0: + atom.SetIsotope(0) + products.append(unmasked_parent) + failed_combos.add((rl.id, frozenset(match_set))) + + for product in products: + # Any tag in product that is not in parent should be 0; otherwise we run into issues with + # the set cover algorithm + for atom in product.GetAtoms(): + if atom.GetIsotope() not in tags_in_parent and atom.GetIsotope() != 0: + atom.SetIsotope(0) + + return products, applied_reactions, failed_combos + + +@dataclass(frozen=True) +class MatchingRule: + """ + Represents a molecular matching rule defined by a SMILES pattern. + + :var name: str: name of the matching rule + :var smiles: str: SMILES pattern defining the motif + :var props: dict[str, Any]: additional properties associated with the rule + :var terminal: bool: whether this rule is terminal (i.e., should not be expanded further) + :var family_tokens: tuple[str, ...]: tokens representing the family of the matching rule + :var ancestor_tokens: tuple[tuple[str, ...]]: tokens representing the ancestors of the matching rule + """ + + name: str + smiles: str + props: dict[str, Any] + terminal: bool = True + + family_tokens: set[str] = field(default_factory=set) + ancestor_tokens: list[str] = field(default_factory=list) + + mol: Mol = field(init=False, repr=False) + + def __post_init__(self) -> None: + """ + Initialize the Mol from the SMILES pattern. + """ + mol = smiles_to_mol(self.smiles) + object.__setattr__(self, "mol", mol) + + @property + def id(self) -> str: + """ + Unique identifier for the matching rule based on its SMILES pattern. + + :return: str: unique identifier + """ + return hashlib.sha256(self.smiles.encode("utf-8")).hexdigest() + + def to_dict(self) -> dict[str, Any]: + """ + Serialize the MatchingRule to a dictionary. + + :return: dictionary representation of the MatchingRule + """ + return { + "name": self.name, + "smiles": self.smiles, + "props": self.props, + "terminal": self.terminal, + "family_tokens": list(self.family_tokens), + "ancestor_tokens": list(self.ancestor_tokens), + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "MatchingRule": + """ + Create a MatchingRule instance from a dictionary. + + :param data: dict[str, Any]: dictionary containing rule data + :return: MatchingRule: the created MatchingRule instance + """ + matching_rule = cls( + name=data["name"], + smiles=data["smiles"], + props=data.get("props", {}), + terminal=data.get("terminal", True), + family_tokens=set(data.get("family_tokens", [])), + ancestor_tokens=list(data.get("ancestor_tokens", [])), + ) + return matching_rule + + def is_match(self, mol: Mol, match_stereochemistry: bool = False) -> bool: + """ + Check if the given molecule matches this rule. + + :param mol: Mol: molecule to check + :param match_stereochemistry: bool: whether to consider stereochemistry in matching + :return: bool: True if the molecule matches the rule, False otherwise + """ + has_substruct_match = mol.HasSubstructMatch(self.mol, useChirality=match_stereochemistry) + has_equal_num_atoms = mol.GetNumAtoms() == self.mol.GetNumAtoms() + has_equal_num_bonds = mol.GetNumBonds() == self.mol.GetNumBonds() + + if has_substruct_match and has_equal_num_atoms and has_equal_num_bonds: + return True + + return False + + +@dataclass(frozen=True) +class RuleSet: + """ + Represents a set of reaction and matching rules. + + :var match_stereochemistry: bool: whether to consider stereochemistry in matching rules + :var reaction_rules: list[ReactionRule]: list of reaction rules + :var matching_rules: list[MatchingRule]: list of matching rules + """ + + match_stereochemistry: bool + reaction_rules: list[ReactionRule] + matching_rules: list[MatchingRule] + + def __str__(self) -> str: + """ + String representation of the RuleSet. + + :return: str: string representation + """ + return f"RuleSet({len(self.reaction_rules)} reaction rules; {len(self.matching_rules)} matching rules; match_stereochemistry={self.match_stereochemistry})" + + @classmethod + def load_default(cls, match_stereochemistry: bool = False) -> "RuleSet": + """ + Load the default set of reaction and matching rules. + + :return: RuleSet: the default rule set + """ + path_reaction_rules = Path(files(retromol.data).joinpath("rxn.yml")) + path_matching_rules_other = Path(files(retromol.data).joinpath("mxn_other.yml")) + + if match_stereochemistry: + path_matching_rules_polyketide = Path(files(retromol.data).joinpath("mxn_pks_chiral.yml")) + else: + path_matching_rules_polyketide = Path(files(retromol.data).joinpath("mxn_pks.yml")) + + with open(path_reaction_rules, "r") as fo: + reaction_rules_data = yaml.safe_load(fo) + + with open(path_matching_rules_other, "r") as fo: + matching_rules_other_data = yaml.safe_load(fo) + + with open(path_matching_rules_polyketide, "r") as fo: + matching_rules_polyketide_data = yaml.safe_load(fo) + + matching_rules_data = matching_rules_other_data + matching_rules_polyketide_data + + reaction_rules = [ReactionRule.from_dict(d) for d in reaction_rules_data] + matching_rules = [MatchingRule.from_dict(d) for d in matching_rules_data] + + return RuleSet(match_stereochemistry, reaction_rules, matching_rules) diff --git a/src/retromol/model/submission.py b/src/retromol/model/submission.py new file mode 100644 index 0000000..68a093c --- /dev/null +++ b/src/retromol/model/submission.py @@ -0,0 +1,86 @@ +"""Module defining the Submission data class.""" + +from dataclasses import dataclass, field +from typing import Any + +from rdkit.Chem.rdchem import Mol + +from retromol.chem.mol import standardize_from_smiles, mol_to_inchikey +from retromol.chem.tagging import tag_mol + + +@dataclass(frozen=True) +class Submission: + """ + Represents a submission of a molecule for retrosynthetic analysis. + + :var smiles: str: SMILES representation of the submitted molecule + :var name: str | None: optional name of the submitted molecule + :var props: dict[str, Any] | None: optional additional properties associated with the submission + """ + + smiles: str + name: str | None = None + props: dict[str, Any] | None = None + + mol: Mol = field(init=False, repr=False) + inchikey: str = field(init=False, repr=False) + + def __post_init__(self) -> None: + """ + Post-initialization processing to generate standardized molecule and InChIKey. + """ + # Sanitize SMILES + smiles = self.smiles.replace("[N]", "N") # avoid parsing issues with RDKit + + # Generate standardized molecule + mol = standardize_from_smiles( + smiles, + keep_stereo=True, + neutralize=True, + tautomer_canon=True, + ) + + # Generate InChIKey + inchikey = mol_to_inchikey(mol) + + # Tag molecule + tag_mol(mol) + + object.__setattr__(self, "smiles", smiles) + object.__setattr__(self, "mol", mol) + object.__setattr__(self, "inchikey", inchikey) + + def __str__(self) -> str: + """ + String representation of the Submission. + + :return: string representation of the Submission + """ + return f"Submission(name={self.name})" + + def to_dict(self) -> dict[str, Any]: + """ + Serialize the Submission to a dictionary. + + :return: dictionary representation of the Submission + """ + return { + "smiles": self.smiles, + "name": self.name, + "props": self.props, + } + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "Submission": + """ + Deserialize a Submission from a dictionary. + + :param data: dictionary representation of the Submission + :return: Submission object + """ + return cls( + smiles=data["smiles"], + name=data.get("name"), + props=data.get("props"), + ) diff --git a/src/retromol/model/synthesis.py b/src/retromol/model/synthesis.py new file mode 100644 index 0000000..04d64fe --- /dev/null +++ b/src/retromol/model/synthesis.py @@ -0,0 +1,24 @@ +"""Data structures for representing synthesis extraction results.""" + +import logging +from dataclasses import dataclass + +from retromol.model.reaction_graph import ReactionGraph + + +log = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class SynthesisExtractResult: + """ + Result of synthesis subgraph extraction. + + :var graph: ReactionGraph: the extracted synthesis subgraph + :var solved: bool: whether the root was successfully solved + :var total_cost: float: total cost of the extracted subgraph + """ + + graph: ReactionGraph + solved: bool + total_cost: float diff --git a/src/retromol/monomer_collapse.py b/src/retromol/monomer_collapse.py deleted file mode 100644 index 9ab8116..0000000 --- a/src/retromol/monomer_collapse.py +++ /dev/null @@ -1,413 +0,0 @@ -"""Collapse monomers into structural (and optionally name-based) groups, deterministically.""" - -from __future__ import annotations - -from collections import defaultdict -from collections.abc import Callable, Iterable, Mapping -from dataclasses import dataclass, field - -from retromol.chem import ( - ExplicitBitVect, - Mol, - calc_tanimoto_similarity_rdkit, - ecfp4, - mol_to_inchikey, - mol_to_smiles, - standardize_from_smiles, -) -from retromol.helpers import blake64_hex - - -def inchikeys(mol: Mol) -> tuple[str, str]: - """ - Get the InChIKeys for a molecule. - - :param mol: input molecule - :return: tuple of (full InChIKey, connectivity InChIKey) - """ - ik_full = mol_to_inchikey(mol) - ik_conn = ik_full.split("-")[0] - return ik_full, ik_conn - - -@dataclass -class Monomer: - """ - Normalized view of each input record, kept for deterministic sorting and lookup. - - :param idx: original index in input list - :param name: monomer name - :param input_smiles: original input SMILES string - :param mol: standardized RDKit Mol object (or None if standardization failed) - :param can_smi: canonical SMILES string (or None if mol is None) - :param ik_full: full InChIKey (or None if mol is None) - :param ik_conn: connectivity InChIKey (or None if mol is None) - :param fp: ECFP4 fingerprint (or None if mol is None) - """ - - idx: int - name: str - input_smiles: str - mol: Mol | None = None - can_smi: str | None = None - ik_full: str | None = None - ik_conn: str | None = None - fp: ExplicitBitVect | None = None - - -@dataclass -class Group: - """ - A group of monomers collapsed either by structure or by explicit name. - - :param gid: group ID - :param rep_idx: index of the representative monomer - :param members: list of member monomer indices - :param token_fine: 64-bit hex over canonical SMILES (no stereo) OR name (for name-groups) - :param rep_can_smi: canonical SMILES for the representative - :param kind: "struct" or "name" - :param name_key: the name string used for name-based groups - """ - - gid: int - rep_idx: int # index of the representative monomer - members: list[int] = field(default_factory=list) - kind: str = "struct" # "struct" or "name" - token_fine: str = "" # 64-bit hex over canonical SMILES (struct) OR name (for name-groups) - rep_can_smi: str = "" # canonical SMILES for the representative - name_key: str | None = None # the name string used for name-based groups - - token_coarse: str = "" # e.g., scaffold hash or family hash (placeholder for now) - rep_scaffold_smi: str = "" # if needed later for computing scaffolds, safe default - - -@dataclass -class NameSimilarityConfig: - """ - Configure similarity bits among name-collapsed groups. - - :param family_of: maps a group name -> family string (None to skip family) - :param family_weight: weight for each family token (per distinct member name) - :param pair_weight: base weight multiplied by pairwise similarity in [0, 1] - :param pairwise: sparse matrix (dict of dict) of explicit similarities. Example: {'serine': {'homoserine': 0.85}} - :param symmetric: if True, treat pairwise as symmetric (use max(a->b, b->a)) - :param family_repeat_scale: integer scaling factor for family tokens - :param pair_repeat_scale: integer scaling factor for pairwise tokens - :param ancestors_of: maps a group name -> ancestor string (None to skip ancestor) - :param ancestor_repeat_scale: integer scaling factor for ancestor tokens - """ - - family_of: Callable[[str], str | None] | None = None - family_weight: float = 0.30 - pair_weight: float = 0.60 - pairwise: Mapping[str, Mapping[str, float]] = field(default_factory=dict) - symmetric: bool = True - family_repeat_scale: int = 2 - pair_repeat_scale: int = 2 - ancestors_of: Callable[[str], str | None] | None = None - ancestor_repeat_scale: int = 0 - - -class DSU: - """Disjoint Set Union (Union-Find) data structure for efficient component merging.""" - - def __init__(self, n: int) -> None: - """ - Initialize DSU with n elements (0 to n-1). - - :param n: number of elements - """ - self.p = list(range(n)) - self.r = [0] * n - - def find(self, x: int) -> int: - """ - Find the representative of the set containing x, with path compression. - - :param x: element to find - :return: representative element of the set - """ - while self.p[x] != x: - self.p[x] = self.p[self.p[x]] # path halving - x = self.p[x] - return x - - def union(self, a: int, b: int) -> None: - """ - Union the sets containing elements a and b. - - :param a: first element - :param b: second element - """ - ra, rb = self.find(a), self.find(b) - if ra == rb: - return - if self.r[ra] < self.r[rb]: - self.p[ra] = rb - elif self.r[ra] > self.r[rb]: - self.p[rb] = ra - else: - self.p[rb] = ra - self.r[ra] += 1 - - -def collapse_monomers_order_invariant( - records: Iterable[tuple[str, str]], - keep_stereo: bool = False, - tanimoto_thresh: float = 0.85, - collapse_by_name: Iterable[str] | None = None, -) -> tuple[list[Group], list[Monomer]]: - """ - Deterministic grouping independent of input order (but still RDKit/version dependent). - - Algorithm: - 1) Normalize each (name, SMILES) -> Monomer (std Mol, can_smi, InChIKeys, ECFP4) - 2) Split indices into name-driven vs structure-driven pools - 3) Structural pool: - a) Exact union by full-IK and by connectivity-IK - b) Block by rough size (bitcount bucker) and union pairs with Tanimoto >= threshold - 4) Emit groups deterministically: - a) Name groups in sorted (name) with members sorted by stable key - b) Structural components by representative stable key - 5) Return monomers sorted by stable key for reproducibility - - :param records: iterable of (name, SMILES) tuples for monomers - :param keep_stereo: whether to retain stereochemistry during standardization - :param tanimoto_thresh: Tanimoto similarity threshold for structural grouping - :param collapse_by_name: optional iterable of names to always collapse by name - :return: tuple of (list of Groups, list of Monomers) - """ - collapse_set = set(collapse_by_name or []) - - # Build Monomer table (skip invalid SMILES unless in name-collapsed) - monomers: list[Monomer] = [] - for i, (name, smi) in enumerate(records): - mol = standardize_from_smiles(smi, keep_stereo=keep_stereo) if smi else None - if mol is None and name not in collapse_set: - continue - can_smi = mol_to_smiles(mol, isomeric=keep_stereo, canonical=True) if mol is not None else None - ik_full = ik_conn = None - fp = None - if mol is not None: - ik_full, ik_conn = inchikeys(mol) - fp = ecfp4(mol) - monomers.append( - Monomer( - idx=i, - name=name, - input_smiles=smi or "", - mol=mol, - can_smi=can_smi, - ik_full=ik_full, - ik_conn=ik_conn, - fp=fp, - ) - ) - - # Stable key used globally to kill order effects - def mkey(m: Monomer) -> tuple[str, str, str, int]: - """ - Stable key for monomer sorting and representative selection. - - :param m: Monomer object - :return: tuple key - """ - return (m.can_smi or "", m.name or "", m.input_smiles or "", m.idx) - - # Helper: get Monomer by original idx quickly - by_idx: dict[int, Monomer] = {m.idx: m for m in monomers} - - # Partition into name vs structural pools - name_idxs = [m.idx for m in monomers if m.name in collapse_set] - struct_idxs = [m.idx for m in monomers if m.name not in collapse_set] - - # Dense indexing for DSU only over structural pool - struct_pos = {idx: pos for pos, idx in enumerate(sorted(struct_idxs))} - dsu = DSU(len(struct_pos)) - - # Tier 1: exact unions by InChIKeys (full, then connectivity) - by_full: dict[str, list[int]] = defaultdict(list) - by_conn: dict[str, list[int]] = defaultdict(list) - for i in struct_idxs: - m = by_idx[i] - if m.ik_full: - by_full[m.ik_full].append(i) - if m.ik_conn: - by_conn[m.ik_conn].append(i) - - for bucket in list(by_full.values()) + list(by_conn.values()): - # Deterministic chaining unions after sorting - if len(bucket) >= 2: - sb = sorted(bucket) - for a, b in zip(sb[:-1], sb[1:], strict=True): - dsu.union(struct_pos[a], struct_pos[b]) - - # Tier 2: similarity unions inside coarse bitcount blocks - def bitcount(fp: ExplicitBitVect | None) -> int: - """ - Count the number of bits set in an RDKit ExplicitBitVect fingerprint. - - :param fp: RDKit ExplicitBitVect fingerprint - :return: Number of bits set - """ - return int(fp.GetNumOnBits()) if fp is not None else 0 - - blocks: dict[tuple[str, int], list[int]] = defaultdict(list) - for i in struct_idxs: - m: Monomer = by_idx[i] - # Single "no scaffold" channel: bucket by rough size (per 16 bits set) - blocks[("", bitcount(m.fp) // 16)].append(i) - - # Deterministic within each block: sort by mkey, then scan upper triangle - for _, bucket in sorted(blocks.items(), key=lambda kv: kv[0]): - if len(bucket) < 2: - continue - bucket = sorted(bucket, key=lambda i: mkey(by_idx[i])) - for ai in range(len(bucket)): - ma = by_idx[bucket[ai]] - if ma.fp is None: - continue - for bi in range(ai + 1, len(bucket)): - mb = by_idx[bucket[bi]] - if mb.fp is None: - continue - if calc_tanimoto_similarity_rdkit(ma.fp, mb.fp) >= tanimoto_thresh: - dsu.union(struct_pos[ma.idx], struct_pos[mb.idx]) - - # Emit groups deterministically - groups: list[Group] = [] - - # Name groups: emit in sorted (name) order; members sorted by mkey - names_sorted = sorted({by_idx[i].name for i in name_idxs}) - for nm in names_sorted: - mems = [i for i in name_idxs if by_idx[i].name == nm] - mems_sorted = sorted(mems, key=lambda i: mkey(by_idx[i])) - rep = by_idx[mems_sorted[0]] - groups.append( - Group( - gid=len(groups), - rep_idx=rep.idx, - members=mems_sorted, - token_fine=blake64_hex(f"NAME:{nm}"), - rep_can_smi=rep.can_smi or "", - kind="name", - name_key=nm, - ) - ) - - # Structural components: gather, choose representative by mkey, and sort components by rep key - comps: dict[int, list[int]] = defaultdict(list) - for i in struct_idxs: - root = dsu.find(struct_pos[i]) - comps[root].append(i) - - comp_infos: list[tuple[tuple[str, str, str, int], list[int]]] = [] - for comp in comps.values(): - comp_sorted = sorted(comp, key=lambda i: mkey(by_idx[i])) - rep = by_idx[comp_sorted[0]] - rep_key = (rep.can_smi or "", rep.name or "", rep.idx) - comp_infos.append((rep_key, comp_sorted)) - - for _, comp_sorted in sorted(comp_infos, key=lambda t: t[0]): - rep = by_idx[comp_sorted[0]] - groups.append( - Group( - gid=len(groups), - rep_idx=rep.idx, - members=comp_sorted, - token_fine=blake64_hex(rep.can_smi or ""), - rep_can_smi=rep.can_smi or "", - kind="struct", - ) - ) - - # Return monomers in a determinisitic order as well - monomers_sorted = sorted(monomers, key=mkey) - return groups, monomers_sorted - - -def tokens_for_groups( - groups: list[Group], - weight_fine: float = 1.0, - weight_coarse: float = 0.4, - extra_bags: Iterable[dict[str, float]] | None = None, -) -> dict[str, float]: - """ - Build a determinisitc token bag (fine/coarse), sorting to keep stable accumulation order. - - :param groups: list of Group objects - :param weight_fine: weight for fine tokens (name or canonical SMILES) - :param weight_coarse: weight for coarse tokens (scaffold-level, not implemented here) - :param extra_bags: optional coarse tokens (e.g., scaffold/family) if present in Group - :return: dictionary mapping tokens to their accumulated weights - """ - bag: dict[str, float] = {} - - # Stable accumulation order - for g in sorted(groups, key=lambda g: (g.kind, g.rep_can_smi, g.gid)): - if g.token_fine: - prefix = "N:" if g.kind == "name" else "F:" - key = f"{prefix}{g.token_fine}" - bag[key] = bag.get(key, 0.0) + weight_fine - - if g.token_coarse: - keyc = f"C:{g.token_coarse}" - bag[keyc] = bag.get(keyc, 0.0) + weight_coarse - - if extra_bags: - for eb in extra_bags: - for k, v in eb.items(): - bag[k] = bag.get(k, 0.0) + float(v) - - return bag - - -def assign_to_existing_groups( - smi: str, - groups: list[Group], - monomers: list[Monomer], - keep_stereo: bool = False, - tanimoto_thresh: float = 0.85, -) -> int | None: - """ - Deterministically assign SMILES into existing structural groups. - - :param smi: input SMILES string. - :param groups: list of existing Group objects. - :param monomers: list of Monomer objects corresponding to the groups. - :param keep_stereo: whether to retain stereochemistry during standardization. - :param tanimoto_thresh: Tanimoto similarity threshold for structural grouping. - :return: group ID if assigned, else None. - """ - mol = standardize_from_smiles(smi, keep_stereo=keep_stereo) - if mol is None: - return None - - ik_full, ik_conn = inchikeys(mol) - fp_new = ecfp4(mol) - - # Representatives, determinisitically sorted - reps = [(g.gid, monomers[g.rep_idx]) for g in groups if g.kind == "struct"] - reps.sort(key=lambda t: (t[1].can_smi or "", t[0])) - - fullIK_to_gid = {m.ik_full: gid for gid, m in reps if m.ik_full} - connIK_to_gid = {m.ik_conn: gid for gid, m in reps if m.ik_conn} - - # Exact InChIKey - if ik_full in fullIK_to_gid: - return fullIK_to_gid[ik_full] - - # Connectivity InChIKey - if ik_conn in connIK_to_gid: - return connIK_to_gid[ik_conn] - - # Tanimoto similarity fallback - best_gid, best_sim = None, 0.0 - for gid, rep in reps: - if rep.fp is None: - continue - sim = calc_tanimoto_similarity_rdkit(fp_new, rep.fp) - if sim > best_sim: - best_gid, best_sim = gid, sim - - return best_gid if best_sim >= tanimoto_thresh else None diff --git a/src/retromol/pipelines/__init__.py b/src/retromol/pipelines/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/retromol/pipelines/parsing.py b/src/retromol/pipelines/parsing.py new file mode 100644 index 0000000..d616c9e --- /dev/null +++ b/src/retromol/pipelines/parsing.py @@ -0,0 +1,315 @@ +"""Module for applying reaction rules to molecules using a reaction graph approach.""" + +import logging +import os +from math import inf +from collections import deque, defaultdict +from typing import Optional + +from retromol.utils.timeout import timeout_decorator +from retromol.model.submission import Submission +from retromol.model.rules import RuleSet, index_uncontested, apply_uncontested +from retromol.model.result import Result +from retromol.model.reaction_graph import ReactionGraph, ReactionStep, RxnEdge +from retromol.model.synthesis import SynthesisExtractResult +from retromol.chem.mol import Mol, encode_mol, mol_to_smiles +from retromol.chem.tagging import get_tags_mol + + +log = logging.getLogger(__name__) + + +def process_mol(submission: Submission, ruleset: RuleSet) -> ReactionGraph: + """ + Process a molecule by applying reaction rules and constructing a reaction graph. + + :param submission: Submission: the input molecule and associated data + :param ruleset: RuleSet: the set of reaction and matching rules to apply + :return: ReactionGraph: the resulting reaction graph after processing + """ + reaction_rules = ruleset.reaction_rules + matching_rules = ruleset.matching_rules + + g = ReactionGraph() + + original_taken_tags = get_tags_mol(submission.mol) + failed_combos: set[tuple[int, frozenset[int]]] = set() + + # Track queue/expansion status by encoding to avoid duplicate work + enqueued: set[str] = set() + expanded: set[str] = set() + + q: deque[Mol] = deque() + q.append(Mol(submission.mol)) + enqueued.add(encode_mol(submission.mol)) + + while q: + parent = q.popleft() + parent_enc = g.add_node(parent) + + log.debug(f"expanding node {mol_to_smiles(parent, include_tags=False)}") + + # If we've already expanded this encoding, skip + if parent_enc in expanded: + continue + expanded.add(parent_enc) + + # Identity gating: only gate (stop expanding) if identified AND terminal=True + node = g.nodes[parent_enc] + ident = node.identify(matching_rules, match_stereochemistry=ruleset.match_stereochemistry) + if ident and bool(getattr(ident, "terminal", True)): + log.debug(f"node (identity={ident.name}) identified as terminal; stopping expansion") + continue + + # Uncontested in bulk (combined step) + allowed_in_bulk = [rl for rl in reaction_rules if rl.allowed_in_bulk] + uncontested = index_uncontested(parent, allowed_in_bulk, failed_combos) + if uncontested: + log.debug(f"applying {len(uncontested)} uncontested rule(s) in bulk") + + products, applied_in_bulk, new_failed = apply_uncontested(parent, uncontested, original_taken_tags) + failed_combos.update(new_failed) + + # If uncontested existed but none succeed, fall through to contested + if applied_in_bulk: + step = ReactionStep( + kind="uncontested", + names=tuple(rl.name for rl, _ in applied_in_bulk), + rule_ids=tuple(rl.id for rl, _ in applied_in_bulk), + ) + g.add_edge(parent_enc, products, step) + + # Enqueue newly discovered products (by encoding) + for m in products: + enc = encode_mol(m) + if enc not in expanded and enc not in enqueued: + q.append(Mol(m)) + enqueued.add(enc) + + continue + + # Contested exhaustive + for rl in reaction_rules: + + results = rl.apply(parent, None) + if not results: + continue + + for result_set in results: + # result_set is an iterable of product mols + step = ReactionStep( + kind="contested", + names=(rl.name,), + rule_ids=(rl.id,), + ) + g.add_edge(parent_enc, result_set, step) + + for m in result_set: + enc = encode_mol(m) + if enc not in expanded and enc not in enqueued: + q.append(Mol(m)) + enqueued.add(enc) + + return g + + +def extract_min_edge_synthesis_subgraph( + g: ReactionGraph, + root_enc: str, + prefer_kind: tuple[str, ...] = ("uncontested", "contested"), + edge_base_cost: float = 1.0, + nonterminal_leaf_penalty: float = 0.25, + unsolved_leaf_penalty: float = 5.0, +) -> SynthesisExtractResult: + """ + Extract a minimum-edge synthesis subgraph from a retrosynthesis ReactionGraph. + + Interprets the graph as an AND/OR graph: + - molecule node: OR (choose one outgoing reaction edge) + - reaction edge: AND (must solve all dst precursor nodes) + - identified molecule nodes are terminal solved leaves (cost=0) + + The extracted subgraph contains at most one chosen outgoing edge per expanded node + and includes all required precursor branches for that choice. + + :param g: ReactionGraph: the full retrosynthesis reaction graph + :param root_enc: encoding of the root molecule to extract from + :param prefer_kind: tuple[str, ...]: preference order for reaction kinds when costs are equal + :param edge_base_cost: float: base cost per reaction edge + :param nonterminal_leaf_penalty: float: penalty for identified leaves that are non-terminal + :param unsolved_leaf_penalty: float: penalty for unsolved leaves (i.e., "give up" cost) + :return: SynthesisExtractResult: the extracted synthesis subgraph and status + :raises: ValueError: if root_enc is not in the graph + """ + if root_enc not in g.nodes: + raise ValueError(f"root encoding {root_enc} not found in reaction graph nodes") + + # Adjacency list of outgoing edges for quick access + out_edges: dict[int, list[int]] = defaultdict(list) + for ei, e in enumerate(g.edges): + out_edges[e.src].append(ei) + + kind_rank = {k: i for i, k in enumerate(prefer_kind)} + + def edge_cost(e: RxnEdge) -> float: + # Primary objective: fewer edges + # Secondary tiebreakers: prefer uncontested; optionally penalize many precursors slightly + kind_penalty = 0.001 * kind_rank.get(e.step.kind, 999) + branch_penalty = 0.0001 * len(e.dsts) + return edge_base_cost + kind_penalty + branch_penalty + + # DP memo: cost to "solve" a node into identified leaves + memo_cost: dict[int, float] = {} + memo_choice: dict[int, Optional[int]] = {} # node -> chosen edge index + visiting: set[int] = set() + + def is_terminal(enc: str) -> bool: + n = g.nodes.get(enc) + if not n or not n.is_identified: + return False + ident = n.identity + return bool(getattr(ident, "terminal", True)) + + def solve_cost(enc: str) -> float: + n = g.nodes.get(enc) + + # Hard leaves: identified + terminal=True + if n and n.is_identified and is_terminal(enc): + memo_choice[enc] = None + return 0.0 + + # If nothing to expand, treat as "frontier leaf" (unsolved remainder) + if not out_edges.get(enc): + memo_choice[enc] = None + # Identified nonterminal with no edges is still fine as 0, else unsolved penalty + if n and n.is_identified: + return 0.0 + return unsolved_leaf_penalty + + # Soft leaves: identified + terminal=False + leaf_cost = inf + if n and n.is_identified and not is_terminal(enc): + # If no outgoing edges exist, must stop here (fallback leaf) + if not out_edges.get(enc): + memo_choice[enc] = None + return 0.0 + # Otherwise, allow stopping, but discourage it + leaf_cost = nonterminal_leaf_penalty + + if enc in memo_cost: + return memo_cost[enc] + + if enc in visiting: + # Cycle guard: treat as unsolvable in this simple DP + return inf + + visiting.add(enc) + + best = leaf_cost + best_ei: Optional[int] = None if best < inf else None + + for ei in out_edges.get(enc, []): + e = g.edges[ei] + # AND: all children must be solvable + c = edge_cost(e) + for d in e.dsts: + dc = solve_cost(d) + if dc == inf: + c = inf + break + c += dc + + if c < best: + best = c + best_ei = ei + + visiting.remove(enc) + + memo_cost[enc] = best + memo_choice[enc] = best_ei + return best + + total = solve_cost(root_enc) + # solved = no unsolved frontier was needed + solved = total < inf and total < unsolved_leaf_penalty + if total == inf: + return SynthesisExtractResult(graph=ReactionGraph(), solved=False, total_cost=inf) + + # Extract chosen policy edges into a new small graph + new_g = ReactionGraph() + + kept_nodes: set[int] = set() + kept_edge_indices: set[int] = set() + + def extract(enc: str) -> None: + if enc in kept_nodes: + return + kept_nodes.add(enc) + + # Always keep the node + if enc in g.nodes: + new_g.nodes[enc] = g.nodes[enc] + new_g.out_edges.setdefault(enc, []) + + # Stop at identified leaves + if is_terminal(enc): + return + + ei = memo_choice.get(enc) + if ei is None: + return + + kept_edge_indices.add(ei) + e = g.edges[ei] + + # Keep all dsts (AND) + for d in e.dsts: + extract(d) + + extract(root_enc) + + # Add edges (after nodes exist) + for ei in kept_edge_indices: + e = g.edges[ei] + if e.src not in new_g.nodes: + continue + dsts = tuple(d for d in e.dsts if d in new_g.nodes) + if not dsts: + continue + new_edge = RxnEdge(src=e.src, dsts=dsts, step=e.step) + new_g.edges.append(new_edge) + new_g.out_edges.setdefault(e.src, []).append(len(new_g.edges) - 1) + + return SynthesisExtractResult(graph=new_g, solved=solved, total_cost=total) + + +def run_retromol(submission: Submission, rules: RuleSet) -> Result: + """ + Run RetroMol retrosynthesis on the given input molecule using the specified reaction rules. + + :param submission: Submission object containing the input molecule and data + :param rules: Rules object containing the reaction rules to apply + :return: Result object containing the retrosynthesis results + """ + g = process_mol(submission, rules) + log.debug(f"retrosynthesis graph has {len(g.nodes)} ({len(g.identified_nodes)} identified) nodes and {len(g.edges)} edges") + + root = encode_mol(submission.mol) + r = extract_min_edge_synthesis_subgraph( + g, + root_enc=root, + edge_base_cost=0.25, # low base cost encourages longer syntheses + nonterminal_leaf_penalty=100.0, # high penalty forces expansion of non-terminal leaves (e.g., fatty acids) + ) + log.debug(f"extracted synthesis subgraph has {len(r.graph.nodes)} ({len(r.graph.identified_nodes)} identified) nodes and {len(r.graph.edges)} edges") + + if not r.solved: + log.debug("retrosynthesis extraction failed to find a solution") + + return Result( + submission=submission, + reaction_graph=r.graph, + ) + + +run_retromol_with_timeout = timeout_decorator(seconds=int(os.getenv("TIMEOUT_RUN_RETROMOL", "60")))(run_retromol) diff --git a/src/retromol/readout.py b/src/retromol/readout.py deleted file mode 100644 index 2a6be63..0000000 --- a/src/retromol/readout.py +++ /dev/null @@ -1,553 +0,0 @@ -"""Module for RetroMol results readout.""" - -from typing import Any - -from networkx import ( - Graph, - all_pairs_shortest_path_length, - connected_components, - is_connected, - shortest_path, -) - -from retromol.api import timeout_decorator -from retromol.chem import Mol, smiles_to_mol -from retromol.config import TIMEOUT_LINEAR_READOUT, TIMEOUT_OPTIMAL_MAPPINGS -from retromol.graph import merge_nodes, mol_to_graph -from retromol.io import Result - - -def optimal_mappings(result: Result) -> list[dict[str, Any]]: - """ - Return one mapping per nesting level. - At each level, nodes in the new level take precedence. - Previous level nodes are added only if they do not overlap - and are not submappings of the new level nodes. - - :param result: RetroMol Result object - :return: list of mapping dictionaries, each with keys: - - "nodes": list of nodes, each with "identity", "smiles", and "tags" - - "covered_tags": list of all tags covered by the selected nodes - - "n_nodes": number of nodes in the mapping - - "n_tags": number of unique tags covered - """ - identified: set[tuple[str, str, tuple[int, ...]]] = result.get_identified_nodes() - - # Normalize and filter empties - items: list[dict[str, Any]] = [] - for identity, smiles, tags in identified: - tag_set = frozenset(int(t) for t in tags) - if not tag_set: - continue - items.append({"identity": identity, "smiles": smiles, "tags": tag_set}) - - if not items: - return [] - - # Sort: largest pieces first - items.sort(key=lambda m: (-len(m["tags"]), str(m["smiles"]), str(m["identity"]))) - - # Partition into levels (graph coloring by overlap) - levels: list[list[dict[str, Any]]] = [] - for item in items: - placed = False - for lvl in levels: - if any(item["tags"] & other["tags"] for other in lvl): - continue - lvl.append(item) - placed = True - break - if not placed: - levels.append([item]) - - results: list[dict[str, Any]] = [] - - # Process levels with precedence for newer levels - for lvl_idx, lvl in enumerate(levels): - used_tags: set[int] = set() - chosen: list[dict[str, Any]] = [] - - # Step 1: take all new-level items - for m in lvl: - chosen.append(m) - used_tags |= m["tags"] - - # Step 2: consider all previous levels - for prev_idx in range(lvl_idx): - for m in levels[prev_idx]: - # skip if overlaps with any new-level item - if m["tags"] & used_tags: - continue - # skip if fully contained in a new-level item (submapping) - if any(m["tags"] <= new_m["tags"] for new_m in lvl): - continue - chosen.append(m) - used_tags |= m["tags"] - - # Build solution dict - covered: set[int] = set().union(*(n["tags"] for n in chosen)) - nodes = [{"identity": n["identity"], "smiles": n["smiles"], "tags": sorted(n["tags"])} for n in chosen] - results.append( - { - "nodes": nodes, - "covered_tags": sorted(covered), - "n_nodes": len(nodes), - "n_tags": len(covered), - } - ) - - return results - - -def mapping_to_graph(tagged_smi: str, mapping: dict[str, Any]) -> "Graph[str | int]": - """ - Convert a mapping dictionary to a NetworkX graph. - - :param tagged_smi: the SMILES representation of the molecule - :param mapping: a dictionary representing a mapping with keys: - - "nodes": list of nodes, each with "identity", "smiles", and "tags" - - "covered_tags": list of all tags covered by the selected nodes - - "n_nodes": number of nodes in the mapping - - "n_tags": number of unique tags covered - :return: a NetworkX graph representing the mapping - """ - tagged_mol: Mol = smiles_to_mol(tagged_smi) - G = mol_to_graph(tagged_mol, use_tags=True) - G.graph["tagged_smiles"] = tagged_smi - - # Merge nodes in graph - for node_idx, node in enumerate(mapping["nodes"]): - node_identity = node["identity"] - node_smiles = node["smiles"] - node_tags = node["tags"] - merge_nodes( - G, - merged_node_id=f"node_{node_idx}_{node_identity.replace(' ', '_')}", - nodes=node_tags, - props={"identity": node_identity, "smiles": node_smiles, "tags": node_tags}, - ) - - # Delete unmerged nodes - to_remove = [n for n, d in G.nodes(data=True) if "identity" not in d] - G.remove_nodes_from(to_remove) - - return G - - -def _dfs_graphs(root: "Graph[int | str]") -> "list[Graph[int | str]]": - """ - Return all graphs in DFS discovery order via node attr 'graph'. - - :param root: root graph - :return: list of graphs in DFS order - """ - out: list[Graph[int | str]] = [] - - def dfs(G: "Graph[int | str]"): - out.append(G) - # Stable order over nodes for determinism - for _, d in list(G.nodes(data=True)): - sub = d.get("graph") - if isinstance(sub, Graph): - dfs(sub) - - dfs(root) - return out - - -def _bfs_graphs_with_depth(root: "Graph[int | str]") -> "list[tuple[Graph[int | str], int]]": - """ - Return all graphs with their true nesting depth (root depth=0) using BFS. - - :param root: root graph - :return: list of tuples (graph, depth) - """ - out: list[tuple[Graph[int | str], int]] = [] - seen: set[int] = set() - q: list[tuple[Graph[int | str], int]] = [(root, 0)] - while q: - G, depth = q.pop(0) - if id(G) in seen: - continue - seen.add(id(G)) - out.append((G, depth)) - for _, d in G.nodes(data=True): - sub = d.get("graph") - if isinstance(sub, Graph): - q.append((sub, depth + 1)) - return out - - -def _graphs_with_metadata(root: "Graph[int | str] | None") -> list[dict[str, Any]]: - """ - Combine DFS order (legacy 'level_index' semantics) with true depth. - - :param root: root graph - :return: a list of dicts: {"graph": G, "dfs_index": i, "depth": d} - """ - if root is None: - return [] - dfs_list = _dfs_graphs(root) - bfs_list = _bfs_graphs_with_depth(root) # (G, depth) - depth_by_id = {id(G): depth for (G, depth) in bfs_list} - meta: list[dict[str, Any]] = [] - for i, G in enumerate(dfs_list): - meta.append({"graph": G, "dfs_index": i, "depth": depth_by_id.get(id(G), 0)}) - return meta - - -def list_levels_summary(result: Result) -> list[dict[str, Any]]: - """ - Legacy-style summary (DFS order), now also shows true nesting depth per entry. - - :param result: RetroMol Result object. - :return: ;ist of dicts, each with keys: - - "dfs_index": DFS discovery index - - "depth": True nesting depth (root=0) - - "n_nodes": number of nodes in the graph at this level - - "n_identified": number of identified nodes (with 'identity' attribute) - """ - # NOTE: should never happen, but be defensive - if result.graph is None: - return [] - summary: list[dict[str, Any]] = [] - for m in _graphs_with_metadata(result.graph): - G = m["graph"] - summary.append( - { - "dfs_index": m["dfs_index"], - "depth": m["depth"], - "n_nodes": G.number_of_nodes(), - "n_identified": sum(1 for _, d in G.nodes(data=True) if d.get("identity") is not None), - "wave_names": sorted({d.get("wave_name") for _, d in G.nodes(data=True) if "wave_name" in d}), - } - ) - return summary - - -def find_depth_by_wave_name(result: Result, wave_name: str) -> int | None: - """ - Return the smallest nesting depth that contains any node with the given wave_name. - - :param result: RetroMol Result object - :param wave_name: the wave_name to search for - :return: the smallest nesting depth containing the wave_name, or None if not found - """ - # NOTE: should never happen, but be defensive - if result.graph is None: - return None - depths: list[int] = [] - for m in _graphs_with_metadata(result.graph): - G = m["graph"] - if any(d.get("wave_name") == wave_name for _, d in G.nodes(data=True)): - depths.append(m["depth"]) - return min(depths) if depths else None - - -def _monomer_nodes_at_level(G: "Graph[int | str]", require_identified: bool) -> list[Any]: - """ - Return monomer nodes at the given graph level. - - :param G: the graph at the current level - :param require_identified: if True, only include nodes with an 'identity' attribute - :return: list of monomer node identifiers - """ - nodes: list[Any] = [] - for n, d in G.nodes(data=True): - if require_identified: - if d.get("identity") is not None and "tags" in d: - nodes.append(n) - else: - if "tags" in d: - nodes.append(n) - return nodes - - -def _is_path_component(H: "Graph[int | str]", nodes: list[Any]) -> bool: - """ - Check if the subgraph induced by 'nodes' in H is a path. - - :param H: the host graph - :param nodes: list of nodes to check - :return: True if the subgraph is a path, False otherwise - """ - if not nodes: - return False - C = H.subgraph(nodes) - if not is_connected(C): - return False - degs: dict[str | int, int] = dict(C.degree()) - if any(d > 2 for d in degs.values()): - return False - if len(C) == 1: - return True - ones = sum(1 for d in degs.values() if d == 1) - return (C.number_of_edges() == len(C) - 1) and (ones == 2) - - -def _order_nodes_along_path(H: "Graph[int | str]", nodes: list[Any]) -> list[Any]: - """ - Order nodes along a path component. - - :param H: the host graph - :param nodes: list of nodes in the path component - :return: ordered list of nodes along the path - """ - C = H.subgraph(nodes).copy() - if len(C) == 1: - return list(C.nodes()) - degs: dict[str | int, int] = dict(C.degree()) - start = [n for n, d in degs.items() if d == 1][0] - order = [start] - prev = None - cur = start - while True: - nbrs = [v for v in C.neighbors(cur) if v != prev] - if not nbrs: - break - nxt = nbrs[0] - order.append(nxt) - prev, cur = cur, nxt - return order - - -def _longest_path_approx(H: "Graph[int | str]", nodes: list[Any]) -> list[Any]: - """ - Approximate longest path in the subgraph induced by 'nodes' in H. - - :param H: the host graph - :param nodes: list of nodes to consider - :return: list of nodes along the approximate longest path - """ - C: Graph[int | str] = H.subgraph(nodes).copy() - if len(C) <= 1: - return list(C.nodes()) - lengths = dict(all_pairs_shortest_path_length(C)) - max_d = -1 - pair: tuple[Any, Any] | None = None - for u, dmap in lengths.items(): - for v, d in dmap.items(): - if d > max_d: - max_d = d - pair = (u, v) - assert pair is not None, "At least one pair should exist in non-empty connected graph" - return shortest_path(C, source=pair[0], target=pair[1]) - - -def _payload_from_order(G_src: "Graph[int | str]", order: list[Any]) -> dict[str, Any]: - """ - Create payload dictionary from ordered nodes. - - :param G_src: source graph - :param order: ordered list of nodes - :return: payload dictionary - """ - items: list[dict[str, Any]] = [] - for n in order: - d = G_src.nodes[n] - items.append( - { - "node": n, - "identity": d.get("identity"), - "smiles": d.get("smiles"), - "tags": sorted(d.get("tags", [])), - } - ) - return { - "n_monomers": len(items), - "ordered_monomers": items, - } - - -def _score_payload(pl: dict[str, Any]) -> tuple[int, int, tuple[str, ...]]: - """ - Score payload for comparison: (n_monomers, n_identified, node_key). - - :param pl: payload dictionary - :return: scoring tuple - """ - n = pl["n_monomers"] - n_ident = sum(1 for it in pl["ordered_monomers"] if it.get("identity") is not None) - node_key = tuple(str(it["node"]) for it in pl["ordered_monomers"]) - return (n, n_ident, node_key) - - -def linear_readout( - result: Result, - require_identified: bool = True, - mode: str = "all", # "all" | "best_per_level" | "global_best" - nesting_depth: int | None = None, -) -> dict[str, Any]: - """ - Linear backbone readouts. - - :param result: RetroMol Result object - :param require_identified: if ``True``, only consider monomer nodes with an - assigned identity. If ``False``, consider all monomer nodes - :param nesting_depth: maximum nesting level to analyze. If ``None``, all depths are included - - when ``nesting_depth`` is ``None``: iterate all graphs in DFS order and return - a structure identical to the previous version (keys and shapes), but each - entry now also includes a ``depth`` field for clarity. - - set ``nesting_depth = k`` to restrict analysis to graphs at that **true** - nesting level (root = 0, its children = 1, etc.). - :param mode: determines the aggregation mode of readouts - Supported values: - - ``"all"``: return all depth levels and paths. - - ``"best_per_level"``: return the best backbone per depth level. - - ``"global_best"``: return the globally best backbone only. - - :returns: - Depending on the selected ``mode``: - - **mode = "all"** - Returns: - ``{"levels": [ - {"dfs_index": int, "depth": int, - "strict_paths": [payload, ...], - "fallback": payload_or_None}, - ... - ]}`` - - **mode = "best_per_level"** - Returns: - ``{"levels": [ - {"dfs_index": int, "depth": int, - "strict_path": bool, - "backbone": payload, - "notes": str}, - ... - ]}`` - - **mode = "global_best"** - Returns: - ``{"dfs_index": int, "depth": int, - "strict_path": bool, - "backbone": payload, - "notes": str}`` - :rtype: - Dict[str, Any] - """ - metas = _graphs_with_metadata(result.graph) - if nesting_depth is not None: - metas = [m for m in metas if m["depth"] == nesting_depth] - if not metas: - msg = f"No graphs at nesting_depth={nesting_depth}." - if mode == "global_best": - return { - "dfs_index": -1, - "depth": nesting_depth, - "strict_path": False, - "backbone": {"n_monomers": 0, "ordered_monomers": []}, - "notes": msg, - } - else: - return {"levels": [], "notes": msg} - - # Per-graph analysis - entries: list[dict[str, Any]] = [] - for m in metas: - G = m["graph"] - dfs_idx = m["dfs_index"] - depth = m["depth"] - - monomer_nodes = _monomer_nodes_at_level(G, require_identified) - if not monomer_nodes: - entries.append( - { - "dfs_index": dfs_idx, - "depth": depth, - "strict_paths": [], - "fallback": None, - } - ) - continue - - MG = G.subgraph(monomer_nodes).copy() - comps = list(connected_components(MG)) - - strict_payloads: list[dict[str, Any]] = [] - for comp in comps: - nodes = list(comp) - if _is_path_component(MG, nodes): - order = _order_nodes_along_path(MG, nodes) - strict_payloads.append(_payload_from_order(G, order)) - - fallback_payload = None - if not strict_payloads and comps: - largest = max(comps, key=len) - approx_order = _longest_path_approx(MG, list(largest)) - fallback_payload = _payload_from_order(G, approx_order) - - entries.append( - { - "dfs_index": dfs_idx, - "depth": depth, - "strict_paths": strict_payloads, - "fallback": fallback_payload, - } - ) - - # Assemble per mode - if mode == "all": - # Preserve old shape (list under "levels"), but each entry includes depth now. - # Keep DFS ordering for stability. - entries.sort(key=lambda e: e["dfs_index"]) - return {"levels": entries} - - if mode == "best_per_level": - best_levels: list[dict[str, Any]] = [] - for e in sorted(entries, key=lambda x: x["dfs_index"]): - candidates: list[tuple[bool, dict[str, Any]]] = [(True, pl) for pl in e["strict_paths"]] - if not candidates and e["fallback"] is not None: - candidates.append((False, e["fallback"])) - if not candidates: - continue - best_idx = max(range(len(candidates)), key=lambda i: _score_payload(candidates[i][1])) - is_strict, payload = candidates[best_idx] - best_levels.append( - { - "dfs_index": e["dfs_index"], - "depth": e["depth"], - "strict_path": is_strict, - "backbone": payload, - "notes": "Strict path" if is_strict else "Fallback to longest-path approximation", - } - ) - return {"levels": best_levels} - - if mode == "global_best": - best: tuple[bool, dict[str, Any], int, int] | None = None # (is_strict, payload, dfs_index, depth) - for e in entries: - for pl in e["strict_paths"]: - if best is None or _score_payload(pl) > _score_payload(best[1]): - best = (True, pl, e["dfs_index"], e["depth"]) - if not e["strict_paths"] and e["fallback"] is not None: - pl = e["fallback"] - if best is None or _score_payload(pl) > _score_payload(best[1]): - best = (False, pl, e["dfs_index"], e["depth"]) - if best is None: - return { - "dfs_index": -1, - "depth": nesting_depth if nesting_depth is not None else -1, - "strict_path": False, - "backbone": {"n_monomers": 0, "ordered_monomers": []}, - "notes": "No monomer backbones found.", - } - is_strict, pl, dfs_idx, depth = best - return { - "dfs_index": dfs_idx, - "depth": depth, - "strict_path": is_strict, - "backbone": pl, - "notes": "Strict path" if is_strict else "Fallback to longest-path approximation", - } - - raise ValueError(f"Unknown mode: {mode!r}") - - -# Decorate the optimal_mappings function with a timeout -optimal_mappings_with_timeout = timeout_decorator(seconds=TIMEOUT_OPTIMAL_MAPPINGS)(optimal_mappings) - - -# Decorate the linear_readout function with a timeout -linear_readout_with_timeout = timeout_decorator(seconds=TIMEOUT_LINEAR_READOUT)(linear_readout) diff --git a/src/retromol/rules.py b/src/retromol/rules.py deleted file mode 100644 index 9f4312c..0000000 --- a/src/retromol/rules.py +++ /dev/null @@ -1,949 +0,0 @@ -"""This module contains functions for parsing RetroMol rules.""" - -import logging -from collections import defaultdict -from copy import deepcopy -from dataclasses import dataclass -from importlib.resources import files -from typing import Any - -import yaml -from rdkit.Chem.rdchem import PeriodicTable -from rdkit.Chem.rdMolDescriptors import CalcNumRings -from rdkit.Chem.rdmolops import ( - AssignAtomChiralTagsFromStructure, - AssignStereochemistry, - SetBondStereoFromDirections, -) -from tqdm import tqdm - -import retromol.data -from retromol.chem import ( - ChemicalReaction, - Mol, - count_fragments, - get_default_valence, - get_periodic_table, - get_tags_mol, - mol_to_inchikey, - mol_to_smiles, - sanitize_mol, - smarts_to_mol, - smarts_to_reaction, - smiles_to_mol, - stereo_summary, -) -from retromol.config import LOGGER_NAME -from retromol.helpers import sha256_hex - - -def check_tags_are_nonzero(mol: Mol) -> None: - """ - Check if all atom tags are nonzero. - - :param rea: molecule - - :raises ValueError: if any atom tag is 0 - """ - curr_tags = [a.GetIsotope() for a in mol.GetAtoms()] - - if any([tag == 0 for tag in curr_tags]): - raise ValueError("molecule contains atom tag 0") - - -def apply_mask(mol: Mol, msk: set[int]) -> dict[int, int]: - """ - Set atom numbers of atoms not in mask to 0 based on atom tags. - - :param mol: molecule - :param msk: mask of atom tags - :return: mapping of atom tags to atomic numbers - .. note:: this function modifies the reactant molecule in place - """ - # Check if all atom tags are nonzero - check_tags_are_nonzero(mol) - - # Get original tag to atomic num mapping - tag_to_anr = {a.GetIsotope(): a.GetAtomicNum() for a in mol.GetAtoms()} - - # Now we can apply the mask, set atomic num to 0 for atoms not in mask - for atom in mol.GetAtoms(): - if atom.GetIsotope() not in msk: - atom.SetAtomicNum(0) - - return tag_to_anr - - -def check_atomic_nums_are_nonzero(mol: Mol) -> None: - """ - Check if all atomic numbers are nonzero. - - :param rea: molecule - :raises ValueError: if any atomic number is 0 - """ - curr_anrs = [a.GetAtomicNum() for a in mol.GetAtoms()] - - if any([anr == 0 for anr in curr_anrs]): - raise ValueError("molecule contains atomic number 0") - - -def reset_atomic_nums(mol: Mol, tag_to_anr: dict[int, int]) -> None: - """ - Reset atomic numbers of atoms based on atom tags. - - :param mol: molecule - :param tag_to_anr: mapping of atom tags to atomic numbers - .. note:: this function modifies the reactant molecule in place - """ - for atom in mol.GetAtoms(): - if atom.GetIsotope() != 0: # newly added atoms in reaction have isotope 0 - original_anr = tag_to_anr.get(atom.GetIsotope(), None) - if original_anr is None: - raise ValueError(f"no atomic num found for atom tag {atom.GetIsotope()}") - atom.SetAtomicNum(original_anr) - - # Make sure that no atomic num is 0 - check_atomic_nums_are_nonzero(mol) - - -def correct_hydrogens(mol: Mol) -> None: - """ - Correct explicit hydrogens on atoms based on valence rules. - - :param mol: molecule - .. note:: this function modifies the reactant molecule in place - """ - for atom in mol.GetAtoms(): - # Skip aromatic and charged atoms - if atom.GetIsAromatic() or atom.GetFormalCharge() != 0: - continue - - # Skip phosphorus and sulfur (can have expanded valence) - if atom.GetAtomicNum() in (15, 16): - continue - - # Check if atom complies with valence rules, otherwise adjust explicit Hs - valence_bonds = int(sum([bond.GetValenceContrib(atom) for bond in atom.GetBonds()])) - default_valence = get_default_valence(atom.GetAtomicNum()) - num_hs = atom.GetNumExplicitHs() - - if default_valence - valence_bonds < num_hs: - new_valence = default_valence - valence_bonds - - if new_valence < 0: - raise ValueError("new atom valence is negative") - - atom.SetNumExplicitHs(new_valence) - - -def _collect_map_to_atomicnums(mols: list[Mol]) -> dict[int, list[int]]: - """ - Return map_number -> list of atomic numbers (with multiplicity). - - :param mols: list of molecules - :return: mapping of map numbers to list of atomic numbers - """ - out: dict[int, list[int]] = defaultdict(list) - for mol in mols: - for atom in mol.GetAtoms(): - m = atom.GetAtomMapNum() - if m > 0: - out[m].append(atom.GetAtomicNum()) - return out - - -def summarize_atomicnums(nums: list[int]) -> str: - """ - Compact helper for error messages, e.g., [6,6,8] -> Cx2,O. - - :param nums: list of atomic numbers - :return: compact string representation - """ - if not nums: - return "[]" - syms = [PeriodicTable.GetElementSymbol(get_periodic_table(), z) for z in nums] - # Build counts - from collections import Counter - - c = Counter(syms) - parts = [f"{el}x{cnt}" if cnt > 1 else el for el, cnt in sorted(c.items())] - return "[" + ",".join(parts) + "]" - - -@dataclass(frozen=True) -class _CompiledConds: - """ - Compiled conditions for reaction rules. - - :param requires_any: list of required substructures (any) - :param requires_all: list of required substructures (all) - :param forbids_any: list of forbidden substructures (any) - :param min_counts: list of (substructure, min count) tuples - :param max_counts: list of (substructure, max count) tuples - :param ring_min: minimum number of rings - :param ring_max: maximum number of rings - :param atom_min: minimum number of atoms - :param atom_max: maximum number of atoms - :param charge_min: minimum total charge - :param charge_max: maximum total charge - :param has_metal: whether the molecule must contain a metal - :param is_macrocycle: whether the molecule must be a macrocycle - """ - - requires_any: list[Mol] - requires_all: list[Mol] - forbids_any: list[Mol] - min_counts: list[tuple[Mol, int]] - max_counts: list[tuple[Mol, int]] - ring_min: int | None - ring_max: int | None - atom_min: int | None - atom_max: int | None - charge_min: int | None - charge_max: int | None - has_metal: bool | None - is_macrocycle: bool | None - - -def _compile_smarts_list(lst: list[str] | None) -> list[Mol]: - """ - Compile a list of SMARTS strings into RDKit Mol objects. - - :param lst: list of SMARTS strings - :return: list of RDKit Mol objects - """ - if not lst: - return [] - out: list[Mol] = [] - for s in lst: - m = smarts_to_mol(s) - out.append(m) - return out - - -def _compile_counts(d: dict[str, int] | None) -> list[tuple[Mol, int]]: - """ - Compile a dictionary of SMARTS strings to counts into a list of (Mol, count) tuples. - - :param d: dictionary of SMARTS strings to counts - :return: list of (Mol, count) tuples - """ - if not d: - return [] - return [(smarts_to_mol(k), v) for k, v in d.items()] - - -# Set of atomic numbers considered as metals -_METALS = { - 3, - 11, - 19, - 37, - 55, - 87, - 4, - 12, - 20, - 38, - 56, - 88, - *range(21, 31), - *range(39, 49), - *range(72, 81), - *range(57, 72), - *range(89, 104), - 13, - 31, - 49, - 50, - 81, - 82, - 83, -} - - -def _has_metal(mol: Mol) -> bool: - """ - Check if the molecule contains any metal atoms. - - :param mol: molecule - :return: True if the molecule contains metal atoms, otherwise False - """ - return any(a.GetAtomicNum() in _METALS for a in mol.GetAtoms()) - - -def _has_macrocycle(mol: Mol, min_size: int = 12) -> bool: - """ - Check if the molecule contains a macrocycle (ring of at least min_size). - - :param mol: molecule - :param min_size: minimum size of the ring to be considered a macrocycle - :return: True if the molecule contains a macrocycle, otherwise False - """ - for ring in mol.GetRingInfo().AtomRings(): - if len(ring) >= min_size: - return True - return False - - -def _passes_global(mol: Mol, C: _CompiledConds) -> bool: - """ - Check if a molecule passes the global conditions. - - :param mol: molecule - :param C: compiled conditions - :return: True if the molecule passes the conditions, otherwise False - """ - # SMARTS presence/absence - if C.requires_any and not any(mol.HasSubstructMatch(q) for q in C.requires_any): - return False - if any(not mol.HasSubstructMatch(q) for q in C.requires_all): - return False - if any(mol.HasSubstructMatch(q) for q in C.forbids_any): - return False - - # Count thresholds - for q, n in C.min_counts: - if len(mol.GetSubstructMatches(q)) < n: - return False - for q, n in C.max_counts: - if len(mol.GetSubstructMatches(q)) > n: - return False - - # Simple numeric props - n_ring = CalcNumRings(mol) - if C.ring_min is not None and n_ring < C.ring_min: - return False - if C.ring_max is not None and n_ring > C.ring_max: - return False - - n_atoms = mol.GetNumAtoms() - if C.atom_min is not None and n_atoms < C.atom_min: - return False - if C.atom_max is not None and n_atoms > C.atom_max: - return False - - charge = sum(a.GetFormalCharge() for a in mol.GetAtoms()) - if C.charge_min is not None and charge < C.charge_min: - return False - if C.charge_max is not None and charge > C.charge_max: - return False - - if C.has_metal is not None and _has_metal(mol) != C.has_metal: - return False - if C.is_macrocycle is not None and _has_macrocycle(mol) != C.is_macrocycle: - return False - - return True - - -@dataclass(frozen=True) -class ReactionRule: - """ - Preprocessing rule for a chemical reaction. - - :param id: internal identifier - :param rid: rule identifier - :param rxn: RDKit ChemicalReaction (optional if `smarts` is provided) - :param smarts: reaction SMARTS string - :param groups: groups the reaction rule belongs to - :param props: properties - """ - - id: int - rid: str - rxn: ChemicalReaction | None - smarts: str - groups: list[str] - props: dict[str, Any] - - def __post_init__(self) -> None: - # Allow constructing rxn from smarts in a frozen class - rxn = self.rxn - if rxn is None: - if not self.smarts: - raise ValueError(f"[{self.rid}] Either rxn or smarts must be provided.") - try: - rxn = smarts_to_reaction(self.smarts, use_smiles=False) - except Exception as e: - raise ValueError(f"[{self.rid}] Invalid reaction SMARTS: {e}") from e - object.__setattr__(self, "rxn", rxn) - - # Basic sanity: at least one reactant and one product - if rxn.GetNumReactantTemplates() == 0 or rxn.GetNumProductTemplates() == 0: - raise ValueError(f"[{self.rid}] Reaction must have >=1 reactant and >=1 product.") - - # Collect mapped atoms (molAtomMapNumber) from both sides - reactant_maps = _collect_map_to_atomicnums(list(rxn.GetReactants())) - product_maps = _collect_map_to_atomicnums(list(rxn.GetProducts())) - - # Keys (map numbers) must match exactly - maps_react = set(reactant_maps.keys()) - maps_prod = set(product_maps.keys()) - if maps_react != maps_prod: - missing_in_prod = sorted(maps_react - maps_prod) - missing_in_reac = sorted(maps_prod - maps_react) - msgs: list[str] = [] - if missing_in_prod: - msgs.append(f"map nums only on reactant side: {missing_in_prod}") - if missing_in_reac: - msgs.append(f"map nums only on product side: {missing_in_reac}") - raise ValueError(f"[{self.rid}] Mapped atoms don't add up: " + "; ".join(msgs)) - - # For every map number, multiplicity and atomic numbers must match - errors: list[str] = [] - for m in sorted(maps_react): - r_list = sorted(reactant_maps[m]) - p_list = sorted(product_maps[m]) - if r_list != p_list: - # This captures both multiplicity and element changes. - errors.append( - f"map {m}: reactants {summarize_atomicnums(r_list)} != products {summarize_atomicnums(p_list)}" - ) - - if errors: - raise ValueError( - f"[{self.rid}] Mapped atom consistency failed (no element changes allowed): " + "; ".join(errors) - ) - - # Read and compile conditions - reactant_conds = self.props.get("conditions", {}).get("reactant", {}) - product_conds = self.props.get("conditions", {}).get("product", {}) - - object.__setattr__( - self, - "_reactant_conds", - _CompiledConds( - requires_any=_compile_smarts_list(reactant_conds.get("requires_any")), - requires_all=_compile_smarts_list(reactant_conds.get("requires_all")), - forbids_any=_compile_smarts_list(reactant_conds.get("forbids_any")), - min_counts=_compile_counts(reactant_conds.get("min_counts")), - max_counts=_compile_counts(reactant_conds.get("max_counts")), - ring_min=(reactant_conds.get("ring_count") or {}).get("min"), - ring_max=(reactant_conds.get("ring_count") or {}).get("max"), - atom_min=(reactant_conds.get("atom_count") or {}).get("min"), - atom_max=(reactant_conds.get("atom_count") or {}).get("max"), - charge_min=(reactant_conds.get("total_charge") or {}).get("min"), - charge_max=(reactant_conds.get("total_charge") or {}).get("max"), - has_metal=reactant_conds.get("custom_props", {}).get("has_metal"), - is_macrocycle=reactant_conds.get("custom_props", {}).get("is_macrocycle"), - ), - ) - object.__setattr__( - self, - "_product_conds", - _CompiledConds( - requires_any=_compile_smarts_list(product_conds.get("requires_any")), - requires_all=_compile_smarts_list(product_conds.get("requires_all")), - forbids_any=_compile_smarts_list(product_conds.get("forbids_any")), - min_counts=_compile_counts(product_conds.get("min_counts")), - max_counts=_compile_counts(product_conds.get("max_counts")), - ring_min=(product_conds.get("ring_count") or {}).get("min"), - ring_max=(product_conds.get("ring_count") or {}).get("max"), - atom_min=(product_conds.get("atom_count") or {}).get("min"), - atom_max=(product_conds.get("atom_count") or {}).get("max"), - charge_min=(product_conds.get("total_charge") or {}).get("min"), - charge_max=(product_conds.get("total_charge") or {}).get("max"), - has_metal=product_conds.get("custom_props", {}).get("has_metal"), - is_macrocycle=product_conds.get("custom_props", {}).get("is_macrocycle"), - ), - ) - - def to_json_serializable_dict(self) -> dict[str, Any]: - """ - Convert the reaction rule to a JSON serializable dictionary. - - :return: JSON serializable dictionary - """ - return { - "rid": self.rid, - "smarts": self.smarts, - "groups": self.groups, - "props": self.props, - } - - @classmethod - def from_json_serializable_dict(cls, internal_identifier: int, d: dict[str, Any]) -> "ReactionRule": - """ - Convert a JSON serializable dictionary to a ReactionRule. - - :param internal_identifier: internal identifier for the reaction rule - :param d: JSON serializable dictionary - :return: ReactionRule - """ - return ReactionRule( - id=internal_identifier, - rid=d["rid"], - rxn=smarts_to_reaction(d["smarts"]), - smarts=d["smarts"], - groups=d["groups"], - props=d.get("props", {}), - ) - - def has_ring_matching_condition(self) -> bool: - """ - Check if the reaction has a ring matching condition. - - :return: True if the reaction has a ring matching condition, otherwise False - """ - return any([self.smarts.find(f";{rc}") != -1 for rc in ["R", "!R"]]) - - def expected_num_products(self) -> int: - """ - Get the expected number of products for the reaction. - - :return: expected number of products - """ - if self.rxn is None: - raise ValueError("reaction is not initialized") - - return self.rxn.GetNumProductTemplates() - - def __call__(self, rea: Mol, msk: set[int] | None = None) -> list[list[Mol]]: - """ - Apply the reaction, sanitize, preserve/reassign stereochemistry, - enforce mask WITHOUT mutating atomic numbers, and dereplicate results - in a stereo-aware, multiplicity-preserving, order-insensitive way. - - :param rea: reactant molecule - :param msk: set of atom tags (isotope-based tags) that are allowed to change - :return: list of unique product tuples (each tuple as a list[Mol]) - """ - logger = logging.getLogger(LOGGER_NAME) - - if self.rxn is None: - raise ValueError("reaction is not initialized") - - def _prepare_stereo(m: Mol) -> Mol: - # Reassign stereo cleanly without changing identity - mm = Mol(m) - SetBondStereoFromDirections(mm) - if mm.GetNumConformers() > 0: - AssignAtomChiralTagsFromStructure(mm, replaceExistingTags=True) - AssignStereochemistry(mm, cleanIt=True, force=True, flagPossibleStereoCenters=True) - return mm - - def _single_component(m: Mol) -> bool: - return count_fragments(m) == 1 - - def _sanitize_in_place(m: Mol) -> bool: - try: - correct_hydrogens(m) - sanitize_mol(m) - return True - except ValueError: - return False - - def _tag_to_idx(m: Mol) -> dict[int, int]: - # "atom tags" live in Isotope; ignore zeros - d: dict[int, int] = {} - for a in m.GetAtoms(): - t = a.GetIsotope() - if t: - d[t] = a.GetIdx() - return d - - def _neighbor_sig(m: Mol, ai: int) -> list[tuple[int, float]]: - # Neighbor signature by (neighbor tag or neighbor atomicnum if untagged, bond order) - out: list[tuple[int, float]] = [] - a = m.GetAtomWithIdx(ai) - for b in a.GetBonds(): - nb = b.GetOtherAtomIdx(ai) - na = m.GetAtomWithIdx(nb) - ntag = na.GetIsotope() - key = ntag if ntag else -na.GetAtomicNum() - out.append((key, float(b.GetBondTypeAsDouble()))) - out.sort() - return out - - def _mapped_tags_changed(r: Mol, p: Mol) -> set[int]: - """ - Heuristic diff between reactant and product by tags. - A tag is considered 'changed' if: - - the atom with that tag changes atomic number, OR - - its neighbor signature (by tagged IDs / atom types + bond order) changes. - - :param r: reactant molecule - :param p: product molecule - :return: set of changed atom tags - """ - changed: set[int] = set() - rmap = _tag_to_idx(r) - pmap = _tag_to_idx(p) - for t in set(rmap).intersection(pmap): - ra = r.GetAtomWithIdx(rmap[t]) - pa = p.GetAtomWithIdx(pmap[t]) - if ra.GetAtomicNum() != pa.GetAtomicNum(): - changed.add(t) - continue - if _neighbor_sig(r, rmap[t]) != _neighbor_sig(p, pmap[t]): - changed.add(t) - return changed - - def _preserves_mask(reactant: Mol, products: list[Mol], allowed: set[int]) -> bool: - """ - Check that only tags in `allowed` are changed across all products. - - :param reactant: reactant molecule - :param products: list of product molecules - :param allowed: set of allowed changed tags - :return: True if only allowed tags are changed, otherwise False - """ - if not allowed: - return True - changed: set[int] = set() - for pr in products: - changed |= _mapped_tags_changed(reactant, pr) - return changed.issubset(allowed) - - def _product_key(m: Mol) -> str: - """ - Stereo-aware, mapping-/tag-invariant canonical key for a product. - - Clears molAtomMapNumber props (if present) and strips isotope tags ONLY on the copy - so symmetric mappings don't duplicate results. - - Uses isomeric canonical SMILES to preserve R/S and E/Z. - - :param m: product molecule - :return: product key string - """ - mm = Mol(m) - for a in mm.GetAtoms(): - if a.HasProp("molAtomMapNumber"): - a.ClearProp("molAtomMapNumber") - # Strip isotope-based tags for the KEY only: - if a.GetIsotope(): - a.SetIsotope(0) - # Important: isomericSmiles preserves stereo - return mol_to_smiles(mm, isomeric=True, canonical=True) - - def _result_key(products: list[Mol]) -> tuple[tuple[str, int], ...]: - """ - Stereo-aware, order-insensitive, - - multiplicity-preserving key for a product tuple. - :param products: list of product molecules - :return: result key tuple - """ - # multiplicity-preserving multiset key (Counter of product keys) - from collections import Counter - - c = Counter(_product_key(p) for p in products) - return tuple(sorted(c.items(), key=lambda kv: kv[0])) - - logger.debug(f"({self.rid}) applying reaction rule to... {mol_to_smiles(deepcopy(rea), remove_tags=True)}") - - # Pre-filter on reactant - if not _passes_global(rea, self._reactant_conds): - logger.debug(f"({self.rid}) reactant fails global conditions") - return [] - - # Run reaction - results = self.rxn.RunReactants([rea]) - if not results: - logger.debug(f"({self.rid}) no valid products found after applying RDKit reaction") - return [] - - # Sanitize, filter invalids, product-conditions - kept: list[list[Mol]] = [] - for tup in results: - products: list[Mol] = [] - - # Quick shape check + sanitize - atom_tag_sets: list[set[int]] = [] - ok_tuple = True - for prod in tup: - if not _single_component(prod): - logger.debug( - f"({self.rid}) product not single component: {mol_to_smiles(deepcopy(prod), remove_tags=True)}" - ) - ok_tuple = False - break - - if not _sanitize_in_place(prod): - logger.debug( - f"({self.rid}) product failed sanitization: {mol_to_smiles(deepcopy(prod), remove_tags=True)}" - ) - ok_tuple = False - break - - # Reassign stereo on the sanitized product - prod = _prepare_stereo(prod) - - products.append(prod) - atom_tag_sets.append(set(get_tags_mol(prod))) - - if not ok_tuple: - logger.debug(f"({self.rid}) product failed validation") - continue - - # Disallow overlapping tag sets across products - total_tags = sum(len(s) for s in atom_tag_sets) - union_tags = len(set().union(*atom_tag_sets)) if atom_tag_sets else 0 - if atom_tag_sets and total_tags != union_tags: - logger.debug(f"({self.rid}) products share atom tags: {[mol_to_smiles(p) for p in products]}") - continue - - # Product-side global conditions - if not all(_passes_global(p, self._product_conds) for p in products): - logger.debug(f"({self.rid}) products fail global conditions") - continue - - # Mask check - if msk is not None and not _preserves_mask(rea, products, msk): - logger.debug(f"({self.rid}) products modify tags outside mask") - # Skip results that modify tags outside the mask - continue - - kept.append(products) - - if len(kept) <= 1: - return kept - - # Stereo-aware derep (order-insensitive, multiplicity-preserving) - seen: dict[tuple[tuple[str, int], ...], int] = {} - unique: list[list[Mol]] = [] - for res in kept: - key = _result_key(res) - if key in seen: - continue - seen[key] = 1 - unique.append(res) - - return unique - - -def DummyReactionRule(rid: str) -> ReactionRule: - """ - Create a dummy reaction rule for testing purposes. - - :param rid: rule identifier - :return: ReactionRule - """ - return ReactionRule( - id=0, - rid=rid, - rxn=smarts_to_reaction("[C:1]>>[C:1]"), - smarts="[C:1]>>[C:1]", - groups=[], - props={}, - ) - - -@dataclass(frozen=True) -class MatchingRule: - """ - Matching rule for a chemical compound. - - :param id: internal identifier - :param rid: rule identifier - :param smiles: SMILES string of the molecule - :param mol: molecule to match against - :param groups: groups the matching rule belongs to - :param props: properties of the rule - """ - - id: int - rid: str - smiles: str - mol: Mol - groups: list[str] - props: dict[str, Any] - - def to_json_serializable_dict(self) -> dict[str, Any]: - """Convert the matching rule to a JSON serializable dictionary. - - :return: JSON serializable dictionary - """ - return { - "rid": self.rid, - "mol": mol_to_smiles(self.mol), - "groups": self.groups, - "props": self.props, - } - - @classmethod - def from_json_serializable_dict(cls, internal_identifier: int, d: dict[str, Any]) -> "MatchingRule": - """ - Convert a JSON serializable dictionary to a MatchingRule. - - :param internal_identifier: internal identifier for the matching rule - :param d: JSON serializable dictionary - :return: MatchingRule - """ - return MatchingRule( - id=internal_identifier, - rid=d["rid"], - smiles=d["mol"], - mol=smiles_to_mol(d["mol"]), - groups=d.get("groups", []), - props=d.get("props", {}), - ) - - def is_match(self, mol: Mol, sch: bool = False) -> str | None: - """ - Check if the molecule matches the rule. - - :param mol: molecule - :param sch: flag to enable/disable stereochemistry matching - :return: name of the rule if the molecule matches, otherwise None - """ - # No stereochemistry matching, check for substructure match - has_substruct_match = mol.HasSubstructMatch(self.mol, useChirality=sch) - has_equal_num_atoms = mol.GetNumAtoms() == self.mol.GetNumAtoms() - has_equal_num_bonds = mol.GetNumBonds() == self.mol.GetNumBonds() - if has_substruct_match and has_equal_num_atoms and has_equal_num_bonds: - if sch: - tag = stereo_summary(mol) - return f"{self.rid}[{tag}]" if tag != "none" else self.rid - return self.rid - else: - return None - - -class Rules: - def __init__( - self, - reaction_rules: list[ReactionRule], - matching_rules: list[MatchingRule], - sha256_reaction_rules: str | None = None, - sha256_matching_rules: str | None = None, - ) -> None: - """ - Initialize the rules for processing compounds. - - :param reaction_rules: list of reaction rules - :param matching_rules: list of matching rules - :param sha256_reaction_rules: SHA256 hash of the reaction rules (optional) - :param sha256_matching_rules: SHA256 hash of the matching rules (optional) - """ - self._logger = logging.getLogger(LOGGER_NAME) - self._reaction_rules = reaction_rules - self._matching_rules = matching_rules - self.sha256_reaction_rules = sha256_reaction_rules - self.sha256_matching_rules = sha256_matching_rules - - def __repr__(self) -> str: - return f"" - - def get_reaction_rules(self, group_names: list[str] | None = None) -> list[ReactionRule]: - """ - Get the reaction rules. - - :return: list of reaction rules - """ - if group_names is not None: - # Filter reaction rules by group name - self._logger.debug(f"retrieving reaction rules for groups: {group_names}") - - return [r for r in self._reaction_rules if any(g in r.groups for g in group_names)] - - return self._reaction_rules - - def get_matching_rules(self) -> list[MatchingRule]: - """ - Get the matching rules sorted by priori - - :return: sorted list of matching rules - """ - matching_rules = self._matching_rules - - return matching_rules - - def check_for_duplicates(self) -> None: - """ - Checks for duplicate items in matching rules. - - :raises ValueError: if duplicate items are found - """ - logger = logging.getLogger(LOGGER_NAME) - - errors_seen = 0 - - seen: dict[str, MatchingRule] = {} - for rule in tqdm(self._matching_rules, desc="checking for duplicate matching rules"): - curr_rid = rule.rid - curr_mol = rule.mol - - try: - inchikey = mol_to_inchikey(curr_mol) - except Exception as e: - raise RuntimeError(f"Failed to make InChIKey for rule {curr_rid}") from e - - if inchikey in seen: - prev: MatchingRule = seen[inchikey] - prev_rid = prev.rid - - logger.warning( - "Duplicate matching rule detected:\n" - f"- First: {prev_rid} ({mol_to_smiles(deepcopy(prev.mol), remove_tags=True)})\n" - f"- Second: {curr_rid} ({mol_to_smiles(deepcopy(curr_mol), remove_tags=True)})\n" - f"- InChIKey: {inchikey}" - ) - errors_seen += 1 - - seen[inchikey] = rule - - if errors_seen > 0: - # Ask user input to contine or not; don't throw error - logger.warning(f"Found {errors_seen} duplicate matching rules.") - user_input = input("Do you want to continue? (y/n): ") - if user_input.lower() != "y": - raise ValueError("Duplicate matching rules found, exiting.") - else: - logger.warning("Continuing despite duplicate matching rules.") - else: - logger.info("No duplicate matching rules found.") - - -def get_path_default_reaction_rules() -> str: - """ - Get the default path to the reaction rules JSON file. - - :return: path to the default reaction rules JSON file - """ - return files(retromol.data).joinpath("default_reaction_rules.yml") - - -def get_path_default_matching_rules() -> str: - """ - Get the default path to the matching rules JSON file. - - :return: path to the default matching rules JSON file - """ - return files(retromol.data).joinpath("default_matching_rules.yml") - - -def get_path_default_wave_config() -> str: - """ - Get the default path to the wave configuration JSON file. - - :return: path to the default wave configuration JSON file - """ - return files(retromol.data).joinpath("default_wave_config.yml") - - -def load_rules_from_files( - path_reaction_rules: str, - path_matching_rules: str, -) -> Rules: - """ - Load rules from JSON files. - - :param path_reaction_rules: path to the reaction rules JSON file - :param path_matching_rules: path to the matching rules JSON file - :return: Rules object containing the loaded rules - """ - reaction_rules_src = open(path_reaction_rules).read() - matching_rules_src = open(path_matching_rules).read() - sha256_reaction_rules = sha256_hex(reaction_rules_src) - sha256_matching_rules = sha256_hex(matching_rules_src) - - with open(path_reaction_rules) as fo: - reaction_rules_data = yaml.safe_load(fo) - reaction_rules = [ReactionRule.from_json_serializable_dict(i, r) for i, r in enumerate(reaction_rules_data)] - - with open(path_matching_rules) as fo: - matching_rules_data = yaml.safe_load(fo) - matching_rules = [MatchingRule.from_json_serializable_dict(i, r) for i, r in enumerate(matching_rules_data)] - - return Rules(reaction_rules, matching_rules, sha256_reaction_rules, sha256_matching_rules) diff --git a/src/retromol/streaming.py b/src/retromol/streaming.py deleted file mode 100644 index 1c2cfff..0000000 --- a/src/retromol/streaming.py +++ /dev/null @@ -1,259 +0,0 @@ -"""Streaming RetroMol runs with multiprocessing.""" - -from __future__ import annotations - -from collections.abc import Callable, Iterable, Iterator -from dataclasses import dataclass -from multiprocessing import Pool -from typing import Any - -import yaml -from pandas import DataFrame, read_csv -from rdkit.Chem.rdmolfiles import SDMolSupplier - -from retromol import api, io, rules -from retromol.chem import mol_to_inchikey, mol_to_smiles, sanitize_mol -from retromol.helpers import iter_json -from retromol.rules import Rules - -_G_RULE_SET = None -_G_WAVE_CONFIGS = None -_G_MATCH_STEREO = None - - -def _init_worker(rule_set: Rules, wave_configs: list[dict[str, Any]] | None, match_stereo: bool) -> None: - """ - Initialize worker process with necessary global variables. - - :param rule_set: reaction/matching rule set - :param wave_configs: wave configuration dicts - :param match_stereo: whether to match stereo in RetroMol runs - """ - global _G_RULE_SET, _G_WAVE_CONFIGS, _G_MATCH_STEREO - _G_RULE_SET = rule_set - _G_WAVE_CONFIGS = wave_configs - _G_MATCH_STEREO = match_stereo - - -def _process_compound( - args_tuple: tuple[str, str, dict[str, Any]], -) -> tuple[str, dict[str, Any] | None, str | None]: - """ - Process a single compound in a worker process. - - :param args_tuple: (inchikey, smiles, props) - :return: (inchikey, serialized_result or None on error, error message or None on success) - """ - inchikey, smiles, props = args_tuple - try: - mol = io.Input(inchikey, smiles, props=props or {}) - if _G_RULE_SET is None: - raise RuntimeError("Worker not properly initialized with rule set.") - if _G_WAVE_CONFIGS is None: - raise RuntimeError("Worker not properly initialized with wave configs.") - result_obj = api.run_retromol_with_timeout( - mol, - _G_RULE_SET, - _G_WAVE_CONFIGS, - _G_MATCH_STEREO if _G_MATCH_STEREO is not None else False, - ) - return inchikey, result_obj.serialize(), None - except Exception as e: - # traceback not returned here to keep workers light-weight; caller can log - return inchikey, None, str(e) - - -@dataclass -class ResultEvent: - """ - Represents the result of processing a single compound. - - :param inchikey: InChIKey of the processed compound - :param result: serialized result dict or None if there was an error - :param error: error message string or None if processing was successful - """ - - inchikey: str - result: dict[str, Any] | None # serialized result or None on error - error: str | None # error message or None on success - - -def _task_buffered_iterator( - source_iter: Iterable[dict[str, Any]], - *, - id_col: str, - smiles_col: str, - batch_size: int, -) -> Iterator[list[tuple[str, str, dict[str, Any]]]]: - """ - Convert row dicts into (inchikey, smiles, props) tuples and yield in batches. - - :param source_iter: iterable of row dicts - :param id_col: name of column containing InChIKey - :param smiles_col: name of column containing SMILES - :param batch_size: number of compounds per batch - :return: iterator over lists of (inchikey, smiles, props) tuples - """ - buf: list[tuple[str, str, dict[str, Any]]] = [] - for rec in source_iter: - if id_col not in rec or smiles_col not in rec: - continue - ik = str(rec[id_col]) - smi = str(rec[smiles_col]) - buf.append((ik, smi, rec)) - if len(buf) >= batch_size: - yield buf - buf = [] - if buf: - yield buf - - -def run_retromol_stream( - *, - # Either provide loaded objects... - rule_set: Rules | None = None, - wave_configs: list[dict[str, Any]] | None = None, - # ...or point to files (if provided, they override the loaded objects) - reaction_rules_path: str | None = None, - matching_rules_path: str | None = None, - wave_config_path: str | None = None, - match_stereo: bool = False, - # Data source: an iterable of row dicts containing id_col and smiles_col - row_iter: Iterable[dict[str, Any]], - id_col: str = "inchikey", - smiles_col: str = "smiles", - # Concurrency knobs (match CLI defaults) - workers: int = 1, - batch_size: int = 2000, - pool_chunksize: int = 50, - maxtasksperchild: int = 2000, - # Optional sink callback - on_result: Callable[[ResultEvent], None] | None = None, -) -> Iterator[ResultEvent]: - """ - Stream RetroMol results with multiprocessing, yielding ResultEvent as soon as - each compound finishes. No files/logs are written here—callers are free to do so. - - :param rule_set: pre-loaded reaction/matching rule set - :param wave_configs: pre-loaded wave configuration dicts - :param reaction_rules_path: path to reaction rules file (YAML) - :param matching_rules_path: path to matching rules file (YAML) - :param wave_config_path: path to wave configuration file (YAML) - :param match_stereo: whether to match stereo in RetroMol runs - :param row_iter: iterable of row dicts containing at least id_col and smiles_col - :param id_col: name of column containing InChIKey (default: "inchikey") - :param smiles_col: name of column containing SMILES (default: "smiles") - :param workers: number of worker processes (default: 1) - :param batch_size: number of compounds to send to each worker at once (default: 2000) - :param pool_chunksize: chunksize for imap_unordered (default: 50) - :param maxtasksperchild: max tasks per worker before restart (default: 2000) - :param on_result: optional callback receiving each ResultEvent as it arrives - :return: iterator over ResultEvent objects - """ - # Load/prepare config exactly once (like CLI) - if reaction_rules_path and matching_rules_path: - rule_set = rules.load_rules_from_files(reaction_rules_path, matching_rules_path) - elif rule_set is None: - raise ValueError("Provide either (reaction_rules_path & matching_rules_path) or an already loaded rule_set.") - - if wave_config_path: - with open(wave_config_path) as f: - wave_configs = yaml.safe_load(f) - if wave_configs is None: - raise ValueError("Provide wave_configs dict or wave_config_path.") - - # Start worker pool with same init pattern - with Pool( - processes=workers, - initializer=_init_worker, - initargs=(rule_set, wave_configs, match_stereo), - maxtasksperchild=maxtasksperchild, - ) as pool: - for task_batch in _task_buffered_iterator( - row_iter, id_col=id_col, smiles_col=smiles_col, batch_size=batch_size - ): - for ik, serialized, err in pool.imap_unordered(_process_compound, task_batch, chunksize=pool_chunksize): - evt = ResultEvent(ik, serialized, err) - if on_result is not None: - on_result(evt) - yield evt - - -def stream_table_rows( - path: str, - *, - sep: str = ",", - chunksize: int = 20_000, -) -> Iterator[dict[str, Any]]: - """ - Stream CSV/TSV rows as dicts. Keeps memory usage low (chunked). - - :param path: path to CSV/TSV file - :param sep: field separator (default: ",") - :param chunksize: number of rows to read per chunk (default: 20,000) - :return: iterator over row dicts - """ - chunks: Iterator[DataFrame] = read_csv( - path, - sep=sep, - chunksize=chunksize, - dtype=str, - keep_default_na=False, - ) - - for chunk in chunks: - # iterrows() -> Iterator[Tuple[int, Series]] - for _, row in chunk.iterrows(): - yield row.to_dict() - - -def stream_sdf_records( - sdf_path: str, - *, - fast: bool = False, -) -> Iterator[dict[str, Any]]: - """ - Stream SDF as dict rows: {'inchikey': , 'smiles': , ...props} - Matches CLI behavior including opportunistic sanitize for IK/SMILES. - - :param sdf_path: path to SDF file - :param fast: if True, skips sanitization and H removal (default: False) - :return: iterator over record dicts - """ - sanitize = not fast - removeHs = fast - suppl = SDMolSupplier(sdf_path, sanitize=sanitize, removeHs=removeHs) - for mol in suppl: - if mol is None: - continue - try: - try: - ik = mol_to_inchikey(mol) - smi = mol_to_smiles(mol) - except Exception: - sanitize_mol(mol) - ik = mol_to_inchikey(mol) - smi = mol_to_smiles(mol) - rec = {"inchikey": ik, "smiles": smi} - for pname in mol.GetPropNames(): - rec[pname] = mol.GetProp(pname) - yield rec - except Exception: - continue - - -def stream_json_records( - path: str, - *, - jsonl: bool = False, -) -> Iterator[dict[str, Any]]: - """ - Stream JSON or JSONL records as dicts. - - :param path: path to JSON or JSONL file - :param jsonl: if True, treat as JSONL (one JSON object per line) - :return: iterator over record dicts - """ - for rec in iter_json(path, jsonl=jsonl): - if isinstance(rec, dict): - yield rec diff --git a/src/retromol/utils/__init__.py b/src/retromol/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/retromol/utils/hashing.py b/src/retromol/utils/hashing.py new file mode 100644 index 0000000..1cea932 --- /dev/null +++ b/src/retromol/utils/hashing.py @@ -0,0 +1,25 @@ +"""Module for hashing utilities in RetroMol.""" + +import hashlib + + +def sha256_hex(s: str) -> str: + """ + Compute the SHA-256 hash of string and return its hexadecimal representation. + + :param s: input string to hash + :return: hexadecimal representation of the SHA-256 hash + .. note:: None is treated as an empty string + """ + return hashlib.sha256((s or "").encode("utf-8")).hexdigest() + + +def blake64_hex(s: str) -> str: + """ + Compute the BLAKE2b hash of string and return the first 16 characters. + + :param s: input string to hash + :return: first 16 characters of the BLAKE2b hash in hexadecimal representation + .. note:: None is treated as an empty string + """ + return hashlib.blake2b((s or "").encode("utf-8"), digest_size=8).hexdigest() diff --git a/src/retromol/utils/logging.py b/src/retromol/utils/logging.py new file mode 100644 index 0000000..10c8e58 --- /dev/null +++ b/src/retromol/utils/logging.py @@ -0,0 +1,82 @@ +"""Utility functions for logging.""" + +from __future__ import annotations + +import logging +import sys + + +PACKAGE_LOGGER = "retromol" + +STANDARD_FMT = "%(asctime)s | %(levelname)s | %(name)s | %(message)s" +STANDARD_DATEFMT = "%Y-%m-%d %H:%M:%S" + + +def setup_logging( + level: str | int = "INFO", + *, + fmt: str = STANDARD_FMT, + datefmt: str = STANDARD_DATEFMT, + stream: None | int | str | object = None, +) -> None: + """ + Set up logging for the retromol package. + + :param level: log level for console output + :param fmt: log message format + :param datefmt: date format for log messages + :param stream: output stream for console logs; defaults to sys.stderr + .. note:: safe to call multiple times; library code should not call this function; + it is intended for use by applications using the library + """ + if stream is None: + stream = sys.stderr + + if isinstance(level, str): + level = level.upper() + + root = logging.getLogger() + root.setLevel(level) + + handler = logging.StreamHandler(stream) + handler.setLevel(level) + handler.setFormatter(logging.Formatter(fmt=fmt, datefmt=datefmt)) + + # Avoid duplicate handlers if called repeatedly (common in notebooks) + # Keep it simple: remove existing handlers created by previous setup calls. + root.handlers = [handler] + + # Make sure package logger propagates to root + logging.getLogger(PACKAGE_LOGGER).propagate = True + + +def add_file_handler( + logfile: str, + *, + level: str | int = "DEBUG", + fmt: str = STANDARD_FMT, + datefmt: str = STANDARD_DATEFMT, +) -> None: + """ + Add a file handler to the root logger. + + :param logfile: path to log file + :param level: log level for file output + .. note:: intended to be called after setup_logginer(); safe to call multiple times + for the same logfile + """ + if isinstance(level, str): + level = level.upper() + + root = logging.getLogger() + + # Prevent duplicate file handlers for the same path + for h in root.handlers: + if isinstance(h, logging.FileHandler) and h.baseFilename == logfile: + return + + fh = logging.FileHandler(logfile) + fh.setLevel(level) + fh.setFormatter(logging.Formatter(fmt=fmt, datefmt=datefmt)) + + root.addHandler(fh) diff --git a/src/retromol/helpers.py b/src/retromol/utils/timeout.py similarity index 61% rename from src/retromol/helpers.py rename to src/retromol/utils/timeout.py index 7b97c95..3bcf3f2 100644 --- a/src/retromol/helpers.py +++ b/src/retromol/utils/timeout.py @@ -1,38 +1,20 @@ -"""This module provides helper functions for the RetroMol package.""" +"""Utilities for function timeouts using SIGALRM.""" -import hashlib -import json import signal from collections.abc import Callable, Generator from typing import Any, Generic, ParamSpec, TypeVar -import ijson - -from retromol.errors import FunctionTimeoutError P = ParamSpec("P") T = TypeVar("T") -def sha256_hex(s: str) -> str: - """ - Compute the SHA-256 hash of the input string `s` and return its hexadecimal representation. - If `s` is None, treat it as an empty string. - - :param s: input string to hash - :return: hexadecimal representation of the SHA-256 hash +class FunctionTimeoutError(Exception): """ - return hashlib.sha256((s or "").encode("utf-8")).hexdigest() - - -def blake64_hex(s: str) -> str: + Custom exception for function timeout. """ - Compute the BLAKE2b hash of the input string `s` and return the first 16 characters - :param s: input string to hash - :return: 16-character hex string (64 bits) - """ - return hashlib.blake2b((s or "").encode("utf-8"), digest_size=8).hexdigest() + pass def _timeout_handler(signum: int, frame: Any) -> None: @@ -97,22 +79,3 @@ def decorate(func: Callable[P, T]) -> _TimeoutWrapper[P, T]: return _TimeoutWrapper(func, seconds) return decorate - - -def iter_json(path: str, jsonl: bool = False) -> Generator[Any, None, None]: - """ - Stream items from a JSON array or a JSON Lines (JSONL) file. - - :param path: path to the JSON or JSONL file - :param jsonl: if True, treat the file as JSONL (one JSON object per line). If False, assume a single JSON array - :yield: parsed JSON objects - """ - with open(path, "rb") as f: - if jsonl: - for line in f: - line = line.strip() - if not line: - continue - yield json.loads(line) - else: - yield from ijson.items(f, "item") diff --git a/src/retromol/visualization/__init__.py b/src/retromol/visualization/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/retromol/visualization/reaction_graph.py b/src/retromol/visualization/reaction_graph.py new file mode 100644 index 0000000..db7f9c8 --- /dev/null +++ b/src/retromol/visualization/reaction_graph.py @@ -0,0 +1,90 @@ +"""Visualization utilities for ReactionGraph.""" + +from retromol.model.reaction_graph import ReactionGraph +from retromol.chem.mol import mol_to_smiles + + +def visualize_reaction_graph(g: ReactionGraph, html_path: str, root_enc: str | None = None) -> None: + """ + Visualize ReactionGraph. + + :param g: ReactionGraph to visualize + :param html_path: path to save the HTML visualization + :param root_enc: optional root molecule encoding to highlight + .. note:: requires pyvis package + """ + + try: + from pyvis.network import Network + except ImportError as e: + raise ImportError("Requires pyvis. Install with: pip install pyvis") from e + + # Build identified map from your graph (as in your code) + identified = {} + for enc, node in getattr(g, "identified_nodes", {}).items(): + identified[enc] = node.identity + + net = Network(height="800px", width="100%", directed=True, notebook=False) + net.toggle_physics(True) + + # Use prefixed IDs to avoid collisions with reaction node IDs + def mol_vid(enc: str) -> str: + """ + Generate a unique molecule node ID. + + :param enc: molecule encoding + :return: str: unique molecule node ID + """ + return f"m:{enc}" + + def rxn_vid(i: int) -> str: + """ + Generate a unique reaction node ID. + + :param i: reaction index + :return: str: unique reaction node ID + """ + return f"r:{i}" + + # Add molecule nodes + for enc, node in g.nodes.items(): + + color = "lightgreen" if root_enc is not None and enc == root_enc else "lightblue" + + identity = None + if enc not in identified and enc == root_enc: + identity = "root" + elif enc in identified and identified[enc]: + identity = identified[enc].name + + label = str(identity) if identity else "mol" + net.add_node(mol_vid(enc), label=label, title=label, shape="ellipse", color=color, smiles=mol_to_smiles(node.mol, include_tags=False)) + + # Add reaction nodes, and edges between molecules and reactions + for i, e in enumerate(g.edges): + title = ", ".join(e.step.names) if getattr(e.step, "names", None) else "" + + net.add_node(rxn_vid(i), label="rxn", title=title, shape="box") + + # src mol -> reaction + if e.src in g.nodes: + net.add_edge(mol_vid(e.src), rxn_vid(i), title="reactant", arrows="to") + + # reaction -> dst mol(s) + for dst in e.dsts: + if dst not in g.nodes: + continue + net.add_edge(rxn_vid(i), mol_vid(dst), title="product", arrows="to") + + # Options + net.set_options( + """ + var options = { + "edges": {"smooth": false}, + "interaction": {"hover": true, "tooltipDelay": 80}, + "physics": {"stabilization": true} + } + """ + ) + + net.write_html(html_path, notebook=False) diff --git a/tests/conftest.py b/tests/conftest.py index a450819..51e8cf8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,19 +1,17 @@ -# -*- coding: utf-8 -*- - """Pytest configuration for loading rule sets and wave configurations.""" import pytest -from .helpers import load_rule_set, load_wave_config - +from retromol.model.rules import RuleSet -@pytest.fixture(scope="session") -def rule_set(): - """Load rule set once per test session.""" - return load_rule_set() +from .helpers import load_rule_set @pytest.fixture(scope="session") -def wave_config(): - """Load a simple wave configuration once per test session.""" - return load_wave_config() +def ruleset() -> RuleSet: + """ + Load rule set once per test session. + + :return: the loaded RuleSet object + """ + return load_rule_set() diff --git a/tests/data/integration_demo_set.py b/tests/data/integration_demo_set.py index f8eb1e0..247706c 100644 --- a/tests/data/integration_demo_set.py +++ b/tests/data/integration_demo_set.py @@ -2,1294 +2,550 @@ """Integration test cases for molecule coverage scoring and optimal mapping identification.""" -from typing import List, Tuple - - # Cases are formatted as: # # ( # name, # smiles, -# expected__coverage_score, -# [ list_of_expected_optimal_mappings, ... ] +# expected coverage_score, +# list of found monomers # ), # -CASES: List[Tuple[str, str, float, List[List[str]]]] = [ +CASES: list[tuple[str, str, float, list[str]]] = [ ( "10-deoxymethynolide", r"CC[C@@H]1[C@@H](/C=C/C(=O)[C@@H](C[C@@H]([C@@H]([C@H](C(=O)O1)C)O)C)C)C", 1.0, - [["A2", "B2", "B2", "C1", "D2", "propanoic acid"]], + ["A2", "B2", "B2", "C1", "D2", "propanoic acid"], ), ( "13-deoxytedanolide", r"C/C=C\[C@H](C)[C@@H]1[C@](O1)(C)[C@H]([C@H]2COC(=O)[C@@H]([C@H]([C@@H](C(=O)[C@@H]([C@H](/C(=C/[C@@H](C(=O)CC[C@H](C2=O)C)C)/C)O)C)C)OC)O)O", 1.0, - [ - [ - "A2", - "A2", - "A2", - "B2", - "B5", - "B7", - "C2", - "C2", - "C2", - "D1", - "methylation", - "oxidation", - "propanoic acid", - ] - ], - ), - ("2-deoxystreptamine", r"C1C(C(C(C(C1N)O)O)O)N", 1.0, [["glycosylation"], ["streptamine"]]), + ["A2", "A2", "A2", "B2", "B5", "B7", "C2", "C2", "C2", "D1", "methylation", "oxidation", "propanoic acid"], + ), + ( + "2-deoxystreptamine", + r"C1C(C(C(C(C1N)O)O)O)N", + 1.0, + ["4,6-diaminocyclohexane-1,2,3-triol"], + ), ( "6-deoxyerythronolide B", r"CC[C@@H]1[C@@H]([C@@H]([C@H](C(=O)[C@@H](C[C@@H]([C@@H]([C@H]([C@@H]([C@H](C(=O)O1)C)O)C)O)C)C)C)O)C", 1.0, - [["A2", "B2", "B2", "B2", "B2", "D2", "propanoic acid"]], + ["A2", "B2", "B2", "B2", "B2", "D2", "propanoic acid"], ), ( "AF-toxin", r"CCC(C)C(C(=O)OC(/C=C/C=C/C=C/C(=O)O)C1(CO1)C)OC(=O)C(C(C)(C)O)O", 1.0, - [ - [ - "2-hydroxy-3-methylpentanoic acid", - "A10", - "B10", - "C1", - "C1", - "C1", - "acetic acid", - "acetic acid", - "oxidation", - ] - ], + ["acetic acid", "B10", "acetic acid", "D17", "2-hydroxy-3-methylpentanoic acid", "C1", "C1", "C1", "oxidation"] ), ( "abyssomicin C", r"C[C@@H]1C[C@]23OC(=O)C4=C2OC1[C@H](O)C3\C=C\C(=O)[C@@H](C)C[C@@H](C)C4=O", - 0.6, - [["A2", "C1", "C1", "C1", "D2", "acetic acid", "oxidation"]], + 0.04, + ["oxidation"], ), ( "atrop-abyssocymin C", r"CC1CC23OC(=O)C4=C2OC1C(O)C3\C=C/C(=O)C(C)CC(C)C4=O", - 0.6, - [["A2", "C1", "C1", "C1", "D2", "acetic acid", "oxidation"]], + 0.04, + ["oxidation"], ), ( "aculeximycin", r"CCCC(O[C@H]1C[C@](C)(N)[C@H](O)[C@H](C)O1)C(C)C(O)C(CC)\C=C\C(O)C(C)C1C\C=C(C)\C(O)C(C)C(CC(O)C(C)C(O)CC2CC(O)C(O)C(O)(CC(O[C@@H]3O[C@H](C)[C@@H](O)[C@H](O[C@H]4C[C@@H](N)[C@H](O)[C@@H](C)O4)[C@H]3O[C@@H]3O[C@H](C)[C@@H](O)[C@H](O)[C@H]3O)C(C)CCC(O)CC(O)C\C=C(CC)\C(=O)O1)O2)O[C@@H]1O[C@H](CO)[C@@H](O)[C@H](O)[C@@H]1O", 1.0, - [ - [ - "A1", - "B1", - "B1", - "B1", - "B1", - "B1", - "B1", - "B2", - "B2", - "B2", - "B2", - "B2", - "B4", - "B5", - "C1", - "C2", - "C4", - "D1", - "D1", - "acetic acid", - "glycosylation", - "glycosylation", - "glycosylation", - "glycosylation", - "glycosylation", - ] - ], + ["4-amino-4,6-dimethyloxane-2,5-diol", "A1", "B1", "B1", "B1", "B1", "B1", "B1", "B2", "B2", "B2", "B2", "B2", "B4", "B5", "C1", "C2", "C4", "D1", "D1", "acetic acid", "butanoic acid", "glucose", "rhamnose", "rhamnose", "sugar"], ), ( "acutiphycin", r"CCCCC[C@@H]1C/C=C(\[C@H](C(C(=O)[C@H](/C=C(\[C@@H]2C[C@@H](C[C@@](O2)(CC(=O)O1)O)O)/C)C)(C)C)O)/C", 1.0, - [["A1", "A2", "B1", "B1", "B1", "B3", "C2", "C2", "D1", "D1", "acetic acid"]], + ["A1", "A2", "B1", "B1", "B1", "B3", "C2", "C2", "D1", "D1", "acetic acid", "butanoic acid", "hexanoic acid"], ), ( "aflatoxin G1", r"COC1=C2C3=C(C(=O)OCC3)C(=O)OC2=C4[C@H]5C=CO[C@H]5OC4=C1", 0.04, - [["methylation"]], + ["methylation"], ), ( "alternapyrone", r"CCC(C)/C=C(\C)/CCCC(C)/C=C(\C)/C=C(\C)/CC(C)C1=C(C(=C(C(=O)O1)C)O)C", 1.0, - [["A2", "A2", "C2", "C2", "C2", "D1", "D2", "D2", "D2", "acetic acid"]], + ["A2", "A2", "C2", "C2", "C2", "D1", "D2", "D2", "D2", "acetic acid"], ), ( "amicoumacin", r"CC(C)C[C@@H]([C@@H]1CC2=C(C(=CC=C2)O)C(=O)O1)NC(=O)[C@H]([C@H]([C@H](CC(=O)N)N)O)O", 1.0, - [["A1", "B1", "B5", "C1", "C1", "asparagine", "leucine"]], + ["A1", "B1", "B5", "C1", "C1", "asparagine", "leucine"], ), ( "amphidinolide J", r"CCC/C=C/[C@@H](C)[C@H]1C(/C=C\C([C@H](C=CCCC(=C)[C@H](CC(=O)O1)C)O)C)O", 1.0, - [["B1", "B1", "C1", "C1", "C2", "D1", "D13", "D8", "D9", "acetic acid"]], + ["B1", "B1", "C1", "C1", "C2", "D1", "D10", "D15", "D8", "acetic acid", "trans-2-hexanoic acid", "butanoic acid"], ), ( "amphidinolide P", r"C[C@@H]1C(=C)C[C@H]2[C@H]3[C@@H](O3)CC(=C)/C=C/[C@H](OC(=O)C[C@@]1(O2)O)[C@H](C)C(=C)C", 1.0, - [["2-methylprop-2-enoic acid", "A1", "A8", "A9", "B1", "C1", "C1", "D9", "oxidation"]], + ["2-methylprop-2-enoic acid", "A1", "A8", "A9", "B1", "C1", "C1", "D10", "oxidation"], ), ( "ansamitocin P-3 ", r"C[C@@H]1[C@@H]2C[C@]([C@@H](/C=C/C=C(/CC3=CC(=C(C(=C3)OC)Cl)N(C(=O)C[C@@H]([C@]4([C@H]1O4)C)OC(=O)C(C)C)C)\C)OC)(NC(=O)O2)O", 1.0, - [ - [ - "3-amino-5-hydroxybenzoic acid", - "A1", - "B1", - "B2", - "C1", - "C2", - "D11", - "D2", - "carbamic acid", - "chlorination", - "isobutyric acid", - "methylation", - "methylation", - "methylation", - "oxidation", - ] - ], + ["3-amino-5-hydroxybenzoic acid", "A1", "B1", "B2", "C1", "C2", "D11", "D2", "carbamic acid", "chlorination", "isobutyric acid", "methylation", "methylation", "methylation", "oxidation"], ), ( "anthracimycin", r"C[C@@H]1/C=C\C=C\[C@H](OC(=O)[C@@H](C(=O)/C=C(/[C@H]2[C@@H]1C=C[C@@H]3[C@@H]2CC=C(C3)C)\O)C)C", 1.0, - [["A1", "A2", "B1", "C1", "C1", "C1", "C1", "C1", "C2", "D2", "acetic acid"]], + ["A1", "A2", "B1", "C1", "C1", "C1", "C1", "C1", "C2", "D2", "acetic acid"], ), ( "apoptolidin", r"COC[C@@H](C[C@H]1O[C@@](O)([C@H](O)[C@@H]2C[C@H](OC)[C@@H](O)CC\C=C(/C)\C=C\[C@@H](O[C@@H]3O[C@@H](C)[C@H](OC)[C@@H](O)[C@@H]3O)[C@H](C)\C=C(/C)\C=C(/C)\C=C(C)\C(=O)O2)[C@H](C)[C@@H](O)[C@H]1C)O[C@H]1C[C@](C)(O)[C@@H](O[C@H]2C[C@@H](OC)[C@H](O)[C@@H](C)O2)[C@H](C)O1", 1.0, - [ - [ - "A5", - "B1", - "B1", - "B2", - "B2", - "B2", - "B5", - "C1", - "C2", - "C2", - "C2", - "C2", - "D1", - "glycolic acid", - "glycosylation", - "glycosylation", - "glycosylation", - "methylation", - "methylation", - "methylation", - "methylation", - ] - ], + ["4,6-dimethyloxane-2,4,5-triol", "6-methyloxane-2,4,5-triol", "A2", "B1", "B1", "B2", "B2", "B5", "C1", "C2", "C2", "C2", "C2", "D5", "malonic acid", "methanol", "methylation", "methylation", "methylation", "methylation", "rhamnose"], ), ( "avilamycin A", r"C[C@@H]1[C@H]([C@@H](C[C@@H](O1)O[C@@H]2[C@H](OC3(C[C@@H]2O)O[C@@H]4[C@H](O[C@H](C[C@]4(O3)C)O[C@@H]5[C@H]([C@@H](O[C@@H]([C@@H]5OC)C)O[C@@H]6[C@H](O[C@H]([C@H]([C@H]6O)OC)O[C@H]7[C@@H]([C@H]8[C@H](CO7)O[C@@]9(O8)C1C([C@@]([C@H](O9)C)(C(=O)C)O)OCO1)OC(=O)C(C)C)COC)O)C)C)O)OC(=O)C1=C(C(=C(C(=C1OC)Cl)O)Cl)C", 1.0, - [ - [ - "A1", - "A1", - "C1", - "acetic acid", - "chlorination", - "chlorination", - "glycosylation", - "glycosylation", - "glycosylation", - "glycosylation", - "glycosylation", - "glycosylation", - "glycosylation", - "isobutyric acid", - "methanol", - "methylation", - "methylation", - "methylation", - "methylation", - ] - ], + ["4,6-dimethyloxane-2,4,5-triol", "6-methyloxane-2,4,5-triol", "6-methyloxane-2,4,5-triol", "A1", "A1", "C1", "acetic acid", "arabinose", "chlorination", "chlorination", "glucose", "isobutyric acid", "methanol", "methylation", "methylation", "methylation", "methylation", "orsellinic acid", "rhamnose", "sugar"], ), ( "avilamycin C", r"C[C@@H]1[C@H]([C@@H](C[C@@H](O1)O[C@@H]2[C@H](OC3(C[C@H]2O)O[C@@H]4[C@H](O[C@H](C[C@]4(O3)C)O[C@@H]5[C@H]([C@@H](O[C@@H]([C@@H]5OC)C)O[C@@H]6[C@H](O[C@H]([C@H]([C@H]6O)OC)O[C@H]7[C@@H]([C@H]8[C@H](CO7)O[C@@]9(O8)[C@H]1[C@H]([C@@]([C@H](O9)C)(C(C)O)O)OCO1)OC(=O)C(C)C)COC)O)C)C)O)OC(=O)C1=C(C(=C(C(=C1OC)Cl)O)Cl)C", 1.0, - [ - [ - "A1", - "A1", - "C1", - "acetic acid", - "chlorination", - "chlorination", - "glycosylation", - "glycosylation", - "glycosylation", - "glycosylation", - "glycosylation", - "glycosylation", - "glycosylation", - "isobutyric acid", - "methanol", - "methylation", - "methylation", - "methylation", - "methylation", - ] - ], + ["4,6-dimethyloxane-2,4,5-triol", "6-methyloxane-2,4,5-triol", "6-methyloxane-2,4,5-triol", "A1", "A1", "C1", "acetic acid", "arabinose", "chlorination", "chlorination", "glucose", "isobutyric acid", "methanol", "methylation", "methylation", "methylation", "methylation", "orsellinic acid", "rhamnose", "sugar"], ), ( "bitungolide F", r"CC[C@@H]1C=CC(=O)O[C@@H]1[C@H](C)CC[C@H](C[C@@H](/C=C/C=C/C2=CC=CC=C2)O)O", 1.0, - [["B1", "B1", "B4", "C1", "C1", "D2", "cinnamic acid"]], + ["B1", "B1", "B4", "C1", "C1", "D2", "cinnamic acid"], ), ( "borrelidin", r"C[C@H]1C[C@H](C[C@@H]([C@H](/C(=C\C=C\C[C@H](OC(=O)C[C@@H]([C@H](C1)C)O)[C@@H]2CCC[C@H]2C(=O)O)/C#N)O)C)C", 1.0, - [ - [ - "B1", - "B1", - "B2", - "C1", - "C1", - "D2", - "D2", - "D2", - "cyanide", - "cyclopentane-1,2-dicarboxylic acid", - ] - ], + ["B1", "B1", "B2", "C1", "C1", "D2", "D2", "D2", "cyanide", "cyclopentane-1,2-dicarboxylic acid"], ), ( "butirosin A", r"C1[C@@H]([C@H]([C@@H]([C@H]([C@@H]1NC(=O)[C@H](CCN)O)O)O[C@H]2[C@@H]([C@H]([C@H](O2)CO)O)O)O[C@@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CN)O)O)N)N", 1.0, - [ - ["4-amino-2-hydroxybutanoic acid", "glycosylation", "glycosylation", "streptamine"], - ["D5", "glycine", "glycosylation", "glycosylation", "streptamine"], - ], + ["3-amino-6-(aminomethyl)oxane-2,4,5-triol", "4,6-diaminocyclohexane-1,2,3-triol", "4-amino-2-hydroxybutanoic acid", "D5", "glycine", "ribose"], ), ( "butirosin B", r"C1[C@@H]([C@H]([C@@H]([C@H]([C@@H]1NC(=O)[C@H](CCN)O)O)O[C@H]2[C@@H]([C@@H]([C@H](O2)CO)O)O)O[C@@H]3[C@@H]([C@H]([C@@H]([C@H](O3)CN)O)O)N)N", 1.0, - [ - ["4-amino-2-hydroxybutanoic acid", "glycosylation", "glycosylation", "streptamine"], - ["D5", "glycine", "glycosylation", "glycosylation", "streptamine"], - ], + ["3-amino-6-(aminomethyl)oxane-2,4,5-triol", "4,6-diaminocyclohexane-1,2,3-triol", "4-amino-2-hydroxybutanoic acid", "D5", "glycine", "ribose"], ), ( "calicheamicin", r"CCN[C@H]1CO[C@H](C[C@@H]1OC)O[C@@H]2[C@H]([C@@H]([C@H](OC2O[C@H]3C#C/C=C\C#C[C@]\4(CC(=O)C(=C3/C4=C\CSSSC)NC(=O)OC)O)C)NO[C@H]5C[C@@H]([C@@H]([C@H](O5)C)SC(=O)C6=C(C(=C(C(=C6OC)OC)O[C@H]7[C@@H]([C@@H]([C@H]([C@@H](O7)C)O)OC)O)I)C)O)O", - 0.74, + 0.70, # Default rule set is not able to parse the enediyne core, but should at least identify the sugar parts - [ - [ - "A1", - "A5", - "C1", - "acetic acid", - "carbonic acid", - "ethanol", - "glycosylation", - "glycosylation", - "glycosylation", - "glycosylation", - "iodination", - "methylation", - "methylation", - "methylation", - "methylation", - "methylation", - "methylation", - ] - ], + ["5-amino-6-methyloxane-2,3,4-triol", "6-methyl-5-sulfanyloxane-2,4-diol", "A1", "A5", "C1", "acetic acid", "ethanol", "iodination", "methylation", "methylation", "methylation", "methylation", "methylation", "methylation", "rhamnose", "sugar"], ), ( "callystatin", r"CC[C@H](C)[C@H]([C@H](C)C(=O)[C@H](C)/C=C(\C)/C=C/C[C@@H](C)/C=C(/CC)\C=C\[C@H]1CC=CC(=O)O1)O", 1.0, - [["A2", "B1", "B2", "C1", "C1", "C1", "C2", "C4", "D2", "D2", "acetic acid"]], + ["A2", "B1", "B2", "C1", "C1", "C1", "C2", "C4", "D2", "D2", "acetic acid"], ), ( "carolacton", r"C[C@@H]\1CCC[C@@H]([C@H](OC(=O)[C@@H]([C@@H](/C=C1)O)O)/C(=C/[C@@H](C)C(=O)[C@H](C)[C@@H](CC(=O)O)OC)/C)C", - 0.97, + 1.0, # This parsing is ambiguous... can parse from two sides and neither is correct/wrong - [["A2", "B1", "B2", "B5", "C1", "C2", "D2", "D2", "methylation"]], + ["A2", "B11", "B2", "B2", "C2", "D1", "D1", "D2", "malonic acid", "methylation"] ), ( "chaetoglobosin A", r"C[C@H]\1C/C=C/[C@H]2[C@H]3[C@](O3)([C@H]([C@@H]4[C@@]2(C(=O)/C=C/C(=O)[C@@H](/C(=C1)/C)O)C(=O)N[C@H]4CC5=CNC6=CC=CC=C65)C)C", - 1.0, - [ - [ - "A1", - "B11", - "C1", - "C1", - "C1", - "C2", - "C2", - "D2", - "acetic acid", - "oxidation", - "tryptophan", - ] - ], + 0.9, + ["A1", "C1", "C1", "C2", "C2", "D1", "D2", "acetic acid", "oxidation", "tryptophan"] ), ( "chichorine", r"CC1=C(O)C=C2C(CNC2=O)=C1OC", 1.0, - [["C1", "D11", "D15", "glycine", "methylation"]], + ["C1", "D11", "D17", "glycine", "methylation"], ), ( "chlorotonil A", r"C[C@@H]1/C=C\C=C\[C@@H](OC(=O)[C@H](C(=O)C(C(=O)[C@@H]2[C@H]1C=C[C@H]3[C@H]2[C@@H](C=C(C3)C)C)(Cl)Cl)C)C", 1.0, - [ - [ - "A1", - "A2", - "B1", - "C1", - "C1", - "C1", - "C1", - "C2", - "C2", - "D2", - "acetic acid", - "chlorination", - "chlorination", - ] - ], + ["A1", "A2", "B1", "C1", "C1", "C1", "C1", "C2", "C2", "D2", "acetic acid", "chlorination", "chlorination"], ), ( "chlorothricin", r"C[C@@H](C(C(O)=O)=C1)C[C@@]2(C(O)=C(O3)C(O2)=O)[C@@H]1C=CCCCC[C@@H]4C=C[C@@]([C@@H](O[C@@H]5C[C@H](O)[C@@H](O[C@H]6O[C@@H](C)[C@H](O)[C@@H](OC(C7=C(C)C(Cl)=CC=C7OC)=O)C6)[C@@H](C)O5)CCC8)([H])[C@]8([H])[C@@H]4C3=O", 1.0, - [ - [ - "A1", - "B1", - "C1", - "C1", - "C1", - "C1", - "C1", - "C1", - "C1", - "C5", - "D1", - "D1", - "D1", - "acetic acid", - "acetic acid", - "chlorination", - "glyceric acid", - "glycolic acid", - "glycosylation", - "glycosylation", - "methylation", - ] - ], + ["6-methyloxane-2,4,5-triol", "6-methyloxane-2,4,5-triol", "6-methylsalicylic acid", "A1", "A5", "B1", "C1", "C1", "C1", "C1", "C1", "C1", "C1", "C13", "D1", "D1", "D1", "acetic acid", "acetic acid", "chlorination", "glyceric acid", "methylation"] ), ( "coelimycin P1", r"C/C=C/C(=O)/C/1=C/C(=C\2/C=CCCN2)/SC[CH](C(=O)O1)NC(=O)C", 0.38, - [["acetic acid", "cysteine"]], + ["acetic acid", "cysteine"], ), ( "compactin", r"CC[C@H](C)C(=O)O[C@H]1CCC=C2C=C[C@H](C)[C@H](CC[C@@H](O)C[C@@H](O)CC(O)=O)[C@@H]12", 1.0, - [["B1", "B1", "C1", "C1", "C1", "C1", "D1", "D2", "D5", "acetic acid", "acetic acid"]], + ["B1", "B1", "C1", "C1", "C1", "C1", "D1", "D2", "D5", "acetic acid", "acetic acid"], ), ( "cremimycin", r"CCCCCCC1CC(CCCC(C/C(O)=C2C(/C=C(/C=C\C=C/C(N1)=O)C)CC(C\2=O)O[C@H]3C[C@@H]([C@@H]([C@H](O3)C)O)OC)O)=O", 1.0, - [ - [ - "A1", - "A1", - "B1", - "B1", - "C1", - "C1", - "C1", - "C2", - "D1", - "D1", - "D1", - "E1", - "glycosylation", - "methylation", - "oxidation", - "propanoic acid", - ] - ], + ["6-methyloxane-2,4,5-triol", "A", "A1", "A1", "B1", "B1", "C1", "C1", "C1", "C2", "D1", "D1", "D1", "heptanoic acid", "methylation", "oxidation", "pentanoic acid", "propanoic acid"], ), ( "deschlorothricin", r"C[C@@H](C(C(O)=O)=C1)C[C@@]2(C(O)=C(O3)C(O2)=O)[C@@H]1C=CCCCC[C@@H]4C=C[C@@]([C@@H](O[C@@H]5C[C@H](O)[C@@H](O[C@H]6O[C@@H](C)[C@H](O)[C@@H](OC(C7=C(C)C=CC=C7OC)=O)C6)[C@@H](C)O5)CCC8)([H])[C@]8([H])[C@@H]4C3=O", 1.0, - [ - [ - "A1", - "B1", - "C1", - "C1", - "C1", - "C1", - "C1", - "C1", - "C1", - "C5", - "D1", - "D1", - "D1", - "acetic acid", - "acetic acid", - "glyceric acid", - "glycolic acid", - "glycosylation", - "glycosylation", - "methylation", - ] - ], + ["6-methyloxane-2,4,5-triol", "6-methyloxane-2,4,5-triol", "6-methylsalicylic acid", "A1", "A5", "B1", "C1", "C1", "C1", "C1", "C1", "C1", "C1", "C13", "D1", "D1", "D1", "acetic acid", "acetic acid", "glyceric acid", "methylation"] ), ( "daptomycin", r"CCCCCCCCCC(=O)N[C@@H](CC1=CNC2=CC=CC=C21)C(=O)N[C@H](CC(=O)N)C(=O)N[C@@H](CC(=O)O)C(=O)N[C@H]3[C@H](OC(=O)[C@@H](NC(=O)[C@@H](NC(=O)[C@H](NC(=O)CNC(=O)[C@@H](NC(=O)[C@H](NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)CNC3=O)CCCN)CC(=O)O)C)CC(=O)O)CO)[C@H](C)CC(=O)O)CC(=O)C4=CC=CC=C4N)C", 1.0, - [ - [ - "3-methylglutamic acid", - "alanine", - "asparagine", - "aspartic acid", - "aspartic acid", - "aspartic acid", - "decanoic acid", - "glycine", - "glycine", - "kynurenine", - "ornithine", - "serine", - "threonine", - "tryptophan", - ], - [ - "3-methylglutamic acid", - "D1", - "D1", - "D1", - "D1", - "acetic acid", - "alanine", - "asparagine", - "aspartic acid", - "aspartic acid", - "aspartic acid", - "glycine", - "glycine", - "kynurenine", - "ornithine", - "serine", - "threonine", - "tryptophan", - ], - ], + ["3-methylglutamic acid", "D1", "D1", "D1", "D1", "acetic acid", "alanine", "asparagine", "aspartic acid", "aspartic acid", "aspartic acid", "butanoic acid", "decanoic acid", "glycine", "glycine", "hexanoic acid", "kynurenine", "octanoic acid", "ornithine", "serine", "threonine", "tryptophan"] ), ( "dictyostatin", r"C[C@H]1CC[C@H]([C@@H]([C@@H](OC(=O)/C=C\C=C\[C@H]([C@H](C[C@@H](/C=C\[C@@H]([C@@H]([C@H](C1)C)O)C)O)O)C)[C@@H](C)/C=C\C=C)C)O", 1.0, - [["B1", "B1", "B2", "B2", "B2", "C1", "C1", "C1", "C1", "C2", "D2", "D2", "acetic acid"]], + ["B1", "B1", "B2", "B2", "B2", "C1", "C1", "C1", "C1", "C2", "D2", "D2", "acetic acid"], ), ( "discodermolide", r"C[C@H]1[C@@H](OC(=O)[C@@H]([C@H]1O)C)C[C@@H](/C=C\[C@H](C)[C@@H]([C@@H](C)/C=C(/C)\C[C@H](C)[C@H]([C@H](C)[C@H]([C@@H](C)/C=C\C=C)OC(=O)N)O)O)O", 1.0, - [ - [ - "B1", - "B2", - "B2", - "B2", - "B2", - "B2", - "C1", - "C1", - "C2", - "C2", - "D2", - "acetic acid", - "carbamic acid", - ] - ], + ["B1", "B2", "B2", "B2", "B2", "B2", "C1", "C1", "C2", "C2", "D2", "acetic acid", "carbamic acid"], ), ( "epothilone", r"C[C@H]1CCC[C@@H]2[C@@H](O2)C[C@H](OC(=O)C[C@H](C(C(=O)[C@@H]([C@H]1O)C)(C)C)O)/C(=C/C3=CSC(=N3)C)/C", 1.0, - [["A3", "B1", "B1", "B2", "C1", "C2", "D1", "D2", "acetic acid", "cysteine", "oxidation"]], + ["A3", "B1", "B1", "B2", "C1", "C2", "D1", "D2", "acetic acid", "cysteine", "oxidation"], ), ( "erythromycin", r"CC[C@@H]1[C@@]([C@@H]([C@H](C(=O)[C@@H](C[C@@]([C@@H]([C@H]([C@@H]([C@H](C(=O)O1)C)O[C@H]2C[C@@]([C@H]([C@@H](O2)C)O)(C)OC)C)O[C@H]3[C@@H]([C@H](C[C@H](O3)C)N(C)C)O)(C)O)C)C)O)(C)O", 1.0, - [ - [ - "A2", - "B2", - "B2", - "B2", - "B6", - "D6", - "glycosylation", - "glycosylation", - "methylation", - "methylation", - "methylation", - "propanoic acid", - ] - ], + ["4,6-dimethyloxane-2,4,5-triol", "4-dimethylamino-6-methyloxane-2,3-diol", "A2", "B2", "B2", "B2", "B6", "D6", "methylation", "propanoic acid"] ), ( "georatusin", r"CC[C@@H](C)C[C@H](C)[C@@H]1[C@H](C[C@@H]([C@@H]2[C@H](C[C@@H]([C@@](O2)([C@H](C(=O)N[C@@H](C(=O)O1)CC3=CNC4=CC=CC=C43)C)O)C)C)C)C", 1.0, - [["A2", "B2", "B2", "D2", "D2", "D2", "D2", "acetic acid", "tryptophan"]], + ["A2", "B2", "B2", "D2", "D2", "D2", "D2", "acetic acid", "tryptophan"], ), ( "gephyronic acid", r"C[C@@H]1[C@@H](O[C@@](C([C@H]1OC)(C)C)([C@@H](C)C[C@H](C)[C@@H]([C@@]2([C@H](O2)[C@@H](C)C=C(C)C)C)O)O)CC(=O)O", 1.0, - [["A3", "B1", "B2", "B2", "C2", "C2", "D2", "isobutyric acid", "methylation", "oxidation"]], + ["A3", "B1", "B2", "B2", "C2", "C2", "D2", "isobutyric acid", "methylation", "oxidation"], ), ( "harzianic acid", r"CCC/C=C/C=C/C(=C\1/C(=O)C(N(C1=O)C)CC(C(C)C)(C(=O)O)O)/O", 1.0, - [ - [ - "A1", - "C1", - "C1", - "D1", - "acetic acid", - "artificial amino acid harzianic acid", - "methylation", - ] - ], + ["A1", "C1", "C1", "D1", "acetic acid", "artificial amino acid harzianic acid", "butanoic acid", "methylation"], ), ( "herboxidiene", r"C[C@H]1CC[C@@H](O[C@@H]1/C(=C/C=C/[C@@H](C)C[C@@]2([C@H](O2)[C@H](C)[C@H]([C@@H](C)O)OC)C)/C)CC(=O)O", 1.0, - [ - [ - "B2", - "B2", - "C1", - "C1", - "C2", - "C2", - "D1", - "D2", - "lactic acid", - "methylation", - "oxidation", - ] - ], + ["B2", "B2", "C1", "C1", "C2", "C2", "D1", "D2", "lactic acid", "methylation", "oxidation"], ), ( "hydroxystreptomycin", r"CN[C@H]1[C@H](O)[C@@H](O)[C@H](CO)O[C@H]1O[C@H]1[C@H](O[C@H]2[C@H](O)[C@@H](O)[C@H](NC(N)=N)[C@@H](O)[C@@H]2NC(N)=N)O[C@@H](CO)[C@]1(O)C=O", 1.0, - [["glycosylation", "glycosylation", "glycosylation", "methylation"]], + ["2-[3-(diaminomethylideneamino)-2,4,5,6-tetrahydroxycyclohexyl]guanidine", "glucosamine", "methylation", "sugar"], ), ( "hymenosetin", r"C/C=C/[C@@H]1C(=C[C@@H]2C[C@@H](CC[C@H]2[C@]1(C)/C(=C\3/C(=O)[C@@H](NC3=O)[C@@H](C)O)/O)C)C", 1.0, - [["A1", "C1", "C1", "C2", "C2", "D1", "D2", "acetic acid", "threonine"]], + ["A1", "C1", "C1", "C2", "C2", "D1", "D2", "acetic acid", "threonine"], ), ( "indanomycin", r"CC[C@H]1CC[C@@H]2[C@@H]1C=C[C@H]([C@H]2C(=O)C3=CC=CN3)/C=C/C=C(\CC)/[C@H]4[C@H](CC[C@@H](O4)[C@@H](C)C(=O)O)C", 1.0, - [["A1", "B2", "C1", "C1", "C1", "C1", "C2", "C4", "D1", "D4", "pyrrole-2-carboxylic acid"]], + ["A1", "B2", "C1", "C1", "C1", "C1", "C2", "C4", "D1", "D4", "pyrrole-2-carboxylic acid"], ), ( "ircinianin", r"C[C@H]1CCC2[C@@H]1C3(C(C=C2C)/C=C(\C)/CCCC4=COC=C4)C(=C(C(=O)O3)C)O", 1.0, - [["A2", "C1", "C1", "C2", "D1", "D11", "D2", "D2", "furan-3-carboxylic acid"]], + ["A2", "C1", "C1", "C2", "D1", "D11", "D2", "D2", "furan-3-carboxylic acid"], ), ( "iriomoteolide 1a", r"C[C@H]1C/C=C/[C@@]([C@@]2(CC(=C)C[C@@H](O2)C/C=C/[C@@H]([C@@H](/C(=C\C(=O)O[C@@H]1C[C@H](C)[C@H](C)O)/C)C)O)O)(C)O", 1.0, - [["A1", "A8", "A8", "B1", "B2", "B2", "C1", "C1", "D6", "D8", "lactic acid"]], + ["A1", "A8", "A8", "B1", "B2", "B2", "C1", "C1", "D6", "D8", "lactic acid"], ), ( "iriomoteolide 3a", r"C[C@H]1C/C=C/[C@@]([C@@]2(CC(=C)C[C@@H](O2)C/C=C/[C@@H]([C@@H](/C(=C\C(=O)O[C@@H]1C[C@H](C)[C@H](C)O)/C)C)O)O)(C)O", 1.0, - [["A1", "A8", "A8", "B1", "B2", "B2", "C1", "C1", "D6", "D8", "lactic acid"]], + ["A1", "A8", "A8", "B1", "B2", "B2", "C1", "C1", "D6", "D8", "lactic acid"], ), ( "jerangolid A", r"CC[C@@H]1C(=CC[C@@H](O1)/C(=C/[C@H](C)/C=C/[C@H]2CC(=C(C(=O)O2)CO)OC)/C)C", 1.0, - [["A7", "B1", "B2", "C1", "C2", "C2", "D1", "methylation", "propanoic acid"]], + ["A7", "B1", "B2", "C1", "C2", "C2", "D1", "methylation", "propanoic acid"], ), ( "kirromycin", r"CC[C@H](C(=O)NC/C=C/C=C(\C)/[C@H]([C@@H](C)[C@H]1[C@H]([C@H]([C@H](O1)/C=C/C=C/C=C(\C)/C(=O)C2=C(C=CNC2=O)O)O)O)OC)[C@@]3([C@@H]([C@@H](C([C@@H](O3)/C=C/C=C/C)(C)C)O)O)O", 1.0, - [ - [ - "A1", - "A4", - "B2", - "B3", - "B5", - "B5", - "C1", - "C1", - "C1", - "C1", - "C1", - "C2", - "C2", - "D11", - "acetic acid", - "beta-alanine", - "glycine", - "methylation", - ] - ], + ["A1","A4","B2","B3","B5","B5","C1","C1","C1","C1","C1","C2","C2","D11","acetic acid","beta-alanine","glycine","methylation"], ), ( "lactimidomycin", r"C[C@H]1/C=C\C=C\CC/C=C/C(=O)O[C@H]1/C(=C/[C@H](C)C(=O)C[C@@H](CC2CC(=O)NC(=O)C2)O)/C", 1.0, - [ - [ - "2-(2,6-dioxopiperidin-4-yl)acetic acid", - "A2", - "B1", - "B2", - "C1", - "C1", - "C1", - "C2", - "D1", - ] - ], + ["2-(2,6-dioxopiperidin-4-yl)acetic acid", "A2", "B1", "B2", "C1", "C1", "C1", "C2", "D1"], ), ( "lankamycin", r"C[C@@H]1C[C@@H]([C@H]([C@@H](O1)O[C@H]2[C@H](C[C@](C(=O)[C@@H]([C@H]([C@H]([C@H](OC(=O)[C@@H]([C@H]([C@@H]2C)O[C@H]3C[C@@]([C@@H]([C@@H](O3)C)OC(=O)C)(C)OC)C)[C@@H](C)[C@H](C)O)C)OC(=O)C)C)(C)O)C)O)OC", 1.0, - [ - [ - "A6", - "B2", - "B2", - "B2", - "B2", - "B2", - "D2", - "acetic acid", - "acetic acid", - "acetic acid", - "glycosylation", - "glycosylation", - "methylation", - "methylation", - ] - ], + ["4,6-dimethyloxane-2,4,5-triol", "6-methyloxane-2,3,4-triol", "A6", "B2", "B2", "B2", "B2", "B2", "D2", "acetic acid", "acetic acid", "acetic acid", "methylation", "methylation"], ), ( "latrunculin", r"C[C@H]1CC[C@@H]2C[C@H](C[C@@](O2)([C@@H]3CSC(=O)N3)O)OC(=O)C=C(CCC=CC=C1)C", 1.0, - [["A1", "A8", "B1", "B1", "C1", "C1", "D1", "D2", "carbonic acid", "cysteine"]], + ["A1", "A8", "B1", "B1", "C1", "C1", "D1", "D2", "carbonic acid", "cysteine"], ), ( - # TODO: find fix for cases like this where it is unclear what the start of polyketide parsing is... "leiodermatolide", r"CC[C@H]1[C@@H]([C@@](CC(=O)O1)(C/C=C/C=C(\C)/[C@@H]2[C@H](/C=C\C=C/[C@@H]([C@H]([C@H]([C@@H](/C(=C/CCC(=O)O2)/C)C)O)C)OC(=O)N)C)O)C", - 0.86, - [ - [ - "A8", - "B2", - "B2", - "B2", - "B2", - "C1", - "C1", - "C1", - "C2", - "D1", - "carbamic acid", - "propanoic acid", - ] - ], + 0.88, + ["A8", "B2", "B2", "B2", "B2", "C1", "C1", "C1", "C2", "D1", "carbamic acid", "malonic acid", "propanoic acid"], ), ( "lovastatin", r"CC[C@H](C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C@H](C=C2)C)CC[C@@H]3C[C@H](CC(=O)O3)O)C", 1.0, - [["B1", "B1", "C1", "C1", "C1", "C2", "D1", "D2", "D5", "acetic acid", "acetic acid"]], + ["B1", "B1", "C1", "C1", "C1", "C2", "D1", "D2", "D5", "acetic acid", "acetic acid"], ), ( "macrolactin A", r"CC1CCCC=CC=CC(CC(CC=CC=CC(CC=CC=CC(=O)O1)O)O)O", 1.0, - [["B1", "B1", "B1", "B1", "C1", "C1", "C1", "C1", "C1", "C1", "D1", "acetic acid"]], + ["B1", "B1", "B1", "B1", "C1", "C1", "C1", "C1", "C1", "C1", "D1", "acetic acid"], ), ( "maytansine", r"C[C@@H]1[C@@H]2C[C@]([C@@H](/C=C/C=C(/CC3=CC(=C(C(=C3)OC)Cl)N(C(=O)C[C@@H]([C@]4([C@H]1O4)C)OC(=O)[C@H](C)N(C)C(=O)C)C)\C)OC)(NC(=O)O2)O", 1.0, - [ - [ - "3-amino-5-hydroxybenzoic acid", - "A1", - "B1", - "B2", - "C1", - "C2", - "D11", - "D2", - "acetic acid", - "alanine", - "carbamic acid", - "chlorination", - "methylation", - "methylation", - "methylation", - "methylation", - "oxidation", - ] - ], + ["3-amino-5-hydroxybenzoic acid", "A1", "B1", "B2", "C1", "C2", "D11", "D2", "acetic acid", "alanine", "carbamic acid", "chlorination", "methylation", "methylation", "methylation", "methylation", "oxidation"], ), ( "megalomycin A", r"CC[C@@H]1[C@@]([C@@H]([C@H](C(=O)[C@@H](C[C@@]([C@@H]([C@H]([C@@H]([C@H](C(=O)O1)C)O[C@H]2C[C@@]([C@H]([C@@H](O2)C)O)(C)O)C)O[C@H]3[C@@H]([C@H](C[C@H](O3)C)N(C)C)O)(C)O[C@H]4C[C@H]([C@H]([C@@H](O4)C)O)N(C)C)C)C)O)(C)O", 1.0, - [ - [ - "A2", - "B2", - "B2", - "B2", - "B6", - "D6", - "glycosylation", - "glycosylation", - "glycosylation", - "methylation", - "methylation", - "methylation", - "methylation", - "propanoic acid", - ] - ], + ["4,6-dimethyloxane-2,4,5-triol", "4-dimethylamino-6-methyloxane-2,3-diol", "A2", "B2", "B2", "B2", "B6", "D6", "methylation", "methylation", "propanoic acid", "sugar"], ), ( "micacocidin A", r"CCCCCC1=C(C(=CC=C1)[O-])C2=N[C@H](CS2)[C@@H]3N([C@@H](CS3)[C@@H](C(C)(C)C4=N[C@@](CS4)(C)C(=O)[O-])O)C", 1.0, - [ - [ - "A1", - "B3", - "C1", - "C1", - "D1", - "D1", - "acetic acid", - "cysteine", - "cysteine", - "cysteine", - "methylation", - "methylation", - ] - ], + ["A1", "B3", "C1", "C1", "D1", "D1", "acetic acid", "cysteine", "cysteine", "cysteine", "methylation", "methylation", "butanoic acid", "hexanoic acid"], ), ( "migrastatin", r"C[C@@H]1/C=C(\[C@H](OC(=O)/C=C/CC/C=C/[C@@H]([C@H]1O)OC)[C@H](C)C(=O)CCCC2CC(=O)NC(=O)C2)/C", 1.00, - [ - [ - "2-(2,6-dioxopiperidin-4-yl)acetic acid", - "A2", - "B2", - "B5", - "C1", - "C1", - "C2", - "D1", - "D1", - "methylation", - ] - ], + ["2-(2,6-dioxopiperidin-4-yl)acetic acid", "A2", "B2", "B5", "C1", "C1", "C2", "D1", "D1", "methylation"], ), ( "narbonolide", r"CC[C@@H]1[C@@H](/C=C/C(=O)[C@@H](C[C@@H]([C@@H]([C@H](C(=O)[C@H](C(=O)O1)C)C)O)C)C)C", 1.0, - [["A2", "A2", "B2", "B2", "C1", "D2", "propanoic acid"]], + ["A2", "A2", "B2", "B2", "C1", "D2", "propanoic acid"], ), ( "pederin", r"C[C@H]1[C@H](O[C@](CC1=C)([C@@H](C(=O)N[C@H]([C@@H]2C[C@H](C([C@H](O2)C[C@@H](COC)OC)(C)C)O)OC)O)OC)C", 1.0, - [ - [ - "2-hydroxyglycine", - "A5", - "A8", - "B1", - "B2", - "B3", - "C1", - "acetic acid", - "methanol", - "methanol", - "methylation", - "methylation", - "methylation", - ] - ], + ["2-hydroxyglycine", "A5", "A8", "B1", "B2", "B3", "C1", "acetic acid", "methanol", "methylation", "methylation", "methylation", "methylation"], ), ( "peluriside A", r"CC[C@@H](CO)/C=C(/C)\[C@@H]1C[C@H](C[C@@H](C([C@@]2([C@@H]([C@@H](C[C@@H](O2)C[C@H]([C@@H](C(=O)O1)O)OC)OC)O)O)(C)C)O)OC", 1.0, - [ - [ - "A5", - "B1", - "B1", - "B1", - "B1", - "B3", - "B5", - "C2", - "D7", - "acetic acid", - "methylation", - "methylation", - "methylation", - ] - ], + ["A5", "B1", "B1", "B1", "B1", "B3", "B5", "C2", "D7", "acetic acid", "methylation", "methylation", "methylation"], ), ( "penicillin G", r"CC1([C@@H](N2[C@H](S1)[C@@H](C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C", 1.0, - [["2-phenylacetic acid", "cysteine", "valine"]], + ["2-phenylacetic acid", "cysteine", "valine"], ), ( "periconiasin A", r"C[C@H]1[C@H]2[C@@H](NC(=O)[C@@]23[C@@H](CC(=CC[C@@H](CC3=O)O)C)C=C1C)CC(C)C", 1.0, - [["A1", "B1", "C1", "C1", "C2", "D2", "acetic acid", "leucine"]], + ["A1", "B1", "C1", "C1", "C2", "D2", "acetic acid", "leucine"], ), ( "periconiasin I", r"C/C/1=C/C[C@@H](CC(=O)[C@]23[C@@H](C1)[C@H](C(=C([C@H]2[C@@H](NC3=O)CC(C)C)C)C)O)O", 1.0, - [["A1", "B1", "C1", "C1", "C2", "D2", "acetic acid", "leucine", "oxidation"]], + ["A1", "B1", "C1", "C1", "C2", "D2", "acetic acid", "leucine", "oxidation"], ), ( "ratjadon", r"C/C=C/[C@H]1[C@H]([C@@H](C[C@H](O1)[C@@H](/C=C/C=C(\C)/C[C@@H](C)/C=C(/C)\C=C\[C@H]2CC=CC(=O)O2)O)O)C", 1.0, - [["B1", "B1", "B2", "C1", "C1", "C1", "C1", "C2", "C2", "D11", "D2", "acetic acid"]], + ["B1", "B1", "B2", "C1", "C1", "C1", "C1", "C2", "C2", "D11", "D2", "acetic acid"], ), ( "soraphen A", r"C[C@H]1/C=C/[C@H]([C@H](CCCC[C@H](OC(=O)[C@H]([C@@]2([C@@H]([C@H]([C@@H]([C@H]1O2)C)O)OC)O)C)C3=CC=CC=C3)OC)OC", 1.0, - [ - [ - "A2", - "B1", - "B1", - "B2", - "B5", - "C2", - "D1", - "D5", - "benzoic acid", - "methylation", - "methylation", - "methylation", - ] - ], + ["A2", "B1", "B1", "B2", "B5", "C2", "D1", "D5", "benzoic acid", "methylation", "methylation", "methylation"], ), ( "spiculoic acid A", r"CC[C@@H]1[C@@H]2[C@@H]([C@H](C1=O)C)C(=C[C@@]([C@@]2(CC)C(=O)O)(CC)/C=C/C3=CC=CC=C3)CC", 1.0, - [["2-phenylacetic acid", "A4", "C2", "C4", "C4", "C4"]], + ["2-phenylacetic acid", "A4", "C2", "C4", "C4", "C4"], ), ( "spongidepsin", r"C[C@H]1CCC(CC(OC(=O)[C@@H](N(C(=O)[C@H](C1)C)C)CC2=CC=CC=C2)CCCC#C)C", 1.0, - [["3-butynoic acid", "B1", "D1", "D2", "D2", "D8", "methylation", "phenylalanine"]], + ["3-butynoic acid", "B1", "D1", "D2", "D2", "D8", "methylation", "phenylalanine"], ), ( "thailanstatin A", r"C[C@H]1C[C@H]([C@H](O[C@H]1C/C=C(\C)/C=C/[C@@H]2[C@H]([C@@]3(C[C@H](O2)CC(=O)O)CO3)O)C)NC(=O)/C=C\[C@H](C)OC(=O)C", 1.0, - [ - [ - "A8", - "B5", - "C1", - "C1", - "C1", - "C1", - "C2", - "D2", - "acetic acid", - "lactic acid", - "oxidation", - "threonine", - ] - ], + ["A8", "B5", "C1", "C1", "C1", "C1", "C2", "D2", "acetic acid", "lactic acid", "oxidation", "threonine"], ), ( "theopederin A", r"C[C@H]1[C@H](O[C@](CC1=C)([C@H](C(=O)N[C@@H]2[C@@H]3[C@@H]([C@H](C([C@H](O3)C[C@H]4CCCC(O4)O)(C)C)OC)OCO2)O)OC)C", 1.0, - [ - [ - "2-hydroxyglycine", - "A5", - "A8", - "B1", - "B2", - "B3", - "B5", - "C1", - "D1", - "acetic acid", - "formaldehyde", - "methanol", - "methylation", - ] - ], + ["2-hydroxyglycine", "A5", "A8", "B1", "B2", "B3", "B5", "C1", "D1", "acetic acid", "formaldehyde", "methylation", "methylation"] ), ( "theopederin B", r"C[C@H]1[C@H](O[C@](CC1=C)([C@H](C(=O)N[C@@H]2[C@@H]3[C@@H]([C@H](C([C@H](O3)C[C@@H](CCCC(=O)OC)O)(C)C)OC)OCO2)O)OC)C", 1.0, - [ - [ - "2-hydroxyglycine", - "A5", - "A8", - "B1", - "B2", - "B3", - "B5", - "C1", - "D1", - "acetic acid", - "formaldehyde", - "methanol", - "methylation", - "methylation", - ] - ], + ["2-hydroxyglycine", "A5", "A8", "B1", "B2", "B3", "B5", "C1", "D1", "acetic acid", "formaldehyde", "methylation", "methylation", "methylation"] ), ( "thermolide A", r"C[C@@H]1C[C@H]([C@@H](OC(=O)[C@H](NC(=O)C[C@H](C[C@@H]1O)O)C)[C@@H](C)C[C@H](C)[C@@H]([C@H](C)[C@@H](C[C@H](C)O)OC(=O)C)O)C", 1.0, - [["B1", "B1", "B1", "B2", "B2", "B2", "D2", "D2", "acetic acid", "acetic acid", "alanine"]], + ["B1", "B1", "B1", "B2", "B2", "B2", "D2", "D2", "acetic acid", "acetic acid", "alanine"], ), ( "thiocoraline", r"CN1C2CSSCC(C(=O)N(C(C(=O)SCC(C(=O)NCC1=O)NC(=O)C3=NC4=CC=CC=C4C=C3O)CSC)C)N(C(=O)CNC(=O)C(CSC(=O)C(N(C2=O)C)CSC)NC(=O)C5=NC6=CC=CC=C6C=C5O)C", 1.0, - [ - [ - "3-hydroxyquinaldic acid", - "3-hydroxyquinaldic acid", - "cysteine", - "cysteine", - "cysteine", - "cysteine", - "cysteine", - "cysteine", - "glycine", - "glycine", - "methylation", - "methylation", - "methylation", - "methylation", - "methylation", - "methylation", - ] - ], + ["3-hydroxyquinaldic acid", "3-hydroxyquinaldic acid", "cysteine", "cysteine", "cysteine", "cysteine", "cysteine", "cysteine", "glycine", "glycine", "methylation", "methylation", "methylation", "methylation", "methylation", "methylation"], ), ( "zincophorin", r"CCC[C@@H](C)/C=C(\C)/[C@@H]([C@H](C)/C=C/CC[C@H]([C@H](C)[C@@H]([C@H](C)[C@@H]([C@H](C)[C@@H]1[C@H](CC[C@H](O1)[C@H](C)C(=O)O)C)O)O)O)O", 1.0, - [["B2", "B2", "B2", "B2", "B2", "C1", "C2", "C2", "D1", "D1", "D2", "propanoic acid"]], + ["B2", "B2", "B2", "B2", "B2", "C1", "C2", "C2", "D1", "D1", "D2", "propanoic acid"], ), ( "zwittermicin A", r"C([C@H]([C@H](CO)N)O)[C@H]([C@H]([C@H]([C@@H](C(=O)N[C@@H](CNC(=O)N)C(=O)N)O)O)N)O", 1.0, - [["2,3-diaminopropionate", "B1", "B12", "B5", "amination", "carbamic acid", "serine"]], + ["B1", "B12", "B5", "carbamic acid", "serine", "2,3-diaminopropionate", "amination"], ), ( "enterobactin", r"C1C(C(=O)OCC(C(=O)OCC(C(=O)O1)NC(=O)C2=C(C(=CC=C2)O)O)NC(=O)C3=C(C(=CC=C3)O)O)NC(=O)C4=C(C(=CC=C4)O)O", 1.0, - [ - [ - "2,3-dihydroxybenzoic acid", - "2,3-dihydroxybenzoic acid", - "2,3-dihydroxybenzoic acid", - "serine", - "serine", - "serine", - ] - ], + ["2,3-dihydroxybenzoic acid", "2,3-dihydroxybenzoic acid", "2,3-dihydroxybenzoic acid", "serine", "serine", "serine"], ), ( "curvularide C", r"CC[C@H](C)[C@@H](CO)NC(=O)/C=C/[C@](C)([C@H]([C@@H](C)C[C@@H](CC)O)O)OC", 1.0, - [["B6", "C1", "D2", "D5", "acetic acid", "isoleucinol", "methylation"]], + ["B6", "C1", "D2", "D5", "acetic acid", "isoleucinol", "methylation"], ), ( "neopeltolide", r"O=C(/C=C\CCC1=COC(/C=C\CNC(OC)=O)=N1)O[C@@H]2C[C@@H](C[C@@H](C)C[C@H](OC)C[C@H](CCC)OC(C3)=O)O[C@@H]3C2", 1.0, - [ - [ - "B1", - "B1", - "B1", - "B1", - "C1", - "C1", - "C1", - "D1", - "D1", - "D8", - "acetic acid", - "carbonic acid", - "glycine", - "methylation", - "methylation", - "serine", - ] - ], + ["B1", "B1", "B1", "B1", "C1", "C1", "C1", "D1", "D1", "D8", "acetic acid", "carbonic acid", "glycine", "methylation", "methylation", "serine", "butanoic acid"], ), ( "dihydroxydione", r"CCC(=O)[C@H](C)[C@H]([C@@H](C)C(=O)CC/C(=C\CC(/C(=C/C1=CSC(=N1)C)/C)O)/C)O", 1.0, - [["A2", "B1", "B2", "C2", "C2", "D1", "acetic acid", "cysteine", "propanoic acid"]], + ["A2", "B1", "B2", "C2", "C2", "D1", "acetic acid", "cysteine", "propanoic acid"], ), ( "amamistatin B", r"CCCCCCC[C@@H](C(C)(C)C(=O)N[C@H]1CCCCN(C1=O)O)OC(=O)[C@@H](CCCCN(C=O)O)NC(=O)C2=C(OC(=N2)C3=CC=CC=C3O)C", 1.0, - [ - [ - "3-hydroxy-2,2-dimethyldecanoic acid", - "N6-formyl-N6-hydroxylysine", - "N6-hydroxylysine", - "salicylic acid", - "threonine", - ], - [ - "B3", - "D1", - "D1", - "D1", - "N6-formyl-N6-hydroxylysine", - "N6-hydroxylysine", - "acetic acid", - "salicylic acid", - "threonine", - ], - ], + ["3-hydroxy-2,2-dimethyldecanoic acid", "B3", "D1", "D1", "D1", "N6-formyl-N6-hydroxylysine", "N6-hydroxylysine", "acetic acid", "butanoic acid", "hexanoic acid", "octanoic acid", "salicylic acid", "threonine"] ), ( "nocardichelin B", r"CCCCCCCCCCC/C=C\C(=O)N(CCCCCNC(=O)CCC(=O)N(CCCCCNC(=O)[C@@H]1COC(=N1)C2=CC=CC=C2O)O)O", 1.0, - [ - [ - "N-(5-aminopentyl)hydroxylamine", - "N-(5-aminopentyl)hydroxylamine", - "butanedioic acid", - "salicylic acid", - "serine", - "tetradec-2-enoic acid", - ], - [ - "C1", - "D1", - "D1", - "D1", - "D1", - "D1", - "N-(5-aminopentyl)hydroxylamine", - "N-(5-aminopentyl)hydroxylamine", - "acetic acid", - "butanedioic acid", - "salicylic acid", - "serine", - ], - ], + ["C1", "D1", "D1", "D1", "D1", "D1", "N-(5-aminopentyl)hydroxylamine", "N-(5-aminopentyl)hydroxylamine", "acetic acid", "butanedioic acid", "butanoic acid", "decanoic acid", "dodecanoic acid", "hexanoic acid", "octanoic acid", "salicylic acid", "serine", "tetradec-2-enoic acid"] ), ( "borophycin", r"[B-]123O[C@]45O[C@H](C(C(=O)C[C@H](CC/C=C\C[C@@H](OC(=O)C(O1)[C@]6(O2)O[C@H](C(C(=O)C[C@H](CC/C=C\C[C@@H](OC(=O)C4O3)CC)O)(C)C)CC[C@H]6C)CC)O)(C)C)CC[C@H]5C", 1.0, - [ - [ - "A3", - "A3", - "A5", - "A5", - "B1", - "B1", - "B1", - "B1", - "B1", - "B1", - "C1", - "C1", - "D1", - "D1", - "D2", - "D2", - "boronation", - "propanoic acid", - "propanoic acid", - ] - ], + ["A3", "A3", "A5", "A5", "B1", "B1", "B1", "B1", "B1", "B1", "C1", "C1", "D1", "D1", "D2", "D2", "boronation", "propanoic acid", "propanoic acid"], ), ( "aplasmomycin C", r"[B-]123O[C@]45O[C@H](C([C@H](C/C=C/[C@H]6O[C@@H]([C@H](C6)OC(=O)[C@@H](O1)[C@]7(O2)O[C@H](C([C@H](C/C=C/[C@H]8O[C@@H]([C@H](C8)OC(=O)[C@H]4O3)C)OC(=O)C)(C)C)CC[C@H]7C)C)OC(=O)C)(C)C)CC[C@H]5C", - 0.74, - [ - [ - "A5", - "A5", - "B1", - "B1", - "B3", - "B3", - "C1", - "C1", - "D2", - "D2", - "acetic acid", - "acetic acid", - "boronation", - ] - ], + 1.0, + ["A5", "A5", "B1", "B1", "B1", "B1", "B3", "B3", "C1", "C1", "C1", "C1", "D2", "D2", "acetic acid", "acetic acid", "boronation", "lactic acid", "lactic acid"], ), ( "chaetosemin G", r"Cc1c(O)c(C(O[C@H](C2)C)=O)c2c(Cl)c1O", 1.0, - [["A1", "A2", "B1", "C1", "acetic acid", "chlorination"]], + ["A1", "A2", "B1", "C1", "acetic acid", "chlorination"], ), ( "actinoquinoline B", r"CC(C)CC(=O)NC[C@H]1CC[C@@H]([C@@H](O1)O)NC(=O)C2=NC3=CC=CC=C3C=C2O", 1.0, - [["3-hydroxyquinaldic acid", "3-methylbutanoic acid", "5-hydroxylysine"]], + ["3-hydroxyquinaldic acid", "3-methylbutanoic acid", "5-hydroxylysine", "D8", "acetic acid"] ), ( - "NRP with intramolecular threoine-cysteine bridges", + "NRP with intramolecular threonine-cysteine bridges", r"CC[C@H](C)[C@@H]1NC(=O)[C@H]2NC(=O)[C@H](Cc3ccc(O)cc3)NC(=O)CNC(=O)[C@H](Cc3c[nH]c4ccccc34)NC(=O)[C@H](Cc3c[nH]cn3)NC(=O)[C@H](CS[C@@H]2C)NC(=O)[C@H](C(C)C)NC(=O)[C@H]2NC(=O)[C@@H](NC(=O)[C@@H](NC(=O)CN)[C@@H](C)O)CS[C@H](C)[C@@H](C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CO)C(=O)O)NC(=O)[C@H](CO)NC(=O)[C@H](Cc3ccccc3)NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](CC(=O)O)NC(=O)[C@H](CS[C@@H]2C)NC(=O)[C@H](CC(C)C)NC1=O", 1.0, - [ - [ - "arginine", - "aspartic acid", - "aspartic acid", - "cysteine", - "cysteine", - "cysteine", - "glycine", - "glycine", - "histidine", - "isoleucine", - "leucine", - "phenylalanine", - "serine", - "serine", - "threonine", - "threonine", - "threonine", - "threonine", - "tryptophan", - "tyrosine", - "valine", - ] - ], + ["arginine", "aspartic acid", "aspartic acid", "cysteine", "cysteine", "cysteine", "glycine", "glycine", "histidine", "isoleucine", "leucine", "phenylalanine", "serine", "serine", "threonine", "threonine", "threonine", "threonine", "tryptophan", "tyrosine", "valine"], ), ] diff --git a/tests/helpers.py b/tests/helpers.py index 27382cf..469e8bf 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -1,66 +1,73 @@ -# -*- coding: utf-8 -*- - """Shared helpers for RetroMol integration tests.""" -from __future__ import annotations - -from importlib.resources import files -from typing import Any, Dict, List - -import yaml - -import retromol.data -from retromol import api, io, readout, rules - - -def load_rule_set() -> rules.Rules: - """Load the default RetroMol rule set once.""" - path_reaction_rules = str(files(retromol.data).joinpath("default_reaction_rules.yml")) - path_matching_rules = str(files(retromol.data).joinpath("default_matching_rules.yml")) - return rules.load_rules_from_files(path_reaction_rules, path_matching_rules) - - -def load_wave_config() -> Dict[str, Any]: - """Load the default wave configuration once.""" - path_wave_config = str(files(retromol.data).joinpath("default_wave_config.yml")) - - with open(path_wave_config) as f: - return yaml.safe_load(f) - - -def parse_compound( - smiles: str, - rule_set: rules.Rules, - wave_config: Dict[str, Any], - *, - match_stereochemistry: bool = False, -) -> io.Result: - """Parse a compound SMILES string into an io.Result object.""" - mol = io.Input("test_compound", smiles) - return api.run_retromol_with_timeout(mol, rule_set, wave_config, match_stereochemistry=match_stereochemistry) - - -def compare_lists_of_lists(a: List[List[str]], b: List[List[str]]) -> bool: - # Convert each inner list to a frozenset (hashable, unordered) - set_a = {frozenset(inner) for inner in a} - set_b = {frozenset(inner) for inner in b} - return set_a == set_b - - -def assert_result(result: io.Result, expected_coverage: float, expected_mappings: List[List[str]]) -> None: - """Common assertion logic used by all integration tests.""" - best_total_coverage: float = result.best_total_coverage() - assert best_total_coverage == expected_coverage, f"Expected coverage {expected_coverage}, got {best_total_coverage}" - - mappings = readout.optimal_mappings_with_timeout(result) - parsed_mappings: List[List[str]] = [] - for mapping in mappings: - parsed_mapping: List[str] = [] - for node in mapping["nodes"]: - parsed_mapping.append(node["identity"]) - parsed_mapping.sort() - parsed_mappings.append(parsed_mapping) - - assert compare_lists_of_lists(expected_mappings, parsed_mappings), ( - f"Expected mappings {expected_mappings}, got {parsed_mappings}" - ) +from retromol.model.rules import RuleSet +from retromol.model.result import Result +from retromol.model.submission import Submission +from retromol.model.reaction_graph import MolNode +from retromol.pipelines.parsing import run_retromol_with_timeout + + +def load_rule_set() -> RuleSet: + """ + Load the default RetroMol rule set once. + + :return: the loaded RuleSet object + """ + return RuleSet.load_default(match_stereochemistry=False) + + +def parse_compound(smiles: str, ruleset: RuleSet) -> Result: + """ + Parse a compound SMILES string into an Result object. + + :param smiles: the SMILES string of the compound to parse + :param ruleset: the RuleSet to use for parsing + :return: the resulting Result object + """ + submission = Submission(smiles) + return run_retromol_with_timeout(submission, ruleset) + + +def compare_floats(a: float, b: float, tol: float = 1e-2) -> bool: + """ + Compare two floating-point numbers for equality within a tolerance. + + :param a: the first float to compare + :param b: the second float to compare + :param tol: the tolerance for comparison + :return: True if the numbers are equal within the tolerance, False otherwise + """ + return abs(a - b) <= tol + + +def compare_lists(list1: list[str], list2: list[str]) -> bool: + """ + Compare two lists of strings for equality, ignoring order. + + :param list1: the first list to compare + :param list2: the second list to compare + :return: True if the lists contain the same elements, False otherwise + """ + return sorted(list1) == sorted(list2) + + +def assert_result(result: Result, expected_coverage: float, expected_monomers: list[str]) -> None: + """ + Common assertion logic used by all integration tests. + + :param result: the Result object to check + :param expected_coverage: the expected total coverage value + :param expected_monomers: the expected list of monomer identities + """ + coverage: float = result.calculate_coverage() + assert compare_floats(coverage, expected_coverage), f"expected coverage {expected_coverage}, got {coverage}" + + ident_nodes: MolNode = result.reaction_graph.identified_nodes.values() + assert all(n.is_identified for n in ident_nodes), "not all identified nodes are marked as identified" + found_monomers: list[str] = [n.identity.name for n in ident_nodes] + + # Sort monomers before comparison; easier to read in case of failure + found_monomers.sort() + expected_monomers.sort() + + assert compare_lists(found_monomers, expected_monomers), f"expected monomers {expected_monomers}, got {found_monomers}" diff --git a/tests/test_integration_demo_set.py b/tests/test_integration_demo_set.py index 06d569b..c5d439b 100644 --- a/tests/test_integration_demo_set.py +++ b/tests/test_integration_demo_set.py @@ -2,25 +2,30 @@ """Integration tests for the demo set of compounds.""" -from typing import Any, Dict, List - import pytest +from rdkit import RDLogger -from retromol import rules +from retromol.model.rules import RuleSet from .data.integration_demo_set import CASES from .helpers import assert_result, parse_compound -@pytest.mark.parametrize("identifier, smiles, expected_coverage, expected_mappings", CASES, ids=[c[0] for c in CASES]) -def test_integration_demo_set( - identifier: str, - smiles: str, - expected_coverage: float, - expected_mappings: List[List[str]], - rule_set: rules.Rules, - wave_config: Dict[str, Any], -) -> None: - print(f"Testing {identifier}...") - result = parse_compound(smiles, rule_set, wave_config, match_stereochemistry=False) - assert_result(result, expected_coverage, expected_mappings) +# Disable RDKit warnings for cleaner test output +RDLogger.DisableLog("rdApp.*") + + +@pytest.mark.parametrize("name, smiles, expected_coverage, expected_monomers", CASES, ids=[c[0] for c in CASES]) +def test_integration_demo_set(name: str, smiles: str, expected_coverage: float, expected_monomers: list[list[str]], ruleset: RuleSet) -> None: + """ + Integration test for the demo set of compounds. + + :param name: the name of the test case + :param smiles: the SMILES string of the compound to test + :param expected_coverage: the expected total coverage value + :param expected_monomers: the expected list of monomer identities + :param ruleset: the RuleSet to use for parsing + """ + print(f"testing {name}...") + result = parse_compound(smiles, ruleset) + assert_result(result, expected_coverage, expected_monomers)