From a5898eb3b0e10e2ab017a30696b16096365cea1d Mon Sep 17 00:00:00 2001 From: David Meijer Date: Tue, 6 Jan 2026 16:01:20 +0100 Subject: [PATCH 1/4] FIX: node encodings are strings, not ints --- src/retromol/model/reaction_graph.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/retromol/model/reaction_graph.py b/src/retromol/model/reaction_graph.py index d289799..3c694f8 100644 --- a/src/retromol/model/reaction_graph.py +++ b/src/retromol/model/reaction_graph.py @@ -298,8 +298,8 @@ def from_dict(cls, data: dict[str, Any]) -> "ReactionGraph": :return: ReactionGraph object """ reaction_graph = cls( - nodes={int(enc): MolNode.from_dict(node_data) for enc, node_data in data["nodes"].items()}, + nodes={enc: MolNode.from_dict(node_data) for enc, node_data in data["nodes"].items()}, edges=[RxnEdge.from_dict(edge_data) for edge_data in data["edges"]], - out_edges={int(enc): indices for enc, indices in data["out_edges"].items()}, + out_edges={enc: indices for enc, indices in data["out_edges"].items()}, ) return reaction_graph From abe7bee8299013929880ab43d8716d553bd38a83 Mon Sep 17 00:00:00 2001 From: David Meijer Date: Tue, 6 Jan 2026 16:08:32 +0100 Subject: [PATCH 2/4] FIX: default turn tautomer canonicalization off --- src/retromol/model/submission.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/retromol/model/submission.py b/src/retromol/model/submission.py index 68a093c..15ddd38 100644 --- a/src/retromol/model/submission.py +++ b/src/retromol/model/submission.py @@ -17,12 +17,19 @@ class Submission: :var smiles: str: SMILES representation of the submitted molecule :var name: str | None: optional name of the submitted molecule :var props: dict[str, Any] | None: optional additional properties associated with the submission + :var keep_stereo: bool: whether to keep stereochemistry during standardization + :var neutralize: bool: whether to neutralize the molecule during standardization + :var canonicalize_tautomer: bool: whether to canonicalize the tautomer during """ smiles: str name: str | None = None props: dict[str, Any] | None = None + keep_stereo: bool = True + neutralize: bool = True + canonicalize_tautomer: bool = False + mol: Mol = field(init=False, repr=False) inchikey: str = field(init=False, repr=False) @@ -36,9 +43,9 @@ def __post_init__(self) -> None: # Generate standardized molecule mol = standardize_from_smiles( smiles, - keep_stereo=True, - neutralize=True, - tautomer_canon=True, + keep_stereo=self.keep_stereo, + neutralize=self.neutralize, + tautomer_canon=self.canonicalize_tautomer, ) # Generate InChIKey From ac23c8627bd31e60b79e5111ebf543427e59cbee Mon Sep 17 00:00:00 2001 From: David Meijer Date: Tue, 6 Jan 2026 20:57:17 +0100 Subject: [PATCH 3/4] UPD: add linear readout in result object including assembly graph --- src/retromol/cli.py | 26 ++++--- src/retromol/model/assembly_graph.py | 108 ++++++++++++++++++++++++++- src/retromol/model/readout.py | 41 +++++++--- src/retromol/model/result.py | 11 ++- src/retromol/pipelines/parsing.py | 9 ++- tests/data/integration_demo_set.py | 24 +++--- 6 files changed, 182 insertions(+), 37 deletions(-) diff --git a/src/retromol/cli.py b/src/retromol/cli.py index b382185..5e24fd9 100644 --- a/src/retromol/cli.py +++ b/src/retromol/cli.py @@ -14,9 +14,8 @@ from retromol.version import __version__ from retromol.utils.logging import setup_logging, add_file_handler from retromol.model.rules import RuleSet -from retromol.model.submission import Submission from retromol.model.result import Result -from retromol.model.readout import LinearReadout +from retromol.model.submission import Submission from retromol.pipelines.parsing import run_retromol_with_timeout from retromol.io.streaming import run_retromol_stream, stream_sdf_records, stream_table_rows, stream_json_records from retromol.chem.mol import encode_mol @@ -129,28 +128,33 @@ def main() -> None: result: Result = run_retromol_with_timeout(submission, ruleset) log.info(f"result: {result}") + # Write out result to file and then read back in again for visualization (test I/O) + result_dict = result.to_dict() + with open(os.path.join(args.outdir, "result.json"), "w") as f: + json.dump(result_dict, f, indent=4) + + with open(os.path.join(args.outdir, "result.json"), "r") as f: + result_data = json.load(f) + result2 = Result.from_dict(result_data) + # Report on coverage as percentage of tags identified - coverage = result.calculate_coverage() + coverage = result2.calculate_coverage() log.info(f"coverage: {coverage:.2%}") - # Get linear readout; print summary - linear_readout = LinearReadout.from_result(result) + # Get linear readout; draw assembly graph + linear_readout = result2.linear_readout out_assembly_graph_fig = os.path.join(args.outdir, "assembly_graph.png") linear_readout.assembly_graph.draw(show_unassigned=True, savepath=out_assembly_graph_fig) log.info(f"linear readout: {linear_readout}") # Visualize reaction graph - root = encode_mol(result.submission.mol) + root = encode_mol(result2.submission.mol) visualize_reaction_graph( - result.reaction_graph, + result2.reaction_graph, html_path=os.path.join(args.outdir, "reaction_graph.html"), root_enc=root ) - result_dict = result.to_dict() - with open(os.path.join(args.outdir, "result.json"), "w") as f: - json.dump(result_dict, f, indent=4) - result_counts["successes"] += 1 # Batch mode diff --git a/src/retromol/model/assembly_graph.py b/src/retromol/model/assembly_graph.py index 1ea7784..827e03a 100644 --- a/src/retromol/model/assembly_graph.py +++ b/src/retromol/model/assembly_graph.py @@ -1,6 +1,6 @@ """Module contains utilities for defining and working with assembly graphs.""" -from dataclasses import dataclass +from dataclasses import dataclass, asdict from typing import Any, Iterable, Iterable, Iterator, Generator from rdkit.Chem.rdchem import Mol @@ -8,6 +8,7 @@ import networkx as nx from retromol.model.reaction_graph import MolNode +from retromol.model.identity import MolIdentity from retromol.chem.tagging import get_tags_mol @@ -37,6 +38,24 @@ class RootBondLink: bond_type: str # stringified version of RDKit BondType bond_order: float | int | None # include if available + def to_dict(self) -> dict[str, Any]: + """ + Convert the RootBondLink to a dictionary. + + :return: dictionary representation of the RootBondLink + """ + return asdict(self) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "RootBondLink": + """ + Create a RootBondLink from a dictionary. + + :param data: dictionary representation of the RootBondLink + :return: RootBondLink instance + """ + return cls(**data) + def build_assembly_graph( root_mol: Mol, @@ -497,6 +516,93 @@ def validate(self) -> None: if not isinstance(data["n_bonds"], int): raise ValueError(f"AssemblyGraph edge {u!r}-{v!r} n_bonds must be int") + + def to_dict(self) -> dict[str, Any]: + """ + Convert the AssemblyGraph to a dictionary. + + :return: dictionary representation of the AssemblyGraph + """ + nodes_out: list[dict[str, Any]] = [] + for node_id, data in self.g.nodes(data=True): + tags = data.get("tags", set()) + tags_json = sorted(tags) # stable + JSON-friendly + + mn = data.get("molnode", None) + mn_json = None if mn is None else mn.to_dict() + + ident = data.get("identity", None) + ident_json = None if ident is None else ident.to_dict() + + nodes_out.append( + { + "id": node_id, + "tags": tags_json, + "identity": ident_json, + "molnode": mn_json, + } + ) + + edges_out: list[dict[str, Any]] = [] + for u, v, data in self.g.edges(data=True): + bonds = data.get("bonds", []) + edges_out.append( + { + "u": u, + "v": v, + "bonds": [b.to_dict() for b in bonds], + "n_bonds": int(data.get("n_bonds", len(bonds))), + } + ) + + return { + "unassigned": self.unassigned, + "nodes": nodes_out, + "edges": edges_out, + } + + @classmethod + def from_dict(cls, data: dict[str, Any], validate: bool = True) -> "AssemblyGraph": + """ + Create an AssemblyGraph from a dictionary. + + :param data: dictionary representation of the AssemblyGraph + :param validate: whether to validate the graph after creation (default: True) + :return: AssemblyGraph instance + """ + unassigned = data.get("unassigned", "unassigned") + g = nx.Graph() + + # Nodes + for nd in data.get("nodes", []): + node_id = nd["id"] + tags = set(nd.get("tags", [])) + + mn_payload = nd.get("molnode", None) + molnode = None if mn_payload is None else MolNode.from_dict(mn_payload) + + ident_payload = nd.get("identity", None) + identity = None if ident_payload is None else MolIdentity.from_dict(ident_payload) + + g.add_node(node_id, molnode=molnode, tags=tags, identity=identity) + + # Edges + for ed in data.get("edges", []): + u = ed["u"] + v = ed["v"] + + bonds_raw = ed.get("bonds", []) + bonds = [RootBondLink.from_dict(b) for b in bonds_raw] + + n_bonds = int(ed.get("n_bonds", len(bonds))) + g.add_edge(u, v, bonds=bonds, n_bonds=n_bonds) + + ag = cls(g=g, unassigned=unassigned, validate_upon_initialization=False) + + if validate: + ag.validate() + + return ag @classmethod def build( diff --git a/src/retromol/model/readout.py b/src/retromol/model/readout.py index 2260e1e..478dacb 100644 --- a/src/retromol/model/readout.py +++ b/src/retromol/model/readout.py @@ -3,9 +3,8 @@ from dataclasses import dataclass from typing import Literal -from retromol.model.reaction_graph import MolNode +from retromol.model.reaction_graph import MolNode, ReactionGraph from retromol.model.assembly_graph import AssemblyGraph -from retromol.model.result import Result from retromol.model.rules import MatchingRule from retromol.chem.mol import encode_mol from retromol.chem.tagging import get_tags_mol @@ -32,18 +31,18 @@ def __str__(self) -> str: return f"LinearReadout(assembly_graph_nodes={self.assembly_graph.g.number_of_nodes()}; assembly_graph_edges={self.assembly_graph.g.number_of_edges()}; num_paths={len(self.paths)})" @classmethod - def from_result( + def from_reaction_graph( cls, - result: Result, - root_enc: str | None = None, + root_enc: str, + reaction_graph: ReactionGraph, exclude_identities: list[MatchingRule] | None = None, include_identities: list[MatchingRule] | None = None, ) -> "LinearReadout": """ Create a LinearReadout from a Result object. - :param result: RetroMol parsing result - :param root_enc: optional root molecule encoding; if None, use submission molecule + :param root_enc: encoding of the root molecule + :param reaction_graph: ReactionGraph object :param exclude_identities: list of matching rules to exclude identities (not used here) :param include_identities: list of matching rules to include identities (not used here) :return: LinearReadout instance @@ -57,10 +56,7 @@ def from_result( if include_identities is not None: include_identities = set([r.id for r in include_identities]) - g = result.reaction_graph - if root_enc is None: - root_enc = encode_mol(result.submission.mol) - + g = reaction_graph if root_enc not in g.nodes: raise ValueError(f"root_enc {root_enc} not found in reaction graph nodes") @@ -83,3 +79,26 @@ def from_result( paths.append(path) return cls(assembly_graph=a, paths=paths) + + def to_dict(self) -> dict: + """ + Serialize the LinearReadout to a dictionary. + + :return: dict representation of LinearReadout + """ + return { + "assembly_graph": self.assembly_graph.to_dict(), + "paths": [[node.to_dict() for node in path] for path in self.paths], + } + + @classmethod + def from_dict(cls, data: dict) -> "LinearReadout": + """ + Deserialize a LinearReadout from a dictionary. + + :param data: dict representation of LinearReadout + :return: LinearReadout instance + """ + assembly_graph = AssemblyGraph.from_dict(data["assembly_graph"]) + paths = [[MolNode.from_dict(node_data) for node_data in path_data] for path_data in data["paths"]] + return cls(assembly_graph=assembly_graph, paths=paths) diff --git a/src/retromol/model/result.py b/src/retromol/model/result.py index 2a79211..65a9051 100644 --- a/src/retromol/model/result.py +++ b/src/retromol/model/result.py @@ -1,10 +1,11 @@ """Module defining the Result data class.""" -from dataclasses import dataclass, asdict +from dataclasses import dataclass from typing import Any from retromol.model.submission import Submission from retromol.model.reaction_graph import ReactionGraph +from retromol.model.readout import LinearReadout from retromol.chem.tagging import get_tags_mol @@ -12,10 +13,15 @@ class Result: """ Represents a RetroMol parsing result. + + :var submission: Submission: the original submission associated with this result + :var reaction_graph: ReactionGraph: the reaction graph generated from retrosynthetic analysis + :var linear_readout: LinearReadout: the linear readout representation of the reaction graph """ submission: Submission reaction_graph: ReactionGraph + linear_readout: LinearReadout def __str__(self) -> str: """ @@ -55,6 +61,7 @@ def to_dict(self) -> dict[str, Any]: return { "submission": self.submission.to_dict(), "reaction_graph": self.reaction_graph.to_dict(), + "linear_readout": self.linear_readout.to_dict(), } @classmethod @@ -67,8 +74,10 @@ def from_dict(cls, data: dict[str, Any]) -> "Result": """ submission = Submission.from_dict(data["submission"]) reaction_graph = ReactionGraph.from_dict(data["reaction_graph"]) + linear_readout = LinearReadout.from_dict(data["linear_readout"]) return cls( submission=submission, reaction_graph=reaction_graph, + linear_readout=linear_readout, ) diff --git a/src/retromol/pipelines/parsing.py b/src/retromol/pipelines/parsing.py index d616c9e..1756449 100644 --- a/src/retromol/pipelines/parsing.py +++ b/src/retromol/pipelines/parsing.py @@ -11,6 +11,7 @@ from retromol.model.rules import RuleSet, index_uncontested, apply_uncontested from retromol.model.result import Result from retromol.model.reaction_graph import ReactionGraph, ReactionStep, RxnEdge +from retromol.model.readout import LinearReadout from retromol.model.synthesis import SynthesisExtractResult from retromol.chem.mol import Mol, encode_mol, mol_to_smiles from retromol.chem.tagging import get_tags_mol @@ -290,10 +291,12 @@ def run_retromol(submission: Submission, rules: RuleSet) -> Result: :param submission: Submission object containing the input molecule and data :param rules: Rules object containing the reaction rules to apply :return: Result object containing the retrosynthesis results - """ + """ + # Parse compound into reaction graph g = process_mol(submission, rules) log.debug(f"retrosynthesis graph has {len(g.nodes)} ({len(g.identified_nodes)} identified) nodes and {len(g.edges)} edges") + # Extract minimum-edge synthesis subgraph root = encode_mol(submission.mol) r = extract_min_edge_synthesis_subgraph( g, @@ -306,9 +309,13 @@ def run_retromol(submission: Submission, rules: RuleSet) -> Result: if not r.solved: log.debug("retrosynthesis extraction failed to find a solution") + # Calculate the linear readouts for the synthesis graph + linear_readout = LinearReadout.from_reaction_graph(root, r.graph) + return Result( submission=submission, reaction_graph=r.graph, + linear_readout=linear_readout, ) diff --git a/tests/data/integration_demo_set.py b/tests/data/integration_demo_set.py index 247706c..2208cc5 100644 --- a/tests/data/integration_demo_set.py +++ b/tests/data/integration_demo_set.py @@ -45,14 +45,14 @@ ( "abyssomicin C", r"C[C@@H]1C[C@]23OC(=O)C4=C2OC1[C@H](O)C3\C=C\C(=O)[C@@H](C)C[C@@H](C)C4=O", - 0.04, - ["oxidation"], + 0.72, + ["A2", "C1", "C1", "C1", "D2", "acetic acid", "glyceric acid", "oxidation"], ), ( "atrop-abyssocymin C", r"CC1CC23OC(=O)C4=C2OC1C(O)C3\C=C/C(=O)C(C)CC(C)C4=O", - 0.04, - ["oxidation"], + 0.72, + ["A2", "C1", "C1", "C1", "D2", "acetic acid", "glyceric acid", "oxidation"], ), ( "aculeximycin", @@ -153,9 +153,9 @@ ( "calicheamicin", r"CCN[C@H]1CO[C@H](C[C@@H]1OC)O[C@@H]2[C@H]([C@@H]([C@H](OC2O[C@H]3C#C/C=C\C#C[C@]\4(CC(=O)C(=C3/C4=C\CSSSC)NC(=O)OC)O)C)NO[C@H]5C[C@@H]([C@@H]([C@H](O5)C)SC(=O)C6=C(C(=C(C(=C6OC)OC)O[C@H]7[C@@H]([C@@H]([C@H]([C@@H](O7)C)O)OC)O)I)C)O)O", - 0.70, + 0.74, # Default rule set is not able to parse the enediyne core, but should at least identify the sugar parts - ["5-amino-6-methyloxane-2,3,4-triol", "6-methyl-5-sulfanyloxane-2,4-diol", "A1", "A5", "C1", "acetic acid", "ethanol", "iodination", "methylation", "methylation", "methylation", "methylation", "methylation", "methylation", "rhamnose", "sugar"], + ["5-amino-6-methyloxane-2,3,4-triol", "6-methyl-5-sulfanyloxane-2,4-diol", "A1", "A5", "C1", "acetic acid", "carbonic acid", "ethanol", "iodination", "methylation", "methylation", "methylation", "methylation", "methylation", "methylation", "rhamnose", "sugar"] ), ( "callystatin", @@ -168,13 +168,13 @@ r"C[C@@H]\1CCC[C@@H]([C@H](OC(=O)[C@@H]([C@@H](/C=C1)O)O)/C(=C/[C@@H](C)C(=O)[C@H](C)[C@@H](CC(=O)O)OC)/C)C", 1.0, # This parsing is ambiguous... can parse from two sides and neither is correct/wrong - ["A2", "B11", "B2", "B2", "C2", "D1", "D1", "D2", "malonic acid", "methylation"] + ["A2", "B1", "B2", "B5", "C1", "C2", "D2", "D2", "malonic acid", "methylation"], ), ( "chaetoglobosin A", r"C[C@H]\1C/C=C/[C@H]2[C@H]3[C@](O3)([C@H]([C@@H]4[C@@]2(C(=O)/C=C/C(=O)[C@@H](/C(=C1)/C)O)C(=O)N[C@H]4CC5=CNC6=CC=CC=C65)C)C", - 0.9, - ["A1", "C1", "C1", "C2", "C2", "D1", "D2", "acetic acid", "oxidation", "tryptophan"] + 1.0, + ["A1", "B11", "C1", "C1", "C1", "C2", "C2", "D2", "acetic acid", "oxidation", "tryptophan"], ), ( "chichorine", @@ -192,7 +192,7 @@ "chlorothricin", r"C[C@@H](C(C(O)=O)=C1)C[C@@]2(C(O)=C(O3)C(O2)=O)[C@@H]1C=CCCCC[C@@H]4C=C[C@@]([C@@H](O[C@@H]5C[C@H](O)[C@@H](O[C@H]6O[C@@H](C)[C@H](O)[C@@H](OC(C7=C(C)C(Cl)=CC=C7OC)=O)C6)[C@@H](C)O5)CCC8)([H])[C@]8([H])[C@@H]4C3=O", 1.0, - ["6-methyloxane-2,4,5-triol", "6-methyloxane-2,4,5-triol", "6-methylsalicylic acid", "A1", "A5", "B1", "C1", "C1", "C1", "C1", "C1", "C1", "C1", "C13", "D1", "D1", "D1", "acetic acid", "acetic acid", "chlorination", "glyceric acid", "methylation"] + ["6-methyloxane-2,4,5-triol", "6-methyloxane-2,4,5-triol", "6-methylsalicylic acid", "A1", "B1", "C1", "C1", "C1", "C1", "C1", "C1", "C1", "C13", "D1", "D1", "D1", "acetic acid", "acetic acid", "chlorination", "glyceric acid", "glycolic acid", "methylation"], ), ( "coelimycin P1", @@ -216,7 +216,7 @@ "deschlorothricin", r"C[C@@H](C(C(O)=O)=C1)C[C@@]2(C(O)=C(O3)C(O2)=O)[C@@H]1C=CCCCC[C@@H]4C=C[C@@]([C@@H](O[C@@H]5C[C@H](O)[C@@H](O[C@H]6O[C@@H](C)[C@H](O)[C@@H](OC(C7=C(C)C=CC=C7OC)=O)C6)[C@@H](C)O5)CCC8)([H])[C@]8([H])[C@@H]4C3=O", 1.0, - ["6-methyloxane-2,4,5-triol", "6-methyloxane-2,4,5-triol", "6-methylsalicylic acid", "A1", "A5", "B1", "C1", "C1", "C1", "C1", "C1", "C1", "C1", "C13", "D1", "D1", "D1", "acetic acid", "acetic acid", "glyceric acid", "methylation"] + ["6-methyloxane-2,4,5-triol", "6-methyloxane-2,4,5-triol", "6-methylsalicylic acid", "A1", "B1", "C1", "C1", "C1", "C1", "C1", "C1", "C1", "C13", "D1", "D1", "D1", "acetic acid", "acetic acid", "glyceric acid", "glycolic acid", "methylation"], ), ( "daptomycin", @@ -264,7 +264,7 @@ "harzianic acid", r"CCC/C=C/C=C/C(=C\1/C(=O)C(N(C1=O)C)CC(C(C)C)(C(=O)O)O)/O", 1.0, - ["A1", "C1", "C1", "D1", "acetic acid", "artificial amino acid harzianic acid", "butanoic acid", "methylation"], + ["A1", "C1", "C1", "D1", "acetic acid", "artificial amino acid harzianic acid", "butanoic acid", "methylation", "trans-2-hexanoic acid"] ), ( "herboxidiene", From 6dc64d5b512a284f1ae299b99c20d448b806dd5c Mon Sep 17 00:00:00 2001 From: David Meijer Date: Tue, 6 Jan 2026 22:09:30 +0100 Subject: [PATCH 4/4] UPD --- pyproject.toml | 2 +- src/retromol/fingerprint/fingerprint.py | 10 ++-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 297d208..6bbf5bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "retromol" -version = "1.1.0" +version = "2.0.0" description = "RetroMol is retrosynthetic analysis tool for modular natural products" readme = "README.md" requires-python = ">=3.10" diff --git a/src/retromol/fingerprint/fingerprint.py b/src/retromol/fingerprint/fingerprint.py index 5df12fa..fd83388 100644 --- a/src/retromol/fingerprint/fingerprint.py +++ b/src/retromol/fingerprint/fingerprint.py @@ -175,8 +175,6 @@ def __init__( tanimoto_threshold: float = 0.6, morgan_radius: int = 2, morgan_num_bits: int = 2048, - family_token_weight: int = 1, - ancestor_token_weight: int = 1, ) -> None: """ Initialize FingerprintGenerator. @@ -186,8 +184,6 @@ def __init__( :param tanimoto_threshold: Tanimoto similarity threshold for collapsing monomers :param morgan_radius: radius for Morgan fingerprinting when collapsing monomers :param morgan_num_bits: number of bits for Morgan fingerprinting when collapsing monomers - :param family_token_weight: weight for family tokens in the fingerprint - :param ancestor_token_weight: weight for ancestor tokens in the fingerprint """ matching_rules = list(matching_rules) @@ -279,10 +275,8 @@ def fingerprint_from_result( if kmer_weights is None: kmer_weights = {1: 1, 2: 1} - # Create assembly graph of monomers; first collect nodes to include - root = result.submission.mol - collected = result.reaction_graph.get_leaf_nodes(identified_only=False) - a = AssemblyGraph.build(root_mol=root, monomers=collected, include_unassigned=True) + # Retrieve AssemblyGraph from Result + a = result.linear_readout.assembly_graph # Calculate kmers from AssemblyGraph tokenized_kmers: list[tuple[str | None, ...]] = []