From 94922e2c1668d1e1d1481d3cc9c0523dd9893643 Mon Sep 17 00:00:00 2001 From: jbutch Date: Sat, 4 Apr 2026 21:23:54 -0700 Subject: [PATCH 1/5] Fix ligand res_id offset to match AF3 output convention RFD3 was offsetting ligand res_id values from the protein max, causing (chain_id, res_id, atom_name) pairing to fail against AF3 predictions which always start ligand res_id at 1. Replace the offset with dense rank-based per-chain renumbering (1, 2, ...) and add a chain A validation with an override option (allow_ligand_on_chain_a). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../rfd3/src/rfd3/inference/input_parsing.py | 29 ++++++++++++++----- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/models/rfd3/src/rfd3/inference/input_parsing.py b/models/rfd3/src/rfd3/inference/input_parsing.py index d97b3be3..30ff294b 100644 --- a/models/rfd3/src/rfd3/inference/input_parsing.py +++ b/models/rfd3/src/rfd3/inference/input_parsing.py @@ -136,6 +136,7 @@ class DesignInputSpecification(BaseModel): # Extra args: length: Optional[str] = Field(None, description="Length range as 'min-max' or int. Constrains length of contig if provided") ligand: Optional[str] = Field(None, description="Ligand name or index to include in design.") + allow_ligand_on_chain_a: bool = Field(False, description="If True, suppress the error when a ligand is on chain A (the protein chain). Use with caution — chain ID is leaked to the model.") cif_parser_args: Optional[Dict[str, Any]] = Field(None, description="CIF parser arguments") extra: Optional[Dict[str, Any]] = Field(default_factory=dict, description="Extra metadata to include in output (useful for logging additional info in metadata)") dialect: int = Field(2, description="RFdiffusion3 input dialect. 1: legacy, 2: release.") @@ -672,14 +673,26 @@ def _append_ligand(self, atom_array, atom_array_input_annotated): + list(atom_array_input_annotated.get_annotation_categories()) ), ) - # Offset ligand residue ids based on the original input to avoid clashes - # with any newly created residues (matches legacy behaviour). - ligand_array.res_id = ( - ligand_array.res_id - - np.min(ligand_array.res_id) - + np.max(atom_array.res_id) - + 1 - ) + # Error if any ligand sits on chain A (the protein chain) unless + # explicitly overridden — chain ID is leaked to the model so this + # is a significant difference from the expected convention. + ligand_chains = np.unique(ligand_array.chain_id) + if "A" in ligand_chains and not self.allow_ligand_on_chain_a: + raise ValueError( + f"Ligand found on chain A, which is reserved for the protein. " + f"Ligand chain(s): {ligand_chains.tolist()}. " + f"Place ligands on separate chains (B, C, D, ...) or set " + f"'allow_ligand_on_chain_a: true' to override this check." + ) + # Reset ligand res_id to start from 1 per chain, matching the + # convention AF3 uses in its output CIF files. Use dense + # rank-based renumbering so gaps in the original numbering + # (e.g. res_id 850, 900) become sequential (1, 2). + for chain in ligand_chains: + mask = ligand_array.chain_id == chain + chain_res_ids = ligand_array.res_id[mask] + _, inverse = np.unique(chain_res_ids, return_inverse=True) + ligand_array.res_id[mask] = inverse + 1 # Harmonize conditioning annotations before concatenation: biotite's # concatenate only preserves annotations present in ALL arrays (set # intersection), so mismatched optional conditioning annotations From 96b2e5f882fcac106ad5f3bfc29196cad6b301b8 Mon Sep 17 00:00:00 2001 From: jbutch Date: Sat, 4 Apr 2026 21:29:31 -0700 Subject: [PATCH 2/5] Generalize chain validation to all existing chains MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename allow_ligand_on_chain_a → allow_ligand_on_existing_chain and check against all chains already present in the built atom array, not just chain A. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../rfd3/src/rfd3/inference/input_parsing.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/models/rfd3/src/rfd3/inference/input_parsing.py b/models/rfd3/src/rfd3/inference/input_parsing.py index 30ff294b..f237e026 100644 --- a/models/rfd3/src/rfd3/inference/input_parsing.py +++ b/models/rfd3/src/rfd3/inference/input_parsing.py @@ -136,7 +136,7 @@ class DesignInputSpecification(BaseModel): # Extra args: length: Optional[str] = Field(None, description="Length range as 'min-max' or int. Constrains length of contig if provided") ligand: Optional[str] = Field(None, description="Ligand name or index to include in design.") - allow_ligand_on_chain_a: bool = Field(False, description="If True, suppress the error when a ligand is on chain A (the protein chain). Use with caution — chain ID is leaked to the model.") + allow_ligand_on_existing_chain: bool = Field(False, description="If True, suppress the error when a ligand shares a chain ID with the built atom array. Use with caution — chain ID is leaked to the model.") cif_parser_args: Optional[Dict[str, Any]] = Field(None, description="CIF parser arguments") extra: Optional[Dict[str, Any]] = Field(default_factory=dict, description="Extra metadata to include in output (useful for logging additional info in metadata)") dialect: int = Field(2, description="RFdiffusion3 input dialect. 1: legacy, 2: release.") @@ -673,16 +673,18 @@ def _append_ligand(self, atom_array, atom_array_input_annotated): + list(atom_array_input_annotated.get_annotation_categories()) ), ) - # Error if any ligand sits on chain A (the protein chain) unless - # explicitly overridden — chain ID is leaked to the model so this - # is a significant difference from the expected convention. + # Error if any ligand shares a chain ID with the already-built + # atom array — chain ID is leaked to the model so collisions + # represent a significant deviation from the expected convention. ligand_chains = np.unique(ligand_array.chain_id) - if "A" in ligand_chains and not self.allow_ligand_on_chain_a: + existing_chains = set(np.unique(atom_array.chain_id)) + overlapping = sorted(existing_chains & set(ligand_chains)) + if overlapping and not self.allow_ligand_on_existing_chain: raise ValueError( - f"Ligand found on chain A, which is reserved for the protein. " - f"Ligand chain(s): {ligand_chains.tolist()}. " - f"Place ligands on separate chains (B, C, D, ...) or set " - f"'allow_ligand_on_chain_a: true' to override this check." + f"Ligand chain(s) {overlapping} overlap with existing " + f"chain(s) {sorted(existing_chains)}. Place ligands on " + f"separate chains or set 'allow_ligand_on_existing_chain: " + f"true' to override this check." ) # Reset ligand res_id to start from 1 per chain, matching the # convention AF3 uses in its output CIF files. Use dense From 9d4ea2cca4156ee3563a880d08d03ea7a79fcee2 Mon Sep 17 00:00:00 2001 From: jbutch Date: Sat, 4 Apr 2026 21:46:02 -0700 Subject: [PATCH 3/5] Error on multiple ligands per chain; preserve gaps in override mode When allow_ligand_on_existing_chain is False, raise an error if multiple ligand residues share the same chain. Reset res_id min to 1 per chain, preserving relative gaps when ligands share a chain (override mode). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../rfd3/src/rfd3/inference/input_parsing.py | 40 +++++++++++-------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/models/rfd3/src/rfd3/inference/input_parsing.py b/models/rfd3/src/rfd3/inference/input_parsing.py index f237e026..9ad908cc 100644 --- a/models/rfd3/src/rfd3/inference/input_parsing.py +++ b/models/rfd3/src/rfd3/inference/input_parsing.py @@ -673,28 +673,36 @@ def _append_ligand(self, atom_array, atom_array_input_annotated): + list(atom_array_input_annotated.get_annotation_categories()) ), ) - # Error if any ligand shares a chain ID with the already-built - # atom array — chain ID is leaked to the model so collisions - # represent a significant deviation from the expected convention. + # Validate chain assignments — chain ID is leaked to the model + # so collisions are a significant deviation from convention. ligand_chains = np.unique(ligand_array.chain_id) existing_chains = set(np.unique(atom_array.chain_id)) overlapping = sorted(existing_chains & set(ligand_chains)) - if overlapping and not self.allow_ligand_on_existing_chain: - raise ValueError( - f"Ligand chain(s) {overlapping} overlap with existing " - f"chain(s) {sorted(existing_chains)}. Place ligands on " - f"separate chains or set 'allow_ligand_on_existing_chain: " - f"true' to override this check." - ) - # Reset ligand res_id to start from 1 per chain, matching the - # convention AF3 uses in its output CIF files. Use dense - # rank-based renumbering so gaps in the original numbering - # (e.g. res_id 850, 900) become sequential (1, 2). + if not self.allow_ligand_on_existing_chain: + if overlapping: + raise ValueError( + f"Ligand chain(s) {overlapping} overlap with existing " + f"chain(s) {sorted(existing_chains)}. Place ligands on " + f"separate chains or set 'allow_ligand_on_existing_chain: " + f"true' to override this check." + ) + # Multiple ligands must each be on their own chain. + for chain in ligand_chains: + n_residues = len( + np.unique(ligand_array.res_id[ligand_array.chain_id == chain]) + ) + if n_residues > 1: + raise ValueError( + f"Multiple ligand residues on chain {chain}. Each " + f"ligand must be on its own chain, or set " + f"'allow_ligand_on_existing_chain: true' to override." + ) + # Reset ligand res_id to start from 1 per chain. When ligands + # share a chain (override mode), preserve relative gaps. for chain in ligand_chains: mask = ligand_array.chain_id == chain chain_res_ids = ligand_array.res_id[mask] - _, inverse = np.unique(chain_res_ids, return_inverse=True) - ligand_array.res_id[mask] = inverse + 1 + ligand_array.res_id[mask] = chain_res_ids - np.min(chain_res_ids) + 1 # Harmonize conditioning annotations before concatenation: biotite's # concatenate only preserves annotations present in ALL arrays (set # intersection), so mismatched optional conditioning annotations From 01d92a950205a5fd01b1040a63e76770055919ef Mon Sep 17 00:00:00 2001 From: jbutch Date: Sat, 4 Apr 2026 21:48:06 -0700 Subject: [PATCH 4/5] Use legacy res_id offset when allow_ligand_on_existing_chain is True The override path now matches the old behaviour (offset from protein max res_id). The default path (separate chains) sets each ligand chain's res_id to 1. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../rfd3/src/rfd3/inference/input_parsing.py | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/models/rfd3/src/rfd3/inference/input_parsing.py b/models/rfd3/src/rfd3/inference/input_parsing.py index 9ad908cc..96952147 100644 --- a/models/rfd3/src/rfd3/inference/input_parsing.py +++ b/models/rfd3/src/rfd3/inference/input_parsing.py @@ -697,12 +697,20 @@ def _append_ligand(self, atom_array, atom_array_input_annotated): f"ligand must be on its own chain, or set " f"'allow_ligand_on_existing_chain: true' to override." ) - # Reset ligand res_id to start from 1 per chain. When ligands - # share a chain (override mode), preserve relative gaps. - for chain in ligand_chains: - mask = ligand_array.chain_id == chain - chain_res_ids = ligand_array.res_id[mask] - ligand_array.res_id[mask] = chain_res_ids - np.min(chain_res_ids) + 1 + if self.allow_ligand_on_existing_chain: + # Legacy behaviour: offset from protein max to avoid clashes. + ligand_array.res_id = ( + ligand_array.res_id + - np.min(ligand_array.res_id) + + np.max(atom_array.res_id) + + 1 + ) + else: + # Reset ligand res_id to start from 1 per chain, matching + # the convention AF3 uses in its output CIF files. + for chain in ligand_chains: + mask = ligand_array.chain_id == chain + ligand_array.res_id[mask] = 1 # Harmonize conditioning annotations before concatenation: biotite's # concatenate only preserves annotations present in ALL arrays (set # intersection), so mismatched optional conditioning annotations From 860b8b8de58291fa743433acc37089da4fb36af0 Mon Sep 17 00:00:00 2001 From: jbutch Date: Sat, 4 Apr 2026 21:49:07 -0700 Subject: [PATCH 5/5] Note in errors that override restores old behaviour Co-Authored-By: Claude Opus 4.6 (1M context) --- models/rfd3/src/rfd3/inference/input_parsing.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/models/rfd3/src/rfd3/inference/input_parsing.py b/models/rfd3/src/rfd3/inference/input_parsing.py index 96952147..3d922466 100644 --- a/models/rfd3/src/rfd3/inference/input_parsing.py +++ b/models/rfd3/src/rfd3/inference/input_parsing.py @@ -684,7 +684,7 @@ def _append_ligand(self, atom_array, atom_array_input_annotated): f"Ligand chain(s) {overlapping} overlap with existing " f"chain(s) {sorted(existing_chains)}. Place ligands on " f"separate chains or set 'allow_ligand_on_existing_chain: " - f"true' to override this check." + f"true' to restore the old behaviour." ) # Multiple ligands must each be on their own chain. for chain in ligand_chains: @@ -695,7 +695,8 @@ def _append_ligand(self, atom_array, atom_array_input_annotated): raise ValueError( f"Multiple ligand residues on chain {chain}. Each " f"ligand must be on its own chain, or set " - f"'allow_ligand_on_existing_chain: true' to override." + f"'allow_ligand_on_existing_chain: true' to restore " + f"the old behaviour." ) if self.allow_ligand_on_existing_chain: # Legacy behaviour: offset from protein max to avoid clashes.