StollLab · mtessmer · May 19, 2026 · May 19, 2026
diff --git a/scripts/rebuild_canonical_residue_pdbs_from_ccd.py b/scripts/rebuild_canonical_residue_pdbs_from_ccd.py
@@ -0,0 +1,287 @@
+#!/usr/bin/env python
+"""Rebuild residue PDB templates from RCSB CCD ideal coordinates.
+
+Uses ``chilife.bio_ccd`` ideal coordinates, preserves chiLife atom naming and
+backbone orientation from the existing residue templates, then regenerates
+``residue_internal_coords/*_ic.pkl`` files.
+
+Canonical amino acids use their own CCD entries. Protonation variants in
+``alt_prot_states`` use the parent amino-acid CCD geometry (e.g. ASH from ASP)
+with variant-specific atoms omitted when absent from the chiLife template.
+"""
+
+from __future__ import annotations
+
+import argparse
+import pickle
+import re
+from pathlib import Path
+
+import MDAnalysis as mda
+import numpy as np
+
+import chilife
+from chilife.globals import alt_prot_states, nataa_codes
+
+RL_DIR = chilife.RL_DIR
+PDB_DIR = RL_DIR / "residue_pdbs"
+IC_DIR = RL_DIR / "residue_internal_coords"
+
+SKIP_CCD_ATOMS = frozenset({"OXT", "HXT", "H2", "H3"})
+
+# chiLife name -> CCD atom_id where automatic mapping fails
+CHILIFE_TO_CCD_H = {
+    "GLY": {"HA": "HA2", "3HA": "HA3"},
+}
+
+# bio_ccd three-letter codes for alt states often name unrelated compounds; use parents.
+ALT_CCD_SOURCE = dict(alt_prot_states)
+
+# Parent CCD atom_ids to drop when building an alt-state template
+ALT_CCD_SKIP: dict[str, frozenset[str]] = {
+    "ASH": frozenset(),
+    "GLH": frozenset(),
+    "LYN": frozenset({"HZ3"}),
+    "CYM": frozenset({"HG"}),
+    "HID": frozenset({"HE2"}),
+    "HIE": frozenset({"HD1"}),
+    "HIP": frozenset(),
+    "TYM": frozenset({"HH"}),
+}
+
+
+def ccd_h_to_chilife(name: str) -> str:
+    """Map a CCD/PDB hydrogen name to chiLife's ``{n}H{anchor}`` style."""
+    if name in ("H", "HA", "H2", "H3", "HZ"):
+        return name
+    match = re.match(r"^H([A-Z]+?\d*)(\d+)$", name)
+    if match:
+        return f"{match.group(2)}H{match.group(1)}"
+    return name
+
+
+def kabsch(mobile: np.ndarray, target: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+    """Return ``R, t`` such that ``mobile @ R + t`` best matches ``target``."""
+    mobile = np.asarray(mobile, dtype=float)
+    target = np.asarray(target, dtype=float)
+    mob_cent = mobile.mean(axis=0)
+    tgt_cent = target.mean(axis=0)
+    mob = mobile - mob_cent
+    tgt = target - tgt_cent
+    H = mob.T @ tgt
+    U, _, Vt = np.linalg.svd(H)
+    R = Vt.T @ U.T
+    if np.linalg.det(R) < 0:
+        Vt = Vt.copy()
+        Vt[-1, :] *= -1
+        R = Vt.T @ U.T
+    t = tgt_cent - mob_cent @ R
+    return R, t
+
+
+def ccd_positions(
+    ccd_resname: str,
+    ccd: dict,
+    *,
+    skip_atoms: frozenset[str] = SKIP_CCD_ATOMS,
+) -> dict[str, np.ndarray]:
+    """Ideal Cartesian coordinates from ``bio_ccd`` (Angstrom)."""
+    if ccd_resname not in ccd:
+        raise KeyError(f"{ccd_resname} not in bio_ccd")
+
+    atoms = ccd[ccd_resname]["chem_comp_atom"]
+    out: dict[str, np.ndarray] = {}
+    for i, atom_id in enumerate(atoms["atom_id"]):
+        if atom_id in skip_atoms:
+            continue
+        out[atom_id] = np.array(
+            [
+                float(atoms["pdbx_model_Cartn_x_ideal"][i]),
+                float(atoms["pdbx_model_Cartn_y_ideal"][i]),
+                float(atoms["pdbx_model_Cartn_z_ideal"][i]),
+            ],
+            dtype=float,
+        )
+    return out
+
+
+def build_chilife_to_ccd_map(
+    resname: str, template_names: list[str], ccd_pos: dict[str, np.ndarray]
+) -> dict[str, str]:
+    """Map each chiLife atom name in the template to a CCD ``atom_id``."""
+    ccd_h_to_chi: dict[str, str] = {}
+    for ccd_name in ccd_pos:
+        if ccd_name[0] == "H" or ccd_name in ("H", "HA", "HZ"):
+            ccd_h_to_chi[ccd_name] = ccd_h_to_chilife(ccd_name)
+
+    chi_to_ccd: dict[str, str] = {}
+    for name in template_names:
+        if name in ccd_pos:
+            chi_to_ccd[name] = name
+            continue
+        if name in CHILIFE_TO_CCD_H.get(resname, {}):
+            chi_to_ccd[name] = CHILIFE_TO_CCD_H[resname][name]
+            continue
+        for ccd_name, chi_name in ccd_h_to_chi.items():
+            if chi_name == name:
+                chi_to_ccd[name] = ccd_name
+                break
+        else:
+            raise KeyError(f"{resname}: no CCD atom for chiLife name {name!r}")
+    return chi_to_ccd
+
+
+def rebuild_residue(
+    resname: str,
+    ccd: dict,
+    pdb_dir: Path,
+    ic_dir: Path,
+    dry_run: bool = False,
+    *,
+    ccd_source: str | None = None,
+    skip_ccd_atoms: frozenset[str] | None = None,
+) -> None:
+    res_lower = resname.lower()
+    template_path = pdb_dir / f"{res_lower}.pdb"
+    if not template_path.exists():
+        raise FileNotFoundError(template_path)
+
+    template = mda.Universe(str(template_path), in_memory=True)
+    template_names = [a.name.strip() for a in template.atoms]
+    ref_coords = {a.name.strip(): a.position.copy() for a in template.atoms}
+
+    ccd_resname = ccd_source or resname
+    skip = SKIP_CCD_ATOMS | (skip_ccd_atoms or frozenset())
+    ccd_pos = ccd_positions(ccd_resname, ccd, skip_atoms=skip)
+    chi_to_ccd = build_chilife_to_ccd_map(resname, template_names, ccd_pos)
+
+    mobile_bb = np.vstack([ccd_pos[chi_to_ccd[x]] for x in ("N", "CA", "C")])
+    target_bb = np.vstack([ref_coords[x] for x in ("N", "CA", "C")])
+    R, t = kabsch(mobile_bb, target_bb)
+
+    new_coords: dict[str, np.ndarray] = {}
+    for chi_name in template_names:
+        ccd_name = chi_to_ccd[chi_name]
+        new_coords[chi_name] = ccd_pos[ccd_name] @ R + t
+
+    lines = []
+    for i, name in enumerate(template_names):
+        atom = template.atoms[i]
+        coord = new_coords[name]
+        lines.append(
+            f"ATOM  {i + 1:5d} {name:^4s} {resname:3s} {atom.chainID:1s}{int(atom.resid):4d}    "
+            f"{coord[0]:8.3f}{coord[1]:8.3f}{coord[2]:8.3f}{1.0:6.2f}{1.0:6.2f}          {atom.element:>2s}  \n"
+        )
+
+    if dry_run:
+        print(f"{resname}: would write {template_path} ({len(lines)} atoms)")
+        return
+
+    try:
+        sorted_lines = chilife.sort_pdb(lines)
+    except (IndexError, RuntimeError):
+        # Template atom order is already valid; some small protonation variants
+        # fail bond guessing during sort (e.g. ASH).
+        sorted_lines = lines
+    template_path.write_text("".join(sorted_lines))
+
+    struct = mda.Universe(str(template_path), in_memory=True)
+    pref_d = chilife.dihedral_defs[resname]
+    ICs = chilife.MolSysIC.from_atoms(struct, preferred_dihedrals=pref_d)
+    ic_path = ic_dir / f"{res_lower}_ic.pkl"
+    with open(ic_path, "wb") as f:
+        pickle.dump(ICs, f)
+    print(f"{resname}: wrote {template_path.name} and {ic_path.name}")
+
+
+def _rebuild_targets(
+    resnames: list[str] | None,
+    *,
+    canonical: bool,
+    alt_states: bool,
+) -> list[tuple[str, str | None, frozenset[str] | None]]:
+    """Return ``(template, ccd_source, skip_ccd_atoms)`` jobs to run."""
+    jobs: list[tuple[str, str | None, frozenset[str] | None]] = []
+    allowed: dict[str, tuple[str | None, frozenset[str] | None]] = {}
+
+    if canonical:
+        for name in sorted(k for k in nataa_codes if len(k) == 3):
+            allowed[name] = (None, None)
+
+    if alt_states:
+        for name in sorted(alt_prot_states):
+            allowed[name] = (
+                ALT_CCD_SOURCE[name],
+                ALT_CCD_SKIP.get(name, frozenset()),
+            )
+
+    if not allowed:
+        raise SystemExit("Select --canonical, --alt-states, or --all")
+
+    if resnames:
+        targets = [r.upper() for r in resnames]
+        unknown = set(targets) - set(allowed)
+        if unknown:
+            raise SystemExit(f"Unknown residue names: {sorted(unknown)}")
+    else:
+        targets = sorted(allowed)
+
+    for name in targets:
+        source, skip = allowed[name]
+        jobs.append((name, source, skip))
+    return jobs
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--res",
+        nargs="*",
+        help="Residue names to rebuild (default: all selected groups)",
+    )
+    parser.add_argument(
+        "--canonical",
+        action="store_true",
+        help="Rebuild the 20 canonical amino acids",
+    )
+    parser.add_argument(
+        "--alt-states",
+        action="store_true",
+        help="Rebuild canonical protonation variants (ASH, GLH, HID, ...)",
+    )
+    parser.add_argument(
+        "--all",
+        action="store_true",
+        help="Rebuild canonical amino acids and protonation variants",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Print actions without writing files",
+    )
+    args = parser.parse_args()
+
+    canonical = args.canonical or args.all
+    alt_states = args.alt_states or args.all
+    if not (canonical or alt_states):
+        canonical = True
+
+    with open(chilife.DATA_DIR / "bio_ccd.pkl", "rb") as f:
+        ccd = pickle.load(f)
+
+    for resname, ccd_source, skip_ccd_atoms in _rebuild_targets(
+        args.res, canonical=canonical, alt_states=alt_states
+    ):
+        rebuild_residue(
+            resname,
+            ccd,
+            PDB_DIR,
+            IC_DIR,
+            dry_run=args.dry_run,
+            ccd_source=ccd_source,
+            skip_ccd_atoms=skip_ccd_atoms,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/chilife/RotamerEnsemble.py b/src/chilife/RotamerEnsemble.py
@@ -253,7 +253,9 @@ def from_mda(cls, residue, **kwargs):
 
 
         if kwargs.get('use_H', False) and res in ralt_prot_states:
-            if res != 'HIS':
+            if res == 'TYR' and 'HH' not in residue.atoms.names:
+                res = ralt_prot_states[res].get(len(residue.atoms), res)
+            elif res != 'HIS':
                 res = ralt_prot_states[res].get(len(residue.atoms), res)
             elif len(residue.atoms) == 18:
                 res = 'HIP'

diff --git a/src/chilife/__init__.py b/src/chilife/__init__.py
@@ -44,4 +44,4 @@
 # SpinLabel = SpinLabel.SpinLabel
 # dSpinLabel = dSpinLabel.dSpinLabel
 
-__version__ = "1.2.3"
+__version__ = "1.2.4"
diff --git a/src/chilife/chilife.py b/src/chilife/chilife.py
@@ -36,7 +36,7 @@
 from .alignment_methods import local_mx
 from .Topology import get_min_topol, guess_bonds
 from .pdb_utils import sort_pdb, get_backbone_atoms, get_bb_candidates
-from .protein_utils import mutate, guess_mobile_dihedrals
+from .protein_utils import mutate, guess_mobile_dihedrals, get_missing_residues, _mutation_ignore_sites
 from .MolSysIC import MolSysIC
 from .scoring import GAS_CONST, reweight_rotamers, ljEnergyFunc
 from .numba_utils import get_delta_r, normdist
@@ -1705,15 +1705,15 @@ def repack(
 
     repack_res_kwargs["eval_clash"] = False
 
-    repack_residue_libraries = [
-        RotamerEnsemble.from_mda(res, **repack_res_kwargs)
-        for res in repack_residues
-        if res.resname not in ["GLY", "ALA"] and res.resname in SUPPORTED_RESIDUES
-    ]
-    repack_residue_libraries += list(ensembles)
-
-    # Create new labeled protein construct to fill in any missing atoms of repack residues
-    protein = mutate(protein, *repack_residue_libraries, **kwargs).atoms
+    fill_kwargs = {**kwargs, "add_missing_atoms": False, "rotamer_index": "closest"}
+    missing = get_missing_residues(
+        protein,
+        ignore=_mutation_ignore_sites(ensembles),
+        use_H=repack_res_kwargs.get("use_H", False),
+        ignore_waters=repack_res_kwargs.get("ignore_waters", True),
+    )
+    if missing:
+        protein = mutate(protein, *missing, **fill_kwargs).atoms
 
     repack_residues = protein.select_atoms(
         f"around {repack_radius} {sites_str}"

diff --git a/src/chilife/data/dihedral_defs.toml b/src/chilife/data/dihedral_defs.toml
@@ -22,6 +22,7 @@ SER = [["N", "CA", "CB", "OG"]]
 THR = [["N", "CA", "CB", "OG1"]]
 TRP = [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]]
 TYR = [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]]
+TYM = [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]]
 VAL = [["N", "CA", "CB", "CG1"]]
 R1C = [["N", "CA", "CB", "SG"], ["CA", "CB", "SG", "SD"], ["CB", "SG", "SD", "CE"], ["SG", "SD", "CE", "C3"], ["SD", "CE", "C3", "C2"]]
 CYR1 = [["N", "CA", "CB", "SG"], ["CA", "CB", "SG", "S1L"], ["CB", "SG", "S1L", "C1L"], ["SG", "S1L", "C1L", "C1R"], ["S1L", "C1L", "C1R", "C1"]]

diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/ala_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/ala_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/arg_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/arg_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/ash_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/ash_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/asn_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/asn_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/asp_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/asp_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/cym_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/cym_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/cys_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/cys_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/glh_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/glh_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/gln_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/gln_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/glu_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/glu_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/gly_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/gly_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/hid_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/hid_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/hie_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/hie_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/hip_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/hip_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/his_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/his_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/ile_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/ile_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/leu_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/leu_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/lyn_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/lyn_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/lys_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/lys_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/met_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/met_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/phe_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/phe_ic.pkl
diff --git a/src/chilife/data/rotamer_libraries/residue_internal_coords/pro_ic.pkl b/src/chilife/data/rotamer_libraries/residue_internal_coords/pro_ic.pkl