ciffy is a fast CIF file parser for molecular structures, with a C backend and Python interface. It supports both NumPy and PyTorch backends for array operations.
ciffy is 40-55x faster than BioPython and Biotite for parsing CIF files. Even loading via ciffy and converting to a biotite AtomArray is ~10x faster than biotite's own loader:
| Structure | Atoms | ciffy | BioPython | Biotite | ciffy→biotite |
|---|---|---|---|---|---|
| 3SKW | 2,826 | 0.30 ms | 16 ms (53x) | 12 ms (39x) | 1 ms (10x) |
| 9GCM | 4,466 | 0.45 ms | 20 ms (45x) | 17 ms (38x) | 2 ms (10x) |
| 9MDS | 102,216 | 8.91 ms | 494 ms (55x) | 411 ms (46x) | 41 ms (10x) |
Run python tests/profiling/profile_io.py to reproduce.
pip install ciffygit clone https://github.com/hmblair/ciffy.git
cd ciffy
pip install -e .ciffy supports two array backends:
- NumPy: Lightweight, no additional dependencies required
- PyTorch: For GPU support (CUDA/MPS) and integration with deep learning workflows
Specify the backend when loading structures:
import ciffy
# Load with NumPy backend (default)
polymer = ciffy.load("structure.cif", backend="numpy")
# Load with PyTorch backend
polymer = ciffy.load("structure.cif", backend="torch")Convert between backends:
torch_polymer = polymer.torch()
numpy_polymer = polymer.numpy()Move tensors to GPU (PyTorch only):
polymer_gpu = polymer.torch().to("cuda")
polymer_mps = polymer.torch().to("mps")import ciffy
# Load from CIF file
polymer = ciffy.load("structure.cif")
# Load specific chains
polymer = ciffy.load("structure.cif", chains=["A", "B"])
# Load specific molecule types
polymer = ciffy.load("structure.cif", molecule_types=ciffy.RNA)
# Print summary
print(polymer)Example output:
Polymer 9GCM [2024-08-02]
─────────────────────────
Type Res Atoms
─────────────────────────
A RNA 135 1413
B PROTEIN 132 1032
C PROTEIN 246 1261
D PROTEIN 485 760
─────────────────────────
Σ 4 998 4466
─────────────────────────
polymer.coordinates # (N, 3) atom positions
polymer.atoms # (N,) atom type indices
polymer.elements # (N,) element indices
polymer.sequence # (R,) residue type indices
polymer.bonds # (B, 2) covalent bond pairs
polymer.molecule_types # (C,) molecule type per chain
polymer.names # Chain names ["A", "B", ...]
polymer.lengths # (C,) residues per chain
polymer.size() # Total atoms
polymer.size(ciffy.RESIDUE) # Total residues
polymer.size(ciffy.CHAIN) # Total chains
polymer.sequence_str() # "acgu..." sequence string# Select by chain
chain_a = polymer.chain(0)
chains_ab = polymer.chain([0, 1])
# Select by residue
first_residue = polymer.residue(0)
some_residues = polymer.residue([0, 5, 10])
# Select by molecule type
rna_only = polymer.molecule_type(ciffy.RNA)
protein_only = polymer.molecule_type(ciffy.PROTEIN)
# Select by residue type
adenines = polymer.residue_type(ciffy.Residue.A)
# Structural selections
backbone = polymer.backbone() # Backbone atoms only
bases = polymer.nucleobase() # Nucleobase atoms (RNA/DNA)
sidechains = polymer.sidechain() # Sidechain atoms
heavy = polymer.heavy() # Heavy atoms (no hydrogens)
# Remove unresolved residues
resolved = polymer.strip()# Iterate over all chains
for chain in polymer.chains():
print(chain.sequence_str())
# Iterate over RNA chains only
for chain in polymer.molecule_type(ciffy.RNA).chains():
print(chain.pdb_id, chain.sequence_str())# Counts at different scales
atoms_per_residue = polymer.counts(ciffy.RESIDUE) # (R,)
residues_per_chain = polymer.counts(ciffy.CHAIN) # (C,)
# Membership indices
chain_per_atom = polymer.membership(ciffy.CHAIN) # (N,) chain index per atom
residue_per_atom = polymer.membership(ciffy.RESIDUE) # (N,) residue index per atom
# Reduce atom features to residue level (mean pooling)
residue_coords = polymer.reduce(polymer.coordinates, ciffy.RESIDUE) # (R, 3)
# Expand residue features to atom level
atom_features = polymer.expand(residue_features, ciffy.RESIDUE) # (N, ...)# Center coordinates
centered, centroids = polymer.center(ciffy.MOLECULE)
centered, centroids = polymer.center(ciffy.CHAIN)
# PCA alignment
aligned, rotations = polymer.pca(ciffy.CHAIN)
# Pairwise distances
distances = polymer.pairwise_distances() # Atom-atom
distances = polymer.pairwise_distances(ciffy.RESIDUE) # Residue centroids
# K-nearest neighbors
neighbors = polymer.knn(k=16)polymer.write("output.cif")Create template polymers (no coordinates) from sequence strings:
import ciffy
# RNA (lowercase with u)
rna = ciffy.template("acguacgu")
# DNA (lowercase with t)
dna = ciffy.template("acgtacgt")
# Protein (uppercase)
protein = ciffy.template("MGKLF")
# Multi-chain
multi = ciffy.template(["acgu", "MGKLF"])Build polymers incrementally using append():
from ciffy import Polymer, Residue
# Build a template (no coordinates)
p = Polymer()
for res in [Residue.A, Residue.C, Residue.G, Residue.U]:
p = p.append(res)
# Build with coordinates
p = Polymer()
p = p.append(Residue.A, coords) # First residue with absolute coordsFor autoregressive generation with relative positioning:
from ciffy import Polymer, Residue
from ciffy.geometry import LocalCoordinates
p = Polymer()
p = p.append(Residue.A, first_coords) # Absolute coordinates
# Subsequent residues use LocalCoordinates(coords, transform)
# where transform is an SE(3) transform [axis-angle (3), translation (3)]
p = p.append(Residue.C, LocalCoordinates(coords, transform))
p = p.append(Residue.G, LocalCoordinates(coords, transform))import ciffy
# RMSD (Kabsch-aligned)
rmsd = ciffy.rmsd(polymer1, polymer2)
rmsd = ciffy.rmsd(polymer1, polymer2, scale=ciffy.CHAIN) # Per-chain
# Also works on raw coordinates
rmsd = ciffy.rmsd(coords1, coords2)
# TM-score
tm = ciffy.tm_score(pred, ref)
# lDDT
lddt = ciffy.lddt(pred, ref)
# Radius of gyration
rg = ciffy.rg(polymer)
# Clash detection
clashes = ciffy.clashes(polymer)# View structure summary
ciffy info structure.cif
# Show sequences
ciffy info structure.cif --sequence
# Show entity descriptions
ciffy info structure.cif --desc
# Multiple files
ciffy info *.cifpytest tests/See CONTRIBUTING.md for development setup and guidelines.