-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathadvanced_filtering.py
More file actions
347 lines (271 loc) · 10.7 KB
/
advanced_filtering.py
File metadata and controls
347 lines (271 loc) · 10.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
#!/usr/bin/env python3
"""
Advanced filtering and manipulation example for PDBRust Python bindings.
This example demonstrates:
- Method chaining for filtering operations
- Chain normalization and residue reindexing
- Atom renumbering
- Structure centering and translation
- Residue and sequence extraction
- Practical preprocessing workflows
"""
import pdbrust
# Sample PDB files
PDB_FILE = "../../examples/pdb_files/1UBQ.pdb"
MULTI_CHAIN_FILE = "../../examples/pdb_files/4INS.pdb"
def main():
print("=" * 60)
print("PDBRust Advanced Filtering Example")
print("=" * 60)
# Load structure
structure = pdbrust.parse_pdb_file(PDB_FILE)
print(f"Loaded structure: {structure.num_atoms} atoms, {structure.num_chains} chains")
# --- Method Chaining ---
print("\n1. METHOD CHAINING")
print("-" * 40)
# Single operation
ca_only = structure.keep_only_ca()
print(f"Original: {structure.num_atoms} atoms")
print(f"After keep_only_ca(): {ca_only.num_atoms} atoms")
# Chain multiple operations
print("\nChaining multiple operations:")
print(" structure.remove_ligands().keep_only_chain('A').remove_hydrogens()")
# Note: We reload to start fresh
s = pdbrust.parse_pdb_file(PDB_FILE)
cleaned = s.remove_ligands().keep_only_chain("A").remove_hydrogens()
print(f" Result: {cleaned.num_atoms} atoms")
# More complex chain
s = pdbrust.parse_pdb_file(PDB_FILE)
backbone_clean = s.remove_ligands().remove_hydrogens().keep_only_backbone()
print(f"\n remove_ligands().remove_hydrogens().keep_only_backbone()")
print(f" Result: {backbone_clean.num_atoms} atoms")
# --- Available Filter Methods ---
print("\n2. AVAILABLE FILTER METHODS")
print("-" * 40)
print("""
Filtering methods (return new structure):
remove_ligands() - Remove HETATM records (ligands, waters)
remove_hydrogens() - Remove hydrogen atoms
keep_only_chain(id) - Keep only specified chain
keep_only_ca() - Keep only CA (alpha carbon) atoms
keep_only_backbone() - Keep only backbone atoms (N, CA, C, O)
select(query) - PyMOL-style selection language
Comparison of approaches:
""")
s = pdbrust.parse_pdb_file(PDB_FILE)
print(f" Original: {s.num_atoms} atoms")
print(f" keep_only_ca(): {s.keep_only_ca().num_atoms} atoms")
print(f" keep_only_backbone(): {s.keep_only_backbone().num_atoms} atoms")
print(f" remove_ligands(): {s.remove_ligands().num_atoms} atoms")
print(f" remove_hydrogens(): {s.remove_hydrogens().num_atoms} atoms")
# --- In-Place Modifications ---
print("\n3. IN-PLACE MODIFICATIONS")
print("-" * 40)
# Load multi-chain structure
try:
multi = pdbrust.parse_pdb_file(MULTI_CHAIN_FILE)
print(f"Loaded multi-chain structure: {multi.num_atoms} atoms")
print(f"Original chains: {multi.get_chain_ids()}")
except Exception as e:
print(f"Could not load multi-chain file: {e}")
multi = None
# --- Chain Normalization ---
print("\n4. CHAIN NORMALIZATION")
print("-" * 40)
if multi:
# Normalize chain IDs to A, B, C, ...
print(f"Original chains: {multi.get_chain_ids()}")
multi.normalize_chain_ids()
print(f"After normalize_chain_ids(): {multi.get_chain_ids()}")
else:
# Use single-chain structure
s = pdbrust.parse_pdb_file(PDB_FILE)
print(f"Original chains: {s.get_chain_ids()}")
s.normalize_chain_ids()
print(f"After normalize_chain_ids(): {s.get_chain_ids()}")
# --- Residue Reindexing ---
print("\n5. RESIDUE REINDEXING")
print("-" * 40)
s = pdbrust.parse_pdb_file(PDB_FILE)
# Get residues before reindexing
residues_before = s.get_residues()[:5]
print(f"Before reindex_residues():")
for seq, name in residues_before:
print(f" {name} {seq}")
# Reindex residues starting from 1
s.reindex_residues()
residues_after = s.get_residues()[:5]
print(f"\nAfter reindex_residues():")
for seq, name in residues_after:
print(f" {name} {seq}")
# --- Atom Renumbering ---
print("\n6. ATOM RENUMBERING")
print("-" * 40)
s = pdbrust.parse_pdb_file(PDB_FILE)
# Get first few atom serials before
atoms_before = s.atoms[:5]
print(f"Before renumber_atoms():")
for a in atoms_before:
print(f" Atom {a.serial}: {a.name} {a.residue_name}")
# Renumber atoms sequentially from 1
s.renumber_atoms()
atoms_after = s.atoms[:5]
print(f"\nAfter renumber_atoms():")
for a in atoms_after:
print(f" Atom {a.serial}: {a.name} {a.residue_name}")
# --- Structure Centering ---
print("\n7. STRUCTURE CENTERING")
print("-" * 40)
s = pdbrust.parse_pdb_file(PDB_FILE)
# Get centroid before centering
centroid_before = s.get_centroid()
print(f"Centroid before: ({centroid_before[0]:.3f}, {centroid_before[1]:.3f}, {centroid_before[2]:.3f})")
# Center structure at origin
s.center_structure()
centroid_after = s.get_centroid()
print(f"Centroid after center_structure(): ({centroid_after[0]:.6f}, {centroid_after[1]:.6f}, {centroid_after[2]:.6f})")
# Also get CA centroid
ca_centroid = s.get_ca_centroid()
print(f"CA centroid: ({ca_centroid[0]:.6f}, {ca_centroid[1]:.6f}, {ca_centroid[2]:.6f})")
# --- Translation ---
print("\n8. STRUCTURE TRANSLATION")
print("-" * 40)
s = pdbrust.parse_pdb_file(PDB_FILE)
# Get centroid before
before = s.get_centroid()
print(f"Before translation: ({before[0]:.3f}, {before[1]:.3f}, {before[2]:.3f})")
# Translate by (10, 20, 30)
s.translate(10.0, 20.0, 30.0)
after = s.get_centroid()
print(f"After translate(10, 20, 30): ({after[0]:.3f}, {after[1]:.3f}, {after[2]:.3f})")
# Verify the difference
diff = (after[0] - before[0], after[1] - before[1], after[2] - before[2])
print(f"Difference: ({diff[0]:.3f}, {diff[1]:.3f}, {diff[2]:.3f})")
# --- Residue and Sequence Access ---
print("\n9. RESIDUE AND SEQUENCE ACCESS")
print("-" * 40)
s = pdbrust.parse_pdb_file(PDB_FILE)
# Get all residues
all_residues = s.get_residues()
print(f"Total residues: {len(all_residues)}")
print(f"First 5 residues: {all_residues[:5]}")
# Get residues for specific chain
chains = s.get_chain_ids()
if chains:
chain_residues = s.get_residues_for_chain(chains[0])
print(f"\nResidues in chain {chains[0]}: {len(chain_residues)}")
print(f"First 5: {chain_residues[:5]}")
# Get sequence from SEQRES records
if chains:
sequence = s.get_sequence(chains[0])
if sequence:
print(f"\nSequence for chain {chains[0]} (from SEQRES):")
seq_str = " ".join(sequence[:10])
print(f" {seq_str}... ({len(sequence)} residues)")
else:
print(f"\nNo SEQRES records for chain {chains[0]}")
# --- Practical Preprocessing Workflow ---
print("\n10. PRACTICAL PREPROCESSING WORKFLOW")
print("-" * 40)
print("Typical preprocessing for structure analysis:")
# Step 1: Load
s = pdbrust.parse_pdb_file(PDB_FILE)
print(f"\n1. Load: {s.num_atoms} atoms")
# Step 2: Clean
s = s.remove_ligands().remove_hydrogens()
print(f"2. Remove ligands & hydrogens: {s.num_atoms} atoms")
# Step 3: Select chain (if multi-chain)
chains = s.get_chain_ids()
if len(chains) > 1:
s = s.keep_only_chain(chains[0])
print(f"3. Keep chain {chains[0]}: {s.num_atoms} atoms")
else:
print(f"3. Single chain, no filtering needed")
# Step 4: Normalize numbering
s.normalize_chain_ids()
s.reindex_residues()
s.renumber_atoms()
print("4. Normalize chain IDs, reindex residues, renumber atoms")
# Step 5: Center
s.center_structure()
print(f"5. Center at origin")
# Final state
print(f"\nFinal structure:")
print(f" Atoms: {s.num_atoms}")
print(f" Chains: {s.get_chain_ids()}")
print(f" Residues: {len(s.get_residues())}")
print(f" Centroid: {s.get_centroid()}")
# --- Workflow for ML Preprocessing ---
print("\n11. ML PREPROCESSING WORKFLOW")
print("-" * 40)
print("""
Common preprocessing for machine learning:
1. Parse structure
structure = pdbrust.parse_pdb_file(path)
2. Clean and filter
cleaned = structure.remove_ligands().remove_hydrogens()
3. Select protein atoms only
protein = cleaned.select("protein")
4. Center at origin
protein.center_structure()
5. Extract coordinates
coords = protein.get_coords_array() # numpy array (N, 3)
ca_coords = protein.get_ca_coords_array() # CA only
6. Normalize (optional)
protein.normalize_chain_ids()
protein.reindex_residues()
7. Write processed structure
protein.to_file("processed.pdb")
""")
# --- Selection vs Method Chaining ---
print("\n12. SELECTION LANGUAGE VS METHOD CHAINING")
print("-" * 40)
s = pdbrust.parse_pdb_file(PDB_FILE)
# Method chaining
result1 = s.keep_only_chain("A").keep_only_backbone()
print(f"Method chaining: keep_only_chain('A').keep_only_backbone()")
print(f" Result: {result1.num_atoms} atoms")
# Selection language (more flexible)
result2 = s.select("chain A and backbone")
print(f"\nSelection: select('chain A and backbone')")
print(f" Result: {result2.num_atoms} atoms")
# Selection language advantages
print("\nSelection language advantages:")
print(" - More expressive: (chain A or chain B) and backbone")
print(" - Numeric filters: bfactor < 30.0")
print(" - Complex logic: protein and not hydrogen and bfactor < 40.0")
# Method chaining advantages
print("\nMethod chaining advantages:")
print(" - Simpler syntax for common operations")
print(" - In-place modifications available")
print(" - No query string parsing overhead")
# --- Summary ---
print("\n13. SUMMARY")
print("-" * 40)
print("""
Key filtering methods:
remove_ligands() - Remove non-protein atoms
remove_hydrogens() - Remove H atoms
keep_only_chain(id) - Single chain
keep_only_ca() - CA atoms only
keep_only_backbone() - Backbone atoms only
select(query) - PyMOL-style selections
Key manipulation methods:
normalize_chain_ids() - Rename chains to A, B, C...
reindex_residues() - Renumber from 1
renumber_atoms() - Sequential atom numbers
center_structure() - Center at origin
translate(dx, dy, dz) - Move structure
Access methods:
get_chain_ids() - List chain IDs
get_residues() - All residues
get_residues_for_chain() - Chain-specific
get_sequence() - From SEQRES
get_centroid() - Center of mass
get_ca_centroid() - CA center of mass
""")
print("\n" + "=" * 60)
print("Example completed successfully!")
print("=" * 60)
if __name__ == "__main__":
main()