MDDB_Tutorials/template_inputs.yaml at main · PDNALab/MDDB_Tutorials · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
---

# This file defines a set of fields which are then read and used by the workflow.
# Every field is explained and example values are provided.


#------------------------------------------------------------------------------------------------------------
# PROJECT METADATA
#------------------------------------------------------------------------------------------------------------
# The following metadata has no effect on the workflow itself, but it will be written to the output metadata file.
# These values will be uploaded to the database and then exposed in the project overview.
# They may be also useful to search this project using the browser.

# Write a brief description or title for this trajectory for the overview page.
# This name may be used by the client to search the trajectory in the database.
# The name is displayed in the overview page.

name: My project

# Write additional comments about the project.
# The description is displayed in the overview page.
#
# Example:
#
# description: My project description

description:

# Write author names.
# Authors are displayed in the overview page.
# IMPORTANT: Check already existing values for this field to avoid making duplicates.
#
# Example:
#
# authors:
#   - My name
#   - My partner's name
#

authors:

# Write author group name/s.
# The group is displayed in the overview page.
# IMPORTANT: Check already existing values for this field to avoid making duplicates.
#
# Example:
#
# groups:
#   - My group
#   - My partner's group
#

groups:

# How to contact the authors (e.g. a mail).
# The contact is displayed in the overview page.

contact:

# Program (software) name which carried the trajectory and its version.
# Program and version are both displayed in the overview page.
# IMPORTANT: Check already existing values for this field to avoid making duplicates.
#
# Example:
#
# program: GROMACS


program:
# Example:
#
# version: 2024.2
version:

# Type of molecular dynamics.
# At this moment there are only two options in this field: `trajectory` and `ensemble`.
# NOTE: This field has an effect on the client: some time-dependent analysis will change the labels of their axes in order to make sense. For instance RMSD X axis will be labeled as `frames` instead of `time`.
#
# Example:
#
# type: trajectory

type:

# MD method is displayed in the overview.
# e.g. Classical MD, Targeted MD, Biased MD (Accelerated Weighted Ensemble), Enhanced sampling (Hamiltonian Replica Exchange) ...
#
# Example:
#
# method: Classical MD

method:

# License and link to the license web page.
# The license is displayed in the overview page.
# Under the license there is a `More information` button.
# The link is used to redirect the user when this button is clicked.
#
# Example:
#
# license: This trajectory dataset is released under a Creative Commons Attribution 4.0 International Public License


license:

# Example:
#
# linkcense: https://creativecommons.org/licenses/by/4.0/

linkcense:

# Citation for refering this simulation or related paper.
# The citation is displayed in the overview page.
# To set a citation use the following instructions:
# to add a line break type '(br)' inside the citation string,
# to add superior text type '^' before each character.

citation:

# Acknowledgements to be shown in the overview page.
#
# Example:
# thanks: I would like to thank my funders...

thanks:

# OPTIONAL: Use this field to force a custom accession.
# Accession is a short code to refer this project in your local database.
# If no accession is forced then a default formatted accession will be generated when loading the project.

accession:

#------------------------------------------------------------------------------------------------------------
# REFERENCES
#------------------------------------------------------------------------------------------------------------
# References to other databases to enrich our data.

# Links to somewhere related to the simulation.
# These links are displayed in the overview page.
# MolSSI uses this field to find simulations in our database and place the embed viewer in their website.
# You must fit to the standard when adding a new MolSSI simulation.
# NOTE: This field has no effect in our workflow BUT others may rely on it.
#
# Example:
# links:
#   - name: First data source
#     url: https://data.source.org/
#   - name: Second data source
#     url: https://mydata.com/

links:

# Set the source pdb ids of the trajectory structure
# Additional data from the pdb is harvested by the loader while uploading to the database
# This data is displayed in the overview page
#
# Example:
# pdb_ids:
#   - 6ACS
#   - 6M0J

pdb_ids:

# Set which reference sequences must be used in order to map residues in the structure of the simulation.
# UniProt accession ids are accepted.
# Forced references may be not provided or just cover the structure partially.
# Then a blast will be run for each orphan chain sequence.
# In addition, UniProt accession ids may be guessed from the PDB ids, when provided.
#
# Forced references may be provided as a list.
# In this scenario UniProt sequences are aligned to chain sequences to guess which UniProt belongs to each chain.
# Forced references may be provided as a dictionary.
# Then the user specifies which reference belongs to each chain.
# Use the "noref" flag to mark a chain as "no referable" (e.g. antibodies, synthetic constructs).
#
# Example:
#
# forced_references:
#   - Q9BYF1
#   - P0DTC2
# forced_references:
#   A: Q9BYF1
#   B: P0DTC2
#   C: noref

forced_references:

# Set ligands in the simulation.
# The workflow identifies ligands by their pubchem accession.
# If a pubchem accession is passed then it is used.
# Otherwise, the pubchem accession is found using other database accessions.
# Each ligand must have at least one of the following attributes:
#   - pubchem: the PubChem accession
#   - drugbank: the DrugBank accession
#   - chembl: the ChEMBL accession
#
# Optionally, a list of vmd selections may be provided to force the mapping
#   - vmd_selection: a list of vmd selections (chain D)
#
# Ligands are mapped in the standard topology file.
# In addition, an RMSD analysis is run for every defined ligand.
#
#
# Example:
#
# ligands:
#   - pubchem: 1986
#   - drugbank: DB00945

ligands:

#------------------------------------------------------------------------------------------------------------
# SIMULATION METADATA
#------------------------------------------------------------------------------------------------------------
# Simulation parameters.
# DANI: Someday this will be automatically mined.

# Time framestep in nanoseconds (ns).
# Time between each snapshot/frame.
# May be None if this is not a trajectory, but an ensemble.
# Framestep is an important value since it is used in many graph axes in the web client.
#
# Example:
#
# framestep: 0.01 # ns

framestep:

# The rest of values are displayed in the web client as trajectory metadata.
# These values do not affect other outcomes in the workflow.
# Simulation timestep in femtoseconds (fs).
# Time step used for the numerical integration of the simulation.
#
# Example:
#
# timestep: 2 # fs

timestep:

# Temperature in Kelvin (K).
#
# Example:
#
# temp: 310 # K

temp:

# Ensemble
# e.g. NVT, NPT, etc.
# IMPORTANT: Check already existing values for this field to avoid making duplicates.
#
# Example:
#
# ensemble: NPT

ensemble:

# Force fields.
# IMPORTANT: Check already existing values for this field to avoid making duplicates.
#
# Example:
#
# ff:
#   - Amber ff14SB
#   - GLYCAM-06j

ff:

# Water force fields.
# IMPORTANT: Check already existing values for this field to avoid making duplicates.
#
# Example:
#
# wat: TIP3P

wat:

# Boxtype
# e.g. Triclinic, Cubic, Dodecahedron.
# IMPORTANT: Check already existing values for this field to avoid making duplicates.

boxtype:

#------------------------------------------------------------------------------------------------------------
# ANALYSIS PARAMETERS
#------------------------------------------------------------------------------------------------------------
# These fields have an impact in the analysis workflow.


# Set which are the interesting interactions to be analyzed.
# A bunch of interaction-specific analyses will be run for each interaction and displayed in the web client.
#
# Interactions are defined by the 'agents' which are meant to interact pairwise.
# An 'agent' may be anything, even a group of unrelated molecules.
# Atoms of different agents which are close enought will be considered as interface atoms.
# These atoms will be the ones considered in interface analyses.
# If no interface atoms are found then the interaction is considered not valid and the user is warned.
#
# Interactions are uploaded to the database as part of the project metadata and as an independent analysis.
# Project metadata includes the interaction name, agents name and agent atom selections (`VMD syntax <https://www.ks.uiuc.edu/Research/vmd/vmd-1.3/ug/node132.html>`_).
# Analysis data includes also every agent atom indices (both the whole agent and the interface only).
#
# Each interaction has the following attributes:
#    - name: a string tag used to relate interaction analyses data with their corresponding atoms. In addition, the name is used to label the corresponding analyses in the web client.
#    - agent_1: the name of the first agent in the interaction, which is used to label in the client.
#    - selection_1: the VMD selection of the first agent in the interaction.
#    - agent_2: the name of the second agent in the interaction, which is used to label in the client.
#    - selection_2: the VMD selection of the second agent in the interaction.
#    - distance_cutoff (optional): the distance used to determine which atoms are in the interface (in Å).
#
# The default value is intended for atomistic simulations.
# Thus coarse grain interactions may need manual input distance cutoff.
#
# Example:
#
# interactions:
#   - name: protein-ligand interaction
#     agent_1: protein
#     selection_1: not resname lig
#     agent_2: ligand
#     selection_2: resname lig
#   - name: domain-domain interaction
#     agent_1: domain 1
#     selection_1: resid 2 to 291
#     agent_2: domain 2
#     selection_2: resid 2 to 291
#   - name: dna-dna hybridization
#     agent_1: strain A
#     selection_1: chain A
#     agent_2: strain B
#     selection_2: chain B
#     distance_cutoff: 10

interactions:

# Set those residues which are under periodic boundary conditions (PBC).
# These residues are excluded from the imaging centering and fitting.
# These residues are excluded in the follwoing analyses:
#    - RMSD: Sudden jumps in PBC residues result in non-sense high peaks
#    - RMSD per residue: Sudden jumps in PBC residues result in non-sense high peaks
#    - RMSD pairwise: Sudden jumps in PBC residues result in non-sense high peaks
#    - TM score: Sudden jumps in PBC residues result in non-sense high peaks
#    - RGYR: Sudden jumps in PBC residues result in non-sense high changes
#    - RMSF: Sudden jumps in PBC residues result in non-sense high peaks
#    - PCA: Sudden jumps make not sense in PCA and they eclipse non-PBC movements
#    - SASA: Residues close to the boundary will be considered exposed to solvent while they may be not
#    - Pockets: Residues close to the boundary may be considered to have pockets while they have not.
# DANI: This isn't really possible because fpocket doesn't allow you to intelligently "discard" atoms. If you remove atoms so it doesn't find pockets in them, pockets can appear in the sites occupied by those atoms. For now, we discard the entire analysis when there's something in PBC and that's it.
#    - Clusters: Since Clustering is RMSD-based it has the same limitations
#
# If this field is set to 'auto' then PBC residues are set automatically.
# Solvent, counter ions and membrane lipids are selected in this cases.
# Note that these are the most tipical residues under periodic boundary conditions.
#
# This field is also useful for those scenarions with several protein or nucleic acid molecules floating around.
# In this situation you can not image and fit all molecules.
# You must focus in one molecule and let the others stay in periodic boundary conditions.
#
# These residues are defined using `VMD selection language <https://www.ks.uiuc.edu/Research/vmd/vmd-1.3/ug/node132.html>`_.
#
# Example:
#
# pbc_selection: water or ions

pbc_selection: auto

# Set those atoms which are not actual "atoms" but coarse grained (CG) beads.
# IMPORTANT: EXPERIMENTAL INPUT

cg_selection:

#------------------------------------------------------------------------------------------------------------
# REPRESENTATION PARAMETERS
#------------------------------------------------------------------------------------------------------------
# These fields have an impact in the display of the simulation once in the web client


# Set optional custom chain names which may be longer than a single letter.
# This names are used to label chains in the web client.
#
# Example:
#
# chainnames:
#   A: Protein
#   B: Ligand

chainnames:

# The web client sets some default representations (molecular viewer configurations).
# They highlight important features in the structure according to the topology reference or interactions.
# In addition, you may set extra customized representations which are interesting for you.
# These representations will be available in the web client.
# IMPORTANT: Make sure whatever you want to represent is not already represented by default or it will be duplicated.
#
# Example:
#
# customs:
#   - name: A custom view focusing on an interesting residue
#     representations:
#       - name: The very interesing residue
#         selection: VIR
#         type: ball+stick
#         color: element
#       - name: The resting boring molecule
#         selection: not VIR
#         type: cartoon
#         color: chainid

customs:

# Set a specific starting orientation for the web client viewer.
# Normally this is done once the simulation has been uploaded since there is no easy way to get the orientation before.
#
# Example:
#
# [
#     72.05997406618104,
#     21.871748915422142,
#     47.89720038949639,
#     0,
#     34.3234627961572,
#     42.053333152877315,
#     -70.84188126104011,
#     0,
#     -39.93012781662099,
#     75.61943426331311,
#     25.542927052994127,
#     0,
#     -63.015499114990234,
#     -33.07249975204468,
#     -39.439000606536865,
#     1
# ]

orientation:

#------------------------------------------------------------------------------------------------------------
# OTHERS
#------------------------------------------------------------------------------------------------------------
# Other metadata

# Set if we have any multimeric form:
# monomer, dimer, trimer...
# This field was requested by the referees.
# Its only use for now is as a parameter in project queries.
# DANI: This is temporary, it would be best to automate it.
#
# Example:
#
# multimeric:
#   - monomer
#   - trimer

multimeric:

#------------------------------------------------------------------------------------------------------------
# COLLECTIONS
#------------------------------------------------------------------------------------------------------------
# Set also additional collection related metadata values.

# Set to which collection does this simulation belong to.
# Currently supported collections:
#   - cv19
#   - mcns
#   - abc
#   - bigna
#   - model

collections:

# BioExcel-CV19 specific metadata fields.
# Set which family does this trajectory belong to.
# Supported units:
# - RBD-ACE2
# - RBD
# - ACE2
# - Spike
# - 3CLpro
# - PLpro
# - Polymerase
# - E protein
# - Exoribonuclease
# - Other

cv19_unit:

# Set some additional inputs requested by the referees.
#
# - Starting conformation of the spike (options: close, 1 open, 2 open, 3 open)
# - Are there antibodies? (e.g. true)
# - Are there nanobodies? (e.g. false)

cv19_startconf:
cv19_abs:
cv19_nanobs:

#------------------------------------------------------------------------------------------------------------
# INPUT FILES
#------------------------------------------------------------------------------------------------------------
# Directories for every MD in the project.
# Input file paths for topology, trajectory and structure.
# Note that all these values may be specified thorugh command line as well.

# Each project may contain several Molecular Dynamics (MD).
# Each MD is to be stored in an independent folder when running the workflow.
# Each MD must have a different name, which will be used also to assign directories in the workflow.
#
# MDs may include additional metadata to overwrite the project metadata for a specific case.
#
# Example:
#
# mds:
#   - name: replica 310 K
#     mdir: replica_1
#     temp: 310
#   - name: replica 311 K
#     mdir: replica_2
#     temp: 311

mds:

# Also the reference MD is to be defined by providing the index of the MDs list.
# If there is not an MD which is more important than others then simply set the first MD (0) as the reference.
#
# Example:
#
# mdref: 0

mdref:

# Input topology, trajectory and structure files.
#
# LORE: These files have been passed to the workflow through command line traditionally. Now the inputs file also provides this option.

input_topology_filepath:
input_structure_filepath:
input_trajectory_filepaths: