Sorin-Lab-Scripting-Repository/data_analysis_v3.py at master · xaviermartinez2012/Sorin-Lab-Scripting-Repository · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
#!/usr/bin/env python


from __future__ import print_function
import Queue
import threading
import mdtraj as md
import sys
import os
import argparse
import logging
import subprocess
from time import clock


logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s')


# Function to print to stderr
def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)


# Function written by Dennis to parse the output of the rms, gyrate, and hbond Gromacs commands
def process_file(data_file):
    # <TIME, METRIC> hashtable in ps v. nm
    metric = {}

    with open(data_file, 'r') as data:
        for line in iter(data.readline, ''):
            # Skip Header Information
            if line[0] is '@' or line[0] is '#':
                continue
            # Grab significant data from line
            raw_time, raw_metric = line.split()[:2]
            time = int(float(raw_time))
            radius = 10 * float(raw_metric)
            metric[time] = radius

    return metric


def create_queue(working_directory):
    work_queue = Queue.Queue()
    listdir = os.listdir
    for project_number in listdir(working_directory):
        for run_number in listdir('%s/%s' % (working_directory, project_number)):
            for clone_number in listdir('%s/%s/%s' % (working_directory, project_number, run_number)):
                for xtc in os.listdir(
                                '%s/%s/%s/%s' % (working_directory, project_number, run_number, clone_number)):
                    # We assert that the file we are working with is in fact an xtc by checking its extension.
                    assert '.xtc' in xtc, "%s is not an xtc." % xtc
                    # We rebuild the directory structure of the xtc file
                    # to be able to run this program from any directory.
                    xtc_file = '%s/%s/%s/%s/%s' % (working_directory, project_number, run_number, clone_number, xtc)
                    work_queue.put(xtc_file)
    return work_queue


class DataProcessor (threading.Thread):
    def __init__(self, threadID, name, queue, lock, logfile):
        threading.Thread.__init__(self)
        self.tID = threadID
        self.n = name
        self.q = queue
        self.l = lock
        self.log = logfile

    def run(self):
        process_data(self.q, self.l, self.log)


def process_data(queue, lock, logfile):
    while not exitFlag:
        data = queue.get()
        basename = (data.split("/")[-1])[:-4]
        basename_split = basename.split("_")
        project_number = basename_split[0][1:]
        run_number = basename_split[1][1:]
        clone_number = basename_split[2][1:]
        rms_file = basename + '.rms.xvg'
        gyrate_file = basename + '.gyrate.xvg'
        hbond_file = basename + '.hbond.xvg'
        # Call the Gromacs rms, gyrate, hbond, and trjconv commands.
        # Exit the program if any one of the commands fails.
        try:
            check_call(
                rms_cmd.format(TPR, data, NDX, rms_file),
                shell=True,
                stderr=subprocess.STDOUT
            )
            check_call(
                gyrate_cmd.format(TPR, data, NDX, gyrate_file),
                shell=True,
                stderr=subprocess.STDOUT
            )
            check_call(
                hbond_cmd.format(TPR, data, NDX, hbond_file),
                shell=True,
                stderr=subprocess.STDOUT
            )
        except subprocess.CalledProcessError:
            logging.fatal('GROMACS ERROR')
            sys.exit()
        # Parse the output of the rms, gyrate, and hbond commands.
        # 'Unlink' (remove) these files when done.
        # If not done, we could double or triple the size of the dataset.
        rms = process(rms_file)
        gyrate = process(gyrate_file)
        hbond = process(hbond_file)
        unlink(rms_file)
        unlink(gyrate_file)
        unlink(hbond_file)
        traj = load(data, top=PDB)
        hydration_values = compute_neighbors(traj, ANGSTROM_CUTOFF, rna_atoms, haystack_indices=ow_atoms)
        ion_density_values = compute_neighbors(traj, ANGSTROM_CUTOFF, rna_atoms, haystack_indices=sodium_atoms)

        with lock:

            try:
                check_call(
                trjconv_cmd.format(data, TPR, NDX),
                shell=True,
                stderr=subprocess.STDOUT
                )
            except subprocess.CalledProcessError:
                logging.fatal('ERROR: GROMACS not Loaded')
                sys.exit()

            # Using the pdb's generated by trjconv we generate and parse a X3DNA string by (unique) frame.
            # If the frame is not unique (Gromacs randomnly generates doubles of frames...) it is skipped
            # Lastly we write into the datafile and unlink all temp files.
            frames = []
            append = frames.append
            for frame_index in range(traj.n_frames):
                frame = int(traj.time[frame_index])
                # Check statement for repeated frames
                if frame in frames:
                    unlink('frame_{}.pdb'.format(frame_index))
                    pass
                else:
                    append(frame)
                    # X3DNA calcualtion and parsing (using Perl script)
                    try:
                        check_call(
                            X3DNA_cmd.format(frame_index, frame_index),
                            shell=True,
                            stderr=subprocess.STDOUT
                        )
                        check_call(
                            fix_X3DNA_output_cmd.format(frame_index, NBP),
                            shell=True,
                            stderr=subprocess.STDOUT
                        )
                    except subprocess.CalledProcessError:
                        logging.fatal('X3DNA ERROR')
                        sys.exit()
                    # Obtain the hydration number by counting the number of atom indices (waters) are
                    # neighbors to the rna: d(rna, water) < angstrom_cutoff
                    hydration_number = len(hydration_values[frame_index])
                    ion_density = len(ion_density_values[frame_index])
                    # Write data to file
                    with open('3dnaout.txt', mode='r') as x3dna:
                        for line in x3dna:
                            logfile.write(
                                '{:5} {:3} {:4} {:>6} {:0<7.6} {:0<7.6} {:<4} {:<4} {:<}'.format(project_number,
                                                                                                 run_number,
                                                                                                 clone_number,
                                                                                                 frame,
                                                                                                 rms[frame],
                                                                                                 gyrate[frame],
                                                                                                 hydration_number,
                                                                                                 ion_density,
                                                                                                 line))
                    # Unlink all temp files
                    unlink('frame_{}.pdb'.format(frame_index))
                    # In the case that X3DNA does not find any pairs, the files in the try/catch are not
                    # generated. If not handled they cause the program to break.
                    try:
                        unlink('allpairs.ana')
                        unlink('allpairs.pdb')
                        unlink('ref_frames.dat')
                        unlink('mref_frames.dat')
                        unlink('mulbp.inp')
                        unlink('multiplets.pdb')
                    except OSError:
                        pass
                    unlink('3dnaout.txt')
                    unlink('pdbout_{}'.format(frame_index))

        queue.task_done()

# Function to check the existence of a file.
# Used in conjunction with argparse to check that the given parameter files exist.
def valid_file(path):
    value = str(path)
    if not os.path.isfile(value):
        raise argparse.ArgumentTypeError(
            '\"%s\" does not exist (must be in the same directory or specify full path).' % value)
    return value


# Function to check the existence of a directory.
# Used in conjunction with argparse to check that the given parameter dataset exist.
def valid_dir(path):
    value = str(path)
    if not os.path.exists(path):
        raise argparse.ArgumentTypeError(
            '\"%s\" does not exist (must be in the same directory or specify full path).' % value)
    return value


# Function to check that the name of logfile does not conflict with the name of a temp file in use by the program.
def valid_outfile(name):
    value = str(name)
    if name == '3dnaout.txt':
        raise argparse.ArgumentTypeError(
            '\"{}\" is used within the program. Please choose another name for the outfile.'.format(name))
    return value


# Initialization of the argument parser.
parser = argparse.ArgumentParser(
    description='Generate a log containing rms, gyrate, hbond, hydation number, and X3DNA string for every frame off '
                'every .xtc file in a FAH dataset. Tested using Gromacs 3.3, X3DNA 2.2, and MDTraj 1.7.2. '
                'Relies on the Fix_3DNA_output.pl script (located on the wiki).',
    epilog="Designed by Xavier Martinez on July 20th, 2016")
parser.add_argument('AngstromCutoff', type=float,
                    help='Cutoff value used to decide if water atom will be counted into the Hydration Number.')
parser.add_argument('pdb', type=valid_file, help='The .pdb file')
parser.add_argument('tpr', type=valid_file, help='The .tpr file.')
parser.add_argument('nbp', type=valid_file, help='The native base pairs file.')
parser.add_argument('ndx', type=valid_file, help='The .ndx file. (Protein before system)')
parser.add_argument('wd', type=valid_dir, help='The working directory (must use FAH directory structure).')
parser.add_argument('o', type=str, help='The name of the logfile.')
parser.add_argument('t', type=int, default=2, help='The number of threads to run. Default is 2.')
args = parser.parse_args()

# Initialization of the variables that correspond to the arguments passed by the user.
ANGSTROM_CUTOFF = args.AngstromCutoff / 10.00
PDB = args.pdb
NDX = args.ndx
NBP = args.nbp
TPR = args.tpr
DATASET_DIRECTORY = args.wd
OUT_FILE = args.o
NUM_THREADS = args.t

# Assert that the 'Fix_3DNA_output.pl' Pearl script exists within the current working directory.
# This script is needed to parse output of the X3DNA program.
assert os.path.exists(
    'Fix_3DNA_output.pl'), 'Fix_3DNA_output.pl script not found in the directory and needed to continue...'

check_call = subprocess.check_call
process = process_file
unlink = os.unlink
load = md.load
compute_neighbors = md.compute_neighbors

# Load just the topology once to select the atom indices of the rna(protein) and water.
# This is used to calculate the wHydration number of a particular frame.
# We do this once to save processing time since the topology does not change from frame to frame.
topology = md.load(PDB).topology
rna_atoms = [atom.index for atom in topology.atoms if
             ('Na+' not in atom.residue.name and "HOH" not in atom.residue.name)]
sodium_atoms = [atom.index for atom in topology.atoms if 'Na+' in atom.residue.name]
ow_atoms = topology.select('name O and water')

# Initialize the commands that will be used throughout the program, saving processing time.
# Note: the trjconv command generates pdb's for every frame in a .xtc (important when using X3DNA)
rms_cmd = 'echo 0 0 | g_rms -noxvgr -s {} -f {} -n {} -o {}'
gyrate_cmd = 'echo 0 | g_gyrate -noxvgr -s {} -f {} -n {} -o {}'
hbond_cmd = 'echo 0 0 | g_hbond -noxvgr -s {} -f {} -n {} -num {}'
trjconv_cmd = 'echo 0 | trjconv -f {} -s {} -sep -o frame.pdb -n {}'
X3DNA_cmd = 'find_pair -z -p frame_{}.pdb pdbout_{}'
fix_X3DNA_output_cmd = './Fix_3DNA_output.pl pdbout_{} {} > 3dnaout.txt'

# Create a queue of xtcs for the threads to operate on.
xtc_queue = create_queue(DATASET_DIRECTORY)
threading_lock = threading.Lock()
threads = []
exitFlag = 0

with open(OUT_FILE, mode='w') as datafile:
    for t in range(NUM_THREADS):
        thread = DataProcessor((t + 1), "T{}".format(t), xtc_queue, threading_lock, datafile)
        thread.setDaemon(True)
        thread.start()
        threads.append(thread)
    xtc_queue.join()
    exitFlag += 1
    for t in threads:
        t.join()