This repository was archived by the owner on Mar 11, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsample_tsv_builder.py
More file actions
239 lines (181 loc) · 7.48 KB
/
sample_tsv_builder.py
File metadata and controls
239 lines (181 loc) · 7.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#!/usr/bin/env python3
"""
Tool to build samples.tsv files for NGS Sampes from QBIC.
A samples.tsv file has the following contents:
<patient-id>\t<sex=XX/XY>\t<status=disease/control>\t<sample>\t<lane>[_<flow_cell>]\t<read-pair1>\t<read-pair2>
A Samples.tsv file is build either from a <sample>.metadata file or read from a complete sample
directory.
Currently assumes FastQ files and pair-ended reads.
"""
import os
import argparse
import json
import re
# STATIC
# file ending of json description file
METAINFO_SUFFIX = ".metadata"
class SarekSampleFile:
class Patient:
def __init__(self, id, sex, diseased):
self.id = id
self.sex = sex
self.diseased = "0" if diseased == "no" else "1"
class ReadInfo:
def __init__(self, r1, r2, lane, sample_id):
self.sample_id = sample_id
self.r1 = r1
self.r2 = r2
self.lane = lane
def __init__(self, directory):
self.is_valid = False
# directoy is the absolute path to the sample-directory
self.directory = directory
self.patient = None
self.reads = None
self.metainfo_file = None
self.read_info = []
if os.path.exists(self.directory) and self._contains_ngs_samples(self.directory):
self.is_valid = True
readfiles = list(map(lambda file: os.path.join(self.directory, file),
filter(self._is_readfile, os.listdir(self.directory))))
self.reads = readfiles
mb_metainfo_files = list(filter(lambda file: file.endswith(METAINFO_SUFFIX),
os.listdir(self.directory)))
if len(mb_metainfo_files) == 1 and mb_metainfo_files[0]:
self.metainfo_file = os.path.join(self.directory, mb_metainfo_files[0])
self._collect_pinfo()
self._collect_rinfo()
else:
print("[!] Invalid Path or no Readfiles! ")
def _is_readfile(self, file):
"""
return True if the given file is a readfile
"""
return file.endswith("fasta") or \
file.endswith("fasta.gz") or \
file.endswith(".fastq") or \
file.endswith(".fastq.gz")
def _contains_ngs_samples(self, directory):
"""
Return True, if the given diretory contains readfiles
"""
if os.path.exists(directory):
return any([self._is_readfile(f) for f in os.listdir(directory)])
return False
def _collect_pinfo(self):
"""
Extract Information about Sample and Patient from dict with metainfo
"""
if self.metainfo_file:
with open(self.metainfo_file, "r") as file:
metainfo = json.loads(file.read())
readfiles = list(
map(lambda file: os.path.join(self.directory, file),
filter(self._is_readfile, metainfo["files"])))
self.reads = readfiles
diseased = metainfo["sample1"]["tumor"]
id_str = metainfo["sample1"]["id_genetics"]
if re.search(r"_\d{2}", id_str):
patient_id, _ = id_str.split("_")
else:
reads = list(filter(lambda f: self._is_readfile(f), os.listdir(self.directory)))
patient_id = reads[0].split("_")[0]
# alway set to diseased for now
diseased = "1"
# TODO: Get information about sex
sex = "XX"
self.patient = self.Patient(patient_id, sex, diseased)
def _collect_rinfo(self):
"""
Extract the read information from a list of all fastq files.
"""
if len(self.reads) % 2 != 0:
print("[!] Not all reads could be paired")
r1_filter = lambda r: r.find("R1") != -1
r2_filter = lambda r: r.find("R2") != -1
left_reads = list(filter(r1_filter, self.reads))
right_reads = list(filter(r2_filter, self.reads))
# regex for sample id
sample_id_re = re.compile(self.patient.id + r"_\d{2}_")
# regex for lane information
lane_info_re = re.compile(r"_L\d{3}_")
# TODO: VERIFY!
# flow cell identifier to solve ambiguities with lane numbering
flow_cell_re = re.compile(r"_[A-Z]{3}\d{3}_")
# regex for read-pair information
readnum_re = re.compile(r"_R[12]{1}_")
# anonymous function for stripping the read filename of its "read-code" (R1 or R2)
replace_readnum = lambda read: re.sub(readnum_re, "", read)
while left_reads:
r1 = left_reads.pop()
r1_stripped = replace_readnum(r1)
for i in range(len(right_reads)):
read = right_reads[i]
if r1_stripped == replace_readnum(read):
r2 = right_reads.pop(i)
break
if not r2:
print("[!] No read-pair for {}".format(r1))
exit(1)
# extract lane information
lane_re_match = re.search(lane_info_re, r1)
lane = lane_re_match.group(0).replace("_", "")
# extract flow_cell information, if exists
flow_cell_match = re.search(flow_cell_re, r1)
if flow_cell_match:
flow_cell = flow_cell_match.group(0).replace("_", "")
lane += "_" + flow_cell
# extract sample id
sample_id = self.patient.id
# gather info into ReadInfo object
read_pair = self.ReadInfo(r1, r2, lane, sample_id)
# collect
self.read_info.append(read_pair)
def to_tsv(self, sample_id_filter=""):
"""
Print the object to a tsv string
"""
line_template = "{}\t{}\t{}\t{}\t{}\t{}\t{}\n"
output = ""
for read_pair in self.read_info:
if re.match(sample_id_filter, read_pair.sample_id):
output += line_template.format(
self.patient.id,
self.patient.sex,
self.patient.diseased,
# this makes more sense
read_pair.sample_id,
read_pair.lane,
read_pair.r1,
read_pair.r2)
return(output)
def write_tsv(self, outfile="samples.tsv"):
"""
Write the tsv to a file
"""
if self.is_valid:
with open(outfile, "w") as file:
file.write(self.to_tsv())
else:
print("[!] tsv for invalid sample directory can't be written!")
def __str__(self):
if self.is_valid:
return(self.to_tsv())
else:
return("Invalid Sample directory!")
if __name__ == "__main__":
parser = argparse.ArgumentParser("Sarek Samples Script")
input_group = parser.add_argument_group("Input", "Input files or directory")
input_group_me = input_group.add_mutually_exclusive_group(required=True)
input_group_me.add_argument("-d", "--directory", type=str,
help="Absolute or relative path to the read directory.")
options_group = parser.add_argument_group("Filters", "Filter options for reads")
options_group.add_argument("-s", "--sample_id", type=str,
help="Regex or string to describe the sample IDs that should be included in the .tsv")
args = parser.parse_args()
if args.directory:
sample_tsv = SarekSampleFile(args.directory)
if args.sample_id:
print(sample_tsv.to_tsv(sample_id_filter=args.sample_id))
else:
print(sample_tsv.to_tsv())