forked from lobrien20/CowPi_workflow_utility
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconversion_of_read_names.py
More file actions
executable file
·135 lines (90 loc) · 3.92 KB
/
conversion_of_read_names.py
File metadata and controls
executable file
·135 lines (90 loc) · 3.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python
import shutil, os
from os import path
import sys
def main():
if sys.argv[3] == "multiple":
mega_directory = sys.argv[1]
fixed_mega_directory = sys.argv[2]
fastq_directories_dict = get_directories(mega_directory)
for directory,fastq_paths in fastq_directories_dict.items():
check_for_bad_fastq_names(fastq_paths)
new_directory = "%s/%s" % (fixed_mega_directory, directory)
os.mkdir(new_directory)
initial_directory = "%s/%s" % (mega_directory, directory)
for fastq in fastq_paths:
run_sequence_file_fix(initial_directory, new_directory, fastq)
elif sys.argv[3] == "single":
initial_directory = sys.argv[1]
new_directory = sys.argv[2]
os.mkdir(new_directory)
fastq_paths = get_fastq_paths(initial_directory)
check_for_bad_fastq_names(fastq_paths)
for fastq in fastq_paths:
run_sequence_file_fix(initial_directory, new_directory, fastq)
else:
print("please choose whether single or multiple datasets.")
def run_sequence_file_fix(initial_fastq_directory, new_fastq_directory, fastq):
fastq_file_path = "%s/%s" % (initial_fastq_directory, fastq)
sequence_line_list = get_all_sequences(fastq_file_path)
new_sequence_line_list = generate_new_sequence_names(sequence_line_list, fastq_file_path)
new_path = "%s/%s" % (new_fastq_directory, fastq)
fix_sequence_file(new_sequence_line_list, new_path)
def get_all_sequences(sequence_file_path):
sequence_line_list = []
with open(sequence_file_path, "r") as sequence_file:
sequence_file_lines = sequence_file.readlines()
sequence_file_len = len(sequence_file_lines)
sequence_file_start = 0
end = 4
while end <= sequence_file_len:
sequence_line_list.append(sequence_file_lines[sequence_file_start:end])
sequence_file_start += 4
end +=4
return sequence_line_list
def generate_new_sequence_names(sequence_line_list, sequence_file_path):
sequence_file_name = sequence_file_path.split("/")[-1][:-6]
sequence_file_name = sequence_file_name.replace("-", "_")
for sequence_chunk in sequence_line_list:
sequence_name = "%s\n" % (sequence_file_name)
sequence_chunk[0] = "@%s" % (sequence_name)
sequence_chunk[2] = "+\n"
return sequence_line_list
def fix_sequence_file(new_sequence_line_list, sequence_file_path):
with open(sequence_file_path, "w") as new_file:
for sequence_chunk in new_sequence_line_list:
for line in sequence_chunk:
new_file.write(line)
def get_fastq_paths(study_directory):
directory_paths = os.listdir(study_directory)
fastq_paths = []
for path in directory_paths:
if path[-6:] == '.fastq':
fastq_paths.append(path)
if path[-3:] == '.fq':
fastq_paths.append(path)
return fastq_paths
def check_for_bad_fastq_names(fastq_paths):
bad_fastq_names = []
for path in fastq_paths:
name = path.split("/")[-1]
if name.count(".") > 1:
bad_fastq_names.append(path)
if len(bad_fastq_names) > 0:
print("The following files have '.' in sample name. Will cause issues with vsearch. Pls fix.")
for bad in bad_fastq_names:
print(bad)
print("Exiting to allow for fixing")
exit()
def get_directories(mega_directory):
mega_directory_paths = os.listdir(mega_directory)
fastq_directories_dict = {}
for path in mega_directory_paths:
if os.path.isdir("%s/%s" % (mega_directory, path)) == True:
fastq_paths = get_fastq_paths("%s/%s" % (mega_directory, path))
if len(fastq_paths) != 0:
fastq_directories_dict[path] = fastq_paths
if len(fastq_directories_dict.keys()) == 0:
print("unable to find any directories with .fastq extension")
return fastq_directories_dict
main()