-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathencodeMergedat
More file actions
executable file
·263 lines (236 loc) · 10.7 KB
/
encodeMergedat
File metadata and controls
executable file
·263 lines (236 loc) · 10.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#!/bin/bash
meta=$1
cell=$2
output=$3
exp_id=$4
if [[ -z ${meta} ]]; then
echo "Usage: "
echo -e "\t\$1: meta file, 来自python爬虫所得。\n\t\t1.文件地址的上2级目录应该是fastq数据所在。\n\t\t2.column1: Experiment ID\n\t\t3.column3: File basename\n\t\t4.column11: marker\n\t\t5.column14: rep"
echo -e "\t\$2: cell,用于重命名文件"
echo -e "\t\$3: output_dir,输出目录"
echo -e "\t\$4: exp_id,需要对哪些实验ID进行合并,默认为meta file中的全部实验ID"
echo -e "\t-h: 给出脚本书写逻辑"
exit 100
fi
# logic for script
if [[ $1 == "-h" ]]; then
echo 'The logic for script:
(0) 格式化的meta文件要求:
1> column1: Experiment ID
2> column3: File basename
3> column11: marker
4> column14: rep
(1) 以Experiment ID为单位 ($id, column1),检测每个Experiment ID下有多少技术重复 ($rep, column14)
(2) 以每个技术重复为单位,找出每个技术重复下有哪些文件 (记录在${id}_rep${rep}.txt中, column3)
(3) 通过替换文件末尾的fq.gz等结尾,统计每个文件是SE还是PE (${id}_rep${rep}_mode.txt)
(4) 因为在合并数据时,同一个文件如果同时有SE和PE数据,需要将SE和PE数据分别进行合并,故统计SE以及PE文
件的数目 (2行2列数据, num mode),根据统计结果进行循环处理:
1> if (num == 1): 只有一个文件,故直接使用软链接处理即可
1>> if (mode == 1): SE数据
2>> if (mode == 2): PE数据
3>> if (mode == 3): SE + PE数据
2> if (num != 1): 有多个文件,使用zcat | pigz进行合并文件
1>> if (mode == 1): SE数据
2>> if (mode == 2): PE数据
3>> if (mode == 3): SE + PE数据
(5) 关于新旧文件名的命名:
1> 旧文件名:根据column3中所得
2> 新文件名:
SE: mergedExpID_cell_mk_rep1_1_RR.fq.gz
PE: mergedExpID_cell_mk_rep1_1_R1.fq.gz
mergedExpID_cell_mk_rep1_1_R1.fq.gz'
exit 100
fi
# format the para
if true; then
if [[ -z ${output} ]]; then
output=$PWD
else
# output full path
now=$PWD
if [[ ! -d ${output} ]]; then
echo "The ${output} is not exist, the program will create this directory."
mkdir -p ${output}
fi
cd ${output}
output=$PWD
cd ${now}
fi
# meta file full path
meta_dir=$(dirname ${meta})
cd ${meta_dir}
meta_dir=$PWD
meta=$(basename ${meta})
meta=${meta_dir}/${meta}
cd ${now}
if [[ -z ${exp_id} ]]; then
rm_id=T
exp_id=${output}/exp799.id
cut -d "," -f1 ${meta} | tail -n+2 | sort -u > ${exp_id}
fi
fi
dat_root=$(dirname ${meta} | xargs dirname | xargs dirname)
script=${output}/0.merge_dat.sh
file_relathion=${output}/file_correspondence.txt
echo -e "Target\tSource" > ${file_relathion}
echo '#!/bin/bash' > ${script}
echo >> ${script}
cat ${exp_id} | while read id
do
mk=$(egrep "${id}" ${meta} | cut -d "," -f11 | sort -u)
# for each rep
egrep "${id}" ${meta} | cut -d "," -f14 | sort -u | while read rep
do
egrep "${id}" ${meta} | awk -F "," -v rep=${rep} '{if($14 == rep) {print $3}}' > ${id}_rep${rep}.txt
# distinct pe and se data
sed "s/_R[12R].fq.gz//g" ${id}_rep${rep}.txt | sort | uniq -c | sed "s/^ *//g" | sed -r "s/ +/\t/g" > ${id}_rep${rep}_mode.txt
cut -f1 ${id}_rep${rep}_mode.txt | sort | uniq -c | sed -r "s/^\s*//g" | sed -r "s/\s+/\t/g" | while read num mode
do
if [[ ${num} -eq 1 ]]; then
if [[ ${mode} -eq 1 ]]; then
# SE data
new_rr_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_RR.fq.gz
old_rr_file=$(awk -F "\t" '{if($1 == 1) {print $2"_RR.fq.gz"}}' ${id}_rep${rep}_mode.txt | sed "s#^#${dat_root}/#g")
ls ${old_rr_file} 1>/dev/null 2>&1
if [[ $? -ne 0 ]]; then
echo "The file for ${id}: ${old_rr_file} is not exist, please check."
exit 100
fi
# file relationship
echo -e "${new_rr_file}\t${old_rr_file}" >> ${file_relathion}
# merge the data
echo "ln -s ${old_rr_file} ${new_rr_file}" >> ${script}
elif [[ ${mode} -eq 2 ]]; then
# PE data
new_r1_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_R1.fq.gz
new_r2_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_R2.fq.gz
awk -F "\t" '{if($1 == 2) {printf $2"_R1.fq.gz\n"$2"_R2.fq.gz\n"}}' ${id}_rep${rep}_mode.txt | sed "s#^#${dat_root}/#g" > ${id}_rep${rep}_mode_pe.txt
cat ${id}_rep${rep}_mode_pe.txt | xargs -n1 -i ls {} 1>/dev/null 2>&1
if [[ $? -ne 0 ]]; then
echo "The file in ${id}_rep${rep}_mode_pe.txt is not exist, please check."
exit 100
fi
# file relationship
awk '{if(NR%2 == 1) {print $0}}' ${id}_rep${rep}_mode_pe.txt | sed "s#^#${new_r1_file}\t#g" >> ${file_relathion}
awk '{if(NR%2 == 0) {print $0}}' ${id}_rep${rep}_mode_pe.txt | sed "s#^#${new_r2_file}\t#g" >> ${file_relathion}
# merge the data
r1s=$(awk '{if(NR%2 == 1) {print $0}}' ${id}_rep${rep}_mode_pe.txt)
echo "ln -s ${r1s} ${new_r1_file}" >> ${script}
r2s=$(awk '{if(NR%2 == 0) {print $0}}' ${id}_rep${rep}_mode_pe.txt)
echo "ln -s ${r2s} ${new_r2_file}" >> ${script}
elif [[ ${mode} -eq 3 ]]; then
# SE data
new_rr_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_RR.fq.gz
old_rr_file=$(awk -F "\t" '{if($1 == 1) {print $2"_RR.fq.gz"}}' ${id}_rep${rep}_mode.txt | sed "s#^#${dat_root}/#g")
ls ${old_rr_file} 1>/dev/null 2>&1
if [[ $? -ne 0 ]]; then
echo "The file for ${id}: ${old_rr_file} is not exist, please check."
exit 100
fi
# PE data
new_r1_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_R1.fq.gz
new_r2_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_R2.fq.gz
awk -F "\t" '{if($1 == 3) {printf $2"_R1.fq.gz\n"$2"_R2.fq.gz\n"}}' ${id}_rep${rep}_mode.txt | sed "s#^#${dat_root}/#g" > ${id}_rep${rep}_mode_pe.txt
cat ${id}_rep${rep}_mode_pe.txt | xargs -n1 -i ls {} 1>/dev/null 2>&1
if [[ $? -ne 0 ]]; then
echo "The file in ${id}_rep${rep}_mode_pe.txt is not exist, please check."
exit 100
fi
# file relationship
echo -e "${new_rr_file}\t${old_rr_file}" >> ${file_relathion}
awk '{if(NR%2 == 1) {print $0}}' ${id}_rep${rep}_mode_pe.txt | sed "s#^#${new_r1_file}\t#g" >> ${file_relathion}
awk '{if(NR%2 == 0) {print $0}}' ${id}_rep${rep}_mode_pe.txt | sed "s#^#${new_r2_file}\t#g" >> ${file_relathion}
# merge the data
echo "ln -s ${old_rr_file} ${new_rr_file}" >> ${script}
r1s=$(awk '{if(NR%2 == 1) {print $0}}' ${id}_rep${rep}_mode_pe.txt)
echo "ln -s ${r1s} ${new_r1_file}" >> ${script}
r2s=$(awk '{if(NR%2 == 0) {print $0}}' ${id}_rep${rep}_mode_pe.txt)
echo "ln -s ${r2s} ${new_r2_file}" >> ${script}
else
echo "The file ${id}_rep${rep}_mode.txt have some wrong mode (${mode}), please check."
exit 100
fi
if [[ -f ${id}_rep${rep}_mode_pe.txt ]]; then
rm ${id}_rep${rep}_mode_pe.txt
fi
else
if [[ ${mode} -eq 1 ]]; then
# SE data
new_rr_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_RR.fq.gz
awk -F "\t" '{if($1 == 1) {print $2"_RR.fq.gz"}}' ${id}_rep${rep}_mode.txt | sed "s#^#${dat_root}/#g" > ${id}_rep${rep}_mode_se.txt
cat ${id}_rep${rep}_mode_se.txt | xargs -n1 -i ls {} 1>/dev/null 2>&1
if [[ $? -ne 0 ]]; then
echo "The file in ${id}_rep${rep}_mode_se.txt is not exist, please check."
exit 100
fi
# file relationship
cat ${id}_rep${rep}_mode_se.txt | sed "s#^#${new_rr_file}\t#g" >> ${file_relathion}
# merge the data
files=$(cat ${id}_rep${rep}_mode_se.txt | xargs)
echo "zcat ${files} | pigz > ${new_rr_file}" >> ${script}
elif [[ ${mode} -eq 2 ]]; then
# PE data
new_r1_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_R1.fq.gz
new_r2_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_R2.fq.gz
awk -F "\t" '{if($1 == 2) {printf $2"_R1.fq.gz\n"$2"_R2.fq.gz\n"}}' ${id}_rep${rep}_mode.txt | sed "s#^#${dat_root}/#g" > ${id}_rep${rep}_mode_pe.txt
cat ${id}_rep${rep}_mode_pe.txt | xargs -n1 -i ls {} 1>/dev/null 2>&1
if [[ $? -ne 0 ]]; then
echo "The file in ${id}_rep${rep}_mode_pe.txt is not exist, please check."
exit 100
fi
# file relationship
awk '{if(NR%2 == 1) {print $0}}' ${id}_rep${rep}_mode_pe.txt | sed "s#^#${new_r1_file}\t#g" >> ${file_relathion}
awk '{if(NR%2 == 0) {print $0}}' ${id}_rep${rep}_mode_pe.txt | sed "s#^#${new_r2_file}\t#g" >> ${file_relathion}
# merge the data
r1s=$(awk '{if(NR%2 == 1) {print $0}}' ${id}_rep${rep}_mode_pe.txt | xargs)
echo "zcat ${r1s} | pigz > ${new_r1_file}" >> ${script}
r2s=$(awk '{if(NR%2 == 0) {print $0}}' ${id}_rep${rep}_mode_pe.txt | xargs)
echo "zcat ${r2s} | pigz > ${new_r2_file}" >> ${script}
elif [[ ${mode} -eq 3 ]]; then
# SE data
new_rr_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_RR.fq.gz
awk -F "\t" '{if($1 == 3) {print $2"_RR.fq.gz"}}' ${id}_rep${rep}_mode.txt | sed "s#^#${dat_root}/#g" > ${id}_rep${rep}_mode_se.txt
cat ${id}_rep${rep}_mode_se.txt | xargs -n1 -i ls {} 1>/dev/null 2>&1
if [[ $? -ne 0 ]]; then
echo "The file in ${id}_rep${rep}_mode_se.txt is not exist, please check."
exit 100
fi
# PE data
new_r1_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_R1.fq.gz
new_r2_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_R2.fq.gz
awk -F "\t" '{if($1 == 3) {printf $2"_R1.fq.gz\n"$2"_R2.fq.gz\n"}}' ${id}_rep${rep}_mode.txt | sed "s#^#${dat_root}/#g" > ${id}_rep${rep}_mode_pe.txt
cat ${id}_rep${rep}_mode_pe.txt | xargs -n1 -i ls {} 1>/dev/null 2>&1
if [[ $? -ne 0 ]]; then
echo "The file in ${id}_rep${rep}_mode_pe.txt is not exist, please check."
exit 100
fi
# file relationship
cat ${id}_rep${rep}_mode_se.txt | sed "s#^#${new_rr_file}\t#g" >> ${file_relathion}
awk '{if(NR%2 == 1) {print $0}}' ${id}_rep${rep}_mode_pe.txt | sed "s#^#${new_r1_file}\t#g" >> ${file_relathion}
awk '{if(NR%2 == 0) {print $0}}' ${id}_rep${rep}_mode_pe.txt | sed "s#^#${new_r2_file}\t#g" >> ${file_relathion}
# merge the data
files=$(cat ${id}_rep${rep}_mode_se.txt | xargs)
echo "zcat ${files} | pigz > ${new_rr_file}" >> ${script}
r1s=$(awk '{if(NR%2 == 1) {print $0}}' ${id}_rep${rep}_mode_pe.txt | xargs)
echo "zcat ${r1s} | pigz > ${new_r1_file}" >> ${script}
r2s=$(awk '{if(NR%2 == 0) {print $0}}' ${id}_rep${rep}_mode_pe.txt | xargs)
echo "zcat ${r2s} | pigz > ${new_r2_file}" >> ${script}
else
echo "The file ${id}_rep${rep}_mode.txt have some wrong mode (${mode}), please check."
exit 100
fi
if [[ -f ${id}_rep${rep}_mode_se.txt ]]; then
rm ${id}_rep${rep}_mode_se.txt
fi
if [[ -f ${id}_rep${rep}_mode_pe.txt ]]; then
rm ${id}_rep${rep}_mode_pe.txt
fi
fi
done
rm ${id}_rep${rep}.txt ${id}_rep${rep}_mode.txt
done
echo "The ${id} has been merged"
done
if [[ ${rm_id} == "T" ]]; then
rm ${exp_id}
fi