Custom_tools/encodeMergedat at main · fatyang799/Custom_tools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#!/bin/bash

meta=$1
cell=$2
output=$3
exp_id=$4

if [[ -z ${meta} ]]; then
	echo "Usage: "
	echo -e "\t\$1: meta file, 来自python爬虫所得。\n\t\t1.文件地址的上2级目录应该是fastq数据所在。\n\t\t2.column1: Experiment ID\n\t\t3.column3: File basename\n\t\t4.column11: marker\n\t\t5.column14: rep"
	echo -e "\t\$2: cell，用于重命名文件"
	echo -e "\t\$3: output_dir，输出目录"
	echo -e "\t\$4: exp_id，需要对哪些实验ID进行合并，默认为meta file中的全部实验ID"
	echo -e "\t-h: 给出脚本书写逻辑"
	exit 100
fi

# logic for script
if [[ $1 == "-h" ]]; then
	echo 'The logic for script:
	(0) 格式化的meta文件要求:
		1> column1: Experiment ID
		2> column3: File basename
		3> column11: marker
		4> column14: rep
	(1)  以Experiment ID为单位 ($id, column1)，检测每个Experiment ID下有多少技术重复 ($rep, column14)
	(2)  以每个技术重复为单位，找出每个技术重复下有哪些文件 (记录在${id}_rep${rep}.txt中, column3)
	(3)  通过替换文件末尾的fq.gz等结尾，统计每个文件是SE还是PE (${id}_rep${rep}_mode.txt)
	(4)  因为在合并数据时，同一个文件如果同时有SE和PE数据，需要将SE和PE数据分别进行合并，故统计SE以及PE文
		件的数目 (2行2列数据, num mode)，根据统计结果进行循环处理:
		1> if (num == 1): 只有一个文件，故直接使用软链接处理即可
			1>> if (mode == 1): SE数据
			2>> if (mode == 2): PE数据
			3>> if (mode == 3): SE + PE数据
		2> if (num != 1): 有多个文件，使用zcat | pigz进行合并文件
			1>> if (mode == 1): SE数据
			2>> if (mode == 2): PE数据
			3>> if (mode == 3): SE + PE数据
	(5) 关于新旧文件名的命名:
		1> 旧文件名：根据column3中所得
		2> 新文件名：
			SE: mergedExpID_cell_mk_rep1_1_RR.fq.gz
			PE: mergedExpID_cell_mk_rep1_1_R1.fq.gz
			    mergedExpID_cell_mk_rep1_1_R1.fq.gz'
	exit 100
fi

# format the para
if true; then
	if [[ -z ${output} ]]; then
		output=$PWD
	else
		# output full path
		now=$PWD
		if [[ ! -d ${output} ]]; then
			echo "The ${output} is not exist, the program will create this directory."
			mkdir -p ${output}
		fi
		cd ${output}
		output=$PWD
		cd ${now}
	fi

	# meta file full path
	meta_dir=$(dirname ${meta})
	cd ${meta_dir}
	meta_dir=$PWD
	meta=$(basename ${meta})
	meta=${meta_dir}/${meta}
	cd ${now}

	if [[ -z ${exp_id} ]]; then
		rm_id=T
		exp_id=${output}/exp799.id
		cut -d "," -f1 ${meta} | tail -n+2 | sort -u > ${exp_id}
	fi
fi

dat_root=$(dirname ${meta} | xargs dirname | xargs dirname)
script=${output}/0.merge_dat.sh
file_relathion=${output}/file_correspondence.txt

echo -e "Target\tSource" > ${file_relathion}
echo '#!/bin/bash' > ${script}
echo >> ${script}

cat ${exp_id} | while read id
do
	mk=$(egrep "${id}" ${meta} | cut -d "," -f11 | sort -u)
	# for each rep
	egrep "${id}" ${meta} | cut -d "," -f14 | sort -u | while read rep
	do
		egrep "${id}" ${meta} | awk -F "," -v rep=${rep} '{if($14 == rep) {print $3}}' > ${id}_rep${rep}.txt

		# distinct pe and se data
		sed "s/_R[12R].fq.gz//g" ${id}_rep${rep}.txt | sort | uniq -c | sed "s/^ *//g" | sed -r "s/ +/\t/g" > ${id}_rep${rep}_mode.txt
		cut -f1 ${id}_rep${rep}_mode.txt | sort | uniq -c | sed -r "s/^\s*//g" | sed -r "s/\s+/\t/g" | while read num mode
		do
			if [[ ${num} -eq 1 ]]; then
				if [[ ${mode} -eq 1 ]]; then
					# SE data
					new_rr_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_RR.fq.gz
					old_rr_file=$(awk -F "\t" '{if($1 == 1) {print $2"_RR.fq.gz"}}' ${id}_rep${rep}_mode.txt | sed "s#^#${dat_root}/#g")
					ls ${old_rr_file} 1>/dev/null 2>&1
					if [[ $? -ne 0 ]]; then
						echo "The file for ${id}: ${old_rr_file} is not exist, please check."
						exit 100
					fi

					# file relationship
					echo -e "${new_rr_file}\t${old_rr_file}" >> ${file_relathion}

					# merge the data
					echo "ln -s ${old_rr_file} ${new_rr_file}" >> ${script}
				elif [[ ${mode} -eq 2 ]]; then
					# PE data
					new_r1_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_R1.fq.gz
					new_r2_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_R2.fq.gz
					awk -F "\t" '{if($1 == 2) {printf $2"_R1.fq.gz\n"$2"_R2.fq.gz\n"}}' ${id}_rep${rep}_mode.txt | sed "s#^#${dat_root}/#g" > ${id}_rep${rep}_mode_pe.txt
					cat ${id}_rep${rep}_mode_pe.txt | xargs -n1 -i ls {} 1>/dev/null 2>&1
					if [[ $? -ne 0 ]]; then
						echo "The file in ${id}_rep${rep}_mode_pe.txt is not exist, please check."
						exit 100
					fi

					# file relationship
					awk '{if(NR%2 == 1) {print $0}}' ${id}_rep${rep}_mode_pe.txt | sed "s#^#${new_r1_file}\t#g" >> ${file_relathion}
					awk '{if(NR%2 == 0) {print $0}}' ${id}_rep${rep}_mode_pe.txt | sed "s#^#${new_r2_file}\t#g" >> ${file_relathion}

					# merge the data
					r1s=$(awk '{if(NR%2 == 1) {print $0}}' ${id}_rep${rep}_mode_pe.txt)
					echo "ln -s ${r1s} ${new_r1_file}" >> ${script}
					r2s=$(awk '{if(NR%2 == 0) {print $0}}' ${id}_rep${rep}_mode_pe.txt)
					echo "ln -s ${r2s} ${new_r2_file}" >> ${script}
				elif [[ ${mode} -eq 3 ]]; then
					# SE data
					new_rr_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_RR.fq.gz
					old_rr_file=$(awk -F "\t" '{if($1 == 1) {print $2"_RR.fq.gz"}}' ${id}_rep${rep}_mode.txt | sed "s#^#${dat_root}/#g")
					ls ${old_rr_file} 1>/dev/null 2>&1
					if [[ $? -ne 0 ]]; then
						echo "The file for ${id}: ${old_rr_file} is not exist, please check."
						exit 100
					fi
					# PE data
					new_r1_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_R1.fq.gz
					new_r2_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_R2.fq.gz
					awk -F "\t" '{if($1 == 3) {printf $2"_R1.fq.gz\n"$2"_R2.fq.gz\n"}}' ${id}_rep${rep}_mode.txt | sed "s#^#${dat_root}/#g" > ${id}_rep${rep}_mode_pe.txt
					cat ${id}_rep${rep}_mode_pe.txt | xargs -n1 -i ls {} 1>/dev/null 2>&1
					if [[ $? -ne 0 ]]; then
						echo "The file in ${id}_rep${rep}_mode_pe.txt is not exist, please check."
						exit 100
					fi

					# file relationship
					echo -e "${new_rr_file}\t${old_rr_file}" >> ${file_relathion}
					awk '{if(NR%2 == 1) {print $0}}' ${id}_rep${rep}_mode_pe.txt | sed "s#^#${new_r1_file}\t#g" >> ${file_relathion}
					awk '{if(NR%2 == 0) {print $0}}' ${id}_rep${rep}_mode_pe.txt | sed "s#^#${new_r2_file}\t#g" >> ${file_relathion}

					# merge the data
					echo "ln -s ${old_rr_file} ${new_rr_file}" >> ${script}
					r1s=$(awk '{if(NR%2 == 1) {print $0}}' ${id}_rep${rep}_mode_pe.txt)
					echo "ln -s ${r1s} ${new_r1_file}" >> ${script}
					r2s=$(awk '{if(NR%2 == 0) {print $0}}' ${id}_rep${rep}_mode_pe.txt)
					echo "ln -s ${r2s} ${new_r2_file}" >> ${script}
				else
					echo "The file ${id}_rep${rep}_mode.txt have some wrong mode (${mode}), please check."
					exit 100
				fi

				if [[ -f ${id}_rep${rep}_mode_pe.txt ]]; then
					rm ${id}_rep${rep}_mode_pe.txt
				fi
			else
				if [[ ${mode} -eq 1 ]]; then
					# SE data
					new_rr_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_RR.fq.gz
					awk -F "\t" '{if($1 == 1) {print $2"_RR.fq.gz"}}' ${id}_rep${rep}_mode.txt | sed "s#^#${dat_root}/#g" > ${id}_rep${rep}_mode_se.txt
					cat ${id}_rep${rep}_mode_se.txt | xargs -n1 -i ls {} 1>/dev/null 2>&1
					if [[ $? -ne 0 ]]; then
						echo "The file in ${id}_rep${rep}_mode_se.txt is not exist, please check."
						exit 100
					fi

					# file relationship
					cat ${id}_rep${rep}_mode_se.txt | sed "s#^#${new_rr_file}\t#g" >> ${file_relathion}

					# merge the data
					files=$(cat ${id}_rep${rep}_mode_se.txt | xargs)
					echo "zcat ${files} | pigz > ${new_rr_file}" >> ${script}
				elif [[ ${mode} -eq 2 ]]; then
					# PE data
					new_r1_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_R1.fq.gz
					new_r2_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_R2.fq.gz
					awk -F "\t" '{if($1 == 2) {printf $2"_R1.fq.gz\n"$2"_R2.fq.gz\n"}}' ${id}_rep${rep}_mode.txt | sed "s#^#${dat_root}/#g" > ${id}_rep${rep}_mode_pe.txt
					cat ${id}_rep${rep}_mode_pe.txt | xargs -n1 -i ls {} 1>/dev/null 2>&1
					if [[ $? -ne 0 ]]; then
						echo "The file in ${id}_rep${rep}_mode_pe.txt is not exist, please check."
						exit 100
					fi

					# file relationship
					awk '{if(NR%2 == 1) {print $0}}' ${id}_rep${rep}_mode_pe.txt | sed "s#^#${new_r1_file}\t#g" >> ${file_relathion}
					awk '{if(NR%2 == 0) {print $0}}' ${id}_rep${rep}_mode_pe.txt | sed "s#^#${new_r2_file}\t#g" >> ${file_relathion}

					# merge the data
					r1s=$(awk '{if(NR%2 == 1) {print $0}}' ${id}_rep${rep}_mode_pe.txt | xargs)
					echo "zcat ${r1s} | pigz > ${new_r1_file}" >> ${script}
					r2s=$(awk '{if(NR%2 == 0) {print $0}}' ${id}_rep${rep}_mode_pe.txt | xargs)
					echo "zcat ${r2s} | pigz > ${new_r2_file}" >> ${script}
				elif [[ ${mode} -eq 3 ]]; then
					# SE data
					new_rr_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_RR.fq.gz
					awk -F "\t" '{if($1 == 3) {print $2"_RR.fq.gz"}}' ${id}_rep${rep}_mode.txt | sed "s#^#${dat_root}/#g" > ${id}_rep${rep}_mode_se.txt
					cat ${id}_rep${rep}_mode_se.txt | xargs -n1 -i ls {} 1>/dev/null 2>&1
					if [[ $? -ne 0 ]]; then
						echo "The file in ${id}_rep${rep}_mode_se.txt is not exist, please check."
						exit 100
					fi
					# PE data
					new_r1_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_R1.fq.gz
					new_r2_file=${output}/merged${id}_${cell}_${mk}_rep${rep}_R2.fq.gz
					awk -F "\t" '{if($1 == 3) {printf $2"_R1.fq.gz\n"$2"_R2.fq.gz\n"}}' ${id}_rep${rep}_mode.txt | sed "s#^#${dat_root}/#g" > ${id}_rep${rep}_mode_pe.txt
					cat ${id}_rep${rep}_mode_pe.txt | xargs -n1 -i ls {} 1>/dev/null 2>&1
					if [[ $? -ne 0 ]]; then
						echo "The file in ${id}_rep${rep}_mode_pe.txt is not exist, please check."
						exit 100
					fi

					# file relationship
					cat ${id}_rep${rep}_mode_se.txt | sed "s#^#${new_rr_file}\t#g" >> ${file_relathion}
					awk '{if(NR%2 == 1) {print $0}}' ${id}_rep${rep}_mode_pe.txt | sed "s#^#${new_r1_file}\t#g" >> ${file_relathion}
					awk '{if(NR%2 == 0) {print $0}}' ${id}_rep${rep}_mode_pe.txt | sed "s#^#${new_r2_file}\t#g" >> ${file_relathion}

					# merge the data
					files=$(cat ${id}_rep${rep}_mode_se.txt | xargs)
					echo "zcat ${files} | pigz > ${new_rr_file}" >> ${script}
					r1s=$(awk '{if(NR%2 == 1) {print $0}}' ${id}_rep${rep}_mode_pe.txt | xargs)
					echo "zcat ${r1s} | pigz > ${new_r1_file}" >> ${script}
					r2s=$(awk '{if(NR%2 == 0) {print $0}}' ${id}_rep${rep}_mode_pe.txt | xargs)
					echo "zcat ${r2s} | pigz > ${new_r2_file}" >> ${script}
				else
					echo "The file ${id}_rep${rep}_mode.txt have some wrong mode (${mode}), please check."
					exit 100
				fi

				if [[ -f ${id}_rep${rep}_mode_se.txt ]]; then
					rm ${id}_rep${rep}_mode_se.txt
				fi
				if [[ -f ${id}_rep${rep}_mode_pe.txt ]]; then
					rm ${id}_rep${rep}_mode_pe.txt
				fi
			fi
		done
		rm ${id}_rep${rep}.txt ${id}_rep${rep}_mode.txt
	done
	echo "The ${id} has been merged"
done

if [[ ${rm_id} == "T" ]]; then
	rm ${exp_id}
fi