-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathbed2conservation
More file actions
executable file
·152 lines (136 loc) · 7.29 KB
/
bed2conservation
File metadata and controls
executable file
·152 lines (136 loc) · 7.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/bin/bash
#PBS -l nodes=1:ppn=4
GENOME="mm9"
AVG=0
#### usage ####
usage() {
echo Program: "bed2conservation (compute phastcons or phyloP conservation score corresponding to input BED file)"
echo " [MORE INFO: https://www.biostars.org/p/16724/]"
echo " [MORE INFO: https://www.biostars.org/p/129981/]"
echo " [MORE INFO: https://www.biostars.org/p/86847/]"
echo Author: BRIC, University of Copenhagen, Denmark
echo Version: 1.0
echo Contact: pundhir@binf.ku.dk
echo "Usage: bed2conservation -i <file> -o <dir> -s <dir>"
echo "Options:"
echo " -i <file> [input file containing genomic coordinate in BED format (can be stdin)]"
echo " [format: chr start end unique_id class]"
echo " [NOTE: fourth column (name) should be unique]"
echo "[OPTIONS]"
echo " -g <string> [genome (default: mm9)]"
echo " -y [compute phyloP scores instead]"
echo " -h [help]"
echo
exit 0
}
#### parse options ####
while getopts i:g:yh ARG; do
case "$ARG" in
i) INPUTBEDFILE=$OPTARG;;
g) GENOME=$OPTARG;;
y) PHYLOP=1;;
h) HELP=1;;
esac
done
## usage, if necessary file and directories are given/exist
if [ -z "$INPUTBEDFILE" -o "$HELP" ]; then
usage
fi
## populating files based on input genome
if [ "$GENOME" == "mm9" ]; then
GENOME_FILE="/home/pundhir/project/genome_annotations/mouse.mm9.genome"
PHASTCONS_DIR="/home/pundhir/project/genome_annotations/phastCons/mm9/phastCons30way/vertebrate"
elif [ "$GENOME" == "hg19" ]; then
GENOME_FILE="/home/pundhir/project/genome_annotations/human.hg19.genome"
PHASTCONS_DIR="/home/pundhir/project/genome_annotations/phastCons/hg19/phastCons46way/vertebrate"
else
echo "Presently the program only support analysis for mm9 or hg19"
echo
usage
fi
## create temporary BED file if input is from stdin
if [ "$INPUTBEDFILE" == "stdin" ]; then
TMP=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1)
while read LINE; do
echo ${LINE}
done | perl -ane '$line=""; foreach(@F) { $line.="$_\t"; } $line=~s/\t$//g; print "$line\n";' > $TMP
INPUTBEDFILE=$TMP
fi
if [ "$GENOME" == "mm9" ]; then
while IFS=$'\t' read -r -a ARR; do
if [ -f "$PHASTCONS_DIR/${ARR[0]}.data.bw" ]; then
#echo "bigWigSummary -type=mean $PHASTCONS_DIR/${ARR[0]}.data.bw ${ARR[0]} ${ARR[1]} ${ARR[2]} 1"; exit
MEAN_SCORE=$(bigWigSummary -type=mean $PHASTCONS_DIR/${ARR[0]}.data.bw ${ARR[0]} ${ARR[1]} ${ARR[2]} 1 2>/dev/null)
if [ "$?" -ne 0 ]; then
MEAN_SCORE="NA"
fi
MAX_SCORE=$(bigWigSummary -type=max $PHASTCONS_DIR/${ARR[0]}.data.bw ${ARR[0]} ${ARR[1]} ${ARR[2]} 1 2>/dev/null)
if [ "$?" -ne 0 ]; then
MAX_SCORE="NA"
fi
else
MEAN_SCORE="NA"
MAX_SCORE="NA"
fi
echo -e "${ARR[@]}\t$MEAN_SCORE\t$MAX_SCORE"
done < $INPUTBEDFILE | perl -an -F'/\s+/' -e 'if($F[0]=~/^chr/i) { print "$F[0]"; foreach(@F[1..scalar(@F)-1]) { print "\t$_"; } print "\n"; }'
else
while IFS=$'\t' read -r -a ARR; do
if [ -f "$PHASTCONS_DIR/${ARR[0]}.phastCons46way.wigFix.bw" ]; then
#echo "bigWigSummary -type=mean $PHASTCONS_DIR/${ARR[0]}.phastCons46way.wigFix.bw ${ARR[0]} ${ARR[1]} ${ARR[2]} 1"; exit
MEAN_SCORE=$(bigWigSummary -type=mean $PHASTCONS_DIR/${ARR[0]}.phastCons46way.wigFix.bw ${ARR[0]} ${ARR[1]} ${ARR[2]} 1 2>/dev/null)
if [ "$?" -ne 0 ]; then
MEAN_SCORE="NA"
fi
MAX_SCORE=$(bigWigSummary -type=max $PHASTCONS_DIR/${ARR[0]}.phastCons46way.wigFix.bw ${ARR[0]} ${ARR[1]} ${ARR[2]} 1 2>/dev/null)
if [ "$?" -ne 0 ]; then
MAX_SCORE="NA"
fi
else
MEAN_SCORE="NA"
MAX_SCORE="NA"
fi
echo -e "${ARR[@]}\t$MEAN_SCORE\t$MAX_SCORE"
done < $INPUTBEDFILE | perl -an -F'/\s+/' -e 'if($F[0]=~/^chr/i) { print "$F[0]"; foreach(@F[1..scalar(@F)-1]) { print "\t$_"; } print "\n"; }'
fi
## remove temporary file, if exists
if [ ! -z "$TMP" ]; then
rm $TMP
fi
exit
## OLD (uses starch files)
<<"COMMENT"
if [ ! -z "$PHYLOP" ]; then
awk '{ regionChromosome = $1; regionStart = $2; regionStop = $3; regionID = $4; ID = $5; baseIdx = 0; for (baseStart = regionStart; baseStart < regionStop; baseStart++) { baseStop = baseStart + 1; print regionChromosome"\t"baseStart"\t"baseStop"\t"regionID"-"baseIdx"\t"ID; baseIdx++; } }' $INPUTBEDFILE > $OUTDIR/INPUTFILE_PER_BASE.BED
## compute phylop conservation scores for input BED file
for i in `seq 1 22` X Y M;
#for i in 15;
do
phyloPFn="$PHASTCONS_DIR/chr${i}.phyloP46way.wigFix.bed"
#echo "$phyloPFn"
if [ $(ls $phyloPFn | wc -l) -eq 1 ]; then
echo "mapping signal for chromosome chr${i}...";
bedmap --echo --echo-map-score --chrom chr${i} --delim '\t' $OUTDIR/INPUTFILE_PER_BASE.BED ${phyloPFn} > $OUTDIR/regions_with_perBase_phyloP.${i}.bed &
fi
done
#cat $OUTDIR/*.bed | perl -ane 'if($F[5]!~/^$/) { $F[3]=~s/\-.*//g; print "$F[0]\t$F[1]\t$F[2]\t$F[3]\t$F[4]\t$F[5]\n"; }' | mergeBed -i stdin -d 1 -c 4,5,6 -o distinct,distinct,median > $OUTDIR/ALL.CONSERVATION
cat $OUTDIR/*.bed | perl -ane 'if($F[5]!~/^$/) { $F[3]=~s/\-.*//g; print "$F[0]\t$F[1]\t$F[2]\t$F[3]\t$F[4]\t$F[5]\n"; }' | perl -ane 'if(!defined($info{$F[3]}) && $id!~/^$/) { print "$info{$id}{'0'}\t$info{$id}{'1'}\t$info{$id}{'2'}\t$info{$id}{'3'}\t$info{$id}{'4'}\t$info{$id}{'5'}\t$info{$id}{count}\n"; %info=(); $info{$F[3]}{'0'}=$F[0]; $info{$F[3]}{'1'}=$F[1]; $info{$F[3]}{'2'}=$F[2]; $info{$F[3]}{'3'}=$F[3]; $info{$F[3]}{'4'}=$F[4]; $info{$F[3]}{'5'}=$F[5]; $info{$F[3]}{count}=1; } elsif(defined($info{$F[3]})) { $id=$F[3]; $info{$id}{'2'}=$F[2]; $info{$id}{'5'}+=$F[5]; $info{$id}{count}++; } else { $info{$F[3]}{'0'}=$F[0]; $info{$F[3]}{'1'}=$F[1]; $info{$F[3]}{'2'}=$F[2]; $info{$F[3]}{'3'}=$F[3]; $info{$F[3]}{'4'}=$F[4]; $info{$F[3]}{'5'}=$F[5]; $info{$F[3]}{count}=1; }' > $OUTDIR/ALL.CONSERVATION
else
## compute phastcons conservation scores for input BED file
for i in `seq 1 22` X Y M;
do
phastConFn="$PHASTCONS_DIR/chr${i}.phastCons*.wigFix.starch";
#echo "$phastConFn"
if [ $(ls $phastConFn | wc -l) -eq 1 ]; then
echo "mapping signal for chromosome chr${i}...";
bedmap --echo --sum --echo-map-score --chrom chr${i} --delim '\t' $INPUTBEDFILE ${phastConFn} > $OUTDIR/regions_with_avg_phastcons.${i}.bed &
fi
done
wait
## make final output file
echo -e "make final output file.. "
NCOL=$(cat $OUTDIR/regions_with_avg_phastcons.*.bed | head -n 1 | perl -ane 'print scalar(@F);')
join -j 4 -o 1.1 1.2 1.3 1.4 1.5 2.$((NCOL-1)) 2.$NCOL <(sort -k 4,4 $INPUTBEDFILE) <(cat $OUTDIR/regions_with_avg_phastcons.*.bed | sort -k 4,4) | perl -ane '$des=(); foreach(@F) { chomp($_); $des.="$_\t"; } $des=~s/\t$//g; print "$des\n";' > $OUTDIR/ALL.CONSERVATION
cat $OUTDIR/ALL.CONSERVATION | perl -ane '$count=0; if($F[6]!~/^$/) { @t=split(/\;/,$F[6]); foreach(@t) { if($_>=0.5) { $count++; }} } print "$F[0]\t$F[1]\t$F[2]\t$F[3]\t$F[4]\t$F[5]\t$count\n";' > $OUTDIR/ALL.CONSERVATION.COUNT
fi
COMMENT