-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathprocess_graph.sh
More file actions
executable file
·165 lines (136 loc) · 6.03 KB
/
process_graph.sh
File metadata and controls
executable file
·165 lines (136 loc) · 6.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/bin/bash
################################################################################
# process_graph.sh
################################################################################
#
# Construct SPR and MCMC graphs of a Bayesian posterior tree space.
# See the README for details.
#
# Copyright 2014 Chris Whidden
# cwhidden@fhcrc.org
# May 2, 2014
# Version 1.0
#
# This file is part of sprspace.
#
# sprspace is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# sprspace is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with sprspace. If not, see <http://www.gnu.org/licenses/>.
################################################################################
# min graph size constructed with iterative deepening
MIN_GRAPH_SIZE=128
dir=$1
# max graph size constructed
MAX_GRAPH_SIZE=${2:-1024}
if [ "$MAX_GRAPH_SIZE" -lt "$MIN_GRAPH_SIZE" ]; then
MIN_GRAPH_SIZE=$MAX_GRAPH_SIZE;
fi
ROOTED=${3:--unrooted}
# script location
SCRIPT=$(readlink -f "$0")
SCRIPTPATH=$(dirname "$SCRIPT")
# settings
. $SCRIPTPATH/settings.sh
fig_dir=$dir/figs
graph_dir=$dir/spr_graphs
cluster_dir=$dir/clusters
distance_dir=$dir/distances
LL_uniq=$dir/LL_uniq_trees_T
mkdir -p $graph_dir
mkdir -p $cluster_dir
mkdir -p $distance_dir
# PP values
awk '{print 0","$1}' < $dir/uniq_shapes_C_sorted_by_PP |
perl $SCRIPTPATH/cytoscape_PP.pl > $graph_dir/graph_pp.tab
end=`cat $dir/uniq_shapes_C_sorted_by_PP | wc -l`
size=$MIN_GRAPH_SIZE
while [ "$size" -lt "$end" ]; do
if [ $size -gt $MAX_GRAPH_SIZE ]; then
exit;
fi
echo $size
size_plus_one=$(($size+1))
# clusters
cluster_file=$cluster_dir/clusters_shapes_${size}_spr_sorted_by_PP_auto.csv
head -$size < $dir/uniq_shapes_C_sorted_by_PP |
grep -o '(.*' |
perl $SCRIPTPATH/cluster_posterior_by_likelihood_spr.pl > $cluster_file
perl $SCRIPTPATH/cytoscape_cluster.pl < $cluster_file > $graph_dir/graph_${size}_color.tab
# distances from peak
head -$size < $dir/uniq_shapes_C_sorted_by_PP |
$RSPR -rf $ROOTED -pairwise 0 1 > $distance_dir/rf_distances_${size}_PP_1_shape.csv
perl $SCRIPTPATH/cytoscape_cluster.pl < $distance_dir/rf_distances_${size}_PP_1_shape.csv > $graph_dir/graph_${size}_dist_rf.tab
sed -i 's/cluster/dist_rf/' $graph_dir/graph_${size}_dist_rf.tab
head -$size < $dir/uniq_shapes_C_sorted_by_PP |
$RSPR $ROOTED -pairwise 0 1 > $distance_dir/spr_distances_${size}_PP_1_shape.csv
perl $SCRIPTPATH/cytoscape_cluster.pl < $distance_dir/spr_distances_${size}_PP_1_shape.csv > $graph_dir/graph_${size}_dist_spr.tab
sed -i 's/cluster/dist_spr/' $graph_dir/graph_${size}_dist_spr.tab
# attribute file
paste -d"\t" $graph_dir/graph_${size}_color.tab <(awk '{print $2}' < $graph_dir/graph_${size}_dist_rf.tab) <(awk '{print $2}' < $graph_dir/graph_${size}_dist_spr.tab) <(awk '{print $2"\t"$3}' < $graph_dir/graph_pp.tab | head -$size_plus_one) |
awk 'NF>=4' > $graph_dir/graph_${size}_attr.tab
# peak 1 neighbourhood size
echo "dist,PP,size" > $graph_dir/graph_${size}_neighbourhood_pp.csv
paste $graph_dir/graph_pp.tab $graph_dir/graph_${size}_dist_spr.tab |
head -$size |
awk '{print $1,$2,$5}' |
tr " " "," |
sed '1d' |
perl $SCRIPTPATH/neighbourhood_pp.pl \
>> $graph_dir/graph_${size}_neighbourhood_pp.csv
# weighted MCMC graph
if [ -f $dir/uniq_trees_T ]; then
echo "source,target,weight" > $graph_dir/graph_${size}_weighted.csv
cat $dir/uniq_trees_T | awk '{print $4}' | perl $SCRIPTPATH/cytoscape_edge_weights.pl <(grep -o '(.*' $dir/uniq_shapes_C_sorted_by_PP | head -$size) | grep -v '^0,,' >> $graph_dir/graph_${size}_weighted.csv
fi
# SPR graph
grep -o '(.*' < $dir/uniq_shapes_C_sorted_by_PP |
head -$size |
$RSPR $ROOTED -pairwise_max 1 -fpt -q |
$SCRIPTPATH/matrix2pairs 1 |
sed "s/\t/,/g" > $graph_dir/graph_${size}.csv
size=$(($size * 2))
done
# clusters
cluster_file=$cluster_dir/clusters_shapes_spr_sorted_by_PP_auto.csv
cat $dir/uniq_shapes_C_sorted_by_PP |
grep -o '(.*' |
perl $SCRIPTPATH/cluster_posterior_by_likelihood_spr.pl > $cluster_file
perl $SCRIPTPATH/cytoscape_cluster.pl < $cluster_file > $graph_dir/graph_color.tab
# distances from peak
cat $dir/uniq_shapes_C_sorted_by_PP |
$RSPR -rf $ROOTED -pairwise 0 1 > $distance_dir/rf_distances_PP_1_shape.csv
perl $SCRIPTPATH/cytoscape_cluster.pl < $distance_dir/rf_distances_PP_1_shape.csv > $graph_dir/graph_dist_rf.tab
sed -i 's/cluster/dist_rf/' $graph_dir/graph_dist_rf.tab
cat $dir/uniq_shapes_C_sorted_by_PP |
$RSPR $ROOTED -pairwise 0 1 > $distance_dir/spr_distances_PP_1_shape.csv
perl $SCRIPTPATH/cytoscape_cluster.pl < $distance_dir/spr_distances_PP_1_shape.csv > $graph_dir/graph_dist_spr.tab
sed -i 's/cluster/dist_spr/' $graph_dir/graph_dist_spr.tab
# attribute file
paste -d"\t" $graph_dir/graph_color.tab <(awk '{print $2}' < $graph_dir/graph_dist_rf.tab) <(awk '{print $2}' < $graph_dir/graph_dist_spr.tab) <(awk '{print $2"\t"$3}' < $graph_dir/graph_pp.tab ) |
awk 'NF>=4' > $graph_dir/graph_attr.tab
# peak 1 neighbourhood size
echo "dist,PP,size" > $graph_dir/graph_neighbourhood_pp.csv
paste $graph_dir/graph_pp.tab $graph_dir/graph_dist_spr.tab |
awk '{print $1,$2,$5}' |
tr " " "," |
sed '1d' |
perl $SCRIPTPATH/neighbourhood_pp.pl \
>> $graph_dir/graph_neighbourhood_pp.csv
# weighted MCMC graph
if [ -f $dir/uniq_trees_T ]; then
echo "source,target,weight" > $graph_dir/graph_weighted.csv
cat $dir/uniq_trees_T | awk '{print $4}' | perl $SCRIPTPATH/cytoscape_edge_weights.pl <(grep -o '(.*' $dir/uniq_shapes_C_sorted_by_PP ) | grep -v '^0,,' >> $graph_dir/graph_weighted.csv
fi
# SPR graph
grep -o '(.*' < $dir/uniq_shapes_C_sorted_by_PP |
$RSPR $ROOTED -q -pairwise_max 1 |
$SCRIPTPATH/matrix2pairs 1 |
sed "s/\t/,/g" > $graph_dir/graph.csv