Skip to content

Commit d08babe

Browse files
authored
Merge pull request #10 from SAMtoBAM/SAMtoBAM-patch-5
Simple fixes of stargraph
2 parents 715da35 + 9a8b07a commit d08babe

2 files changed

Lines changed: 14 additions & 7 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ the _Starship_ annotations (_starfish_output/elementFinder/\*.elements.ann.feat_
106106
-i | --identifier The identifying tag used for tyrosine recombinases; given as the -i option for starfish annotate (Default: tyr)
107107

108108
pggb specific inputs:
109-
-i | --identity -p option in pggb (Default: Automatically calulated using mash distances)
109+
-pi | --identity -p option in pggb (Default: Automatically calulated using mash distances)
110110
-l | --length -s option in pggb (Default: 20000 ; a conservative value increased from default pggb values)
111111
-k | --kmersize -k option in pggb (Default: 19 ; same as pggb)
112112
-G | --poaparam -G option in pggb (Default: 7919,8069; a conservative value increased from default pggb values)

bin/stargraph.sh

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ case "$key" in
105105
shift
106106
shift
107107
;;
108-
-i|--identity)
108+
-pi|--identity)
109109
identity="$2"
110110
shift
111111
shift
@@ -188,7 +188,7 @@ case "$key" in
188188
-i | --identifier The identifying tag used for tyrosine recombinases; given as the -i option for starfish annotate (Default: tyr)
189189
190190
pggb specific inputs:
191-
-i | --identity -p option in pggb (Default: Automatically calulated using mash distances)
191+
-pi | --identity -p option in pggb (Default: Automatically calulated using mash distances)
192192
-l | --length -s option in pggb (Default: 20000 ; a conservative value increased from default pggb values)
193193
-k | --kmersize -k option in pggb (Default: 19 ; same as pggb)
194194
-G | --poaparam -G option in pggb (Default: 7919,8069; a conservative value increased from default pggb values)
@@ -241,6 +241,9 @@ elementspath=$( realpath ${elements} )
241241

242242
#check if the files given actually exist
243243
[ ! -f "${assembliespath}" ] && echo "ERROR: Cannot find path to assemblies file provided by -a; check path is correct and file exists" && exit
244+
[ ! -f "${tyrRspath}" ] && echo "ERROR: Cannot find path to Tyrosine Recombinases file provided by -r; check path is correct and file exists" && exit
245+
[ ! -f "${elementspath}" ] && echo "ERROR: Cannot find path to Starship elements file provided by -e; check path is correct and file exists" && exit
246+
244247

245248
##check if output directory already exists
246249
[ -d "${output}" ] && echo "ERROR: output folder already exists" && exit
@@ -277,23 +280,27 @@ echo "Step 1: Building the genome-graph"
277280
##use mash triangle to generate a all vs all comparison per assembly file
278281
##then use this as a proxy for nucleotide identity distances and add an increase of 5% divergence to be used for the -p option in pggb (can be quite lenient here)
279282
mash triangle -s 100000 -l path_to_assemblies.txt -E > ${prefix}.assemblies.mash_distances.txt
283+
284+
##set automatically calculated identity value
285+
identity2=$( awk -F "\t" '{v=(1-$3)*100-5; if(NR==1||v<min) min=v} END{print int(min)}' ${prefix}.assemblies.mash_distances.txt )
286+
##use automatically calculated distance parameter if not set by user
280287
if [[ ${identity} == "" ]]
281288
then
282-
identity=$( cat ${prefix}.assemblies.mash_distances.txt | cut -f3 | awk '{print (1-$1)*100}' | sort -n | head -n1 | awk -F "." '{print $1-5}' )
289+
identity="${identity2}"
283290
fi
284291

285292

286293
##concatenate and compress the assemblies using the absolute paths to each assembly
287294
##and removing all soft-masking
288-
cat path_to_assemblies.txt | while read genome
295+
while read -r genome
289296
do
290297
if [[ ${genome} =~ ".gz"$ || ${genome} =~ ".bgzip"$ || ${genome} =~ ".gzip"$ ]]
291298
then
292299
zcat ${genome} | awk '{if($1 ~ ">") {print } else {print toupper($0)}}'
293300
else
294-
cat ${genome} | awk '{if($1 ~ ">") {print } else {print toupper($0)}}'
301+
awk '{if($1 ~ ">") {print } else {print toupper($0)}}' ${genome}
295302
fi
296-
done | bgzip > ${prefix}.assemblies.fa.gz
303+
done < path_to_assemblies.txt | bgzip > ${prefix}.assemblies.fa.gz
297304

298305
##get the number of assemblies provided to be used as the -n option in pggb
299306
genomecount=$( cat path_to_assemblies.txt | wc -l )

0 commit comments

Comments
 (0)