@@ -105,7 +105,7 @@ case "$key" in
105105 shift
106106 shift
107107 ;;
108- -i |--identity)
108+ -pi |--identity)
109109 identity=" $2 "
110110 shift
111111 shift
@@ -188,7 +188,7 @@ case "$key" in
188188 -i | --identifier The identifying tag used for tyrosine recombinases; given as the -i option for starfish annotate (Default: tyr)
189189
190190 pggb specific inputs:
191- -i | --identity -p option in pggb (Default: Automatically calulated using mash distances)
191+ -pi | --identity -p option in pggb (Default: Automatically calulated using mash distances)
192192 -l | --length -s option in pggb (Default: 20000 ; a conservative value increased from default pggb values)
193193 -k | --kmersize -k option in pggb (Default: 19 ; same as pggb)
194194 -G | --poaparam -G option in pggb (Default: 7919,8069; a conservative value increased from default pggb values)
@@ -241,6 +241,9 @@ elementspath=$( realpath ${elements} )
241241
242242# check if the files given actually exist
243243[ ! -f " ${assembliespath} " ] && echo " ERROR: Cannot find path to assemblies file provided by -a; check path is correct and file exists" && exit
244+ [ ! -f " ${tyrRspath} " ] && echo " ERROR: Cannot find path to Tyrosine Recombinases file provided by -r; check path is correct and file exists" && exit
245+ [ ! -f " ${elementspath} " ] && echo " ERROR: Cannot find path to Starship elements file provided by -e; check path is correct and file exists" && exit
246+
244247
245248# #check if output directory already exists
246249[ -d " ${output} " ] && echo " ERROR: output folder already exists" && exit
@@ -277,23 +280,27 @@ echo "Step 1: Building the genome-graph"
277280# #use mash triangle to generate a all vs all comparison per assembly file
278281# #then use this as a proxy for nucleotide identity distances and add an increase of 5% divergence to be used for the -p option in pggb (can be quite lenient here)
279282mash triangle -s 100000 -l path_to_assemblies.txt -E > ${prefix} .assemblies.mash_distances.txt
283+
284+ # #set automatically calculated identity value
285+ identity2=$( awk -F " \t" ' {v=(1-$3)*100-5; if(NR==1||v<min) min=v} END{print int(min)}' ${prefix} .assemblies.mash_distances.txt )
286+ # #use automatically calculated distance parameter if not set by user
280287if [[ ${identity} == " " ]]
281288then
282- identity=$( cat ${prefix} .assemblies.mash_distances.txt | cut -f3 | awk ' {print (1-$1)*100} ' | sort -n | head -n1 | awk -F " . " ' {print $1-5} ' )
289+ identity=" ${identity2} "
283290fi
284291
285292
286293# #concatenate and compress the assemblies using the absolute paths to each assembly
287294# #and removing all soft-masking
288- cat path_to_assemblies.txt | while read genome
295+ while read -r genome
289296do
290297if [[ ${genome} =~ " .gz" $ || ${genome} =~ " .bgzip" $ || ${genome} =~ " .gzip" $ ]]
291298then
292299zcat ${genome} | awk ' {if($1 ~ ">") {print } else {print toupper($0)}}'
293300else
294- cat ${genome} | awk ' {if($1 ~ ">") {print } else {print toupper($0)}}'
301+ awk ' {if($1 ~ ">") {print } else {print toupper($0)}}' ${genome}
295302fi
296- done | bgzip > ${prefix} .assemblies.fa.gz
303+ done < path_to_assemblies.txt | bgzip > ${prefix} .assemblies.fa.gz
297304
298305# #get the number of assemblies provided to be used as the -n option in pggb
299306genomecount=$( cat path_to_assemblies.txt | wc -l )
0 commit comments