test-scripts/macse2_prep.sh at master · dmvelasco/test-scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/bin/bash
#SBATCH -D /home/dmvelasc/Projects/Prunus/Data/fasta/
#SBATCH -o /home/dmvelasc/Projects/Prunus/slurm-log/%j-stdout-macse2_prep.txt
#SBATCH -e /home/dmvelasc/Projects/Prunus/slurm-log/%j-stderr-macse2_prep.txt
#SBATCH -J fastacat
#SBATCH -p bigmemm
#SBATCH -n 1
#SBATCH -c 2
#SBATCH -t 2-00:00:00
#SBATCH --exclude=bigmem1
#SBATCH --mail-user=dmvelasco@ucdavis.edu
#SBATCH --mail-type=ALL
set -e
set -u

# Load zlib 1.2.8
module load zlib

# Declare directories
dir1="/home/dmvelasc/bin"				# software binary directory
dir2="/home/dmvelasc/Projects/Prunus/Analysis/VCF"	# VCF directory
dir3="/home/dmvelasc/Data/references/persica-SCF"	# FASTA reference directory
dir4="/scratch/dmvelasc/fasta-concat"			# scratch directory
dir5="/home/dmvelasc/Projects/Prunus/Data/fasta"	# directory of CDS fasta sequences

# concatenate fasta sequences from each sample for each gene
# output multi-sequence fasta for use in MAFFT multi-sequence alignment program

# column 1: ID, column2: other ID/information
list="/home/dmvelasc/Projects/Prunus/Script/sample.txt"

####################
### Begin script ###
####################
# create scratch directory for temporary file placement
mkdir -p /scratch/dmvelasc/fasta-concat/

# create multi-sequence FASTA for each gene and CDS by concatenating FASTAs by ID from each sample
# Three steps:
# 1. for loop establishes sample ID
# 2. if statement determines if the sample directory with the fasta files exists
# and performs concatentation steps
# 3. while loop goes through each gene

echo "begin CDS FASTA concatenation"
date

for i in {0..66}; do
  mapfile -s "$i" -n 1 -t id < "${list}"
  # -s number of rows to skip | -n number of rows to read | -t (remove leading/trailing whitespace?)
  # id is the array name (anything in this position is the array name)

  # create an array from each two column line
  arr=( `echo "${id[0]}"` )

  # declare variables, created from array
  acc="${arr[0]}"
  echo -e "$acc"

  if [ -d "$acc" ]; then
    while read p; do
         # concatenate the fasta files for each sample by looping through the array
           # gene FASTA concatenation
           cat "$dir5"/"$acc"/"$p"_"$acc"_gene.fa >> "$dir4"/"$p"_gene.fa
           echo -e "\n" >> "$dir4"/"$p"_gene.fa
           # cds FASTA concatenation
           cat "$dir5"/"$acc"/"$p"_"$acc"_cds.fa >> "$dir4"/"$p"_cds.fa
           echo -e "\n" >> "$dir4"/"$p"_cds.fa
    done < "$dir3"/Prunus_persica_v1.0_genes_list.gff3
  fi
done

echo "end CDS FASTA concatenation"
date

# move files and remove directory
echo "move concatenated CDS FASTA files and remove temporary fasta scratch directory"
date
mv "$dir4" "$dir5"/