From edafc670880803433b7f2255058bf9696699b581 Mon Sep 17 00:00:00 2001 From: jlboat Date: Sun, 27 Jun 2021 14:37:37 -0400 Subject: [PATCH] Added bash scripts from sbatch, converted py2 to py3, edited design files --- bash/run_ase_align_and_count_testData.bash | 220 ++++++++++++++++++ bash/run_ase_bayesian.bash | 57 +++++ ...n_ase_create_design_file_4_prior_calc.bash | 9 + ...genotype_specific_references_testData.bash | 51 ++++ ...ase_merge_priors_2_comparate_testData.bash | 32 +++ bash/run_ase_prior_calculation_testData.bash | 22 ++ bash/run_ase_summarize_counts_testData.bash | 46 ++++ example_in/comparate_design_file.csv | 1 - example_in/df_BASE_galaxy_W55_noEmptyFq.csv | 1 - example_in/df_list_G2_VCF_split.csv | 1 - hpc/ase_scripts/BWASplitSAM_07mai.py | 38 +-- hpc/ase_scripts/sam_compare_w_feature.py | 10 +- ...e_cnts_table_1cond_and_output_APN_06amm.py | 2 +- .../NBmodel_stan2_slurm_02amm.py | 2 +- 14 files changed, 463 insertions(+), 29 deletions(-) create mode 100644 bash/run_ase_align_and_count_testData.bash create mode 100644 bash/run_ase_bayesian.bash create mode 100644 bash/run_ase_create_design_file_4_prior_calc.bash create mode 100644 bash/run_ase_genotype_specific_references_testData.bash create mode 100644 bash/run_ase_merge_priors_2_comparate_testData.bash create mode 100644 bash/run_ase_prior_calculation_testData.bash create mode 100644 bash/run_ase_summarize_counts_testData.bash diff --git a/bash/run_ase_align_and_count_testData.bash b/bash/run_ase_align_and_count_testData.bash new file mode 100644 index 0000000..c33b358 --- /dev/null +++ b/bash/run_ase_align_and_count_testData.bash @@ -0,0 +1,220 @@ +#!/usr/bin/env bash + +## Set / Create Directories and Variables +#Everything is relative to current directory +SCRIPTS=hpc/ase_scripts +REF=example_out/reference/updated_genomes_vcfConsensus +ORIG=example_in/reads + +BEDFILE=example_in/reformatted_BASE_testdata_bedfile.bed + +## Create output directory for alignments to updated genome +OUTALN=example_out/aln_upd_genome + if [ ! -e $OUTALN ]; then mkdir -p $OUTALN; fi +PSAM=example_out/aln_upd_genome_bwa_parse + if [ ! -e $PSAM ]; then mkdir -p $PSAM; fi + +## Create output directory for ase counts from sam compare +SAMC=example_out/ase_counts_updated + if [ ! -e $SAMC ]; then mkdir -p $SAMC; fi + +## Create output directory for upd aln summaries and checks +CHKALN=example_out/check_aln + if [ ! -e $CHKALN ]; then mkdir -p $CHKALN; fi + +## Create output directory for Sam Compare check +CHKSC=example_out/check_samcomp + if [ ! -e $CHKSC ]; then mkdir -p $CHKSC; fi + +# G1,G2,sampleID,fqName,fqExtension,readLength,techRep +DESIGN_FILE=example_in/df_BASE_galaxy_W55_noEmptyFq.csv +DESIGN=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $DESIGN_FILE) +IFS=',' read -ra ARRAY <<< "$DESIGN" + +G1=${ARRAY[0]} +G2=${ARRAY[1]} +READLEN=${ARRAY[5]} +FQ=${ARRAY[3]} +EXT=${ARRAY[4]} + +## Create temp dir +ROZ=roz_${FQ} + mkdir -p ${ROZ} + +## set READ and calculate ave readlength +READ=${ORIG}/${FQ}${EXT} +echo " read is $READ +" + +AVE_RL=$(awk '{if(NR%4==2 && length) {count++; bases += length} } END {print bases/count}' ${READ} | awk '{ printf "%.0f\\n", $1 }') + +###### Create modified BED file to use in SAM compare - this bed has features 1st with start and end positions in genome +## use awk to reorder columms in bed file +SBED=example_out/snp_feature_first.bed + +echo "Reformatting $BEDFILE" +awk -v OFS='\t' '{print $4,$2,$3,$1}' $BEDFILE > $SBED + + +###### (1) Align Reads to Updated Genomes - first to G2 ref then to G1 reference - and Parse sam file +FQLINEFN=$(wc -l $READ) +FQLINE=$(echo $FQLINEFN | cut -d" " -f1) +NUMREAD=$(( FQLINE / 4 )) +FN=$(echo $FQLINEFN | cut -d" " -f2) + +## count number of starting reads - same for G1 and G2 refs +echo $NUMREAD | awk -v fq=${FQ} -v gq=pre_aln_read_count '{print "filename" "," gq "\n" fq "," $0}' > $CHKALN/pre_aln_reads_${FQ}.csv + + + +for FOO in G1 G2 +do + if [[ $FOO == 'G2' ]] + then + BREF=$REF/${G2}_snp_upd_genome_BWA + + echo "Aligning G2 to reference $BREF" + bwa mem -t 8 -M $BREF $READ > $OUTALN/${G2}_${FQ}_upd.sam + + echo -e "Start BWASplitSam on: $OUTALN/${G2}_${FQ}_upd.sam" + $SCRIPTS/BWASplitSAM_07mai.py -s $OUTALN/${G2}_${FQ}_upd.sam --outdir $ROZ -fq1 $READ + + ## cat together mapped and opposite + cat $ROZ/${G2}_${FQ}_upd_mapped.sam $ROZ/${G2}_${FQ}_upd_oposite.sam > $PSAM/${G2}_${FQ}_upd_uniq.sam + + elif [[ $FOO == 'G1' ]] + then + BREF=$REF/${G1}_snp_upd_genome_BWA + + echo "Aligning G1 to reference $BREF" + bwa mem -t 8 -M $BREF $READ > $OUTALN/${G1}_${FQ}_upd.sam + + echo -e "Start BWASplitSam on: $OUTALN/${G1}_${FQ}_upd.sam" + $SCRIPTS/BWASplitSAM_07mai.py -s $OUTALN/${G1}_${FQ}_upd.sam --outdir $ROZ -fq1 $READ + + ## cat together mapped and opposite + cat $ROZ/${G1}_${FQ}_upd_mapped.sam $ROZ/${G1}_${FQ}_upd_oposite.sam > $PSAM/${G1}_${FQ}_upd_uniq.sam + fi +done + +### move alignment summary csv files +mv ${ROZ}/${G1}_${FQ}_upd_summary.csv ${CHKALN}/${G1}_${FQ}_upd_summary.csv +mv ${ROZ}/${G2}_${FQ}_upd_summary.csv ${CHKALN}/${G2}_${FQ}_upd_summary.csv + +### for every FQ file run, should have 2 sam files (NOTE default is SE) +echo " + checking for 2 sam files" + +python $SCRIPTS/check_sam_present_04amm.py \ + -fq $FQ \ + -alnType SE \ + -samPath $PSAM \ + -G1 $G1 \ + -G2 $G2 \ + -o $CHKALN/check_2_sam_files_${FQ}.txt + +## run python script to count reads into aln and in each SAM file +echo " + checking for missing reads in sam files" + +python $SCRIPTS/check_for_lost_reads_05amm.py \ + -a1 $CHKALN/${G1}_${FQ}_upd_summary.csv \ + -a2 $CHKALN/${G2}_${FQ}_upd_summary.csv \ + -numread $CHKALN/pre_aln_reads_${FQ}.csv \ + -fq $FQ \ + -o $CHKALN/check_start_reads_vs_aln_reads_${FQ}.csv + + +## Insert bedtools intersect script + any checks (reads in aln sam output) +###### (2) Bedtools Intersect: Here we will call the shell script to reformat the sam file so that the have feature names instead of CHR names +## In parsed SAM, + +for SAMFILE in $PSAM/*_${FQ}_upd_uniq.sam +do + MYSAMFILE2=$(basename $SAMFILE) + + AWKTMP=$PSAM/${MYSAMFILE2/_uniq.sam/_uniq_AWK.txt} + NEWSAM=$PSAM/${MYSAMFILE2/_uniq.sam/_uniq_FEATURE.sam} + + #Create a bed file to write the starting position of every read and an end postion (end = start + readlength) + awk -v readLen=${READLEN} -v OFS='\t' '{print $3,$4,$4+readLen}' $SAMFILE > $AWKTMP + + BED4=${PSAM}/${MYSAMFILE2/_uniq.sam/_uniq_int_all.bed} + BED3=$PSAM/${MYSAMFILE2/_uniq.sam/_uniq_int.bed} + SUM=${PSAM}/${MYSAMFILE2/_uniq.sam/_drop_summary.txt} + + #Run bedtools intersect with -loj between the reads and the features. + #We will have one result for each region + # pipe to awk to remove rows where a read does NOT overlap with a feature + bedtools intersect -a $AWKTMP -b $SBED -loj > ${BED4} + + awk -v OFS='\t' '$4 !="."' ${BED4} > $BED3 + + ## create file with counts for before and after dropping + awk -v a=0 -v b=${G2}_${FQ} -v OFS=',' 'BEGIN{print "fqName", "number_overlapping_rows", "total_number_rows"delimited}; { if ($4 !=".") a++} END { print b, a, NR}' ${BED4} > ${SUM} + + #With awk substitute column 3 of sam file with column 7 (Feature name) of bed file (using chrom and pos as keys). + ##omit reads with no feature assigned + awk -v OFS='\t' 'FNR==NR{a[$1,$2]=$7; next} {$3=a[$3,$4]}1' ${BED3} $SAMFILE | awk -F'\t' '$3!=""' > $NEWSAM + + echo initial sam file $SAMFILE + echo awk outfile $AWKTMP + echo bed intersect outfile $BED3 + echo new sam file "$NEWSAM" + +done + + + +## Grab sam files and bed files + ## sam1 (samA) = G1 and sam2 (samB) = G2 +SAM1=$PSAM/${G1}_${FQ}_upd_uniq_FEATURE.sam +SAM2=$PSAM/${G2}_${FQ}_upd_uniq_FEATURE.sam +BED1=$PSAM/${G1}_${FQ}_upd_uniq_int.bed +BED2=$PSAM/${G2}_${FQ}_upd_uniq_int.bed + +awk 'NR==FNR{c[$3]++;next};c[$7] == 0' $SAM1 $BED1 > $CHKSC/check_sam_bed_${G1}_${FQ}.txt +awk 'NR==FNR{c[$3]++;next};c[$7] == 0' $SAM2 $BED2 > $CHKSC/check_sam_bed_${G2}_${FQ}.txt + +###### (3) Run Sam Compare + +READ1=${ORIG}/${FQ}${EXT} + +echo -e "READ1: '${READ1}" +echo -e "SAM1: '${SAM1}'" +echo -e "SAM2: '${SAM2}'" +echo -e "BED: '${SBED}'" + +echo -e "starting sam compare for $FQ " +## NOTE using average read length!! +python $SCRIPTS/sam_compare_w_feature.py \ + -n \ + -l ${AVE_RL} \ + -f $BEDFILE \ + -q $READ1 \ + -A $SAM1 \ + -B $SAM2 \ + -c $SAMC/ase_counts_${FQ}.csv \ + -t $SAMC/ase_totals_${FQ}.txt \ + --log $CHKSC/ase_log_${FQ}.log + +### mv drop summary to check_aln dir +echo " + Moving drop read summary files to check_aln dir" +mv ${PSAM}/*_${FQ}_upd_drop_summary.txt ${CHKALN}/ + +echo -e "run sam compare check for $FQ " +# Check to make sure counts in csv summary file is within range of minimum unique reads from respective sam files and +# the summation of the unique reads of both sam files +python $SCRIPTS/check_samcomp_for_lost_reads_03amm.py \ + -b1 $CHKALN/${G1}_${FQ}_upd_drop_summary.txt \ + -b2 $CHKALN/${G2}_${FQ}_upd_drop_summary.txt \ + -G1 $G1 \ + -G2 $G2 \ + -s $SAMC/ase_totals_${FQ}.txt \ + -fq $FQ \ + -o $CHKSC/check_samcomp_${FQ}_aln_2_upd.csv + +rm -r $ROZ + + diff --git a/bash/run_ase_bayesian.bash b/bash/run_ase_bayesian.bash new file mode 100644 index 0000000..3c566b8 --- /dev/null +++ b/bash/run_ase_bayesian.bash @@ -0,0 +1,57 @@ +#!/usr/bin/env bash + +## 2 conditions - M vs V + +## Set / Create Directories and Variables +SCRIPTS=hpc/stan2_scripts + +## Create temp output directory for initial datafile with modified headers before entering Bayesian +ROZ=example_out/temp_directory + mkdir -p $ROZ + +## directory for model testing +BAYES=example_out/bayesian_in + +BAYESOUT=example_out/bayesian_out + mkdir -p $BAYESOUT + +## Set design file with G1, G2, c1, c2, and input filename columns +DESIGN_FILE=example_in/comparate_design_file.csv + +DESIGN=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $DESIGN_FILE) +IFS=',' read -ra ARRAY <<< "$DESIGN" + +COMP_1=${ARRAY[0]} +COMP_2=${ARRAY[1]} +COMPID=${ARRAY[2]} + +echo "comparate 1 is $COMP_1" + +## Set number of comparates to be analyzed + ## M vs F for each line = 2 +COMPNUM=2 + +## set number of iterations = 100,000 +ITER=100000 + +## set burn in of 10,000 +WARMUP=10000 + +###### Run python script calling environmental bayesian model (stan2, 2 conditions) +python3 $SCRIPTS/NBmodel_stan2_slurm_02amm.py \ + -comparate_1 $COMP_1 \ + -comparate_2 $COMP_2 \ + -c1_g1 M \ + -c1_g2 M \ + -c2_g1 V \ + -c2_g2 V \ + -compID $COMPID \ + -datafile $BAYES \ + -datafile2 $BAYESOUT/ \ + -cond $COMPNUM \ + -workdir . \ + -routput $BAYESOUT \ + -subpath $SCRIPTS/NBModel_stan2_flex_prior.R \ + -iterations $ITER \ + -warmup $WARMUP \ + -o $BAYESOUT diff --git a/bash/run_ase_create_design_file_4_prior_calc.bash b/bash/run_ase_create_design_file_4_prior_calc.bash new file mode 100644 index 0000000..33951d9 --- /dev/null +++ b/bash/run_ase_create_design_file_4_prior_calc.bash @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +### create design file for calculating priors from df_ase_samcomp_summed.dsv + +## Set / Create Directories and Variables +DF=example_out +DESIGN=example_in/df_ase_samcomp_summed.csv + +awk 'BEGIN{FS=","; OFS=","} { print $1, $2, $4}' ${DESIGN} | sort -u > ${DF}/df_priors.csv diff --git a/bash/run_ase_genotype_specific_references_testData.bash b/bash/run_ase_genotype_specific_references_testData.bash new file mode 100644 index 0000000..f18e569 --- /dev/null +++ b/bash/run_ase_genotype_specific_references_testData.bash @@ -0,0 +1,51 @@ +#!/usr/bin/env bash + +## set directories +## All directories are relative to the main BayesASE folder +VCFIN=example_in/vcf_by_genotype +VCFOUT=example_out/vcf_by_genotype +UREF=example_out/reference/updated_genomes_vcfConsensus +mkdir -p $VCFOUT +mkdir -p $UREF + +FASTA=example_in/reference/dmel_r551_chromosome_2R_X.fasta + +# line,old +# W55,Winters_55 +# W1118,w1118_w118 +DESIGN_FILE=example_in/df_list_G2_VCF_split.csv +DESIGN=$(sed -n "${SLURM_ARRAY_TASK_ID}p" $DESIGN_FILE) +IFS=',' read -ra ARRAY <<< "$DESIGN" + +G2=${ARRAY[0]} + +#### (1) UPDATE SNPs +### Index starting VCF files + + + ## remove and replace if gz file already exists (so we can overwrite) + if [[ -f ${VCFOUT}/${G2}_snp_testGenes_sorted.vcf.gz ]] ; then + rm ${VCFOUT}/${G2}_snp_testGenes_sorted.vcf.gz + fi + + # sort split VCF files + bcftools sort -O v ${VCFIN}/${G2}_snp_testGenes.vcf -o ${VCFOUT}/${G2}_snp_testGenes_sorted.vcf + + echo "indexing individual SNP vcf files: ${G2} " + bgzip ${VCFOUT}/${G2}_snp_testGenes_sorted.vcf + tabix ${VCFOUT}/${G2}_snp_testGenes_sorted.vcf.gz + + echo "updating SNPs: ${G2} " + cat ${FASTA} | vcf-consensus ${VCFOUT}/${G2}_snp_testGenes_sorted.vcf.gz > ${UREF}/${G2}_snp_upd_genome.fasta + + + #### (2) Build indexes from the references + echo `date`": building BWA index for ${G2} UPD genome " + ##BWA-mem references -p is the prefix for the output database, -a is the indexing algorithm ('bwtsw' is for ref>2G, 'is' for ref<2G). + bwa index -p ${UREF}/${G2}_snp_upd_genome_BWA -a bwtsw ${UREF}/${G2}_snp_upd_genome.fasta + + echo `date`": building samtools index for ${G2} UPD genome" + samtools faidx ${UREF}/${G2}_snp_upd_genome.fasta + + + diff --git a/bash/run_ase_merge_priors_2_comparate_testData.bash b/bash/run_ase_merge_priors_2_comparate_testData.bash new file mode 100644 index 0000000..29a1ae9 --- /dev/null +++ b/bash/run_ase_merge_priors_2_comparate_testData.bash @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +## merge priors to comparate +## merge comparates together + + +SCRIPTS=hpc/ase_scripts + +## user must provide following design file for merging comparates: +DESIGN2=example_in/df_merge_comparates_4_bayesian.csv + +DESIGN=example_out/df_priors.csv +PRIORS=example_out/priors_fromData +FILT=example_out/ase_counts_summarized +BAYESIN=example_out/bayesian_in + mkdir -p $BAYESIN + + echo "running merge priors to comparate + " + python3 ${SCRIPTS}/merge_priors_to_comparate_03amm.py \ + --output ${BAYESIN} \ + --comp ${FILT} \ + --prior ${PRIORS} \ + --design ${DESIGN} + + + python3 ${SCRIPTS}/merge_comparates_and_gen_headers_for_bayesian_02amm.py \ + --output ${BAYESIN} \ + --comp ${BAYESIN} \ + --design ${DESIGN2} + + diff --git a/bash/run_ase_prior_calculation_testData.bash b/bash/run_ase_prior_calculation_testData.bash new file mode 100644 index 0000000..1bde190 --- /dev/null +++ b/bash/run_ase_prior_calculation_testData.bash @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +### (1) calculate priors from ASE count tables (generated from simulated reads, DNA reads, or RNA data reads) + +## Set / Create Directories and Variables +SCRIPTS=hpc/ase_scripts + +## this example calculates from data +FILT=example_out/ase_counts_summarized + +OUTPUT=example_out/priors_fromData +mkdir -p $OUTPUT + +DESIGN=example_out/df_priors.csv + +##### (1) calculate priors from ASE counts - data + +python3 $SCRIPTS/calculate_priors_from_ase_count_tables_03amm.py \ + --output $OUTPUT \ + --design $DESIGN \ + --input $FILT \ + --debug diff --git a/bash/run_ase_summarize_counts_testData.bash b/bash/run_ase_summarize_counts_testData.bash new file mode 100644 index 0000000..f901cfa --- /dev/null +++ b/bash/run_ase_summarize_counts_testData.bash @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +## combine ase count tables +## summarize data + + +SCRIPTS=hpc/ase_scripts +SAMC=example_out/ase_counts_updated + +SSUM=example_out/ase_counts_summed_techReps + mkdir -p $SSUM +FILT=example_out/ase_counts_summarized + mkdir -p $FILT + +BEDFILE=example_out/snp_feature_first.bed +DESIGN=example_in/df_BASE_galaxy_W55_combinedFq.csv + + + mkdir -p ${SSUM} + + SIM=False + APN=1 + + python3 ${SCRIPTS}/combine_cnt_tables_13amm.py \ + -design ${DESIGN} \ + -sim ${SIM} \ + --bed ${BEDFILE} \ + --path ${SAMC} \ + --designdir example_in \ + --out ${SSUM} + + + ## design2 is created in above script + DESIGN2=example_in/df_ase_samcomp_summed.csv + + mkdir -p ${FILT} + + python3 ${SCRIPTS}/summarize_sam_compare_cnts_table_1cond_and_output_APN_06amm.py \ + --output ${FILT} \ + --design ${DESIGN2} \ + --parent1 G1 \ + --parent2 G2 \ + --sampleCol sample \ + --sampleIDCol sampleID \ + --sam-compare-dir ${SSUM} \ + --apn ${APN} \ diff --git a/example_in/comparate_design_file.csv b/example_in/comparate_design_file.csv index fd29fb6..0439f19 100755 --- a/example_in/comparate_design_file.csv +++ b/example_in/comparate_design_file.csv @@ -1,2 +1 @@ -Comparate_1,Comparate_2,compID W55_M,W55_V,W55_M_V diff --git a/example_in/df_BASE_galaxy_W55_noEmptyFq.csv b/example_in/df_BASE_galaxy_W55_noEmptyFq.csv index f29c561..bb98727 100755 --- a/example_in/df_BASE_galaxy_W55_noEmptyFq.csv +++ b/example_in/df_BASE_galaxy_W55_noEmptyFq.csv @@ -1,4 +1,3 @@ -G1,G2,sampleID,fqName,fqExtension,readLength,techRep W1118,W55,W55_M_1,W55_M_1_1,.fastq,96,1 W1118,W55,W55_M_1,W55_M_1_2,.fastq,96,2 W1118,W55,W55_M_1,W55_M_1_3,.fastq,96,3 diff --git a/example_in/df_list_G2_VCF_split.csv b/example_in/df_list_G2_VCF_split.csv index 3d51daa..bd9531f 100644 --- a/example_in/df_list_G2_VCF_split.csv +++ b/example_in/df_list_G2_VCF_split.csv @@ -1,3 +1,2 @@ -line,old W55,Winters_55 W1118,w1118_w118 diff --git a/hpc/ase_scripts/BWASplitSAM_07mai.py b/hpc/ase_scripts/BWASplitSAM_07mai.py index 5dd3c45..a869013 100755 --- a/hpc/ase_scripts/BWASplitSAM_07mai.py +++ b/hpc/ase_scripts/BWASplitSAM_07mai.py @@ -132,12 +132,12 @@ def SplitSAMPE (fname,odir,summname): # Getting & printing "gray" reads elif elements[1] in flags_grayones: - print >> GRAY,'\t'.join(elements) + print('\t'.join(elements), file=GRAY) counter_grayones += 1 total += 1 # Getting & printing "OPOSITE" reads elif elements[1] in flags_opositestrand: - print >> OPOSITE,'\t'.join(elements) + print('\t'.join(elements), file=OPOSITE) counter_opositestrand += 1 total += 1 @@ -146,31 +146,31 @@ def SplitSAMPE (fname,odir,summname): elif elements[1] in flags_mapped1: regmatch=re.match(".+\tAS:i:([0-9]+)\tXS:i:([0-9]+).*",line) if int(regmatch.group(1))-int(regmatch.group(2))==0: - print >> AMBIGUOUS,'\t'.join(elements) + print('\t'.join(elements), file=AMBIGUOUS) ambiguous1.append(elements[0]) counter_ambiguous1 += 1 total += 1 else: - print >> MAPPED,'\t'.join(elements) + print('\t'.join(elements), file=MAPPED) counter_mapped1 += 1 total += 1 elif elements[1] in flags_mapped2: regmatch=re.match(".+\tAS:i:([0-9]+)\tXS:i:([0-9]+).*",line) if int(regmatch.group(1))-int(regmatch.group(2))==0: - print >> AMBIGUOUS,'\t'.join(elements) + print('\t'.join(elements), file=AMBIGUOUS) ambiguous2.append(elements[0]) counter_ambiguous2 += 1 total += 1 else: - print >> MAPPED,'\t'.join(elements) + print('\t'.join(elements), file=MAPPED) counter_mapped2 += 1 total += 1 # If not in the previous categories then unknown else: - print "Warning: "+elements[1]+" key is not recognized" - print >> UNRECOGNIZED,'\t'.join(elements) + print("Warning: "+elements[1]+" key is not recognized") + print('\t'.join(elements), file=UNRECOGNIZED) # Print summary @@ -186,9 +186,9 @@ def SplitSAMPE (fname,odir,summname): counter_bothunmapped2,counter_mapped1, counter_mapped2,counter_ambiguous1, counter_ambiguous2] - count_values = map(str,count_values) - print >> SUMMARY,','.join(count_names) - print >> SUMMARY,','.join(count_values) + count_values = list(map(str,count_values)) + print(','.join(count_names), file=SUMMARY) + print(','.join(count_values), file=SUMMARY) # Clossing all files @@ -253,12 +253,12 @@ def SplitSAMSE (sam,odir,summname): counter_unmappedread += 1 # Getting & printing "OPOSITE" reads elif elements[1] in flags_opositestrand: - print >> OPOSITE,'\t'.join(elements) + print('\t'.join(elements), file=OPOSITE) counter_total += 1 counter_opositestrand += 1 # Getting & printing "CHIMERIC" reads elif elements[1] in flags_chimeric: - print >> CHIMERIC,"\t".join(elements) + print("\t".join(elements), file=CHIMERIC) counter_total += 1 counter_chimeric += 1 # Getting & printing AMBIGUOUS reads, those who are not ambiguous are @@ -266,18 +266,18 @@ def SplitSAMSE (sam,odir,summname): elif elements[1] in flags_mapped: regmatch=re.match(".+\tAS:i:([0-9]+)\tXS:i:([0-9]+).*",line) if int(regmatch.group(1))-int(regmatch.group(2))==0: - print >> AMBIGUOUS,'\t'.join(elements) + print('\t'.join(elements), file=AMBIGUOUS) ambiguous.append(elements[0]) counter_total += 1 counter_ambiguous += 1 else: - print >> MAPPED,'\t'.join(elements) + print('\t'.join(elements), file=MAPPED) counter_total += 1 counter_mapped += 1 #If not in the previous categories then unknown else: - print "Warning: "+elements[1]+" key is not recognized" + print("Warning: "+elements[1]+" key is not recognized") #Print summary @@ -296,9 +296,9 @@ def SplitSAMSE (sam,odir,summname): counter_ambiguous, counter_chimeric] - count_values = map(str,count_values) - print >> SUMMARY ,','.join(count_names) - print >> SUMMARY ,','.join(count_values) + count_values = list(map(str,count_values)) + print(','.join(count_names), file=SUMMARY) + print(','.join(count_values), file=SUMMARY) #Clossing all files SAM.close() diff --git a/hpc/ase_scripts/sam_compare_w_feature.py b/hpc/ase_scripts/sam_compare_w_feature.py index bda9454..fe8139e 100755 --- a/hpc/ase_scripts/sam_compare_w_feature.py +++ b/hpc/ase_scripts/sam_compare_w_feature.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -from __future__ import print_function + ## @package sam_compare # This script compares two SAM files (sequence alignment maps) and @@ -21,7 +21,7 @@ # Changed from sending data to stderr to file/stdout option import sys, os, time, datetime, locale -import csv, cStringIO, logging, pprint, re, mmap, argparse, subprocess +import csv, io, logging, pprint, re, mmap, argparse, subprocess import shlex, gc, getpass, random from Bio import SeqIO @@ -246,11 +246,11 @@ def get_sam_reads(samdata, fqids, read_length, samid): reads[name][1:] = [1, 1, 0] else: # not an exact match, check if we got closer than previous (read_length initially) - if edit_distance < reads[name][3]: + if edit_distance < int(reads[name][3].replace("\\n","")): # we got closer to a match, set new edit distance and reset count reads[name][3] = edit_distance reads[name][1] = 1 - elif edit_distance == reads[name][3]: + elif edit_distance == int(reads[name][3].replace("\\n","")): # same edit distance, inc count reads[name][1] += 1 else: @@ -565,7 +565,7 @@ def write_counts(filename, counts): f = open(filename, 'w') if filename.lower() != "stdout" else sys.stdout f.write(",".join(header)) f.write("\n") - for row in counts.itervalues(): + for row in counts.values(): rowstr = "" for field in fields: if rowstr: diff --git a/hpc/ase_scripts/summarize_sam_compare_cnts_table_1cond_and_output_APN_06amm.py b/hpc/ase_scripts/summarize_sam_compare_cnts_table_1cond_and_output_APN_06amm.py index df74a7d..298ee92 100755 --- a/hpc/ase_scripts/summarize_sam_compare_cnts_table_1cond_and_output_APN_06amm.py +++ b/hpc/ase_scripts/summarize_sam_compare_cnts_table_1cond_and_output_APN_06amm.py @@ -65,7 +65,7 @@ def main(): sampleIDCol = args.sampleIDCol # groupby G2 and comparison - df_design['numReps'] = df_design.groupby([parent2, sampleCol]).sample.transform('count') + df_design['numReps'] = df_design.groupby([parent2, sampleCol])['sample'].transform('count') print(df_design[:5]) #counter variables for each comparate diff --git a/hpc/stan2_scripts/NBmodel_stan2_slurm_02amm.py b/hpc/stan2_scripts/NBmodel_stan2_slurm_02amm.py index a1af91e..65bb08d 100644 --- a/hpc/stan2_scripts/NBmodel_stan2_slurm_02amm.py +++ b/hpc/stan2_scripts/NBmodel_stan2_slurm_02amm.py @@ -81,7 +81,7 @@ def main(): infile=pd.read_csv(os.path.join(args.datafile, infileName)) infile.set_index('FEATURE_ID') - pre_headers=list(infile.columns.get_values()) + pre_headers=list(infile.columns) pre_headers_split=pre_headers[:4] pre_headers_split2=pre_headers[4:]