diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..59409aa --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,16 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + "pythonPath": "/home/wanglab/anaconda3/bin/python" + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..3469003 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "/home/wanglab/anaconda3/bin/python" +} \ No newline at end of file diff --git a/ChIP_analysis/README.md b/ChIP_analysis/README.md index 725b281..338a010 100755 --- a/ChIP_analysis/README.md +++ b/ChIP_analysis/README.md @@ -7,22 +7,43 @@ binding sites ## Pre-processing of ChIP data ## For each fastq file, the following steps were taken to preprocess the data +Reads were concatenated from multiple files into a single large fastq.gz file + +Here: +arg0 is substituted for a samples prefix + +```bash +cat *R1*.fastq.gz > arg0_all_R1.fastq.gz +cat *R2*.fastq.gz > arg0_all_R2.fastq.gz +``` + +Illumina adapters were trimmed from reads. + Here: arg0 is substituted for a samples prefix arg1 represents the fastq for R1 of the paired end sequencing arg2 represents the fastq for R2 of the paired end sequencing -``` +```bash fastqc arg1 -f fastq -o fastqc_before/ fastqc arg2 -f fastq -o fastqc_before/ cutadapt --quality-base=33 -a AGATCGGAAGAGC -A AGATCGGAAGAGC -n 3 -m 20 --mask-adapter --match-read-wildcards -o arg0_R1_cutadapt.fastq.gz -p arg0_R2_cutadapt.fastq.gz arg1 arg2 > arg0_cutadapt.log 2> arg0_cutadapt.err -TrimmomaticPE -phred33 arg0_R1_cutadapt.fastq.gz arg0_R2_cutadapt.fastq.gz arg0_R1_trim_paired.fastq.gz arg0_R1_trim_unpaired.fastq.gz arg0_R2_trim_paired.fastq.gz arg0_R2_trim_unpaired.fastq.gz TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:20 > arg0_trim.log 2> arg0_trim.err +java -jar trimmomatic-0.39.jar PE -phred33 arg0_R1_cutadapt.fastq.gz arg0_R2_cutadapt.fastq.gz arg0_R1_trim_paired.fastq.gz arg0_R1_trim_unpaired.fastq.gz arg0_R2_trim_paired.fastq.gz arg0_R2_trim_unpaired.fastq.gz TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:20 > arg0_trim.log 2> arg0_trim.err fastqc arg0_R1_trim_paired.fastq.gz -f fastq -o fastqc_after/ fastqc arg0_R2_trim_paired.fastq.gz -f fastq -o fastqc_after/ ``` +If you have single end data: + +```bash +fastqc arg1 -f fastq -o fastqc_before/ +cutadapt --quality-base=33 -a AGATCGGAAGAGC -a CGGAAGAGCACAC -n 3 -m 20 --mask-adapter --match-read-wildcards -o arg0_R1_cutadapt.fastq.gz arg1 > arg0_cutadapt.log 2> arg0_cutadapt.err +java -jar trimmomatic-0.39.jar SE -phred33 arg0_R1_cutadapt.fastq.gz arg0_R1_trim_unpaired.fastq.gz TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:20 > arg0_trim.log 2> arg0_trim.err +fastqc arg0_R1_trim_paired.fastq.gz -f fastq -o fastqc_after/ +``` + ## Alignment of processed reads ## -Next, each pair of processed reads was aligned with bowtie2. +Next, each pair of processed reads was aligned with bowtie2. Here arg0 is substituted for a samples prefix @@ -30,19 +51,22 @@ arg1 represents the trimmed paired fastq for R1 of the paired end sequencing arg2 represents the trimmed paired fastq for R2 of the paired end sequencing arg3 represents the trimmed unpaired fastq for R1 of the paired end sequencing arg4 represents the trimmed unpaired fastq for R2 of the paired end sequencing -``` -bowtie2 -x /home/mbwolfe/genomes/bowtie2indices/ATCC_47076 -1 arg1 -2 arg2 -U arg3,arg4 -X 2000 -q --end-to-end --very-sensitive -p 5 --phred33 --dovetail 2> arg0_bow.log | samtools view -bSh - > arg0.bam + +```bash +bowtie2 -x /home/mbwolfe/genomes/bowtie2indices/ATCC_47076 -1 arg1 -2 arg2 -U arg3,arg4 -X 2000 -q --end-to-end --very-sensitive -p 5 --phred33 --dovetail 2> arg0_bow.log | samtools view -bSh - > arg0.bam ``` ## Mapping of coverage and preparation for bootstrapping ## Next, reads were filtered with samtools and made into a sampling object using the custom script `bootstrap_sam_file.py`'s `parse` option. +### Note: if you have single-end reads, DO NOT include the `-f 3` flag, as it will exclude all reads without a proper pair, and you ain't got 'em. Here arg0 is the sample prefix arg1 is the input bam file -``` -samtools view -f 3 -F 2828 -q 30 arg1 | python bootstrap_sam_file.py parse - arg0.ob --paired 2> arg0_sampler.err + +```bash +samtools view -f 3 -F 2828 -q 30 arg1 | python3 bootstrap_sam_file.py parse - arg0.ob --paired 2> arg0_sampler.err ``` ## Obtaining summary tracks at 10 bp resolution ## @@ -58,8 +82,8 @@ arg7 and arg8 are samplers for the KO input samples Note that arg1 and arg3 are paired, arg2 and arg4 are paired etc. -``` -python bootstrapped_chip_no_consolidation.py 4639676 arg0 --ext_samps arg1 arg2 --inp_samps arg3 arg4 --ext_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1 --identity -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0.log +```bash +python3 bootstrapped_chip_no_consolidation.py --numba --sample_name_luts run_info --genome_size 4215607 --out_prefix arg0 --ChIP_samps arg1 arg2 --inp_samps arg3 arg4 --ChIP_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1 --identity -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0.log ``` ## Obtaining bootstrap replicate summary statistics ## @@ -76,8 +100,8 @@ arg7 and arg8 are samplers for the KO input samples Note that arg1 and arg3 are paired, arg2 and arg4 are paired etc. -``` -python bootstrapped_chip_no_consolidation.py 4639676 arg0 --ext_samps arg1 arg2 --inp_samps arg3 arg4 --ext_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1000 -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0_1000_bootstrap.log +```bash +python3 bootstrapped_chip_no_consolidation.py --numba --sample_name_luts run_info --genome_size 4215607 --out_prefix arg0 --ChIP_samps arg1 arg2 --inp_samps arg3 arg4 --ChIP_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1000 -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0_1000_bootstrap.log ``` ## Calculating IDR statistic ## @@ -89,7 +113,8 @@ arg0 is the sample prefix arg1 is one WT-KO replicate arg2 is a second WT-KO replicate arg3 is the estimated number of Lrp octamers for that condition -``` + +```bash Rscript calculate_idr.R arg1 arg2 arg3 arg0 > arg0.log 2> arg0.err ``` @@ -105,8 +130,9 @@ mad1-4.npy represent the corresponding bootstrap mad for each RSE replicate idr_combo1-2.npy represent each combination of subtracted replicates for the idr calculation out_peaks is the output prefix -``` -python calculate_peaks.py --log2ratios actual_sub1.npy actual_sub2.npy actual_sub3.npy actual_sub4.npy --mad mad_sub1.npy mad_sub2.npy mad_sub3.npy mad_sub4.npy --idr idr_combo1.npy idr_combo2.npy --resolution 10 --bins 3 --outpre out_peaks --idralpha 0.01 --bioalpha 0.001 --techalpha 0.001 + +```bash +python2.7 calculate_peaks.py --log2ratios actual_sub1.npy actual_sub2.npy actual_sub3.npy actual_sub4.npy --mad mad_sub1.npy mad_sub2.npy mad_sub3.npy mad_sub4.npy --idr idr_combo1.npy idr_combo2.npy --resolution 10 --bins 3 --outpre out_peaks --idralpha 0.01 --bioalpha 0.001 --techalpha 0.001 ``` For any questions or clarifications please contact Michael Wolfe at diff --git a/ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-36.pyc b/ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-36.pyc new file mode 100644 index 0000000..fa74946 Binary files /dev/null and b/ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-36.pyc differ diff --git a/ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-37.pyc b/ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-37.pyc new file mode 100644 index 0000000..795f362 Binary files /dev/null and b/ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-37.pyc differ diff --git a/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-36.pyc b/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-36.pyc new file mode 100644 index 0000000..ddc5366 Binary files /dev/null and b/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-36.pyc differ diff --git a/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-37.pyc b/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-37.pyc new file mode 100644 index 0000000..ebcc1b5 Binary files /dev/null and b/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-37.pyc differ diff --git a/ChIP_analysis/__pycache__/convert_multiple_to_average_robustz.cpython-37.pyc b/ChIP_analysis/__pycache__/convert_multiple_to_average_robustz.cpython-37.pyc new file mode 100644 index 0000000..16a7620 Binary files /dev/null and b/ChIP_analysis/__pycache__/convert_multiple_to_average_robustz.cpython-37.pyc differ diff --git a/ChIP_analysis/__pycache__/peak.cpython-37.pyc b/ChIP_analysis/__pycache__/peak.cpython-37.pyc new file mode 100644 index 0000000..59ea773 Binary files /dev/null and b/ChIP_analysis/__pycache__/peak.cpython-37.pyc differ diff --git a/ChIP_analysis/__pycache__/sam_utils.cpython-36.pyc b/ChIP_analysis/__pycache__/sam_utils.cpython-36.pyc new file mode 100644 index 0000000..93b1189 Binary files /dev/null and b/ChIP_analysis/__pycache__/sam_utils.cpython-36.pyc differ diff --git a/ChIP_analysis/__pycache__/sam_utils.cpython-37.pyc b/ChIP_analysis/__pycache__/sam_utils.cpython-37.pyc new file mode 100644 index 0000000..d8fb406 Binary files /dev/null and b/ChIP_analysis/__pycache__/sam_utils.cpython-37.pyc differ diff --git a/ChIP_analysis/bootstrap_helper_funcs.py b/ChIP_analysis/bootstrap_helper_funcs.py index 4676954..4d30044 100755 --- a/ChIP_analysis/bootstrap_helper_funcs.py +++ b/ChIP_analysis/bootstrap_helper_funcs.py @@ -28,10 +28,12 @@ #CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. import numpy as np +import numba from math import ceil, floor +# @numba.jit(nopython=True, parallel=True) def credible_interval(array, alpha = 0.05): - """ Take a bootstrapped based 95% credible interval + """ Take a bootstrap-based 95% credible interval Args: array (np.array): 1xn array (n number of bootstraps) @@ -41,15 +43,36 @@ def credible_interval(array, alpha = 0.05): pos 0: mean pos 1: min_ci at alpha pos 2: max_ci at alpha + pos 3: median """ - out_array = np.zeros(3, dtype=float) + out_array = np.zeros(4) mean = np.mean(array) out_array[0] = mean - sorted_array = np.sort(array) - out_array[1] = sorted_array[int(floor((alpha/2)*array.size))] - out_array[2] = sorted_array[int(floor(array.size - alpha/2*array.size))] + array_sort = np.sort(array) + out_array[1] = array_sort[int(floor((alpha/2)*array.size))] + out_array[2] = array_sort[int(floor(array.size - alpha/2*array.size))] + # print(array_sort.size) + # print(array_sort.size//2) + if array_sort.size % 2 == 0: + median = (array_sort[array_sort.size//2]+array_sort[array_sort.size//2-1])/2 + else: + median = array_sort[int(floor(array_sort.size//2))] + # print("Median: ", median) + out_array[3] = median return out_array +# @numba.jit(nopython=True) +# def bubblesort_jit(arr): +# N = arr.shape[0] +# for end in range(N, 1, -1): +# for i in range(end - 1): +# cur = arr[i] +# if cur > arr[i + 1]: +# tmp = arr[i] +# arr[i] = arr[i + 1] +# arr[i + 1] = tmp +# return(arr) + def least_extreme_value(stats): """ Take the value closest to zero for the min/max ci @@ -58,7 +81,7 @@ def least_extreme_value(stats): Returns: lev (float): least extreme value """ - mean, minci, maxci = stats + mean, minci, maxci, med = stats if minci <= 0.0 and maxci >= 0.0: return 0.0 elif minci >= 0.0 and maxci > 0.0: diff --git a/ChIP_analysis/bootstrap_helper_funcs.pyc b/ChIP_analysis/bootstrap_helper_funcs.pyc index 5c90a35..0184fc2 100644 Binary files a/ChIP_analysis/bootstrap_helper_funcs.pyc and b/ChIP_analysis/bootstrap_helper_funcs.pyc differ diff --git a/ChIP_analysis/bootstrap_sam_file.py b/ChIP_analysis/bootstrap_sam_file.py index 4592456..732c0fb 100755 --- a/ChIP_analysis/bootstrap_sam_file.py +++ b/ChIP_analysis/bootstrap_sam_file.py @@ -34,20 +34,26 @@ ################################################################################ ## Python base imports ## -from __future__ import division +from __future__ import division, print_function import argparse import sys import random import logging from math import floor, ceil +import time # Wildely available import numpy as np +import numba +from Bio import SeqIO # custom module import sam_utils +from bootstrap_helper_funcs import least_extreme_value, credible_interval +######################################################## ## TO DO: # add support for multiple chromosomed organisms # add support for strandedness +######################################################## class ReadSampler(object): """Class to hold and sample from processed reads. Reads are stored internally @@ -56,7 +62,7 @@ class ReadSampler(object): Attributes: reads (list): holds (start, end) tuples of reads. Converted to numpy array when sampling - total (int): number of reads in the list + total (int): number of reads in the list sampling (boolean): True if ready for sampling, False otherwise """ def __init__(self): @@ -69,11 +75,11 @@ def add_read(self, new_read): If sampler is in sampling mode, then converts self.reads to a list and sets the sampler flag back to False. - + Args: - new_read (tuple): new (start, end) read - """ + new_read (tuple): new (start, end) read + """ if self.sampling: self.convert_to_list() self.reads.append(new_read) @@ -83,24 +89,24 @@ def add_reads(self, new_reads): """ Method to add a multiple new reads to the sampler. If sampler is in sampling mode, then converts self.reads to a list - and sets the sampler flag back to False. + and sets the sampler flag back to False. Args: new_reads (list): new list of (start, end) reads - """ + """ if self.sampling: self.convert_to_list() self.reads.extend(new_reads) - + def convert_to_array(self): """ Method to convert reads to a numpy array for efficient sampling - """ + """ self.reads = np.asarray(self.reads, dtype="int64") self.sampling=True def convert_to_list(self): """ Method to convert reads to a list for efficient addition of reads - """ + """ self.reads = list(self.reads) self.sampling = False @@ -111,7 +117,7 @@ def pull_read(self, prng): Args: prng (np.RandomState object): random state to use for sampling - """ + """ if not self.sampling: self.convert_to_array() index = prng.random.randint(0, self.total) @@ -125,7 +131,7 @@ def pull_reads(self, n, prng): Args: n (int): number of reads to sample prng (np.RandomState object): random state to use for sampling - """ + """ if not self.sampling: self.convert_to_array() index = prng.randint(0, self.total, size=n) @@ -140,29 +146,66 @@ def save_data(self, f): Args: f (fhandle): a file handle to save to - """ + """ if not self.sampling: self.convert_to_array() np.save(f, self.reads) - + def sort_reads(self): """ Method to sort reads by starting location Will convert sampler to sampling mode if not already. - """ + """ if not self.sampling: self.convert_to_array() self.reads = self.reads[self.reads[:,0].argsort()] + def load_data(self, f): """ Method to load data from a saved sampler object Will overwrite any attributes that were previously in object - """ + """ self.sampling = True self.reads = np.load(f) self.total = self.reads.shape[0] +@numba.jit(nopython=True, parallel=True) +def get_cpm(array, cpm_array): + for i in range(array.shape[1]): + cpm_array[:,i] = array[:,i]/np.sum(array[:,i]) * 1e6 +def summary_stats_only_finite(array, alpha=0.05): + """ This version of summary stats only considers values that aren't nan or + inf. Creates summaries a single location based on bootstrap + + Args: + array (np.array): np.array that is 1 x n (n is the number of bootstrap + replicates) + + Returns: + summary_stats (np.array): np.array that is 1 x 9 + pos 0 average + pos 1 min_ci at alpha + pos 2 max_ci at alpha + pos 3 variance + pos 4 least extreme value + pos 5 median + pos 6 median absolute deviation + pos 7 number of infinite values in bootstrap + pos 8 number of nans in bootstrap + """ + finite_vals = np.isfinite(array) + num_inf = np.sum(np.isinf(array)) + num_nan = np.sum(np.isnan(array)) + if np.sum(finite_vals) == 0: + return(np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, num_inf, num_nan) + else: + these_stats = credible_interval(array[finite_vals], alpha) + this_lev = least_extreme_value(these_stats) + var = np.var(array[finite_vals]) + # median = np.median(array[finite_vals]) + mad = np.median(np.abs(array[finite_vals] - these_stats[3])) + return(these_stats[0], these_stats[1], these_stats[2], var, this_lev, these_stats[3], mad, num_inf, num_nan) def merge(intervals): """ Merge several individual intervals into one (start, stop) interval @@ -226,7 +269,7 @@ def get_paired_blocks(r1, r2): def create_read_list(samfile): """ Read in a samfile and convert it to a list of reads for the sampler object - + This function is for single end reads only. Skips any reads that are gapped reads @@ -242,13 +285,13 @@ def create_read_list(samfile): line = sam_utils.SamAlignment(line) vals = line.get_aligned_blocks() if len(vals) > 1: - logging.info("Skipping gapped read %s %s"%(line.QNAME, str(vals))) + logging.info("Skipping gapped read %s %s"%(line.QNAME, str(vals))) read_sampler.add_read(vals[0]) return read_sampler def create_read_list_paired(samfile): """ Read in a samfile and convert it to a list of reads for the sampler - object. + object. This function is for paired end reads only. Skips any reads that are gapped reads or are not properly paired. Assumes samfile is sorted by readname and @@ -260,17 +303,17 @@ def create_read_list_paired(samfile): Returns: read_sampler(obj(ReadSampler)): final read sampler for the samfile - + Raises: ValueError: If pair readnames don't match. Not considered a failsafe for catching violations of assumptions above but should catch most mistakes. """ read_sampler = ReadSampler() - while True: + while True: line1 = samfile.readline() line2 = samfile.readline() - if not line2: + if not line2: break line1 = sam_utils.SamAlignment(line1) line2 = sam_utils.SamAlignment(line2) @@ -288,11 +331,11 @@ def create_read_list_paired(samfile): logging.error("Skipping pair %s"%err) return read_sampler -def map_read(array, read, res=1.0): +def map_reads(array, reads, res=1.0): """ Take in a [start, stop] read and map it to an array. Read must always be in single basepair coordinates [0,end). Array can - be any resolution desired through control of res parameter. Modifies + be any resolution desired through control of res parameter. Modifies array in place. Args: @@ -301,24 +344,57 @@ def map_read(array, read, res=1.0): res (optional, float): resolution the numpy array is in """ - start, stop = read - array[int(ceil(start/res)):int(ceil(stop/res))] += 1 + numba_sum_coverage_from_sample(reads, array, res) + # start, stop = reads + # array[int(ceil(start/res)):int(ceil(stop/res))] += 1 def sample(read_sampler, n, array, res=1.0, prng = np.random.RandomState()): """ Sample reads with replacement from a sampler and map them to an array - Modifies array in place - Args: read_sampler (class ReadSampler): object holding reads to sample n (int): number of reads to sample array (1d np.array): array to store coverage in res (optional, float): resolution the numpy array is in - prng (optional, np.RandomState): random state to pull random numbers + prng (optional, np.RandomState): random state to pull random numbers from """ - for read in read_sampler.pull_reads(n, prng): - map_read(array, read, res) + sampled_reads = read_sampler.pull_reads(n, prng) + numba_sum_coverage_from_sample(sampled_reads, array, res) + # return(array) + # for read in read_sampler.sampled_reads(n, prng): + # map_read(array, read, res) + +@numba.jit(nopython=True, parallel=True) +def numba_sum_coverage_from_sample(sampled_reads, array, res): + """ Map sampled reads to an array + + Args: + reads (): sampled read (start,end) positions + array (): array to be populated with coverage + res (float): resolution the array is in + """ + for i in range(sampled_reads.shape[0]): + read = sampled_reads[i,:] + # read = sampled_reads[pos,:] + start, stop = read + start = float(start) + stop = float(stop) + res = float(res) + x = int(ceil(start/res)) + y = int(ceil(stop/res)) + # UNCOMMENT TO ONLY EVERY DO SINGLE BP RESOLUTION + #array[start:stop] +=1 + # array[x:y] += 1 + # sum_range_vectorized(x,y,array) + for idx in range(x,y): + array[idx] += 1 + # return(array) + +# @numba.guvectorize(["void(int32, int32, int32[:])"], '(), (), (n)') +# def sum_range_vectorized(x,y,array): +# for idx in range(x,y): +# array[idx] += 1 if __name__ == "__main__": @@ -346,10 +422,12 @@ def sample(read_sampler, n, array, res=1.0, prng = np.random.RandomState()): on the samfile of interest") sample_parser.add_argument('outpre', help="output file to np.save the\ numpy array that is created") - sample_parser.add_argument('array_size',type=int, help="length of genome") + sample_parser.add_argument('--reference_genome', type=str, help="fasta\ + file containing reference genome. Will be used to infer genome size") + # sample_parser.add_argument('array_size',type=int, help="length of genome") sample_parser.add_argument('--num_samples', type=int, default=1, help="number of full samples to pull from the sampler, default is 1") - sample_parser.add_argument('--num_reads', type=int, default=None, + sample_parser.add_argument('--num_reads', type=int, default=None, help="number of reads to pull for each sample. Default is the size of\ sampling object.") sample_parser.add_argument('--identity', action="store_true", @@ -359,13 +437,25 @@ def sample(read_sampler, n, array, res=1.0, prng = np.random.RandomState()): help="only report every x basepairs, default=1") sample_parser.add_argument('--seed', type=int, default=1234, help="psuedo-random number generator seed, default=1234") - args = parser.parse_args() + # summarize + summarize_parser = subparsers.add_parser('summarize', help="Summarize\ + coverage from bootstrap samples.") + summarize_parser.add_argument('--samples_path', help="output file from using sample\ + on the sampler of interest") + summarize_parser.add_argument('--out_prefix', help="output file to np.save the\ + numpy array that is created") + args = parser.parse_args() + + logging.basicConfig(format='%(asctime)s %(message)s',level=logging.INFO) if args.command == "parse": + logging.info("Preparing sampler object {}".format(args.outpre + ".npy")) if args.samfile == "-": + logging.info("Reading stream from stdin") f = sys.stdin else: + logging.info("Reading file {}".format(args.samfile)) f = open(args.samfile, mode="r") if args.paired: sampler = create_read_list_paired(f) @@ -374,20 +464,56 @@ def sample(read_sampler, n, array, res=1.0, prng = np.random.RandomState()): f.close() sampler.sort_reads() sampler.save_data(args.outpre) + elif args.command == "sample": + array_size = len(SeqIO.read(args.reference_genome, 'fasta')) + logging.info("Inferred genome size is {}".format(array_size)) + + logging.info("Sampling from file {}".format(args.outpre + ".npy")) + logging.info("Using seed {}".format(args.seed)) prng = np.random.RandomState(args.seed) - array = np.zeros((int(floor(args.array_size/args.resolution)), args.num_samples)) + array = np.zeros((int(floor(array_size/args.resolution)), args.num_samples), dtype=np.int32) sampler = ReadSampler() sampler.load_data(args.samplerfile) if args.identity: - for read in sampler.reads: - map_read(array, read, args.resolution) + map_reads(array, sampler.reads, args.resolution) np.save(args.outpre, array) else: - for i in xrange(args.num_samples): + for i in range(args.num_samples): + begin = time.time() if args.num_reads: num_reads = args.num_reads else: - num_reads = sampler.total + num_reads = sampler.total + # array = sample(sampler, num_reads, array[:,i], args.resolution, prng) sample(sampler, num_reads, array[:,i], args.resolution, prng) + finish = time.time() + logging.info("Sample {} took {} seconds".format(i, finish-begin)) + logging.info("Saving output array.") np.save(args.outpre, array) + + elif args.command == "summarize": + + logging.info("Reading in samples from file {}".format(args.samples_path)) + + array = np.load(args.samples_path) + cpm_array = np.zeros(array.shape) + get_cpm(array, cpm_array) # modifies cpm_array in place + logging.info("Created cpm_array of shape {}".format(cpm_array.shape)) + logging.info("Calculating mean, minci, maxci, mad, and median cpm for each position from bootstrapped coverage values") + stats = np.apply_along_axis(summary_stats_only_finite, axis=1, arr=cpm_array) + + logging.info("Saved summary stats to {0}_mean.npy, {0}_minci.npy, {0}_maxci.npy, {0}_median.npy, {0}_mad.npy".format(args.out_prefix)) + np.save(args.out_prefix+"_mean", stats[:,0]) + np.save(args.out_prefix+"_minci", stats[:,1]) + np.save(args.out_prefix+"_maxci", stats[:,2]) + np.save(args.out_prefix+"_median", stats[:,5]) + np.save(args.out_prefix+"_mad", stats[:,6]) + + + + + + + + diff --git a/ChIP_analysis/bootstrap_sam_file.pyc b/ChIP_analysis/bootstrap_sam_file.pyc index bf1197f..ae7c8b4 100644 Binary files a/ChIP_analysis/bootstrap_sam_file.pyc and b/ChIP_analysis/bootstrap_sam_file.pyc differ diff --git a/ChIP_analysis/bootstrapped_chip_enrichment.py b/ChIP_analysis/bootstrapped_chip_enrichment.py new file mode 100644 index 0000000..ec1dfeb --- /dev/null +++ b/ChIP_analysis/bootstrapped_chip_enrichment.py @@ -0,0 +1,672 @@ +################################################################################ +# bootstrap sampling for ChIP analysis with paired extraced and input files and +# a KO to subtract +# +# Written by Michael Wolfe +# Copyright (c) 2018 Michael Wolfe University of Michigan. All rights reserved. +# +# +#Developed by: Michael Wolfe +#University of Michigan +#http://freddolino-lab.med.umich.edu/ +#Modified by : Jeremy Schroeder +#University of Wisconsin - Madison +#http://wanglab.bact.wisc.edu +# +#Permission is hereby granted, free of charge, to any person obtaining a copy of +#this software and associated documentation files (the "Software"), to deal with +#the Software without restriction, including without limitation the rights to +#use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +#the Software, and to permit persons to whom the Software is furnished to do so, +#subject to the following conditions: +# +#Redistributions of source code must retain the above copyright notice, this +#list of conditions and the following disclaimers. Redistributions in binary +#form must reproduce the above copyright notice, this list of conditions and the +#following disclaimers in the documentation and/or other materials provided with +#the distribution. Neither the names of Michael Wolfe, Jeremy Schroeder +#University of Michigan, University of Wisconsin - Madison, +#nor the names of their contributors may be used to endorse or promote products +#derived from this Software without specific prior written permission. +# +#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +#FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS +#OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +#WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +#CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. +################################################################################ + +# base python +from __future__ import print_function +import argparse +import sys +import logging +import os +import multiprocessing as mp +from math import floor, ceil, log +import re +import time + +# Widely available +import numpy as np +from scipy import stats +import pandas as pd +from dfply import * + +# More custom +import numba + +# custom +from bootstrap_sam_file import ReadSampler +from bootstrap_helper_funcs import least_extreme_value, credible_interval + +def summary_stats_only_finite(array, alpha=0.05): + """ This version of summary stats only considers values that aren't nan or + inf. Creates summaries a single location based on bootstrap + + Args: + array (np.array): np.array that is 1 x n (n is the number of bootstrap + replicates) + + Returns: + summary_stats (np.array): np.array that is 1 x 9 + pos 0 average + pos 1 min_ci at alpha + pos 2 max_ci at alpha + pos 3 variance + pos 4 least extreme value + pos 5 median + pos 6 median absolute deviation + pos 7 number of infinite values in bootstrap + pos 8 number of nans in bootstrap + """ + finite_vals = np.isfinite(array) + num_inf = np.sum(np.isinf(array)) + num_nan = np.sum(np.isnan(array)) + if np.sum(finite_vals) == 0: + return(np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, num_inf, num_nan) + else: + these_stats = credible_interval(array[finite_vals], alpha) + this_lev = least_extreme_value(these_stats) + var = np.var(array[finite_vals]) + median = np.median(array[finite_vals]) + mad = np.median(np.abs(array[finite_vals] - median)) + return(these_stats[0], these_stats[1], these_stats[2], var, this_lev, these_stats[3], mad, num_inf, num_nan) + +def summary_stats_nan_zeros(array, alpha=0.05): + """ This version of summary stats turns nans to 0. NOTE THAT THIS MODIFIES + THE ARRAY + + Creates summaries for a single location based on bootstraps + + Args: + array (np.array): np.array that is 1 x n (n is the number of bootstrap + replicates) + + Returns: + summary_stats (np.array): np.array that is 1 x 9 + pos 0 average + pos 1 min_ci at alpha + pos 2 max_ci at alpha + pos 3 variance + pos 4 least extreme value + pos 5 median + pos 6 median absolute deviation + """ + array[np.isnan(array)] = 0 + these_stats = credible_interval(array, alpha) + this_lev = least_extreme_value(these_stats) + var = np.var(array) + median = np.median(array) + mad = np.sum(np.abs(array - median))/array.size + return(these_stats[0], these_stats[1], these_stats[2], var, this_lev, median, mad) + + +def actual_reads(sampler, size, res=1): + """ Take all reads from a sampler and map them to an array + + Args: + sampler (class ReadSampler): object holding reads to sample + size (int): size of numpy array to create + res (optional, int): resolution the numpy array is in + """ + ############# modify to support jit acceleration ############ + array = np.zeros(size) + for read in sampler.reads: + start, stop = read + # UNCOMMENT TO ONLY EVERY DO SINGLE BP RESOLUTION + #array[start:stop] +=1 + array[int(ceil(start/res)):int(ceil(stop/res))] += 1 + return array + +def sp_sample_reads(sampler, size, i, j, res, jit=False): + """ Sample reads from samplers using multiprocessing + + Args: + sampler (list): List of ReadSampler objects to sample from + size (int): Size of numpy array to create + res (float): resolution of numpy array + start (int): starting seed for random state + p (optional, int): number of processors + + Returns: + arrays (np.array) final result of sampling + """ + start = args.s+i+len(all_sim) + array = sample_reads(sampler, size, np.random.RandomState(start+j), res, jit) + return array + +def sample_reads(sampler, size, prng, res=1, jit=False): + """ Sample with replacement all reads from a sampler and map them to an + array + + Args: + sampler (class ReadSampler): object holding reads to sample + size (int): size of numpy array to create + prng (np.RandomState): random state to pull random numbers + from + res (optional, int): resolution the numpy array is in + """ + array = np.zeros(size) + sampled_reads = sampler.pull_reads(sampler.total, prng) + # # define heirarchy for using GPU + # threadsperblock = 32 + # blockspergrid = (array.size + (threadsperblock - 1)) // threadsperblock + + if jit: + # threadsperblock = 32 + # blockspergrid = (sampled_reads.shape[0] + (threadsperblock - 1)) // threadsperblock + # note that using cuda.jit here causes us to write directly to the array, + # without explicitly returning it from sum_coverage_from_sample function + # sum_coverage_from_sample[blockspergrid, threadsperblock](sampled_reads, array, res) + # device_array = cuda.to_device(array) + # device_samples = cuda.to_device(sampled_reads) + # device_res = cuda.to_device(res) + array = numba_sum_coverage_from_sample(sampled_reads, array, res) + # cuda_sum_coverage_from_sample[blockspergrid, threadsperblock](device_samples, device_array, res) + # array = device_array.copy_to_host() + else: + array = sum_coverage_from_sample(sampled_reads, array, res) + + return array + +def sum_coverage_from_sample(sampled_reads, array, res): + """ Map sampled reads to an array + + Args: + reads (): sampled read (start,end) positions + array (): array to be populated with coverage + res (float): resolution the array is in + """ + + for read in sampled_reads: + start, stop = read + res = float(res) + x = int(ceil(start/res)) + y = int(ceil(stop/res)) + # UNCOMMENT TO ONLY EVERY DO SINGLE BP RESOLUTION + #array[start:stop] +=1 + array[x:y] += 1 + + return(array) + +@numba.jit(nopython=True, parallel=True) +def numba_sum_coverage_from_sample(sampled_reads, array, res): + """ Map sampled reads to an array + + Args: + reads (): sampled read (start,end) positions + array (): array to be populated with coverage + res (float): resolution the array is in + """ + # pos = cuda.grid(1) + for i in range(sampled_reads.shape[0]): + read = sampled_reads[i,:] + # read = sampled_reads[pos,:] + start, stop = read + start = float(start) + stop = float(stop) + res = float(res) + x = int(ceil(start/res)) + y = int(ceil(stop/res)) + # UNCOMMENT TO ONLY EVERY DO SINGLE BP RESOLUTION + #array[start:stop] +=1 + for idx in range(x,y): + array[idx] += 1 + return(array) + +# @numba.cuda.jit +# def cuda_sum_coverage_from_sample(sampled_reads, array, res): +# """ Map sampled reads to an array + +# Args: +# reads (): sampled read (start,end) positions +# array (): array to be populated with coverage +# res (float): resolution the array is in +# """ +# # pos = cuda.grid(1) +# for i in range(sampled_reads.shape[0]): +# read = sampled_reads[i,:] +# # read = sampled_reads[pos,:] +# start, stop = read +# start = float(start) +# stop = float(stop) +# res = float(res) +# x = int(ceil(start/res)) +# y = int(ceil(stop/res)) +# # UNCOMMENT TO ONLY EVERY DO SINGLE BP RESOLUTION +# #array[start:stop] +=1 +# for idx in range(x,y): +# array[idx] += 1 + +def log2_ratio(array1, array2): + """ Take the median normalized log2 ratio of two arrays + + Args: + array1 (np.array): numpy array holding extracted signal + array2 (np.array): numpy array holding input signal + cu (bool): whether to use cuda.jit to accelerate calculations + + Returns: + ratio (np.array): median normalized log2 ratio + """ + # lets normalize by the median + arr1_median = float(np.median(array1)) + arr2_median = float(np.median(array2)) + ratio = (array1/arr1_median)/(array2/arr2_median) + + # if not cu: + # ratio = (array1/arr1_median)/(array2/arr2_median) + # only nans should be np.log2(0/0) which should be 0 + #ratio[np.isnan(ratio)] = 0.0 + # else: + # threadsperblock = 32 + # blockspergrid = (array1.size + (threadsperblock - 1)) // threadsperblock + + # device_array1 = cuda.to_device(array1) + # device_arr1_med = cuda.to_device(arr1_median) + # device_array2 = cuda.to_device(array2) + # device_arr2_med = cuda.to_device(arr2_median) + + # median_array1 = np.empty(array1.shape) + # device_median_array1 = cuda.to_device(median_array1) + # median_array2 = np.empty(array2.shape) + # device_median_array2 = cuda.to_device(median_array2) + + # ratio = np.empty(array1.shape) + # device_ratio = cuda.to_device(ratio) + # cuda_ratio[blockspergrid, threadsperblock](device_array1, device_array2, + # arr1_median, arr2_median, + # device_median_array1, device_median_array2, + # device_ratio) + # ratio = device_ratio.copy_to_host() + + log2_ratio = np.log2(ratio) + + return log2_ratio + +# @cuda.jit +# def cuda_ratio(array1, array2, arr1_med, arr2_med, +# median_array1, median_array2, +# ratio): + +# pos = cuda.grid(1) +# if pos < ratio.shape[0]: +# # calculate median for position pos of each array +# median_array1[pos] = array1[pos]/arr1_med +# median_array2[pos] = array2[pos]/arr2_med + +# # calculate ratio +# ratio[pos] = median_array1[pos]/median_array2[pos] + + + +def floored_sub(samp, control): + """ Subtract control signal from sample signal. Only subtract if control is + greater than 0. + + Args: + samp (np.array): sample numpy array holding signal + control (np.array): control numpy array holding signal + + Returns: + sub (np.array): subtracted signal + """ + # make copy as to not modify control array + this_control = np.copy(control) + this_control[this_control < 0] = 0 + return samp-this_control + +def read_in_sampler(samplerfile): + """ Take a saved numpy array file and load it back into a sampler + + Args: + samplerfile (str): name of stored numpy array + + Returns: + sampler (ReadSampler object): a sampler ready for sampling + """ + sampler = ReadSampler() + sampler.load_data(samplerfile) + # sampler.sort_reads() # our reads are already sorted + return sampler + +def mp_sample_reads_helper(args): + """ Helper function for sampling reads with multiprocessing + """ + sampler, size, res, prng = args + return sample_reads(sampler, size, prng, res) + +def mp_sample_reads(samplers, size, res, start, p=1): + """ Sample reads from samplers using multiprocessing + + Args: + samplers (list): List of ReadSampler objects to sample from + size (int): Size of numpy array to create + res (int): resolution of numpy array + start (int): starting seed for random state + p (optional, int): number of processors + + Returns: + arrays (np.array) final result of sampling + """ + pool = mp.Pool(processes=p) + arrays = pool.map(mp_sample_reads_helper, + ((sampler, size, res, np.random.RandomState(start+x)) for x,sampler in enumerate(samplers))) + pool.close() + pool.join() + return arrays + +def mp_actual_reads_helper(args): + """ Helper function for mapping reads with multiprocessing + """ + sampler, size, res = args + np.random.RandomState() + return actual_reads(sampler, size, res) + +def mp_actual_reads(samplers, size, res, p=1): + """ Map reads from samplers using multiprocessing + + Args: + samplers (list): List of ReadSampler objects to sample from + size (int): Size of numpy array to create + res (int): resolution of numpy array + p (optional, int): number of processors + + Returns: + arrays (np.array) final result of mapping + """ + pool = mp.Pool(processes=p) + arrays = pool.map(mp_actual_reads_helper, + ((sampler, size, res) for sampler in samplers)) + pool.close() + pool.join() + return arrays + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Given sampling objects,\ + goes through the entire chip-seq pipeline with x bootstraps.\ + outputs a numpy g by x matrix where g is the size of the genome\ + and x is the number of bootstrap replicates to do.") + parser.add_argument("--genome_size", type=int, help="size of genome\ + controls how large to build the arrays", required=True) + parser.add_argument('--out_prefix', help="Output prefix for final matrix.", + required=False, default='') + parser.add_argument("--sample_name_luts", type=str, nargs="+", + help="Paths to files containing the\ + lookup run information provided by Illumina. The\ + file will be used to match sample IDs\ + with sample names. Multiple file names should\ + be separated by spaces.", required=True) + parser.add_argument('--ChIP_samps', type=str, nargs="+", + help="Extracted read simulators for samples") + parser.add_argument('--inp_samps', type=str, nargs="+", + help="Input read simulators for samples") + # parser.add_argument('--ChIP_conts', type=str, nargs="+", + # help="Extracted read simulators for controls") + # parser.add_argument('--inp_conts', type=str, nargs="+", + # help="Input read simulators for controls") + parser.add_argument('--num_replicates', type=int, default=5, + help="Number of bootstrap replicates to do, default is 5") + parser.add_argument('--identity', action="store_true", + help="Don't sample, just use the data as is") + parser.add_argument('-p', type=int, default=10, + help="Number of processors to use, default is 5") + parser.add_argument('-s', type=int, default=None, + help="Seed for random number generation. Default is 1") + parser.add_argument('--resolution', type=int, default=1, + help="resolution of data to map, default is 1bp") + parser.add_argument('--save_summaries', type=float, default=None, + help="Don't save all bootstrap replicates. Just save the summaries:\ + minci, maxci, lev, mean, var. Specify the alpha level here") + parser.add_argument('--numba', action="store_true", + help="Adding this flag will enable jit compilation\ + on the cpu to speed up calculations.") + args = parser.parse_args() + logging.basicConfig(format='%(asctime)s %(message)s',level=logging.INFO) + ## TO DO: + # allow functions to deal with more than two samples for each + prng= np.random.RandomState(seed=args.s) + array_size = int(floor(args.genome_size/args.resolution)) + + num_ext_samp = len(args.ChIP_samps) + num_inp_samp = len(args.inp_samps) + # num_ext_cont = len(args.ChIP_conts) + # num_inp_cont = len(args.inp_conts) + + if args.out_prefix == '': + save_name_prefix = '' + else: + save_name_prefix = '{}_'.format(args.out_prefix) + + if args.save_summaries is not None: + this_summary_stats_only_finite = lambda x: summary_stats_only_finite(x, alpha=args.save_summaries) + this_summary_stats_nan_zeros = lambda x: summary_stats_nan_zeros(x, alpha=args.save_summaries) + + samp_final = np.zeros((array_size, args.num_replicates, num_ext_samp)) + # cont_final = np.zeros((array_size, args.num_replicates, num_ext_cont)) + + pat = re.compile(r'Sample_\d+') + # grab the sample names from the treatment sampler files + samp_matches = [pat.search(s) for s in args.ChIP_samps] + if None in samp_matches: + print("Your sampler file names should begin with the sample id\ + Illumina gave you, i.e., Sample_NNNNNN,\ + where N is any integer. Exiting script.") + sys.exit() + else: samp_strings = [match.group() for match in samp_matches] + # grab the sample names from the control sampler files + # cont_matches = [pat.search(s) for s in args.ChIP_conts] + # if None in cont_matches: + # print("Your sampler file names should begin with the sample id\ + # Illumina gave you, i.e., Sample_NNNNNN,\ + # where N is any integer. Exiting script.") + # sys.exit() + # else: cont_strings = [match.group() for match in cont_matches] + + # read in sample info from illumina to look up descriptions from sample ids + for i,sample_info_file_name in enumerate(args.sample_name_luts): + if sample_info_file_name.endswith('.csv'): + sample_info_tmp = (pd.read_csv(sample_info_file_name, header=18) >> + select(X.Sample_ID, X.Description)) + else: + sample_info_tmp = (pd.read_csv(sample_info_file_name, sep='\t') >> + mutate(Sample_ID = 'Sample_' + X.SampleID.astype(str)) >> + select(X.Sample_ID, X.Description)) + if i == 0: + sample_info = sample_info_tmp + else: + sample_info = sample_info.append(sample_info_tmp) + + samp_names = [] + for samp_id in samp_strings: + sample_rows = sample_info[sample_info.Sample_ID == samp_id] + samp_names.append(sample_rows['Description'].iloc[0]) # grab the first description for this Sample_ID + + # cont_names = [] + # for samp_id in cont_strings: + # cont_rows = sample_info[sample_info.Sample_ID == samp_id] + # cont_names.append(cont_rows['Description'].iloc[0]) + + # samp_names = [os.path.basename(x).split(".")[0] for x in args.ChIP_samps] + # cont_names = [os.path.basename(x).split(".")[0] for x in args.ChIP_conts] + + ## Read in all samplers + all_sims = [] + all_sims.extend(args.ChIP_samps) + all_sims.extend(args.inp_samps) + # all_sims.extend(args.ChIP_conts) + # all_sims.extend(args.inp_conts) + if num_ext_samp != num_inp_samp: # or num_ext_cont != num_inp_cont: + logging.error("Mismatch number of extracted and input samples Ext_samp: %s\ + Inp_samp: %s"%(num_ext_samp, num_inp_samp))#, num_ext_cont, num_inp_cont)) + + all_sim = [] + for sampler in all_sims: + logging.info("Reading in sampler {}".format(sampler)) + all_sim.append(read_in_sampler(sampler)) + + # all_sim = [read_in_sampler(sampler) for sampler in all_sim] + logging.info("Finished reading in samplers") + + ## sample reads + for i in range(args.num_replicates): + logging.info("Starting bootstrap replicate %s"%i) + logging.info("Sampling reads %s"%i) + begin = time.time() + if args.numba: + if args.identity: + # arrays = mp_actual_reads(all_sim, array_size, args.resolution, args.p) + ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING + arrays = [actual_reads(sampler, array_size, args.resolution) for sampler in all_sim] + else: + ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING + arrays = [sp_sample_reads(sampler, array_size, i, j, args.resolution, jit=args.numba) for j,sampler in enumerate(all_sim)] + # arrays = mp_sample_reads(all_sim, array_size, args.resolution, args.s+i+len(all_sim), args.p) + else: + if args.identity: + arrays = mp_actual_reads(all_sim, array_size, args.resolution, args.p) + ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING + # arrays = [actual_reads(sampler, args.genome_size, args.resolution) for sampler in all_sim] + else: + ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING + # arrays = [sp_sample_reads(sampler, array_size, i, j, args.resolution, cu=args.numba) for j,sampler in enumerate(all_sim)] + arrays = mp_sample_reads(all_sim, array_size, args.resolution, args.s+i+len(all_sim), args.p) + finish = time.time() + logging.info("Sampling bootstrap replicate {} took {} seconds".format(i, finish-begin)) + ## Calculate ratios + logging.info("Calculating Ratios %s"%i) + begin = time.time() + for j, (ext_ind, inp_ind) in enumerate(zip(range(0, num_ext_samp), + range(num_ext_samp, + num_ext_samp+num_inp_samp))): + samp_final[:,i,j] = log2_ratio(arrays[ext_ind], arrays[inp_ind]) + # for j, (ext_ind, inp_ind) in enumerate(zip(range(num_ext_samp + num_inp_samp, + # num_ext_samp + num_inp_samp+num_ext_cont), + # range(num_ext_samp + num_inp_samp + num_ext_cont, + # num_ext_samp + num_inp_samp + num_ext_cont + num_inp_cont))): + # cont_final[:,i,j]=log2_ratio(arrays[ext_ind], arrays[inp_ind]) + finish = time.time() + logging.info("Calculating Ratios for replicate {} took {} seconds".format(i, finish-begin)) + + + # Write out final output + if args.identity and args.save_summaries: + # save sample summaries + for j, name in enumerate(samp_names): + save_name = "{}{}_actual".format(save_name_prefix,name) + np.save(save_name, samp_final[:,:,j]) + # save control summaries + # for j, name in enumerate(cont_names): + # np.save(args.out_prefix+"_%s_actual"%name, cont_final[:,:,j]) + # save each combination of sample - control summary + # for j, samp_name in enumerate(samp_names): + # for k, cont_name in enumerate(cont_names): + # # note that floored sub MODIFIES the control array. Since we have already written the control array + # # that is not a big deal but be aware that this modification happens + # np.save(args.out_prefix+"_%s_Sub_%s_actual"%(samp_name,cont_name), floored_sub(samp_final[:,:,j], cont_final[:,:,k])) + + elif args.save_summaries: + # ALL OF THIS IS HARDCODED QUICK CODING. Saves a lot of output files to be + # used downstream + logging.info("Calculating Summary Stats for finite values") + begin = time.time() + # save sample summaries + for j, name in enumerate(samp_names): + these_stats = np.apply_along_axis(this_summary_stats_only_finite, 1, samp_final[:,:,j]) + np.save("{}{}_mean".format(save_name_prefix,name), these_stats[:,0]) + np.save("{}{}_minci".format(save_name_prefix,name), these_stats[:,1]) + np.save("{}{}_maxci".format(save_name_prefix,name), these_stats[:,2]) + np.save("{}{}_var".format(save_name_prefix,name), these_stats[:,3]) + np.save("{}{}_lev".format(save_name_prefix,name), these_stats[:,4]) + np.save("{}{}_median".format(save_name_prefix,name), these_stats[:,5]) + np.save("{}{}_mad".format(save_name_prefix,name), these_stats[:,6]) + np.save("{}{}_num_inf".format(save_name_prefix,name), these_stats[:,7]) + np.save("{}{}_num_nan".format(save_name_prefix,name), these_stats[:,8]) + # save control summaries + # for j, name in enumerate(cont_names): + # these_stats = np.apply_along_axis(this_summary_stats_only_finite, 1, cont_final[:,:,j]) + # np.save("{}{}_mean".format(save_name_prefix,name), these_stats[:,0]) + # np.save("{}{}_minci".format(save_name_prefix,name), these_stats[:,1]) + # np.save("{}{}_maxci".format(save_name_prefix,name), these_stats[:,2]) + # np.save("{}{}_var".format(save_name_prefix,name), these_stats[:,3]) + # np.save("{}{}_lev".format(save_name_prefix,name), these_stats[:,4]) + # np.save("{}{}_median".format(save_name_prefix,name), these_stats[:,5]) + # np.save("{}{}_mad".format(save_name_prefix,name), these_stats[:,6]) + # np.save("{}{}_num_inf".format(save_name_prefix,name), these_stats[:,7]) + # np.save("{}{}_num_nan".format(save_name_prefix,name), these_stats[:,8]) + # # save each combination of sample - control summary + # for j, samp_name in enumerate(samp_names): + # for k, cont_name in enumerate(cont_names): + # these_stats = np.apply_along_axis(this_summary_stats_only_finite, 1, floored_sub(samp_final[:,:,j], cont_final[:,:,k])) + # np.save(args.out_prefix+"_%s_Sub_%s_mean"%(samp_name,cont_name), these_stats[:,0]) + # np.save(args.out_prefix+"_%s_Sub_%s_minci"%(samp_name,cont_name), these_stats[:,1]) + # np.save(args.out_prefix+"_%s_Sub_%s_maxci"%(samp_name, cont_name), these_stats[:,2]) + # np.save(args.out_prefix+"_%s_Sub_%s_var"%(samp_name,cont_name), these_stats[:,3]) + # np.save(args.out_prefix+"_%s_Sub_%s_lev"%(samp_name,cont_name), these_stats[:,4]) + # np.save(args.out_prefix+"_%s_Sub_%s_median"%(samp_name,cont_name), these_stats[:,5]) + # np.save(args.out_prefix+"_%s_Sub_%s_mad"%(samp_name,cont_name), these_stats[:,6]) + # np.save(args.out_prefix+"_%s_Sub_%s_num_inf"%(samp_name,cont_name), these_stats[:,7]) + # np.save(args.out_prefix+"_%s_Sub_%s_num_nan"%(samp_name,cont_name), these_stats[:,8]) + # finish = time.time() + # logging.info("Calculating Summary Stats took {} seconds".format(finish-begin)) + +# logging.info("Calculating Summary Stats for nans as zeros") +# # save sample summaries +# for j, name in enumerate(samp_names): +# these_stats = np.apply_along_axis(this_summary_stats_nan_zeros, 1, samp_final[:,:,j]) +# np.save("{}{}_mean".format(save_name_prefix,name), these_stats[:,0]) +# np.save("{}{}_minci".format(save_name_prefix,name), these_stats[:,1]) +# np.save("{}{}_maxci".format(save_name_prefix,name), these_stats[:,2]) +# np.save("{}{}_var".format(save_name_prefix,name), these_stats[:,3]) +# np.save("{}{}_lev".format(save_name_prefix,name), these_stats[:,4]) +# np.save("{}{}_median".format(save_name_prefix,name), these_stats[:,5]) +# np.save("{}{}_mad".format(save_name_prefix,name), these_stats[:,6]) +# # save control summaries +# for j, name in enumerate(cont_names): +# these_stats = np.apply_along_axis(this_summary_stats_only_finite, 1, cont_final[:,:,j]) +# np.save("{}{}_mean".format(save_name_prefix,name), these_stats[:,0]) +# np.save("{}{}_minci".format(save_name_prefix,name), these_stats[:,1]) +# np.save("{}{}_maxci".format(save_name_prefix,name), these_stats[:,2]) +# np.save("{}{}_var".format(save_name_prefix,name), these_stats[:,3]) +# np.save("{}{}_lev".format(save_name_prefix,name), these_stats[:,4]) +# np.save("{}{}_median".format(save_name_prefix,name), these_stats[:,5]) +# np.save("{}{}_mad".format(save_name_prefix,name), these_stats[:,6]) +# # save each combination of sample - control summary +# for j, samp_name in enumerate(samp_names): +# for k, cont_name in enumerate(cont_names): +# these_stats = np.apply_along_axis(this_summary_stats_only_finite, 1, floored_sub(samp_final[:,:,j], cont_final[:,:,k])) +# np.save(args.out_prefix+"_%s_Sub_%s_mean"%(samp_name,cont_name), these_stats[:,0]) +# np.save(args.out_prefix+"_%s_Sub_%s_minci"%(samp_name,cont_name), these_stats[:,1]) +# np.save(args.out_prefix+"_%s_Sub_%s_maxci"%(samp_name, cont_name), these_stats[:,2]) +# np.save(args.out_prefix+"_%s_Sub_%s_var"%(samp_name,cont_name), these_stats[:,3]) +# np.save(args.out_prefix+"_%s_Sub_%s_lev"%(samp_name,cont_name), these_stats[:,4]) +# np.save(args.out_prefix+"_%s_Sub_%s_median"%(samp_name,cont_name), these_stats[:,5]) +# np.save(args.out_prefix+"_%s_Sub_%s_mad"%(samp_name,cont_name), these_stats[:,6]) + + else: + # write final array + logging.info("Writing final array") + np.save("{}samp".format(save_name_prefix), samp_final) + # np.save(args.out_prefix+"_cont", cont_final) diff --git a/ChIP_analysis/bootstrapped_chip_no_consolidation.py b/ChIP_analysis/bootstrapped_chip_no_consolidation.py index dc08946..d00e450 100644 --- a/ChIP_analysis/bootstrapped_chip_no_consolidation.py +++ b/ChIP_analysis/bootstrapped_chip_no_consolidation.py @@ -9,6 +9,9 @@ #Developed by: Michael Wolfe #University of Michigan #http://freddolino-lab.med.umich.edu/ +#Modified by : Jeremy Schroeder +#University of Wisconsin - Madison +#http://wanglab.bact.wisc.edu # #Permission is hereby granted, free of charge, to any person obtaining a copy of #this software and associated documentation files (the "Software"), to deal with @@ -21,8 +24,9 @@ #list of conditions and the following disclaimers. Redistributions in binary #form must reproduce the above copyright notice, this list of conditions and the #following disclaimers in the documentation and/or other materials provided with -#the distribution. Neither the names of Michael Wolfe, University of Michigan, -#nor the names of its contributors may be used to endorse or promote products +#the distribution. Neither the names of Michael Wolfe, Jeremy Schroeder +#University of Michigan, University of Wisconsin - Madison, +#nor the names of their contributors may be used to endorse or promote products #derived from this Software without specific prior written permission. # #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR @@ -34,21 +38,28 @@ ################################################################################ # base python +from __future__ import print_function import argparse import sys import logging import os import multiprocessing as mp -from math import floor, ceil +from math import floor, ceil, log +import re +import time # Widely available import numpy as np from scipy import stats +import pandas as pd +from dfply import * + +# More custom +import numba # custom from bootstrap_sam_file import ReadSampler -from bootstrap_helper_funcs import credible_interval, least_extreme_value - +from bootstrap_helper_funcs import least_extreme_value, credible_interval def summary_stats_only_finite(array, alpha=0.05): """ This version of summary stats only considers values that aren't nan or @@ -75,18 +86,18 @@ def summary_stats_only_finite(array, alpha=0.05): num_nan = np.sum(np.isnan(array)) if np.sum(finite_vals) == 0: return(np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, num_inf, num_nan) - else: + else: these_stats = credible_interval(array[finite_vals], alpha) this_lev = least_extreme_value(these_stats) var = np.var(array[finite_vals]) median = np.median(array[finite_vals]) mad = np.median(np.abs(array[finite_vals] - median)) - return(these_stats[0], these_stats[1], these_stats[2], var, this_lev, median, mad, num_inf, num_nan) + return(these_stats[0], these_stats[1], these_stats[2], var, this_lev, these_stats[3], mad, num_inf, num_nan) def summary_stats_nan_zeros(array, alpha=0.05): - """ This version of summary stats turns nans to 0. NOTE THAT THIS MODIFIES + """ This version of summary stats turns nans to 0. NOTE THAT THIS MODIFIES THE ARRAY - + Creates summaries for a single location based on bootstraps Args: @@ -112,14 +123,15 @@ def summary_stats_nan_zeros(array, alpha=0.05): return(these_stats[0], these_stats[1], these_stats[2], var, this_lev, median, mad) -def actual_reads(sampler, size, res=1.0): +def actual_reads(sampler, size, res=1): """ Take all reads from a sampler and map them to an array Args: sampler (class ReadSampler): object holding reads to sample size (int): size of numpy array to create - res (optional, float): resolution the numpy array is in + res (optional, int): resolution the numpy array is in """ + ############# modify to support gpu acceleration ############ array = np.zeros(size) for read in sampler.reads: start, stop = read @@ -128,24 +140,125 @@ def actual_reads(sampler, size, res=1.0): array[int(ceil(start/res)):int(ceil(stop/res))] += 1 return array -def sample_reads(sampler, size, prng, res=1.0): +def sp_sample_reads(sampler, size, i, j, res, jit=False): + """ Sample reads from samplers using multiprocessing + + Args: + sampler (list): List of ReadSampler objects to sample from + size (int): Size of numpy array to create + res (float): resolution of numpy array + start (int): starting seed for random state + p (optional, int): number of processors + + Returns: + arrays (np.array) final result of sampling + """ + start = args.s+i+len(all_sim) + array = sample_reads(sampler, size, np.random.RandomState(start+j), res, jit) + return array + +def sample_reads(sampler, size, prng, res=1, jit=False): """ Sample with replacement all reads from a sampler and map them to an array Args: sampler (class ReadSampler): object holding reads to sample size (int): size of numpy array to create - prng (np.RandomState): random state to pull random numbers + prng (np.RandomState): random state to pull random numbers from - res (optional, float): resolution the numpy array is in + res (optional, int): resolution the numpy array is in """ array = np.zeros(size) - for read in sampler.pull_reads(sampler.total, prng): + sampled_reads = sampler.pull_reads(sampler.total, prng) + # # define heirarchy for using GPU + # threadsperblock = 32 + # blockspergrid = (array.size + (threadsperblock - 1)) // threadsperblock + + if jit: + # threadsperblock = 32 + # blockspergrid = (sampled_reads.shape[0] + (threadsperblock - 1)) // threadsperblock + # note that using cuda.jit here causes us to write directly to the array, + # without explicitly returning it from sum_coverage_from_sample function + # sum_coverage_from_sample[blockspergrid, threadsperblock](sampled_reads, array, res) + # device_array = cuda.to_device(array) + # device_samples = cuda.to_device(sampled_reads) + # device_res = cuda.to_device(res) + array = numba_sum_coverage_from_sample(sampled_reads, array, res) + # cuda_sum_coverage_from_sample[blockspergrid, threadsperblock](device_samples, device_array, res) + # array = device_array.copy_to_host() + else: + array = sum_coverage_from_sample(sampled_reads, array, res) + + return array + +def sum_coverage_from_sample(sampled_reads, array, res): + """ Map sampled reads to an array + + Args: + reads (): sampled read (start,end) positions + array (): array to be populated with coverage + res (float): resolution the array is in + """ + + for read in sampled_reads: start, stop = read + res = float(res) + x = int(ceil(start/res)) + y = int(ceil(stop/res)) # UNCOMMENT TO ONLY EVERY DO SINGLE BP RESOLUTION #array[start:stop] +=1 - array[int(ceil(start/res)):int(ceil(stop/res))] += 1 - return array + array[x:y] += 1 + + return(array) + +@numba.jit(nopython=True, parallel=True) +def numba_sum_coverage_from_sample(sampled_reads, array, res): + """ Map sampled reads to an array + + Args: + reads (): sampled read (start,end) positions + array (): array to be populated with coverage + res (float): resolution the array is in + """ + # pos = cuda.grid(1) + for i in range(sampled_reads.shape[0]): + read = sampled_reads[i,:] + # read = sampled_reads[pos,:] + start, stop = read + start = float(start) + stop = float(stop) + res = float(res) + x = int(ceil(start/res)) + y = int(ceil(stop/res)) + # UNCOMMENT TO ONLY EVERY DO SINGLE BP RESOLUTION + #array[start:stop] +=1 + for idx in range(x,y): + array[idx] += 1 + return(array) + +# @numba.cuda.jit +# def cuda_sum_coverage_from_sample(sampled_reads, array, res): +# """ Map sampled reads to an array + +# Args: +# reads (): sampled read (start,end) positions +# array (): array to be populated with coverage +# res (float): resolution the array is in +# """ +# # pos = cuda.grid(1) +# for i in range(sampled_reads.shape[0]): +# read = sampled_reads[i,:] +# # read = sampled_reads[pos,:] +# start, stop = read +# start = float(start) +# stop = float(stop) +# res = float(res) +# x = int(ceil(start/res)) +# y = int(ceil(stop/res)) +# # UNCOMMENT TO ONLY EVERY DO SINGLE BP RESOLUTION +# #array[start:stop] +=1 +# for idx in range(x,y): +# array[idx] += 1 def log2_ratio(array1, array2): """ Take the median normalized log2 ratio of two arrays @@ -153,19 +266,64 @@ def log2_ratio(array1, array2): Args: array1 (np.array): numpy array holding extracted signal array2 (np.array): numpy array holding input signal + cu (bool): whether to use cuda.jit to accelerate calculations Returns: ratio (np.array): median normalized log2 ratio """ # lets normalize by the median - ratio = (array1/(np.median(array1)+0.0))/(array2/(np.median(array2)+0.0)) - ratio = np.log2(ratio) - # only nans should be np.log2(0/0) which should be 0 - #ratio[np.isnan(ratio)] = 0.0 - return ratio - + arr1_median = float(np.median(array1)) + arr2_median = float(np.median(array2)) + ratio = (array1/arr1_median)/(array2/arr2_median) + + # if not cu: + # ratio = (array1/arr1_median)/(array2/arr2_median) + # only nans should be np.log2(0/0) which should be 0 + #ratio[np.isnan(ratio)] = 0.0 + # else: + # threadsperblock = 32 + # blockspergrid = (array1.size + (threadsperblock - 1)) // threadsperblock + + # device_array1 = cuda.to_device(array1) + # device_arr1_med = cuda.to_device(arr1_median) + # device_array2 = cuda.to_device(array2) + # device_arr2_med = cuda.to_device(arr2_median) + + # median_array1 = np.empty(array1.shape) + # device_median_array1 = cuda.to_device(median_array1) + # median_array2 = np.empty(array2.shape) + # device_median_array2 = cuda.to_device(median_array2) + + # ratio = np.empty(array1.shape) + # device_ratio = cuda.to_device(ratio) + # cuda_ratio[blockspergrid, threadsperblock](device_array1, device_array2, + # arr1_median, arr2_median, + # device_median_array1, device_median_array2, + # device_ratio) + # ratio = device_ratio.copy_to_host() + + log2_ratio = np.log2(ratio) + + return log2_ratio + +# @cuda.jit +# def cuda_ratio(array1, array2, arr1_med, arr2_med, +# median_array1, median_array2, +# ratio): + +# pos = cuda.grid(1) +# if pos < ratio.shape[0]: +# # calculate median for position pos of each array +# median_array1[pos] = array1[pos]/arr1_med +# median_array2[pos] = array2[pos]/arr2_med + +# # calculate ratio +# ratio[pos] = median_array1[pos]/median_array2[pos] + + + def floored_sub(samp, control): - """ Subtract control signal from sample signal. Only subtract is control is + """ Subtract control signal from sample signal. Only subtract if control is greater than 0. Args: @@ -190,8 +348,8 @@ def read_in_sampler(samplerfile): sampler (ReadSampler object): a sampler ready for sampling """ sampler = ReadSampler() - sampler.load_data(samplerfile) - sampler.sort_reads() + sampler.load_data(samplerfile) + # sampler.sort_reads() # our reads are already sorted return sampler def mp_sample_reads_helper(args): @@ -206,7 +364,7 @@ def mp_sample_reads(samplers, size, res, start, p=1): Args: samplers (list): List of ReadSampler objects to sample from size (int): Size of numpy array to create - res (float): resolution of numpy array + res (int): resolution of numpy array start (int): starting seed for random state p (optional, int): number of processors @@ -214,7 +372,7 @@ def mp_sample_reads(samplers, size, res, start, p=1): arrays (np.array) final result of sampling """ pool = mp.Pool(processes=p) - arrays = pool.map(mp_sample_reads_helper, + arrays = pool.map(mp_sample_reads_helper, ((sampler, size, res, np.random.RandomState(start+x)) for x,sampler in enumerate(samplers))) pool.close() pool.join() @@ -233,14 +391,14 @@ def mp_actual_reads(samplers, size, res, p=1): Args: samplers (list): List of ReadSampler objects to sample from size (int): Size of numpy array to create - res (float): resolution of numpy array + res (int): resolution of numpy array p (optional, int): number of processors Returns: arrays (np.array) final result of mapping """ pool = mp.Pool(processes=p) - arrays = pool.map(mp_actual_reads_helper, + arrays = pool.map(mp_actual_reads_helper, ((sampler, size, res) for sampler in samplers)) pool.close() pool.join() @@ -251,182 +409,262 @@ def mp_actual_reads(samplers, size, res, p=1): goes through the entire chip-seq pipeline with x bootstraps.\ outputs a numpy g by x matrix where g is the size of the genome\ and x is the number of bootstrap replicates to do.") - parser.add_argument("genome_size", type=int, help="size of genome\ - controls how large to build the arrays") - parser.add_argument('outpre', help="Output prefix for final matrix.") - parser.add_argument('--ext_samps', type=str, nargs="+", + parser.add_argument("--genome_size", type=int, help="size of genome\ + controls how large to build the arrays", required=True) + parser.add_argument('--out_prefix', help="Output prefix for final matrix.", + required=True) + parser.add_argument("--sample_name_luts", type=str, nargs="+", + help="Paths to file containing the\ + lookup run information provided by Illumina. The\ + file will be used to match sample IDs\ + with sample names", required=True) + parser.add_argument('--ChIP_samps', type=str, nargs="+", help="Extracted read simulators for samples") - parser.add_argument('--inp_samps', type=str, nargs="+", + parser.add_argument('--inp_samps', type=str, nargs="+", help="Input read simulators for samples") - parser.add_argument('--ext_conts', type=str, nargs="+", + parser.add_argument('--ChIP_conts', type=str, nargs="+", help="Extracted read simulators for controls") - parser.add_argument('--inp_conts', type=str, nargs="+", + parser.add_argument('--inp_conts', type=str, nargs="+", help="Input read simulators for controls") - parser.add_argument('--num_replicates', type=int, default=5, + parser.add_argument('--num_replicates', type=int, default=5, help="Number of bootstrap replicates to do, default is 5") parser.add_argument('--identity', action="store_true", help="Don't sample, just use the data as is") - parser.add_argument('-p', type=int, default=5, + parser.add_argument('-p', type=int, default=10, help="Number of processors to use, default is 5") - parser.add_argument('-s', type=int, default=None, + parser.add_argument('-s', type=int, default=None, help="Seed for random number generation. Default is 1") - parser.add_argument('--resolution', type=float, default=1.0, + parser.add_argument('--resolution', type=int, default=1, help="resolution of data to map, default is 1bp") parser.add_argument('--save_summaries', type=float, default=None, help="Don't save all bootstrap replicates. Just save the summaries:\ minci, maxci, lev, mean, var. Specify the alpha level here") + parser.add_argument('--numba', action="store_true", + help="Adding this flag will enable jit compilation\ + on the cpu to speed up calculations.") args = parser.parse_args() - logging.basicConfig(format='%(asctime)s %(message)s',level=logging.INFO) + logging.basicConfig(format='%(asctime)s %(message)s',level=logging.INFO) ## TO DO: # allow functions to deal with more than two samples for each prng= np.random.RandomState(seed=args.s) array_size = int(floor(args.genome_size/args.resolution)) - num_ext_samp = len(args.ext_samps) + num_ext_samp = len(args.ChIP_samps) num_inp_samp = len(args.inp_samps) - num_ext_cont = len(args.ext_conts) + num_ext_cont = len(args.ChIP_conts) num_inp_cont = len(args.inp_conts) if args.save_summaries is not None: this_summary_stats_only_finite = lambda x: summary_stats_only_finite(x, alpha=args.save_summaries) this_summary_stats_nan_zeros = lambda x: summary_stats_nan_zeros(x, alpha=args.save_summaries) - + samp_final = np.zeros((array_size, args.num_replicates, num_ext_samp)) cont_final = np.zeros((array_size, args.num_replicates, num_ext_cont)) - samp_names = [os.path.basename(x).split(".")[0] for x in args.ext_samps] - cont_names = [os.path.basename(x).split(".")[0] for x in args.ext_conts] + # pat = re.compile(r'Sample_\d+') + pat = re.compile(r'SRR.+') + # grab the sample names from the treatment sampler files + samp_matches = [pat.search(s) for s in args.ChIP_samps] + if None in samp_matches: + print("Your sampler file names should begin with the sample id\ + Illumina gave you, i.e., Sample_NNNNNN,\ + where N is any integer. Exiting script.") + sys.exit() + else: samp_strings = [match.group() for match in samp_matches] + # grab the sample names from the control sampler files + cont_matches = [pat.search(s) for s in args.ChIP_conts] + if None in cont_matches: + print("Your sampler file names should begin with the sample id\ + Illumina gave you, i.e., Sample_NNNNNN,\ + where N is any integer. Exiting script.") + sys.exit() + else: cont_strings = [match.group() for match in cont_matches] + + # read in sample info from illumina to look up descriptions from sample ids + for i,sample_info_file_name in enumerate(args.sample_name_luts): + if sample_info_file_name.endswith('.csv'): + sample_info_tmp = (pd.read_csv(sample_info_file_name, header=18) >> + select(X.Sample_ID, X.Description)) + else: + # sample_info_tmp = (pd.read_csv(sample_info_file_name, sep='\t') >> + # mutate(Sample_ID = 'Sample_' + X.SampleID.astype(str)) >> + # select(X.Sample_ID, X.Description)) + sample_info_tmp = (pd.read_csv(sample_info_file_name, sep='\t') >> + mutate(Sample_ID = X.SampleID.astype(str)) >> + select(X.Sample_ID, X.Description)) + if i == 0: + sample_info = sample_info_tmp + else: + sample_info = sample_info.append(sample_info_tmp) + + samp_names = [] + for samp_id in samp_strings: + sample_rows = sample_info[sample_info.Sample_ID == samp_id] + samp_names.append(sample_rows['Description'].iloc[0]) # grab the first description for this Sample_ID + + cont_names = [] + for samp_id in cont_strings: + cont_rows = sample_info[sample_info.Sample_ID == samp_id] + cont_names.append(cont_rows['Description'].iloc[0]) + + # samp_names = [os.path.basename(x).split(".")[0] for x in args.ChIP_samps] + # cont_names = [os.path.basename(x).split(".")[0] for x in args.ChIP_conts] ## Read in all samplers - all_sim = [] - all_sim.extend(args.ext_samps) - all_sim.extend(args.inp_samps) - all_sim.extend(args.ext_conts) - all_sim.extend(args.inp_conts) + all_sims = [] + all_sims.extend(args.ChIP_samps) + all_sims.extend(args.inp_samps) + all_sims.extend(args.ChIP_conts) + all_sims.extend(args.inp_conts) if num_ext_samp != num_inp_samp or num_ext_cont != num_inp_cont: logging.error("Mismatch number of extracted and input samples Ext_samp: %s\ Inp_samp: %s, Ext_cont: %s, Inp_cont: %s"%(num_ext_samp, num_inp_samp, num_ext_cont, num_inp_cont)) - logging.info("Reading in samplers") - all_sim = [read_in_sampler(sampler) for sampler in all_sim] + all_sim = [] + for sampler in all_sims: + logging.info("Reading in sampler {}".format(sampler)) + if not sampler.endswith(".ob.npy"): + sampler = sampler + ".ob.npy" + all_sim.append(read_in_sampler(sampler)) + + # all_sim = [read_in_sampler(sampler) for sampler in all_sim] logging.info("Finished reading in samplers") ## sample reads - for i in xrange(args.num_replicates): + for i in range(args.num_replicates): logging.info("Starting bootstrap replicate %s"%i) logging.info("Sampling reads %s"%i) - if args.identity: - arrays = mp_actual_reads(all_sim, array_size, args.resolution, args.p) - ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING -# arrays = [actual_reads(sampler, args.genome_size, args.resolution) for sampler in all_sim] + begin = time.time() + if args.numba: + if args.identity: + # arrays = mp_actual_reads(all_sim, array_size, args.resolution, args.p) + ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING + arrays = [actual_reads(sampler, array_size, args.resolution) for sampler in all_sim] + else: + ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING + arrays = [sp_sample_reads(sampler, array_size, i, j, args.resolution, jit=args.numba) for j,sampler in enumerate(all_sim)] + # arrays = mp_sample_reads(all_sim, array_size, args.resolution, args.s+i+len(all_sim), args.p) else: - ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING - #arrays = [sample_reads(sampler, args.genome_size, args.resolution) for sampler in all_sim] - arrays = mp_sample_reads(all_sim, array_size, args.resolution, args.s+i+len(all_sim), args.p) + if args.identity: + arrays = mp_actual_reads(all_sim, array_size, args.resolution, args.p) + ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING + # arrays = [actual_reads(sampler, args.genome_size, args.resolution) for sampler in all_sim] + else: + ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING + # arrays = [sp_sample_reads(sampler, array_size, i, j, args.resolution, cu=args.numba) for j,sampler in enumerate(all_sim)] + arrays = mp_sample_reads(all_sim, array_size, args.resolution, args.s+i+len(all_sim), args.p) + finish = time.time() + logging.info("Sampling bootstrap replicate {} took {} seconds".format(i, finish-begin)) ## Calculate ratios logging.info("Calculating Ratios %s"%i) - for j, (ext_ind, inp_ind) in enumerate(zip(range(0, num_ext_samp), - range(num_ext_samp, + begin = time.time() + for j, (ext_ind, inp_ind) in enumerate(zip(range(0, num_ext_samp), + range(num_ext_samp, num_ext_samp+num_inp_samp))): samp_final[:,i,j] = log2_ratio(arrays[ext_ind], arrays[inp_ind]) - for j, (ext_ind, inp_ind) in enumerate(zip(range(num_ext_samp + num_inp_samp, - num_ext_samp + num_inp_samp+num_ext_cont), - range(num_ext_samp + num_inp_samp + num_ext_cont, + for j, (ext_ind, inp_ind) in enumerate(zip(range(num_ext_samp + num_inp_samp, + num_ext_samp + num_inp_samp+num_ext_cont), + range(num_ext_samp + num_inp_samp + num_ext_cont, num_ext_samp + num_inp_samp + num_ext_cont + num_inp_cont))): cont_final[:,i,j]=log2_ratio(arrays[ext_ind], arrays[inp_ind]) + finish = time.time() + logging.info("Calculating Ratios for replicate {} took {} seconds".format(i, finish-begin)) # Write out final output if args.identity and args.save_summaries: for j, name in enumerate(samp_names): - np.save(args.outpre+"_%s_actual"%name, samp_final[:,:,j]) + np.save(args.out_prefix+"_%s_actual"%name, samp_final[:,:,j]) # save control summaries for j, name in enumerate(cont_names): - np.save(args.outpre+"_%s_actual"%name, cont_final[:,:,j]) + np.save(args.out_prefix+"_%s_actual"%name, cont_final[:,:,j]) # save each combination of sample - control summary for j, samp_name in enumerate(samp_names): for k, cont_name in enumerate(cont_names): # note that floored sub MODIFIES the control array. Since we have already written the control array # that is not a big deal but be aware that this modification happens - np.save(args.outpre+"_%s_Sub_%s_actual"%(samp_name,cont_name), floored_sub(samp_final[:,:,j], cont_final[:,:,k])) + np.save(args.out_prefix+"_%s_Sub_%s_actual"%(samp_name,cont_name), floored_sub(samp_final[:,:,j], cont_final[:,:,k])) elif args.save_summaries: # ALL OF THIS IS HARDCODED QUICK CODING. Saves a lot of output files to be # used downstream logging.info("Calculating Summary Stats for finite values") + begin = time.time() # save sample summaries for j, name in enumerate(samp_names): these_stats = np.apply_along_axis(this_summary_stats_only_finite, 1, samp_final[:,:,j]) - np.save(args.outpre+"_%s_mean"%name, these_stats[:,0]) - np.save(args.outpre+"_%s_minci"%name, these_stats[:,1]) - np.save(args.outpre+"_%s_maxci"%name, these_stats[:,2]) - np.save(args.outpre+"_%s_var"%name, these_stats[:,3]) - np.save(args.outpre+"_%s_lev"%name, these_stats[:,4]) - np.save(args.outpre+"_%s_median"%name, these_stats[:,5]) - np.save(args.outpre+"_%s_mad"%name, these_stats[:,6]) - np.save(args.outpre+"_%s_num_inf"%name, these_stats[:,7]) - np.save(args.outpre+"_%s_num_nan"%name, these_stats[:,8]) + np.save(args.out_prefix+"_%s_mean"%name, these_stats[:,0]) + np.save(args.out_prefix+"_%s_minci"%name, these_stats[:,1]) + np.save(args.out_prefix+"_%s_maxci"%name, these_stats[:,2]) + np.save(args.out_prefix+"_%s_var"%name, these_stats[:,3]) + np.save(args.out_prefix+"_%s_lev"%name, these_stats[:,4]) + np.save(args.out_prefix+"_%s_median"%name, these_stats[:,5]) + np.save(args.out_prefix+"_%s_mad"%name, these_stats[:,6]) + np.save(args.out_prefix+"_%s_num_inf"%name, these_stats[:,7]) + np.save(args.out_prefix+"_%s_num_nan"%name, these_stats[:,8]) # save control summaries for j, name in enumerate(cont_names): these_stats = np.apply_along_axis(this_summary_stats_only_finite, 1, cont_final[:,:,j]) - np.save(args.outpre+"_%s_mean"%name, these_stats[:,0]) - np.save(args.outpre+"_%s_minci"%name, these_stats[:,1]) - np.save(args.outpre+"_%s_maxci"%name, these_stats[:,2]) - np.save(args.outpre+"_%s_var"%name, these_stats[:,3]) - np.save(args.outpre+"_%s_lev"%name, these_stats[:,4]) - np.save(args.outpre+"_%s_median"%name, these_stats[:,5]) - np.save(args.outpre+"_%s_mad"%name, these_stats[:,6]) - np.save(args.outpre+"_%s_num_inf"%name, these_stats[:,7]) - np.save(args.outpre+"_%s_num_nan"%name, these_stats[:,8]) + np.save(args.out_prefix+"_%s_mean"%name, these_stats[:,0]) + np.save(args.out_prefix+"_%s_minci"%name, these_stats[:,1]) + np.save(args.out_prefix+"_%s_maxci"%name, these_stats[:,2]) + np.save(args.out_prefix+"_%s_var"%name, these_stats[:,3]) + np.save(args.out_prefix+"_%s_lev"%name, these_stats[:,4]) + np.save(args.out_prefix+"_%s_median"%name, these_stats[:,5]) + np.save(args.out_prefix+"_%s_mad"%name, these_stats[:,6]) + np.save(args.out_prefix+"_%s_num_inf"%name, these_stats[:,7]) + np.save(args.out_prefix+"_%s_num_nan"%name, these_stats[:,8]) # save each combination of sample - control summary for j, samp_name in enumerate(samp_names): for k, cont_name in enumerate(cont_names): these_stats = np.apply_along_axis(this_summary_stats_only_finite, 1, floored_sub(samp_final[:,:,j], cont_final[:,:,k])) - np.save(args.outpre+"_%s_Sub_%s_mean"%(samp_name,cont_name), these_stats[:,0]) - np.save(args.outpre+"_%s_Sub_%s_minci"%(samp_name,cont_name), these_stats[:,1]) - np.save(args.outpre+"_%s_Sub_%s_maxci"%(samp_name, cont_name), these_stats[:,2]) - np.save(args.outpre+"_%s_Sub_%s_var"%(samp_name,cont_name), these_stats[:,3]) - np.save(args.outpre+"_%s_Sub_%s_lev"%(samp_name,cont_name), these_stats[:,4]) - np.save(args.outpre+"_%s_Sub_%s_median"%(samp_name,cont_name), these_stats[:,5]) - np.save(args.outpre+"_%s_Sub_%s_mad"%(samp_name,cont_name), these_stats[:,6]) - np.save(args.outpre+"_%s_Sub_%s_num_inf"%(samp_name,cont_name), these_stats[:,7]) - np.save(args.outpre+"_%s_Sub_%s_num_nan"%(samp_name,cont_name), these_stats[:,8]) + np.save(args.out_prefix+"_%s_Sub_%s_mean"%(samp_name,cont_name), these_stats[:,0]) + np.save(args.out_prefix+"_%s_Sub_%s_minci"%(samp_name,cont_name), these_stats[:,1]) + np.save(args.out_prefix+"_%s_Sub_%s_maxci"%(samp_name, cont_name), these_stats[:,2]) + np.save(args.out_prefix+"_%s_Sub_%s_var"%(samp_name,cont_name), these_stats[:,3]) + np.save(args.out_prefix+"_%s_Sub_%s_lev"%(samp_name,cont_name), these_stats[:,4]) + np.save(args.out_prefix+"_%s_Sub_%s_median"%(samp_name,cont_name), these_stats[:,5]) + np.save(args.out_prefix+"_%s_Sub_%s_mad"%(samp_name,cont_name), these_stats[:,6]) + np.save(args.out_prefix+"_%s_Sub_%s_num_inf"%(samp_name,cont_name), these_stats[:,7]) + np.save(args.out_prefix+"_%s_Sub_%s_num_nan"%(samp_name,cont_name), these_stats[:,8]) + finish = time.time() + logging.info("Calculating Summary Stats took {} seconds".format(finish-begin)) # logging.info("Calculating Summary Stats for nans as zeros") # # save sample summaries # for j, name in enumerate(samp_names): # these_stats = np.apply_along_axis(this_summary_stats_nan_zeros, 1, samp_final[:,:,j]) -# np.save(args.outpre+"_%s_mean"%name, these_stats[:,0]) -# np.save(args.outpre+"_%s_minci"%name, these_stats[:,1]) -# np.save(args.outpre+"_%s_maxci"%name, these_stats[:,2]) -# np.save(args.outpre+"_%s_var"%name, these_stats[:,3]) -# np.save(args.outpre+"_%s_lev"%name, these_stats[:,4]) -# np.save(args.outpre+"_%s_median"%name, these_stats[:,5]) -# np.save(args.outpre+"_%s_mad"%name, these_stats[:,6]) +# np.save(args.out_prefix+"_%s_mean"%name, these_stats[:,0]) +# np.save(args.out_prefix+"_%s_minci"%name, these_stats[:,1]) +# np.save(args.out_prefix+"_%s_maxci"%name, these_stats[:,2]) +# np.save(args.out_prefix+"_%s_var"%name, these_stats[:,3]) +# np.save(args.out_prefix+"_%s_lev"%name, these_stats[:,4]) +# np.save(args.out_prefix+"_%s_median"%name, these_stats[:,5]) +# np.save(args.out_prefix+"_%s_mad"%name, these_stats[:,6]) # # save control summaries # for j, name in enumerate(cont_names): # these_stats = np.apply_along_axis(this_summary_stats_only_finite, 1, cont_final[:,:,j]) -# np.save(args.outpre+"_%s_mean"%name, these_stats[:,0]) -# np.save(args.outpre+"_%s_minci"%name, these_stats[:,1]) -# np.save(args.outpre+"_%s_maxci"%name, these_stats[:,2]) -# np.save(args.outpre+"_%s_var"%name, these_stats[:,3]) -# np.save(args.outpre+"_%s_lev"%name, these_stats[:,4]) -# np.save(args.outpre+"_%s_median"%name, these_stats[:,5]) -# np.save(args.outpre+"_%s_mad"%name, these_stats[:,6]) +# np.save(args.out_prefix+"_%s_mean"%name, these_stats[:,0]) +# np.save(args.out_prefix+"_%s_minci"%name, these_stats[:,1]) +# np.save(args.out_prefix+"_%s_maxci"%name, these_stats[:,2]) +# np.save(args.out_prefix+"_%s_var"%name, these_stats[:,3]) +# np.save(args.out_prefix+"_%s_lev"%name, these_stats[:,4]) +# np.save(args.out_prefix+"_%s_median"%name, these_stats[:,5]) +# np.save(args.out_prefix+"_%s_mad"%name, these_stats[:,6]) # # save each combination of sample - control summary # for j, samp_name in enumerate(samp_names): # for k, cont_name in enumerate(cont_names): # these_stats = np.apply_along_axis(this_summary_stats_only_finite, 1, floored_sub(samp_final[:,:,j], cont_final[:,:,k])) -# np.save(args.outpre+"_%s_Sub_%s_mean"%(samp_name,cont_name), these_stats[:,0]) -# np.save(args.outpre+"_%s_Sub_%s_minci"%(samp_name,cont_name), these_stats[:,1]) -# np.save(args.outpre+"_%s_Sub_%s_maxci"%(samp_name, cont_name), these_stats[:,2]) -# np.save(args.outpre+"_%s_Sub_%s_var"%(samp_name,cont_name), these_stats[:,3]) -# np.save(args.outpre+"_%s_Sub_%s_lev"%(samp_name,cont_name), these_stats[:,4]) -# np.save(args.outpre+"_%s_Sub_%s_median"%(samp_name,cont_name), these_stats[:,5]) -# np.save(args.outpre+"_%s_Sub_%s_mad"%(samp_name,cont_name), these_stats[:,6]) +# np.save(args.out_prefix+"_%s_Sub_%s_mean"%(samp_name,cont_name), these_stats[:,0]) +# np.save(args.out_prefix+"_%s_Sub_%s_minci"%(samp_name,cont_name), these_stats[:,1]) +# np.save(args.out_prefix+"_%s_Sub_%s_maxci"%(samp_name, cont_name), these_stats[:,2]) +# np.save(args.out_prefix+"_%s_Sub_%s_var"%(samp_name,cont_name), these_stats[:,3]) +# np.save(args.out_prefix+"_%s_Sub_%s_lev"%(samp_name,cont_name), these_stats[:,4]) +# np.save(args.out_prefix+"_%s_Sub_%s_median"%(samp_name,cont_name), these_stats[:,5]) +# np.save(args.out_prefix+"_%s_Sub_%s_mad"%(samp_name,cont_name), these_stats[:,6]) else: - # write final array + # write final array logging.info("Writing final array") - np.save(args.outpre+"_samp", samp_final) - np.save(args.outpre+"_cont", cont_final) + np.save(args.out_prefix+"_samp", samp_final) + np.save(args.out_prefix+"_cont", cont_final) diff --git a/ChIP_analysis/calculate_peaks.py b/ChIP_analysis/calculate_peaks.py index bc8ac39..7f958e6 100644 --- a/ChIP_analysis/calculate_peaks.py +++ b/ChIP_analysis/calculate_peaks.py @@ -170,11 +170,11 @@ def bio_and_tech_filter(bio_qs, tech_qs): for bioq, techq in zip(bio_qs, tech_qs): this_rep += 1*np.logical_and(bioq > -np.log10(args.bioalpha), techq > -np.log10(args.techalpha)) temp_peaks = np.logical_or(temp_peaks, this_rep > 0) - print "Fractions passing bio and tech filter" - print np.sum(this_rep == 1)/(bio_qs[0].size + 0.0) - print np.sum(this_rep == 2)/(bio_qs[0].size + 0.0) - print np.sum(this_rep == 3)/(bio_qs[0].size + 0.0) - print np.sum(this_rep == 4)/(bio_qs[0].size + 0.0) + print ("Fractions passing bio and tech filter") + print (np.sum(this_rep == 1)/(bio_qs[0].size + 0.0)) + print (np.sum(this_rep == 2)/(bio_qs[0].size + 0.0)) + print (np.sum(this_rep == 3)/(bio_qs[0].size + 0.0)) + print (np.sum(this_rep == 4)/(bio_qs[0].size + 0.0)) return temp_peaks def idr_filter(temp_peaks, idrs): @@ -233,6 +233,7 @@ def find_min_value(start, stop, vallist, ret_index=False): parser.add_argument('--resolution', type=float, help="resolution of data", default=1) parser.add_argument('--bins', type=int, help="bins to consolidate over", default=30) parser.add_argument('--outpre', type=str, help="output prefix") + parser.add_argument('--chrom_name', type=str, help="Chromosome name") args = parser.parse_args() actual_sigs = [np.load(x) for x in args.log2ratios] @@ -259,7 +260,7 @@ def find_min_value(start, stop, vallist, ret_index=False): max_valbio = find_max_value(entry[0], entry[1], bio_qs) max_validr = find_max_value(entry[0], entry[1], idrs, False) max_valtech = find_max_value(entry[0], entry[1], tech_qs, False) - this_peak = peak.Peak("gi|48994873|gb|U00096.2|mod|ATCC.47076|", start=int(start), end=int(end), + this_peak = peak.Peak(args.chrom_name, start=int(start), end=int(end), name = "%s_%s"%(args.outpre,i), score = max_value, signalval = max_valtech, diff --git a/ChIP_analysis/sam_utils.py b/ChIP_analysis/sam_utils.py index 101964f..e6c6fb5 100755 --- a/ChIP_analysis/sam_utils.py +++ b/ChIP_analysis/sam_utils.py @@ -35,6 +35,7 @@ ################################################################################ ## Python base imports ## +from __future__ import print_function import re import itertools import logging @@ -62,7 +63,7 @@ def grouper(iterable, n, fillvalue=None): ('G', 'x', 'x') """ args = [iter(iterable)] * n - return itertools.izip_longest(fillvalue=fillvalue, *args) + return itertools.zip_longest(fillvalue=fillvalue, *args) def complement(sequence): """Complement a nucleotide sequence @@ -89,7 +90,7 @@ class SamAlignment: """ Class for holding a single line out of a SAM file. Along with methods to manipulate and extract data from the alignment. """ - + def __init__(self, line, sep = '\t'): """ Class initalizer. Takes in a single line from a Sam alignment and stores each and every field as a class attribute. @@ -213,8 +214,8 @@ def __init__(self, line, sep = '\t'): self.aligned_muts = None def parse_opt_fields(self, fields_list): - """ Parses each field from the optional fields in the - SAM file. + """ Parses each field from the optional fields in the + SAM file. Inputs: List. Contains all the optional fields in a sam file Modifies: nothing @@ -243,7 +244,7 @@ def parse_opt_fields(self, fields_list): # right now, could be implemented in the future # Have a dictionary to hold the functions to convert each type of # field to the type it indicates - d_type_func = {'A': str, 'i': int, 'f':float, 'Z':str, + d_type_func = {'A': str, 'i': int, 'f':float, 'Z':str, 'H': str, 'B': str} # initialize a dictionary to hold the parsed fields field_dict = {} @@ -263,9 +264,9 @@ def quality_filter(self, good_flags=[], bad_flags=[], mapq=None, stats=None): Inputs allow additional, filters for determining if a read is good or not. - Input: good_flags - flags that are required to be present in self.FLAG + Input: good_flags - flags that are required to be present in self.FLAG in order to pass the filter - bad_flags - flags that are required to be absent from self.FLAG + bad_flags - flags that are required to be absent from self.FLAG in order to pass the filter [mapq] - int. If positive, only return true if self.MAPQ is >= mapq. If negative only return true if self.MAPQ @@ -275,7 +276,7 @@ def quality_filter(self, good_flags=[], bad_flags=[], mapq=None, stats=None): on how many reads are filtered for what issues. Modifies: stats - Returns: True if the read passes filters and False otherwise. + Returns: True if the read passes filters and False otherwise. Tests: @@ -344,27 +345,27 @@ def quality_filter(self, good_flags=[], bad_flags=[], mapq=None, stats=None): elif mapq >= 0: # if the mapq is greater than 0 then we are looking for reads # ABOVE the mapq score specified - mapq_condition = self.MAPQ >= mapq + mapq_condition = self.MAPQ >= mapq elif mapq < 0: # if the mapq is less than 0 then we are looking for reads # BELOW the mapq score specified - mapq_condition = self.MAPQ < abs(mapq) + mapq_condition = self.MAPQ < abs(mapq) else: raise ValueError("mapq must be a positive or negative integer "\ - "value.") + "value.") good_al = good_al and mapq_condition if stats and not mapq_condition: stats.total_mapq += 1 # finally return whether the alignment was good or not return bool(good_al) - + def is_gapped(self): """ Determine if the alignment has any gaps in it as determined by the CIGAR field. Inputs: nothing Modifies: nothing - Returns: True if 'N' is in the CIGAR field. + Returns: True if 'N' is in the CIGAR field. False if not. Raises error if the CIGAR field = "*" Tests: >>> line = "testQ.1.testR.20.30.3M.testR.25.9.AAA.(((.NM:i:0" @@ -399,7 +400,7 @@ def is_rc(self, paired): reference. If paired, forces each read of the pair to have opposite orientations, otherwise it raises an error - Inputs: paired - boolean. if true, checks reads mate to see if it is + Inputs: paired - boolean. if true, checks reads mate to see if it is in a consistent orientation. If not, raises error. if false, just checks if read is rc to reference. Modifies: nothing @@ -441,7 +442,7 @@ def is_rc(self, paired): this_is_rc = eval(bin(self.FLAG)) & 0x10 # check if the read's pair is reverse complement to the reference other_is_rc = eval(bin(self.FLAG)) & 0x20 - + # if paired, force the read's pair to have the opposite orientation # of the read. Raise error if not if paired: @@ -460,14 +461,14 @@ def is_rc(self, paired): def sense_to_ref(self, paired, library): """ Function to determine whether the ORIGINAL RNA molecule was sense - to the reference or antisense to the reference. Only implemented for + to the reference or antisense to the reference. Only implemented for pairs when each read is in a different orientation - + Inputs: paired - boolean. If True than additional checking will be done to make sure the paired read is consistent with this read. If False, then it is considered an individual read. - library - str. ["R1" | "R2" | "unstranded"]. R1 indicates that + library - str. ["R1" | "R2" | "unstranded"]. R1 indicates that the first read sequenced (*_R1.fasta) is sense to the original RNA strand. R2 indicates that the 2nd read sequenced @@ -502,7 +503,7 @@ def sense_to_ref(self, paired, library): Traceback (most recent call last): RuntimeError: Read mate ... - Test a paired end read + Test a paired end read >>> read.FLAG = 32 | 64 >>> read.sense_to_ref(True, "R1") True @@ -527,7 +528,7 @@ def sense_to_ref(self, paired, library): """ # if the read is paired, we have to consider the orientation of # the reads - if paired: + if paired: # first segment in the template is_first = eval(bin(self.FLAG)) & 0x40 # last segment in the template @@ -537,7 +538,7 @@ def sense_to_ref(self, paired, library): is_rc = self.is_rc(True) # raise the error if this can't be determined except RuntimeError: - raise + raise # determine what orientation that entire segment is in if ( (is_first and not(is_rc)) or (is_second and is_rc) ): if library == "R1" or library == "unstranded": @@ -600,7 +601,7 @@ def get_cigar_tuples(self): [(7, 'M')] More complex test - >>> read.CIGAR = "3M4D2M1I1M" + >>> read.CIGAR = "3M4D2M1I1M" >>> read.cigar_tuples = None >>> read.get_cigar_tuples() [(3, 'M'), (4, 'D'), (2, 'M'), (1, 'I'), (1, 'M')] @@ -616,8 +617,8 @@ def get_cigar_tuples(self): # split using regular expressions. The parenthesis in the re gives us # the seperators as well cigar_list = re.split('([MIDNSHP=X])', self.CIGAR) - - # loop through by twos using itertools grouper recipe from the + + # loop through by twos using itertools grouper recipe from the # python itertools documentation. # The cigar string always starts with a number and ends in a char, # so we cut the list short by one since the last value by twos will @@ -634,13 +635,13 @@ def get_cigar_tuples(self): def get_aligned_blocks(self): """ Function to take the cigar field and determine the locations - where there is continuous mapping coverage. - + where there is continuous mapping coverage. + Inputs: nothing Modifies: nothing - Returns: a list of (start, end) locations where continuous mapping - coverage occurs. Continuous coverage includes locations where - there is continuous 'M', '=', 'D', or 'X' in the CIGAR field. + Returns: a list of (start, end) locations where continuous mapping + coverage occurs. Continuous coverage includes locations where + there is continuous 'M', '=', 'D', or 'X' in the CIGAR field. Breaks occur at 'S', 'I', or 'N' in the CIGAR field. Tests: All matching test, read.POS is at 1, cigar is 7M @@ -690,7 +691,7 @@ def get_aligned_blocks(self): start = self.POS # parse the cigar string into tuples cigar_tuples = self.get_cigar_tuples() - + # go through each cigar number and character for val, char in cigar_tuples: # if it is an alignment match of any sort, @@ -699,13 +700,13 @@ def get_aligned_blocks(self): # just add the value to the previous end. if char in ['M', '=', 'X', 'D']: if end is not None: - end += val + end += val else: end = start + val # If there is an intron, go ahead and append # the previous (start, end) pair to the list and reset the # next end to being unknown. Additionally, push the next start - # to the end of the intron. If an end had not been + # to the end of the intron. If an end had not been # determined yet, do not add the (start, end) pair to the list # as this is a continuing insertion of some sort. elif char == 'N': @@ -732,7 +733,7 @@ def get_aligned_seq_and_phred(self): """This function uses the CIGAR field of the read to determine what the sequence that aligns to the reference looks like after - removing any insertions and accounting for deletions. It likewise + removing any insertions and accounting for deletions. It likewise returns the corresponding quality scores associated with the sequence. Inputs: nothing @@ -741,7 +742,7 @@ def get_aligned_seq_and_phred(self): removed phred - phred scores corresponding to each base in seq Tests: - + Test an all matching sequence >>> line = "testQ.1.testR.2.30.7M.testR.4.9.AGTCGCT.!#%()+,.NM:i:0" >>> read = SamAlignment(line, sep = '.') @@ -797,11 +798,11 @@ def get_aligned_seq_and_phred(self): # expand out the tuples to make a big cigar string for val, char in cigar_tuples: expanded_cigar += val*char - + # remove the elements that would not be included in the SEQ field expanded_cigar = expanded_cigar.replace('N', '') expanded_cigar = expanded_cigar.replace('H', '') - + #Initialize a new seq variable new_seq = '' new_phred = '' @@ -839,10 +840,10 @@ def get_aligned_seq_and_phred(self): def get_gap_blocks(self): """ Function to take the cigar field and determine the locations where there is a gap in coverage - + Inputs: nothing Modifies: nothing - Returns: a list of (start, end) locations where gaps occur in mapping + Returns: a list of (start, end) locations where gaps occur in mapping coverage. Tests: @@ -896,7 +897,7 @@ def get_gap_blocks(self): new_end = start[1:] # zip them together into a new list of tuples and return return zip(new_start, new_end) - + def get_aligned_locs(self): """ Function to get the corresponding locations with @@ -953,7 +954,7 @@ def get_aligned_locs(self): >>> read.get_aligned_locs() [1, 2, 3, 4, 5] """ - + # get both the seq with no gaps and the start and stop # locations if self.aligned_locs: @@ -975,7 +976,7 @@ def start_end_gaps(self, paired): consideration. Thus, if paired is True, it will return the start and end for the entire pair, but only the locations within gaps for the individual read it was called on. - + Inputs: paired - boolean. Treat read as SE or PE. If paired is True, returns the pos + TLEN as the end. If false then returns the right most mapped location @@ -988,7 +989,7 @@ def start_end_gaps(self, paired): gaps - list. locations that reside within gaps Tests: - + All matching test, read.POS is at 1, cigar is 7M, read.TLEN is 12 >>> line = "testQ.1.testR.2.30.7M.=.4.12.AGTCGCT.!#%()+,.NM:i:0" >>> read = SamAlignment(line, sep = '.') @@ -1007,7 +1008,7 @@ def start_end_gaps(self, paired): (1, 13, []) Test internal introns - >>> line = "testQ.1.testR.2.30.2M3N5M.=.4.12.AGTCGCT.!#%()+,.NM:i:0" + >>> line = "testQ.1.testR.2.30.2M3N5M.=.4.12.AGTCGCT.!#%()+,.NM:i:0" >>> read = SamAlignment(line, sep = '.') >>> read.start_end_gaps(False) (1, 11, [(3, 6)]) @@ -1068,7 +1069,7 @@ def reconstruct_reference(self): Input: nothing Modifies: nothing Returns: string. Genome sequence reconstructed from MD field - list. Tuples containing mutations in the read from the + list. Tuples containing mutations in the read from the reference, [(loc, ref_base, mut_base, phred, loc)] Tests: @@ -1097,14 +1098,14 @@ def reconstruct_reference(self): >>> read.reconstruct_reference() ('GGTCGCG', [(1, 'G', 'A', '!'), (7, 'G', 'T', ',')]) - Test internal deletions, + Test internal deletions, >>> line = "testQ.1.testR.2.30.2M3D5M.=.4.12.AGTCGCT.!#%()+,.MD:Z:2^AAA5" >>> read = SamAlignment(line, sep = '.') >>> read.reconstruct_reference() ('AGAAATCGCT', [(3, 'A', '-', '-'), (4, 'A', '-', '-'), (5, 'A', '-', '-')]) Test internal introns - >>> line = "testQ.1.testR.2.30.2M3N5M.=.4.12.AGTCGCT.!#%()+,.MD:Z:7" + >>> line = "testQ.1.testR.2.30.2M3N5M.=.4.12.AGTCGCT.!#%()+,.MD:Z:7" >>> read = SamAlignment(line, sep = '.') >>> read.reconstruct_reference() ('AGTCGCT', []) @@ -1130,7 +1131,7 @@ def reconstruct_reference(self): if self.aligned_reference and self.aligned_muts: return self.aligned_reference, self.aligned_muts # get the aligned sequence and phred scores - aligned_seq, aligned_phred = self.get_aligned_seq_and_phred() + aligned_seq, aligned_phred = self.get_aligned_seq_and_phred() # get the aligned locations aligned_locs = self.get_aligned_locs() # create a list to hold the split up MD values @@ -1140,7 +1141,7 @@ def reconstruct_reference(self): # split using regular expressions. The parenthesis give us the # seperators as well try: - MD_list = re.split('([AGTC^])', self.OPT['MD']) + MD_list = re.split('([AGTC^])', self.OPT['MD']) except KeyError: raise KeyError("MD field not found for read %s"%self.QNAME) # make a list to hold of the mutations that are found @@ -1162,7 +1163,7 @@ def reconstruct_reference(self): # append this value to the reference sequence reference_seq.append(val) # we also append mutation information to the mutation list - mut_list.append((aligned_locs[start], val, + mut_list.append((aligned_locs[start], val, aligned_seq[start], aligned_phred[start])) # finally we increment the index of where we are on the @@ -1177,7 +1178,7 @@ def reconstruct_reference(self): # we append all of this to the reference sequence reference_seq.append(aligned_seq[start:end]) # and change the end to become the start - start = end + start = end # finally we join the list of bases in the reference sequence to a # string and cache it along with the mutation list self.aligned_reference = ''.join(reference_seq) @@ -1187,7 +1188,7 @@ def reconstruct_reference(self): def record_muts(self, strand, phred_encoding=33): """ Function to find mutations found in the read as compared to the - reference. Considers all reads individually (doesn't matter if paired + reference. Considers all reads individually (doesn't matter if paired or not). This depends on the presence of both a CIGAR field and an MD field in the alignment. If these are not present then this cannot be used. It also requires the XM field to be present @@ -1198,7 +1199,7 @@ def record_muts(self, strand, phred_encoding=33): [phred_encoding] - int. what is the phred-encoding for the read? the default is 33. Modifies: nothing - Returns: A list of mutation records in the form: + Returns: A list of mutation records in the form: (chrm, loc, strand, base, mut_base, phred) Tests: @@ -1276,16 +1277,16 @@ def get_ref_base_locs(self, base, strand): that actually have sequence for the read, not its pair or the interpolated region between the pair and itself. - Inputs: base - char. The identity of the reference base + Inputs: base - char. The identity of the reference base strand - char. ["+" | "-" | "."] If + treats the base as is, - if - searches for the bases complement. If . it + if - searches for the bases complement. If . it searches for both the base and its complement ( essentially search for base pairs) Modifies: nothing Returns: a numpy array of locations where the base occurs on the read. Tests: - + All base of interest >>> line = "testQ.1.testR.2.30.7M.testR.4.9.TTTTTTT.!#%()+,.NM:i:0.MD:Z:7" >>> read = SamAlignment(line, sep = '.') @@ -1327,15 +1328,15 @@ def get_ref_base_locs(self, base, strand): if strand == "-": # NOTE only complementing! not reverse complementing for - # minus strand read - base = complement(base) + # minus strand read + base = complement(base) # use numpy arrays to quickly find where bases are in the reference # sequence base_index = np.where(np.asarray(list(genome_sequence)) == base)[0] locs_list = [aligned_locs[i] for i in base_index] # if we don't have stranded information, then we have to consider that - # the "base" could be from either strand. We therefore search for a + # the "base" could be from either strand. We therefore search for a # "base-pair" i.e. find all the locations of both the base and its # complement if strand == ".": @@ -1367,7 +1368,7 @@ def __init__(self): Returns: None Tests: - + Test initialization >>> x = MutDataFrame() >>> x.mutation_list @@ -1391,11 +1392,11 @@ def __init__(self): def create_df_from_list(self): """Takes the list of mutation data and enters it into the dataframe all at once. Only to be done once all mutations are found. Mutation_list - has to be in the form of: + has to be in the form of: [(chrom1, loc1, strand1, base1, mut1, phred1), ...,(chromN, locN, strandN, baseN, mutN, phredN)] - + Inputs: None Modifies: self.df @@ -1483,7 +1484,7 @@ def do_fdr(self, alpha=0.05): >>> x.df chrom loc strand base mut phred pvalue qvalue 0 test 1 + A T 30 0.001 0.002 - + Test a non-passing value >>> x = MutDataFrame() >>> x.mutation_list = [("test", 2, "-","G","C",2)] @@ -1497,7 +1498,7 @@ def do_fdr(self, alpha=0.05): # defaults to assuming phred scores havent been converted to p values self.phred_to_p() # calculate the qvalues, alpha does nothing in this regard here - self.df["qvalue"] = multicomp.fdrcorrection0(self.df["pvalue"], + self.df["qvalue"] = multicomp.fdrcorrection0(self.df["pvalue"], alpha)[1] # filter by alpha self.df = self.df[self.df["qvalue"] < alpha] @@ -1517,9 +1518,9 @@ def filter_mut_type(self, base, mut, data_frame=None): Example: get all the T to C mutations from region chr1:1-100 that were from the "+" strand: - - filtered_df = Mut_Df.filter_strand('+', - data_frame= Mut_Df.filter_region("chr1", (1,100), + + filtered_df = Mut_Df.filter_strand('+', + data_frame= Mut_Df.filter_region("chr1", (1,100), data_frame=Mut_Df.filter_mut_type(T, C))) Tests: >>> x = MutDataFrame() @@ -1556,7 +1557,7 @@ def filter_mut_type(self, base, mut, data_frame=None): def filter_region(self, chrom, locs=None, data_frame=None): """Function to grab mutations within a certain region. Can be coupled - with filter_mut_type or filter_strand by specifying + with filter_mut_type or filter_strand by specifying data_frame = output_from on each subsequent function Inputs: chrom - char chromosome you are looking for @@ -1571,9 +1572,9 @@ def filter_region(self, chrom, locs=None, data_frame=None): Example: get all the T to C mutations from region chr1:1-100 that were from the "+" strand: - - filtered_df = Mut_Df.filter_strand('+', - data_frame= Mut_Df.filter_region("chr1", (1,100), + + filtered_df = Mut_Df.filter_strand('+', + data_frame= Mut_Df.filter_region("chr1", (1,100), data_frame=Mut_Df.filter_mut_type(T, C))) Tests: >>> x = MutDataFrame() @@ -1592,7 +1593,7 @@ def filter_region(self, chrom, locs=None, data_frame=None): else: df = self.df if locs: - df = df[(df['chrom'] == chrom) & + df = df[(df['chrom'] == chrom) & ((df['loc'] >= locs[0]) & (df['loc'] <= locs[1]))] else: df = df[(df['chrom']==chrom)] @@ -1616,9 +1617,9 @@ def filter_strand(self, strand, data_frame=None): Example: get all the T to C mutations from region chr1:1-100 that were from the "+" strand: - - filtered_df = Mut_Df.filter_strand('+', - data_frame= Mut_Df.filter_region("chr1", (1,100), + + filtered_df = Mut_Df.filter_strand('+', + data_frame= Mut_Df.filter_region("chr1", (1,100), data_frame=Mut_Df.filter_mut_type(T, C))) Tests: >>> x = MutDataFrame() @@ -1644,12 +1645,12 @@ def filter_strand(self, strand, data_frame=None): def count_muts(self, data_frame=None): """Increments counters for each type of mutation. - + Inputs: [data_frame] - dataframe from the results of a filter. If None (default), then uses self.df Modifies: nothing - Returns: dictionary of mutations with mutation type as the key. For + Returns: dictionary of mutations with mutation type as the key. For example, dict['AT']: 30, if there were 30 mutations that were A->T Tests: @@ -1757,7 +1758,7 @@ def __add__(self, other): for key in set(self.bad_flag_reads.keys(), other.bad_flag_reads.keys()): new.bad_flag_reads[key] = self.bad_flag_reads.get(key, 0) +\ other.bad_flag_reads.get(key, 0) - for key in set(self.missed_good_flag_reads.keys(), + for key in set(self.missed_good_flag_reads.keys(), other.missed_good_flag_reads.keys()): new.missed_good_flag_reads[key] = self.missed_good_flag_reads.get(key, 0) +\ other.missed_good_flag_reads.get(key, 0) @@ -1809,50 +1810,50 @@ def increment_bases(self, read, strand): self.base_counts['G'] += seq.count('G') self.base_counts['C'] += seq.count('C') self.base_counts['A'] += seq.count('A') - + def print_map_stats(self, section, mapq, paired, locs=False): """ This prints out each of the counters in an easy to read format """ - print "MAPPING STATS FOR %s"%section + print("MAPPING STATS FOR {}".format(section)) if paired: - print "Paired-End Mode" + print("Paired-End Mode") else: - print "Single-End Mode" - print 20*"-" - print "Total reads in file: %s" %self.total_reads - print "" - print "Quality Filtering" - print "Total reads filtered: %s"%(self.total_reads - self.total_mapped) - print "Filtered for:" + print("Single-End Mode") + print(20*"-") + print("Total reads in file: {}".format(self.total_reads)) + print("") + print("Quality Filtering") + print("Total reads filtered: {}".format(self.total_reads - self.total_mapped)) + print("Filtered for:") for flag in self.missed_good_flag_reads.keys(): - print "\tMissing %s: %s"%(self.flag_mapping[flag], self.missed_good_flag_reads[flag]) + print("\tMissing {}: {}".format(self.flag_mapping[flag], self.missed_good_flag_reads[flag])) for flag in self.bad_flag_reads.keys(): - print "\t%s: %s"%(self.flag_mapping[flag], self.bad_flag_reads[flag]) + print("\t{}: {}".format(self.flag_mapping[flag], self.bad_flag_reads[flag])) if mapq is None: pass elif mapq >= 0: - print "\tMAPQ < %s : %s"%(mapq, self.total_mapq) + print("\tMAPQ < {} : {}".format(mapq, self.total_mapq)) elif mapq < 0: - print "\tMAPQ > %s : %s"%(mapq, self.total_mapq) + print("\tMAPQ > {} : {}".format(mapq, self.total_mapq)) if locs: - print "\tOverlap with locations: %s"%(self.total_overlap) - print "\tStrandedness could not be determined: %s"%(self.total_bad_sense) + print("\tOverlap with locations: {}".format(self.total_overlap)) + print("\tStrandedness could not be determined: {}".format(self.total_bad_sense)) if paired: - print "Filtered pairs that mapped to plus strand: %s"%( - self.paired["+"]) - print "Filtered pairs that mapped to minus strand: %s"%( - self.paired["-"]) - print "Filtered pairs that mapped to either strand: %s"%( - self.paired["."]) - print "Total pair basepairs considered: %s"%(self.total_paired_bp) + print("Filtered pairs that mapped to plus strand: {}".format( + self.paired["+"])) + print("Filtered pairs that mapped to minus strand: {}".format( + self.paired["-"])) + print("Filtered pairs that mapped to either strand: {}".format( + self.paired["."])) + print("Total pair basepairs considered: {}".format(self.total_paired_bp)) else: - print "Filtered reads that mapped to plus strand: %s"%( - self.unpaired["+"]) - print "Filtered reads that mapped to minus strand: %s"%( - self.unpaired["-"]) - print "Filtered reads that mapped to either strand: %s"%( - self.unpaired["."]) - print "Total read basepairs considered: %s"%(self.total_read_bp) + print("Filtered reads that mapped to plus strand: {}".format( + self.unpaired["+"])) + print("Filtered reads that mapped to minus strand: {}".format( + self.unpaired["-"])) + print("Filtered reads that mapped to either strand: {}".format( + self.unpaired["."])) + print("Total read basepairs considered: {}".format(self.total_read_bp)) def calc_mut_rates(self, stranded): """ Calculates mutation rate for each type of mutation @@ -1888,37 +1889,37 @@ def calc_mut_rates(self, stranded): mut_rate = (self.mut_counts[base+mut] + 0.0 + self.mut_counts[complement(base) +complement(mut)])/( - self.base_counts[base] + + self.base_counts[base] + self.base_counts[complement(base)]) except ZeroDivisionError: mut_rate = 0.0 out_dict[base+mut] = mut_rate return out_dict - def print_mut_stats(self, stranded): + def print_mut_stats(self, stranded): - print "Mutation Stats" - print 20*"-" - print "Total mutations before FDR: %s"%self.muts_before_filt - print "Total mutations after FDR: %s"%sum(self.mut_counts.values()) + print("Mutation Stats") + print(20*"-") + print("Total mutations before FDR: %s"%self.muts_before_filt) + print("Total mutations after FDR: %s"%sum(self.mut_counts.values())) for mut_type in self.mut_counts.keys(): - print "Total %s->%s mutations after FDR: %s"%(mut_type[0], - mut_type[1], - self.mut_counts[mut_type]) + print("Total {}->{} mutations after FDR: {}".format(mut_type[0], + mut_type[1], + self.mut_counts[mut_type])) for base in self.base_counts.keys(): - print "Total reference %s sequenced: %s"%(base, self.base_counts[base]) - print "Mutation Rate Matrix (after filtering):" - print "Original Base on left, Mutation base on top" - print "\tA\tG\tT\tC" + print("Total reference %s sequenced: %s"%(base, self.base_counts[base])) + print("Mutation Rate Matrix (after filtering):") + print("Original Base on left, Mutation base on top") + print("\tA\tG\tT\tC") mut_rates = self.calc_mut_rates(stranded) - print "A\tX\t%.3e\t%.3e\t%.3e"%(mut_rates["AG"], mut_rates["AT"], - mut_rates["AC"]) - print "G\t%.3e\tX\t%.3e\t%.3e"%(mut_rates["GA"], mut_rates["GT"], - mut_rates["GC"]) - print "T\t%.3e\t%.3e\tX\t%.3e"%(mut_rates["TA"], mut_rates["TG"], - mut_rates["TC"]) - print "C\t%.3e\t%.3e\t%.3e\tX"%(mut_rates["CA"], mut_rates["CG"], - mut_rates["CT"]) + print("A\tX\t%.3e\t%.3e\t%.3e"%(mut_rates["AG"], mut_rates["AT"], + mut_rates["AC"])) + print("G\t%.3e\tX\t%.3e\t%.3e"%(mut_rates["GA"], mut_rates["GT"], + mut_rates["GC"])) + print("T\t%.3e\t%.3e\tX\t%.3e"%(mut_rates["TA"], mut_rates["TG"], + mut_rates["TC"])) + print("C\t%.3e\t%.3e\t%.3e\tX"%(mut_rates["CA"], mut_rates["CG"], + mut_rates["CT"])) def write_mut_stats(self, out_prefix, stranded): mut_rates = self.calc_mut_rates(stranded) @@ -1931,7 +1932,7 @@ def write_exposure_df(self, out_prefix, paired=False): file_out = out_prefix + "_exposure.txt" with open(file_out, 'w') as fn: if paired: - fn.write("%s\t%s\n" %((self.total_mapped/2), + fn.write("%s\t%s\n" %((self.total_mapped/2), self.total_paired_bp)) else: fn.write("%s\t%s\n" %((self.total_mapped), self.total_read_bp)) diff --git a/ChIP_analysis/sam_utils.pyc b/ChIP_analysis/sam_utils.pyc index bfe5039..158a793 100644 Binary files a/ChIP_analysis/sam_utils.pyc and b/ChIP_analysis/sam_utils.pyc differ diff --git a/ChIP_analysis/save_csv_files.py b/ChIP_analysis/save_csv_files.py new file mode 100644 index 0000000..efc170c --- /dev/null +++ b/ChIP_analysis/save_csv_files.py @@ -0,0 +1,34 @@ +#%% +import numpy as np +import os +import fnmatch +import pandas as pd + +# %% +file_names = ['PcrA_Myc-PcrA_Sub_Myc_control_median.npy', + 'PcrA_Myc-PcrA_Sub_Myc_control_maxci.npy', + 'PcrA_Myc-PcrA_Sub_Myc_control_mean.npy', + 'PcrA_Myc-PcrA_Sub_Myc_control_mad.npy', + 'PcrA_Myc-PcrA_Sub_Myc_control_minci.npy', + 'PcrA_Myc-PcrA_Sub_Myc_control_var.npy'] +# %% +arr_list = [] +for fname in file_names: + arr_list.append(np.load(fname)) +dat = np.array(arr_list) +# %% +dat = np.transpose(dat) + +# %% +dat.shape +# %% +df = pd.DataFrame(dat) + +# %% +df.columns = ['median','maxci','mean','mad','minci','var'] + +# %% +df['position'] = np.arange(0, dat.shape[1]) * 10 + 5 + +# %% +df.to_csv('PcrA_over_Myc_control_enrichment_stats.csv') \ No newline at end of file