mikewolfe · jwschroeder3 · Mar 12, 2019 · May 17, 2019 · May 20, 2019 · May 20, 2019
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,16 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "pythonPath": "/home/wanglab/anaconda3/bin/python"
+        }
+    ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.pythonPath": "/home/wanglab/anaconda3/bin/python"
+}
diff --git a/ChIP_analysis/README.md b/ChIP_analysis/README.md
@@ -7,42 +7,66 @@ binding sites
 ## Pre-processing of ChIP data ##
 For each fastq file, the following steps were taken to preprocess the data
 
+Reads were concatenated from multiple files into a single large fastq.gz file
+
+Here:  
+arg0 is substituted for a samples prefix
+
+```bash
+cat *R1*.fastq.gz > arg0_all_R1.fastq.gz
+cat *R2*.fastq.gz > arg0_all_R2.fastq.gz
+```
+
+Illumina adapters were trimmed from reads.
+
 Here:
 arg0 is substituted for a samples prefix
 arg1 represents the fastq for R1 of the paired end sequencing
 arg2 represents the fastq for R2 of the paired end sequencing
 
-```
+```bash
 fastqc arg1 -f fastq -o fastqc_before/
 fastqc arg2 -f fastq -o fastqc_before/
 cutadapt --quality-base=33 -a AGATCGGAAGAGC -A AGATCGGAAGAGC -n 3 -m 20 --mask-adapter --match-read-wildcards -o arg0_R1_cutadapt.fastq.gz -p arg0_R2_cutadapt.fastq.gz arg1 arg2 > arg0_cutadapt.log 2> arg0_cutadapt.err
-TrimmomaticPE -phred33 arg0_R1_cutadapt.fastq.gz arg0_R2_cutadapt.fastq.gz arg0_R1_trim_paired.fastq.gz arg0_R1_trim_unpaired.fastq.gz arg0_R2_trim_paired.fastq.gz arg0_R2_trim_unpaired.fastq.gz TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:20 > arg0_trim.log 2> arg0_trim.err
+java -jar trimmomatic-0.39.jar PE -phred33 arg0_R1_cutadapt.fastq.gz arg0_R2_cutadapt.fastq.gz arg0_R1_trim_paired.fastq.gz arg0_R1_trim_unpaired.fastq.gz arg0_R2_trim_paired.fastq.gz arg0_R2_trim_unpaired.fastq.gz TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:20 > arg0_trim.log 2> arg0_trim.err
 fastqc arg0_R1_trim_paired.fastq.gz -f fastq -o fastqc_after/
 fastqc arg0_R2_trim_paired.fastq.gz -f fastq -o fastqc_after/
 ```
 
+If you have single end data:
+
+```bash
+fastqc arg1 -f fastq -o fastqc_before/
+cutadapt --quality-base=33 -a AGATCGGAAGAGC -a CGGAAGAGCACAC -n 3 -m 20 --mask-adapter --match-read-wildcards -o arg0_R1_cutadapt.fastq.gz arg1 > arg0_cutadapt.log 2> arg0_cutadapt.err
+java -jar trimmomatic-0.39.jar SE -phred33 arg0_R1_cutadapt.fastq.gz arg0_R1_trim_unpaired.fastq.gz TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:20 > arg0_trim.log 2> arg0_trim.err
+fastqc arg0_R1_trim_paired.fastq.gz -f fastq -o fastqc_after/
+```
+
 ## Alignment of processed reads ##
-Next, each pair of processed reads was aligned with bowtie2. 
+Next, each pair of processed reads was aligned with bowtie2.
 
 Here
 arg0 is substituted for a samples prefix
 arg1 represents the trimmed paired fastq for R1 of the paired end sequencing
 arg2 represents the trimmed paired fastq for R2 of the paired end sequencing
 arg3 represents the trimmed unpaired fastq for R1 of the paired end sequencing
 arg4 represents the trimmed unpaired fastq for R2 of the paired end sequencing
-```
-bowtie2 -x /home/mbwolfe/genomes/bowtie2indices/ATCC_47076 -1 arg1 -2 arg2 -U arg3,arg4 -X 2000 -q --end-to-end --very-sensitive -p 5 --phred33 --dovetail 2> arg0_bow.log | samtools view -bSh - > arg0.bam 
+
+```bash
+bowtie2 -x /home/mbwolfe/genomes/bowtie2indices/ATCC_47076 -1 arg1 -2 arg2 -U arg3,arg4 -X 2000 -q --end-to-end --very-sensitive -p 5 --phred33 --dovetail 2> arg0_bow.log | samtools view -bSh - > arg0.bam
 ```
 
 ## Mapping of coverage and preparation for bootstrapping ##
 Next, reads were filtered with samtools and made into a sampling object using
 the custom script `bootstrap_sam_file.py`'s `parse` option.
+### Note: if you have single-end reads, DO NOT include the `-f 3` flag, as it will exclude all reads without a proper pair, and you ain't got 'em.
 
 Here
 arg0 is the sample prefix
 arg1 is the input bam file
-```
-samtools view -f 3 -F 2828 -q 30 arg1 | python bootstrap_sam_file.py parse - arg0.ob --paired 2> arg0_sampler.err
+
+```bash
+samtools view -f 3 -F 2828 -q 30 arg1 | python3 bootstrap_sam_file.py parse - arg0.ob --paired 2> arg0_sampler.err
 ```
 
 ## Obtaining summary tracks at 10 bp resolution ##
@@ -58,8 +82,8 @@ arg7 and arg8 are samplers for the KO input samples
 
 Note that arg1 and arg3 are paired, arg2 and arg4 are paired etc.
 
-```
-python bootstrapped_chip_no_consolidation.py 4639676 arg0 --ext_samps arg1 arg2 --inp_samps arg3 arg4 --ext_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1 --identity -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0.log
+```bash
+python3 bootstrapped_chip_no_consolidation.py --numba --sample_name_luts run_info --genome_size 4215607 --out_prefix arg0 --ChIP_samps arg1 arg2 --inp_samps arg3 arg4 --ChIP_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1 --identity -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0.log
 ```
 
 ## Obtaining bootstrap replicate summary statistics ##
@@ -76,8 +100,8 @@ arg7 and arg8 are samplers for the KO input samples
 
 Note that arg1 and arg3 are paired, arg2 and arg4 are paired etc.
 
-```
-python bootstrapped_chip_no_consolidation.py 4639676 arg0 --ext_samps arg1 arg2 --inp_samps arg3 arg4 --ext_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1000 -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0_1000_bootstrap.log
+```bash
+python3 bootstrapped_chip_no_consolidation.py --numba --sample_name_luts run_info --genome_size 4215607 --out_prefix arg0 --ChIP_samps arg1 arg2 --inp_samps arg3 arg4 --ChIP_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1000 -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0_1000_bootstrap.log
 ```
 
 ## Calculating IDR statistic ##
@@ -89,7 +113,8 @@ arg0 is the sample prefix
 arg1 is one WT-KO replicate
 arg2 is a second WT-KO replicate
 arg3 is the estimated number of Lrp octamers for that condition
-```
+
+```bash
 Rscript calculate_idr.R arg1 arg2 arg3 arg0 > arg0.log 2> arg0.err
 ```
 
@@ -105,8 +130,9 @@ mad1-4.npy represent the corresponding bootstrap mad for each RSE replicate
 idr_combo1-2.npy represent each combination of subtracted replicates for the idr
                  calculation
 out_peaks is the output prefix
-```
-python calculate_peaks.py --log2ratios actual_sub1.npy actual_sub2.npy actual_sub3.npy actual_sub4.npy --mad mad_sub1.npy mad_sub2.npy mad_sub3.npy mad_sub4.npy --idr idr_combo1.npy idr_combo2.npy --resolution 10 --bins 3 --outpre out_peaks --idralpha 0.01 --bioalpha 0.001 --techalpha 0.001
+
+```bash
+python2.7 calculate_peaks.py --log2ratios actual_sub1.npy actual_sub2.npy actual_sub3.npy actual_sub4.npy --mad mad_sub1.npy mad_sub2.npy mad_sub3.npy mad_sub4.npy --idr idr_combo1.npy idr_combo2.npy --resolution 10 --bins 3 --outpre out_peaks --idralpha 0.01 --bioalpha 0.001 --techalpha 0.001
 ```
 
 For any questions or clarifications please contact Michael Wolfe at

diff --git a/ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-36.pyc b/ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-36.pyc
diff --git a/ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-37.pyc b/ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-37.pyc
diff --git a/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-36.pyc b/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-36.pyc
diff --git a/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-37.pyc b/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-37.pyc
diff --git a/ChIP_analysis/__pycache__/convert_multiple_to_average_robustz.cpython-37.pyc b/ChIP_analysis/__pycache__/convert_multiple_to_average_robustz.cpython-37.pyc
diff --git a/ChIP_analysis/__pycache__/peak.cpython-37.pyc b/ChIP_analysis/__pycache__/peak.cpython-37.pyc
diff --git a/ChIP_analysis/__pycache__/sam_utils.cpython-36.pyc b/ChIP_analysis/__pycache__/sam_utils.cpython-36.pyc
diff --git a/ChIP_analysis/__pycache__/sam_utils.cpython-37.pyc b/ChIP_analysis/__pycache__/sam_utils.cpython-37.pyc
diff --git a/ChIP_analysis/bootstrap_helper_funcs.py b/ChIP_analysis/bootstrap_helper_funcs.py
@@ -28,10 +28,12 @@
 #CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
 
 import numpy as np
+import numba
 from math import ceil, floor
 
+# @numba.jit(nopython=True, parallel=True)
 def credible_interval(array, alpha = 0.05):
-    """ Take a bootstrapped based 95% credible interval
+    """ Take a bootstrap-based 95% credible interval
 
     Args:
         array (np.array): 1xn array (n number of bootstraps)
@@ -41,15 +43,36 @@ def credible_interval(array, alpha = 0.05):
                                   pos 0: mean
                                   pos 1: min_ci at alpha
                                   pos 2: max_ci at alpha
+                                  pos 3: median
     """
-    out_array = np.zeros(3, dtype=float)
+    out_array = np.zeros(4)
     mean = np.mean(array)
     out_array[0] = mean
-    sorted_array = np.sort(array)
-    out_array[1] = sorted_array[int(floor((alpha/2)*array.size))]
-    out_array[2] = sorted_array[int(floor(array.size - alpha/2*array.size))]
+    array_sort = np.sort(array)
+    out_array[1] = array_sort[int(floor((alpha/2)*array.size))]
+    out_array[2] = array_sort[int(floor(array.size - alpha/2*array.size))]
+    # print(array_sort.size)
+    # print(array_sort.size//2)
+    if array_sort.size % 2 == 0:
+        median = (array_sort[array_sort.size//2]+array_sort[array_sort.size//2-1])/2
+    else:
+        median = array_sort[int(floor(array_sort.size//2))]
+    # print("Median: ", median)
+    out_array[3] = median
     return out_array
 
+# @numba.jit(nopython=True)
+# def bubblesort_jit(arr):
+#     N = arr.shape[0]
+#     for end in range(N, 1, -1):
+#         for i in range(end - 1):
+#             cur = arr[i]
+#             if cur > arr[i + 1]:
+#                 tmp = arr[i]
+#                 arr[i] = arr[i + 1]
+#                 arr[i + 1] = tmp
+#     return(arr)
+
 def least_extreme_value(stats):
     """ Take the value closest to zero for the min/max ci
 
@@ -58,7 +81,7 @@ def least_extreme_value(stats):
     Returns:
         lev (float): least extreme value
     """
-    mean, minci, maxci = stats
+    mean, minci, maxci, med = stats
     if minci <= 0.0 and maxci >= 0.0:
         return 0.0
     elif minci >= 0.0 and maxci > 0.0:

diff --git a/ChIP_analysis/bootstrap_helper_funcs.pyc b/ChIP_analysis/bootstrap_helper_funcs.pyc