From 8ba5ea9e8ca0ab0d4e62e148cf6c3d02d65f793c Mon Sep 17 00:00:00 2001 From: jwschroeder3 Date: Tue, 12 Mar 2019 12:35:31 -0500 Subject: [PATCH 01/13] First commit --- ChIP_analysis/README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/ChIP_analysis/README.md b/ChIP_analysis/README.md index 725b281..5d02472 100755 --- a/ChIP_analysis/README.md +++ b/ChIP_analysis/README.md @@ -22,7 +22,7 @@ fastqc arg0_R2_trim_paired.fastq.gz -f fastq -o fastqc_after/ ``` ## Alignment of processed reads ## -Next, each pair of processed reads was aligned with bowtie2. +Next, each pair of processed reads was aligned with bowtie2. Here arg0 is substituted for a samples prefix @@ -31,18 +31,19 @@ arg2 represents the trimmed paired fastq for R2 of the paired end sequencing arg3 represents the trimmed unpaired fastq for R1 of the paired end sequencing arg4 represents the trimmed unpaired fastq for R2 of the paired end sequencing ``` -bowtie2 -x /home/mbwolfe/genomes/bowtie2indices/ATCC_47076 -1 arg1 -2 arg2 -U arg3,arg4 -X 2000 -q --end-to-end --very-sensitive -p 5 --phred33 --dovetail 2> arg0_bow.log | samtools view -bSh - > arg0.bam +bowtie2 -x /home/mbwolfe/genomes/bowtie2indices/ATCC_47076 -1 arg1 -2 arg2 -U arg3,arg4 -X 2000 -q --end-to-end --very-sensitive -p 5 --phred33 --dovetail 2> arg0_bow.log | samtools view -bSh - > arg0.bam ``` ## Mapping of coverage and preparation for bootstrapping ## Next, reads were filtered with samtools and made into a sampling object using the custom script `bootstrap_sam_file.py`'s `parse` option. +### Note: if you have single-end reads, DO NOT include the `-f 3` flag, as it will exclude all reads without a proper pair, and you ain't got 'em. Here arg0 is the sample prefix arg1 is the input bam file ``` -samtools view -f 3 -F 2828 -q 30 arg1 | python bootstrap_sam_file.py parse - arg0.ob --paired 2> arg0_sampler.err +samtools view -f 3 -F 2828 -q 30 arg1 | python2.7 bootstrap_sam_file.py parse - arg0.ob --paired 2> arg0_sampler.err ``` ## Obtaining summary tracks at 10 bp resolution ## @@ -59,7 +60,7 @@ arg7 and arg8 are samplers for the KO input samples Note that arg1 and arg3 are paired, arg2 and arg4 are paired etc. ``` -python bootstrapped_chip_no_consolidation.py 4639676 arg0 --ext_samps arg1 arg2 --inp_samps arg3 arg4 --ext_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1 --identity -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0.log +python2.7 bootstrapped_chip_no_consolidation.py 4639676 arg0 --ext_samps arg1 arg2 --inp_samps arg3 arg4 --ext_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1 --identity -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0.log ``` ## Obtaining bootstrap replicate summary statistics ## @@ -77,7 +78,7 @@ arg7 and arg8 are samplers for the KO input samples Note that arg1 and arg3 are paired, arg2 and arg4 are paired etc. ``` -python bootstrapped_chip_no_consolidation.py 4639676 arg0 --ext_samps arg1 arg2 --inp_samps arg3 arg4 --ext_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1000 -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0_1000_bootstrap.log +python2.7 bootstrapped_chip_no_consolidation.py 4639676 arg0 --ext_samps arg1 arg2 --inp_samps arg3 arg4 --ext_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1000 -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0_1000_bootstrap.log ``` ## Calculating IDR statistic ## @@ -106,7 +107,7 @@ idr_combo1-2.npy represent each combination of subtracted replicates for the idr calculation out_peaks is the output prefix ``` -python calculate_peaks.py --log2ratios actual_sub1.npy actual_sub2.npy actual_sub3.npy actual_sub4.npy --mad mad_sub1.npy mad_sub2.npy mad_sub3.npy mad_sub4.npy --idr idr_combo1.npy idr_combo2.npy --resolution 10 --bins 3 --outpre out_peaks --idralpha 0.01 --bioalpha 0.001 --techalpha 0.001 +python2.7 calculate_peaks.py --log2ratios actual_sub1.npy actual_sub2.npy actual_sub3.npy actual_sub4.npy --mad mad_sub1.npy mad_sub2.npy mad_sub3.npy mad_sub4.npy --idr idr_combo1.npy idr_combo2.npy --resolution 10 --bins 3 --outpre out_peaks --idralpha 0.01 --bioalpha 0.001 --techalpha 0.001 ``` For any questions or clarifications please contact Michael Wolfe at From 2e9eb3b73ce6d72bc3a7ebc30ed48c3be0e129af Mon Sep 17 00:00:00 2001 From: jwschroeder3 Date: Fri, 17 May 2019 12:40:24 -0500 Subject: [PATCH 02/13] Updates for sample_id lookups --- ChIP_analysis/README.md | 45 ++- .../bootstrap_helper_funcs.cpython-36.pyc | Bin 0 -> 1858 bytes .../bootstrap_sam_file.cpython-36.pyc | Bin 0 -> 12272 bytes .../__pycache__/sam_utils.cpython-36.pyc | Bin 0 -> 55399 bytes ChIP_analysis/bootstrap_helper_funcs.pyc | Bin 2391 -> 2292 bytes ChIP_analysis/bootstrap_sam_file.py | 54 ++-- ChIP_analysis/bootstrap_sam_file.pyc | Bin 15026 -> 14399 bytes .../bootstrapped_chip_no_consolidation.py | 211 ++++++++------ ChIP_analysis/sam_utils.py | 259 +++++++++--------- ChIP_analysis/sam_utils.pyc | Bin 60169 -> 58849 bytes 10 files changed, 317 insertions(+), 252 deletions(-) create mode 100644 ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-36.pyc create mode 100644 ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-36.pyc create mode 100644 ChIP_analysis/__pycache__/sam_utils.cpython-36.pyc diff --git a/ChIP_analysis/README.md b/ChIP_analysis/README.md index 5d02472..cd4f2fd 100755 --- a/ChIP_analysis/README.md +++ b/ChIP_analysis/README.md @@ -7,20 +7,41 @@ binding sites ## Pre-processing of ChIP data ## For each fastq file, the following steps were taken to preprocess the data +Reads were concatenated from multiple files into a single large fastq.gz file + +Here: +arg0 is substituted for a samples prefix + +```bash +cat *R1*.fastq.gz > arg0_all_R1.fastq.gz +cat *R2*.fastq.gz > arg0_all_R2.fastq.gz +``` + +Illumina adapters were trimmed from reads. + Here: arg0 is substituted for a samples prefix arg1 represents the fastq for R1 of the paired end sequencing arg2 represents the fastq for R2 of the paired end sequencing -``` +```bash fastqc arg1 -f fastq -o fastqc_before/ fastqc arg2 -f fastq -o fastqc_before/ cutadapt --quality-base=33 -a AGATCGGAAGAGC -A AGATCGGAAGAGC -n 3 -m 20 --mask-adapter --match-read-wildcards -o arg0_R1_cutadapt.fastq.gz -p arg0_R2_cutadapt.fastq.gz arg1 arg2 > arg0_cutadapt.log 2> arg0_cutadapt.err -TrimmomaticPE -phred33 arg0_R1_cutadapt.fastq.gz arg0_R2_cutadapt.fastq.gz arg0_R1_trim_paired.fastq.gz arg0_R1_trim_unpaired.fastq.gz arg0_R2_trim_paired.fastq.gz arg0_R2_trim_unpaired.fastq.gz TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:20 > arg0_trim.log 2> arg0_trim.err +java -jar trimmomatic-0.39.jar PE -phred33 arg0_R1_cutadapt.fastq.gz arg0_R2_cutadapt.fastq.gz arg0_R1_trim_paired.fastq.gz arg0_R1_trim_unpaired.fastq.gz arg0_R2_trim_paired.fastq.gz arg0_R2_trim_unpaired.fastq.gz TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:20 > arg0_trim.log 2> arg0_trim.err fastqc arg0_R1_trim_paired.fastq.gz -f fastq -o fastqc_after/ fastqc arg0_R2_trim_paired.fastq.gz -f fastq -o fastqc_after/ ``` +If you have single end data: + +```bash +fastqc arg1 -f fastq -o fastqc_before/ +cutadapt --quality-base=33 -a AGATCGGAAGAGC -a CGGAAGAGCACAC -n 3 -m 20 --mask-adapter --match-read-wildcards -o arg0_R1_cutadapt.fastq.gz arg1 > arg0_cutadapt.log 2> arg0_cutadapt.err +java -jar trimmomatic-0.39.jar SE -phred33 arg0_R1_cutadapt.fastq.gz arg0_R1_trim_unpaired.fastq.gz TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:20 > arg0_trim.log 2> arg0_trim.err +fastqc arg0_R1_trim_paired.fastq.gz -f fastq -o fastqc_after/ +``` + ## Alignment of processed reads ## Next, each pair of processed reads was aligned with bowtie2. @@ -30,7 +51,8 @@ arg1 represents the trimmed paired fastq for R1 of the paired end sequencing arg2 represents the trimmed paired fastq for R2 of the paired end sequencing arg3 represents the trimmed unpaired fastq for R1 of the paired end sequencing arg4 represents the trimmed unpaired fastq for R2 of the paired end sequencing -``` + +```bash bowtie2 -x /home/mbwolfe/genomes/bowtie2indices/ATCC_47076 -1 arg1 -2 arg2 -U arg3,arg4 -X 2000 -q --end-to-end --very-sensitive -p 5 --phred33 --dovetail 2> arg0_bow.log | samtools view -bSh - > arg0.bam ``` @@ -42,7 +64,8 @@ the custom script `bootstrap_sam_file.py`'s `parse` option. Here arg0 is the sample prefix arg1 is the input bam file -``` + +```bash samtools view -f 3 -F 2828 -q 30 arg1 | python2.7 bootstrap_sam_file.py parse - arg0.ob --paired 2> arg0_sampler.err ``` @@ -59,8 +82,8 @@ arg7 and arg8 are samplers for the KO input samples Note that arg1 and arg3 are paired, arg2 and arg4 are paired etc. -``` -python2.7 bootstrapped_chip_no_consolidation.py 4639676 arg0 --ext_samps arg1 arg2 --inp_samps arg3 arg4 --ext_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1 --identity -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0.log +```bash +python2.7 bootstrapped_chip_no_consolidation.py 4215607 arg0 --ext_samps arg1 arg2 --inp_samps arg3 arg4 --ext_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1 --identity -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0.log ``` ## Obtaining bootstrap replicate summary statistics ## @@ -77,8 +100,8 @@ arg7 and arg8 are samplers for the KO input samples Note that arg1 and arg3 are paired, arg2 and arg4 are paired etc. -``` -python2.7 bootstrapped_chip_no_consolidation.py 4639676 arg0 --ext_samps arg1 arg2 --inp_samps arg3 arg4 --ext_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1000 -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0_1000_bootstrap.log +```bash +python2.7 bootstrapped_chip_no_consolidation.py 4215607 arg0 --ext_samps arg1 arg2 --inp_samps arg3 arg4 --ext_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1000 -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0_1000_bootstrap.log ``` ## Calculating IDR statistic ## @@ -90,7 +113,8 @@ arg0 is the sample prefix arg1 is one WT-KO replicate arg2 is a second WT-KO replicate arg3 is the estimated number of Lrp octamers for that condition -``` + +```bash Rscript calculate_idr.R arg1 arg2 arg3 arg0 > arg0.log 2> arg0.err ``` @@ -106,7 +130,8 @@ mad1-4.npy represent the corresponding bootstrap mad for each RSE replicate idr_combo1-2.npy represent each combination of subtracted replicates for the idr calculation out_peaks is the output prefix -``` + +```bash python2.7 calculate_peaks.py --log2ratios actual_sub1.npy actual_sub2.npy actual_sub3.npy actual_sub4.npy --mad mad_sub1.npy mad_sub2.npy mad_sub3.npy mad_sub4.npy --idr idr_combo1.npy idr_combo2.npy --resolution 10 --bins 3 --outpre out_peaks --idralpha 0.01 --bioalpha 0.001 --techalpha 0.001 ``` diff --git a/ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-36.pyc b/ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e9d4b5a837a7e45f26674f604a98333b1b929ea6 GIT binary patch literal 1858 zcma)6OK)5?6t?eUa_1)VXp**xs3L=k)LGDJQdNQ|0;)(LB!nu$>Z+3ETzfJ%xi8tC zG!so$Q^X31Eo@o!C-6sDc*_E1!5?77u_u#ENf-5%ALr?F&-sp@wp#Hg|NQjw&lW;| zp@k8``AeAUBa9G79Oo#(uzR^z_=#TxNq`aW@PLP}aT4+dUwVy_hFId!iT4YZTh1m* zA`u-od29`$^Lfl>ER>1YzW`s{MrWs*+_CN1wxC4gl?3UrB5Af(=6LLMSg@mG^OADh*d+J5TD{B*;3t1_LHl|Hr zqcwr5q&7ZOPFfE@LpFP%30;+7v#B=CYNV;-$Q95^R;-oaGu9g<*MRx&*ZpBti2f^9 z9_8$?uVmWad9wW}{Z`g=fB4OdzFjCQSw2>o>R*G24n+?#wGm=YaYU z{Hr!BXr+x~9Kqbe53#%poYelI9RW=H5Bo6H3s@%TrGJ4hJm?R2Gfv>0nfS1W@ZLZd zI$kJu94~+!@a}j>e&u-B?gy^t4&W_=Zt&0ln6-k19pHQN;`Y0%k?u-hwpSlTM~?wSxDgnB#+G$@!JN{eQPQCIbA z|0OE~rN&odWdbITPE8{#2U#vm!{$^96Tqv%lQ1j6jqzT=JgH2SsX3Z^QH~?Di-o!aV}jlqP?Qxqg;Hx1uXRtmI!yfcs*L1!87(^d=KCA Ox&YQTd=DTq5Ac6q&(;S3 literal 0 HcmV?d00001 diff --git a/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-36.pyc b/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dfedf813d8f1af04e521e7a666c4e2a1d08b88fe GIT binary patch literal 12272 zcmd5?O>o=Rb_PI@qG(x`6~%vX9OTEbm`HRa=ht!4_&N4_X%odycIuSL8$O7*lqit| zeF54MM{Z}TWSUK8-lEf0y6$w_Zo2BC(?wTZymrySw9}b((aEke-L%=fe&4wu2+5XT zU*=6)NCYl``+v^)&UeqnrK3kH&o=(%^IyDWS^sL~jtQjS!!P;=$Fc$|usc@QuG?Lw z?%2F{I)!>cu5R6xYq4IGYpGtswa}U9mh0v2WPLKtU#TCF_fz$$A6kJM6u+>7qB^=) zs80u_VB!m_ek>>lleo?VmEZ`j$AhWhD6X@?Tfwnl<_o+2L@=$6Y!)}?_N`8SZcw)=>nWO(+6*f=qS@l!u)aL16E_gCH@wHPwqt0wT6`TxC1*dU$R-N5^Iye(N z#XH-QyR*U5xw~`0Gl3U8`*i_tS#mabPD-9r=TP$b;Dt0V@6HD2b9XPQXMzjCyp-hI zD(+tjUJe$%c2O6fEaHh1)N)TQqP17j)|g{S$}Hz{T*8xAgV)k0r#3x|@LBcj=5xVG zjM~=!iS+Zq>yrL6Uspo>(bZ%wd@njJ-)F6e6>KRqW{VVt_J!S3Ldp1(bEeG!} zscH0b9<{!TXRom~TwjNbcDK*r=^O7`Kbfn)sh&`m)SE27huQv_bNBWg3pKxmCkyJ* z9^}*ebo)o@GVYi6Y?QfNe;es5sOKAA>sNXu)b>vOih5Gbt1IdfX4m=BQE#`L(`<1` z)%J=~^H&FJ4xC$7_42n=`KptYgZ904)DC;ebYHi7aicZpHDlbb;AOGZ2}7N@P1Wu+ zSzvU)G>nn_6bWV3?ZB!#fepiOf&vuRHEIsSFbZBu6?`I9@N%l)lfiUw4EZa;OmH06 zBT%|0aGjzO<9akWfzF;xj;ty_xaD{I9i{!ywybL%KZ?9K^ftmy;Q774i{!1>(qY%@ z>#(V!NClooxv1u`c;xw7c~Kl{JjY0t?)jb0PQ^2aA9vY>iF)oWtVLA&S2DymiF z{na?u?X>|OTpr~&ReSTDb`)1HOFtrSK8k%EFL+W-JV19NFKmr^aV_lKQ#wZT=y-3? z?PrareHdMQt^FPE{)XzAVYPefS+j8%`yFo{13;^&WKHR;2@K1NH$k@7;+qwlvPCG|8h<#O;!;a|fqI)T%=wQDti zk~rIegR>J@5LsbGGwY3NAt{OslA?4xDUZ5SwG%f|oz|aP`UrA;$wab7f4_u9{Z$Nm7`NoLI{v!1<~O%kKxB%>Go`qy*K?}O{-DPVHsH;R z-cq?RhJ6an*I5nUQobwIqMpPxIhOXc5r++GpQam4N`Ai&Q_yAHR2^NxrCuI2Bw{zG zDP}HX=A;KA1s1U?CA(}-9gyBA+dk=KI(ZJo4wqhFLpL2C$Y?zzvhJW0w_&lF>;zwO zQ~G}`JW7ej;j#k&rEo;qF-d56NQO}+#FeTm^3%sKZK=%Ec1-w6#Wgt+S9DNblOsVy zt0-)YZIS{TKAAVREZ05^aZyC%d=`lqc19ZufFNHR3@HFf3V{o06^9?>M3t&p3cg|r z0;IrpLMD++wOZ|FTlM11qO##8MZo6kuL63CLA=vfw=w?2>GhKdKavV$3S>sAwy5jD zLC@s$TuL-Jc8%-X}V`H>&Yx^Dt9g(Zl_R9V1_fGE#!<^<8tnA#~! zOj0t1lL=m-lmd~Webs^}UgENGDr}&(4{G6Lf6(a&R&G&Bf|X^O-pm2x%kniT8y=KT z5kE1$D@YuWkwY`YGq^h#W8py|b|Iz0p8WtOxmnsR1o+G3l3n&n)XnwR42y5)=y!Yjeu$Fpgo~B4ie}~pP1M^O&ruu^F%ge zPU03TnGBYyf}U`oqQx z$Jg3`b#)YM)%QpdJ#$ZEm2t&kW}_n}qiLQxl@q#UY|=Qx#BLpw&Lrz;1b*!Q0m>cf z1hZWGe8A(d67CREp-Z&C1VR!YHw$~#X~+-WfpRJgbsj6BQ|YX&VM4hKA?NReO`q;y zJW=0EOyg-1EE}-~nJZy>6Fvwfx^Rfwtni%xR`iG2JQam3>{H-zwtWhmIm;JNSk@X= z3}-?{PM&~?Du9z?if~s93YZ*s_4b0`6q3OqI21^2v}DBcNf7sTr1USE`oY~S2Y?*rP07*V$flFs?oSR@H^?7iAEy`n~jD(%0`d# z^aM{LQ6YmQ()u}`&hvDEr!kp{}gu< zxWn_F@do6J{VmXe`=QtO+YB{;Opo9UsS$w%$aR?; zx@gFam@LB+eKEFC<27%EWTlUg2I4bh8ie1~m8*-9x@+oPK-Zvx@2^}<+nI0IRL$dh zdGVIITTLm>euNy=KVCA7mJ6RrPZp#rKgq@@i=nQ(k0{`7V zXIcEL;YsON?Fxag6TORs$$|2PEwv*?s;80j+h5?&%Seq+;9?#gB!#ZOt!I#`7V4AP z%q1l==K9fiqa7jeVALgNAmYG_lIu@UI$}H4)QnScD)yW`?@ZhJH9X0%j>Igp3i#vH zDz;qxBJ>y2uw@}7Q3+Us&FI8~Lhn;!J9zhDn#+|G-_nVgT+2wYjI-tF&4=okixIlR zQx`ehmU9A8&EL1ztp~2Lq@V!119gXW;fI#~Vwbeg5lh_nj!{rpuM4U;PJuq!Ry?um zf=0S=nb)9zQGFLr2JWcUxWebHf@ymfF%}W}g#XX9(%U1e${8q00(g9Tz1PKZ4-7#p zje04OkTpL-_)Xv%)+S13Xa%9Ywn$Z$-bN zJ(>!AME`OI{@Z?Mpss5T_(>y&5>e%-Fjl}5hDpfF##qI96i0`d49%RsvL9bRg z=$U|$nBnmBO%#{K8Gr*pKExqn?|g)RvC!d}ALhzSVe^(5GQ+JNsk^s7yuOmmuB*5q z+M$AU8tR3cAKywUY>LpY8{wD>*w5r9#(M$L(V;!wp zF+!W|^^Lebm7nT^=6{^sFs>l&#b2VG=tZ2YV{=%u5ej&o;@O$AUHg=M4)+z@PuXW| z{T^~==t$0(LB!{{i7lA9y?t|+PVz?&93e(9J-SOWMy}wvegk;8`bJ#bEWyy7%?X$x zo*Gk!!P=XXT-(8E!c^~0r7qI-Kcn%jQmc)%_+25=G_{c{YsbcsjU0$EGP=p^YybyG(kvns;lf-KTl)qz+DO8qs{IG^bRN@lE^f6}Bo}&aVc* z))1JKH9@*0V0YPZ`YN&BwAdluj#(Lb=7z*PhBePA0v7PT3LG%*j=4CBkx8h+8=;*@IYUuDiGbxpg4=-f?9R-&S}l7em`;^4p`f!c2s^lDrL@lWqZ4r*r( zdRu~HaiGiMZthrj%dQm{w<`LNY`8Dm8qUS#%}MCIOUGhccdXw5#S97$+~0LSHMuLu z{n!7q+$@pV#y^3k{F-eeV! zCR{!UkP5gyQP`Z?2?yX4y*_$n)Z_-^G~EHVE*^WVVac{nKRqfac3#2ljKZ2qHn7rv zY(<#R)ayw9>DZLt+%Th&en=G@;A4+2xC!RJW?*|blR>&vdp=sJ9x)(Z=KK+E^;LkP z{sB({({C^(OY!%ZB5o!{MZ%>oBaN-L>wJDgp0k3bq{z#1Qb4YQuzm6%Y^NmTzfe44 zsKJ^m*$y~H8H~db9G|iscNXb6#F5Y8_kyiIM!tVzX3P5b-^hZ<`_?J!`>y@D^8k># z3y{Y;9Kdb9i;z34A-3{;egluH1a|J;`TXqz9HCwoRszbDF$f`(y4VF%nETPT5w zWCb5@=y0&ULBbR3utU27z(GSaR1Vr)^F9iLcB`GnFr|C!M8D%VAB|Ne%lr}`tzVNH z#wevbj7{P91VR&S(?4pT%*SYneGBRaTglH@)#N}Om@pZYDTP&jD2VHK7O*oM z`m%qavXK&{^ZRJPU$iO}2t(n=AO$RNH6;%kT$H{hF8h-dFd0cvq6Uaz78QcoBmP6(%(!%zrvQq_g9_orvxL=GutF%E!w{;#-%Up z6yM%J56pHLHzP(5&37u}5dWysL$CadM`j0_XQGhQ7Hpvr{q(aut@3o6C)qn;xagqb zmCQ6>8mk>h>Zi!a7-<)gDa1v~&9K`=*P~=2O+nY)4b|xc?T~=lomzk_!SYBKt{;D9U2&y$=e0k*0LLk?; z#Aj$EcW^AIR_2adKBj0&9mxs#O$)A3)_0UJQ;-$QRrk>+w{FuPU?_`bLNQtlD0S$d zuph5ne*}fCc{c_cHS2|3Wt?cBvbY-5x$?0$$P**^gopj7a7AdmG4?(xldJ6!+@%J+ zEBW&6ooLN)S!Our2vY6JS2I#YBZFFU#FWwO#qbBID5XnMctF2M#|0XmJK*csTa=Zo z@NW{#g!T`w=Ic0I8S>8KfDR-TfviS^ebM0zA}i})j%Z?C_3*(Ljg6mTm?0lsTtrPN zq^41ycMN#2NQ;nSMkw)UffyNB+I=p+7vQ&BK0fB5rYs?Z8_?h<(ubye@&+xNx z0~tP>nn8Aj3z{Zt-uvTsYe7*~1DSKX z!89Vy3v+I6QPYBl_$K&3lxrnRaTtp2iuNltEegPaL z2y}`j+un#OMcMqlvotI%Vw*t)?Vls;G+gRO0~LgeDbT001~G!BfU#f;MFqLS%a>mL z&R$Gx%}sHB(*`uW2~kwv<-SyFfH*Dor8GnRx4(+$nY zm++6Ih^G5HvPml(8rUT}c*3tGf+sOE2}cNn6{>-cNX0|lqJt7IpkHRQ)9~q7`u5_R zGz*=Sqz^io=9f#+V9ne{^~yL6mBu^sqCO?p^e&m=8w^BhtRFQ(O<(FCv%QMRk33Q0 z@~l7O$gcApSQJoNb^;TZA*wohBk;+%4YfNdHaj7@Df@BfS&&~K@l#3_h$BpjL#0D} zKFPBXQ6Q o-lDWgGgo1X@ltG{FlNfyQ4gL>+a5|Wk~+mzj3cc(C9Gur8;4gNb^rhX literal 0 HcmV?d00001 diff --git a/ChIP_analysis/__pycache__/sam_utils.cpython-36.pyc b/ChIP_analysis/__pycache__/sam_utils.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93b118904acb0139c39a7f47fa8b711e62bb41ae GIT binary patch literal 55399 zcmeHwd2k$8df)V17z{uVJj6rNYFp3&Us?g|hE!YeJf+;Iav00$h* zfZYQUoE_MzrF>`Yb-X^av0X{*loQ+8BzBIClgi<8svO7ZI7cN_aq++WJZ`CPR>&3;j>O#F-t4=pwd|{|kx>c%Ft(DTSYrZgKxp>0!6S!C@ zxvsT@i{Ye~U934PR^7|U`wJg_TXoAsb!z5fdA($hR!Y^S`gL>R`damdYt|fdzEWGb;TFtU z>qe<+E;_Z9yotYZz2wwuwTf%nwS~2nQnhZeF?o47TXO3spI!!&63@8Xi44!2&=@&9I zc#fCnnOxBK_$hox67?~!d$C-p+_EZbrQwv9W?Q^&-6F52QC=-pYSkq)V>s>Q_{5s8 zlsu#AWYD`_+H#iMUrso^xcK12%hziwrI&A8)uoCx|FY{WygYtn?6u;X&T4V;`lZV+ zGbvhCt8&LJyDz)eN^z}TuDFHOJ6@OHE5|u-9Kl3ypW*c3xM1)Ke>`^K$FX)#Ah8WQ zX{YQor=#8V_3RJ&qrH*%{=}kT=j`tHbIrs#jQ<9v=CZ+=uz~r2aqnsT)MRaCwNm1E znU+~yTd0(3^|D4Z9;>VIV(^u3`$#`KEB@0Kq^t|Pkuo%19&Z1pjsLNAl7Yg_wf3i|`>wI#0 zhNbco*+k=+uulpnu`H~Ldm1Ofs~#4Xm*%CHUapm^C_NW+tNOIz97IX-4KE)Sc8=f3 zBT3}6{07!d|vjKofW&{9JW(i#YAl0>) zmf2tc%;oJKJn6D~?OizL>^^%pj@@>@y$8pisiZ{ z7xwj9#V%KuFzZ|#DkZa0u9gHg)E05`%ES~PP^DBbC%E#=+hstZl~VnB&30wB0Ya@< z)$;0E#j2M~t7@C2^}1sPAdG;?tY62I;Ohm+_JPse8BBWU%cilMxSPNdNn(kl2qF(- z?i7=Co+l4u;ovHb^OW4r$o(|$+sQ+GGb?v8{+nHRGb?ZAaMmSf-SS2*k|!^By8S#o zcr!0=_TsEZ&UQ5u%YC@QV0~{;i`_Qv7W*aLBk5jA_er{6(kCQ6VDG|E^jW`~O(gvO zFIUTT^h=}UC^+sCz{Uqc`VYfkt%N){yzspX9ml=EOx4d4Sx^v*r5jvRg72E7lV8G9Ml& zY)Z5h}SQT`V~Smq3)5cr_OoV6!qVzjXzrs(Kt5CD^geN@=lfA{!EGb(PBk zm0B!2xWN}TD{5-u^4mdCLB^YF7}@%r&9Y2hx;Qc8_uN7aSh`&0={lZABsIF}93f;x4R@vFCW|yZ`VH_X!5qhA8F?E8$F}KP*`{w!S zpeL$pEAz;P{OmhG8-fDqVZaQhymYAqEISI(nbTRSNL-7I*!x%T#VPQ2Mxz{Hiw zOP8chraws0j^C_X-nu-i>rz`KO=VToGILFLwX}d`v0$QMY9SKXmBKEbDb?4UDpt*O zty&5fAn7spMEqL1nKMs7LH*8Z8Td@VFd(-R&L>iE0;*4$19g=1c44f*<4j@vNZ}~{ zPX(993$GU@CMF6)Lqmn>sT1WBM+S5wS&A&WQ(X5NLkzLx5lpUCP>6#CpJd~F3o`gi z|5Whxu*g|wEnF!2by@XzdrW`2^LuJ#2gS~8S1czeOMQKtVr7Nm#iK{Ic~PxPe*fsy zb|0USm9q1?2Ct7F+olq-ics!$t>XIN=P|n5)m2sk-^DE0?p;jvs{_+h1J_viCLQLG z{=Fv&sjzz~CZvV#krKZL9GAF9^7vkkNjDQxQulh0Ob;LQvVf^hdDY9fs{oT;Mlgw& z0?_f&oGV^NW`dW(2zzM`pO=xb;R66DFC%cuOS7-M6ner-vyEQ=s^z$)B9>@Tkf7_O z&>pXMp}b@{#rhhkY}e}pY{mk$i;9c#dV*^J7jIW^v3lJp*$%n7u z3oC1N7ko*YFqs5rwdAF+gyy<%gM!K0^24!WZ?k2yG+Q@8VH@ce1CTXTF&|0Zx=49gL)9)mjDNrzJ z>vu5N=4E!XPj@J4Lz8-~;1Q4vfLhOaO;Uz{Pbd+9$2UQw7tF~3A6k`)fcOptf=U8O zM9Nf`8`36KAS{4T{X~Ob@)LyI&Ygn!2B;%d@w&AHLdi96lg?^A6B9hsW9Ew&RHHhj0~2RYyrD8ro;z_CXV(-f9P|QkXK4s^ zQm|~;EVBpe`U7qFCw%`ipG8@Gf3Z@tc9i3dSw9B`c_&|Zd*=S$pE!Ge?@ylF?tPz+ z_l&Y1Ts7F!BKuAx zXLEUZBAh!ik`3|Fq)EZ-%3EGW7ETK+bY4fPe~h0?Axq+uIU}D;1MKZh?N2s75d(Iu zQ)_yG7>9p^dPG42_5K@9Y-1y{{z@|m7+2WHHnZU05@!-l-cD|0nrSBws7Dc8mT=Eb zA58GNnFXxNzmr&hOg{w?CMQ8f80!`6U$w8 z`qe}-QS7Scnwh&47zW?G7@Q)~@*3E{8^+rXdPgy3%SdI{pg z7+emyGX)k2mPlY9Gl$lyWNS+Hut2(Tb77&D8bG}@6Lq1UfbUdxpZkv{E zmleT0;#W_J9_$iox&jnPYL~(W#s0bGQmtk~#;F$);CVaS%9o04mJ1F+9qZREh|d6b z9Z1c}K;G;SfYViQ$ds%Bfa*r-!aKnhyjXK=bR+VwQc}WxP(9n4Z{FHzEv$K|grLbC zRpUNtah1b*&F8!d=A}h*wdSJJZ-I)hRWVMEa3E$K#5wy~zAr>^xdTE9y5#gJf4ZQ& zYH10Q0(z}wnFb}Mqrw8sodt|E4%Jm3F|jxXz*()Rr1XpbM%v9;sp*-4HD}EvaLERsu z#yj|)F4pfz%J3kDp05FhSpta^0MYNYcIx(HNhoyc=#iG`7yv6>UQ73>Tw`Ib&I7ON zJ6{qEx@CPkl|LG-)+PhcIbWn=EvxH-F#V1R%4)hXE+BYi&F@x?Z>XEpxB~QcX$hcQ zLF(ad=Vj#e`hvMqWEc z!=PDjWSu_hs(B7-GU<%e4d}Q@&n6Ef4nptoWuuy=s>jZOD$H9qQ=kgZ`Q(*GFG8z{ zit#!%13+aIDw8TI18hQL9Ap3wdM8F2Dyf)|g%RWvaP6(^c8K#@pR{4U!RbMg0jL7; zU+go%GSWwx#YB}P2cfC}CWQQf(;>j0k^Epp2R;=MbOAF!(i)V-rTI98r)9T@t_MLq z)CvW5TEnravFWjoSj5|@mTlN?t8V;MAbI#`M9`7?JqK%tI>z{f`O^6}-@J7B%B3rM zuxFu-EX`XBH_RbX?Lf3cW&mRKiV59VeK?#KGiy~EcnFL+5uJ{lq@|VB`kfKTB}nZN za+&<%+xk1A1Bbf&Q;nCR6H({H7pp?w>UG9>+jvq_et1*O3W1>@gd08jQ!ls|rW@z) zquk9)9Zoq%0b0D?XltEO+!{87yKjYn-MXWf3H*dZKMcSJpqIqI{$%4o47lmnwLFqA z4IoKz=5siqJd)7st^u8`aVD`r*<^Spi94yL;Y>G^&MVCnIN(=o$R3+1=Ug+1t8;ex zN}>)vTOePqo~EW4&@U@I@j3(oF`hWq;n9w9VBP~UCggN+fg9clhcyuNDz*xmvyu%y zw&pAZ;!l|izBY>LEOL$4EwW0h!i9xp)toY>o>=6#V9f9tA?OKz9R<%YMFgaU97F(R zSu$9SQIqJdWK|0S^}s!hK-nqGykJvM3edx&25w0JKZ@Rp#RKGpinvyV2~3@J$wavm0S9~h(4kQGdY8+3<; zy+`7&O~nS`F?c>u)g=f-SUEk|j~g-gH1SvmG9IhSSs0RrhDf6g9J4irpp;m%E$AXlJ6L{TxfSn*-=BLUkp1kJJv z0IN5zU;C3sTr*t(VNlvTT%=i8HCTd}r0sx8=huve-}T${y3VJ@Jy^f3rEES2W$bh* zG6nV3J(`5wGm*?*XgJ|epdCJuC-Wx)W=MI;-1OBSY;Fj!Z(oP45jcoN$66B7^|rj; z-Wxv7oF2}2=@R%2FEwAT3cul8Lv5UQkPN2R7TRCXV79Th6#+U6+PsOnP%8Z<@pHd`LV(!37`h_@OvUabaVLGJ z4=57iV;DhY-bqvsq!ab@jlA=wk>I~CCqAF3Cf_Aee7BkDO}O6$k@gQYGx$N6eIx0v z;z&VsT~KGz&f@$%G70sJofE`4jT~Q-ykAEtzlXfP=jZ(uKkqmAKF*o}<>6j3ED=}zu`*9HmwVaIWw^(5uGhjg zAlHRhD}es(y=;%Y8|Z(Z^W((J`xD3 z86gzJD2eN5i{jL>M&$(3jIc!EH31ByBuLmn(V)s@@ZPt|HcY0rE3NH$r9AIIpNrmr zzN%ne9hey#0OlE(8JELa6}O@HDA@zoz%-7@rxrjWQKlc!?RDr247!+pwS6*tSNbI| zNV3hLPZejz3X9+=EKwiErU*dZQW-`=TNKBcsH%&qj<-}*#$((kSTIMGU2sUcgtZFJ zSe^0hsw30i|L7J~3;Pj_v+8l`{3u2j9S;`)f9PQh?Jba!(6fiw_dWn@W{h~zCxq%=Wi{?;KEW$qR0HHgs`71aoRJJv=>J8pvhJ7*k^ zWE6}aOtLnzLHDJ8(z(QFRXUg0IbXCUstNjwn-EeDi*k+ChgJ4_^kYWGKF(@xGZgx< zo*SBE1C-ZN`=hD zF~fHoPtK73QbZZ;2FQE(Y)a#kf?t%O0})anxPiDu%(NQMcjia$wkmcAKN@(k+1`u~lv|Vv4<^N0HYX zoq4}c29MT~iSucA`|M37p}6TY;Q3?pL(P*DiD%?Ii9e%1ork(-Kb|XzXyhDE8;4_z zVf1TS=|l4+t@QZV`#&mpP(p$V5YQ@Tt7l_CyX^SbV9MycWK`AbmGF{qL$BQ~-idc}7HlqQYV zDVu~E4{VN@&gvz&#yJb}YYC1&4t#!SW8;G6gGJ;!G~|FFum*`Ygn&MsCWVP2P|DK< zi786dI@d%-k$sT9gOnb8nMh2P^8_uq@Rhfg3`!@8+Evoif)}sYinau;p0r(utadw9=Wd zNinBHpP)ZPl(Us`<<+?M2d;n@Cua-KKKH`V@Jk~P(a;~t*9lODgZ0628V@YH+0amP3UTFbUU8`K*_ucz;yF#v<*MVvmv}gA<+-{@4VMcF2h@| zX(%wn9C#0&d_gYcPf4ykN*=n2e*{!g{;##aqSh{pX_at4@i9zzUl9N5EGo zMdVXDhLR}_1r`7ln5%e(REZ?&Ag*8AHv6=TX>1XBYLS@YckFJ}p zzR+G=r#`?k!3sIWgx=erVg}!ZvVJF;`m!%{X)~vrCmye9CI*sA1^%)X{Ut9~gedtF|SWjlbI3im|cHn}PN?(RJ6c z5z`-oMjRzxcm)=5TQ~*O2Y2(dxLxe``5DAVX{Prl zKrAG0ehLHvhzf8%2KgX42SLGe$k+=qq>o6BAx<>i=S&|@TGJUj{hhp zBW@rW|0>%=&=}n9x>>>7-5_AT+058^^`0EFH!JdNH%j{b@a>xwIBV=`<_?JQ-OUQ> zhOdB3fl*#jjP`=o4fk7r3JPo%suH+vsn{~+8g17|l|(IiA`Z=DaWwg&r@fe%Gglz} zM-lXRpq;yLMSP|u;z0W~VBty`s$nHo3MtDiL^l#5lY7{N;gDk14IK- zq!kH3pErd!xo*khqIWo1HsdJ+FsX)gv$vvvZ>!dij*bdvt~vl$!k=&62ak$adUQPy zY5ult4?vNgAR3cL3Eqc5TRpnwPtK~Le`%y`&>tT)_Nbb#$37VO0C={K>%*Yn9#yjh zsA-TE9arZtsjq!ZE-`^fInhe+nY>4;PXcBM%_r1fD4-Ruc2@rV#N@NNK);oQYGtMlT#9(vYh6!$G1q zMXD2a#3@pmeFl&GmzHBX)z6>@sm6<&QkUA4`vBfbS>Gw6#Ba+!!c`*BeyDl*BYL<$$%eb)U^Q5Fwe^C1ZA z$wKR?81MU#de<+sV-bkM9s+gnz)fha{Nuy$AEF5X1ZeNwhk>IXLd7C}h|YDucSuie zm3pI zL%OFI3t4WTw+VUUh|J;tV>UdP@zj*KB36SvY1obT&CdtreG4GSfE#AdL>T+4&|E;5 z3WyEe>T$sB5%bkn;P>&K1Ay|)H`bQ zBV*>bK`9TZ-JI*KL;SJD?+>xfSl^FX^YT$O2AY;nlR^;UA&a=hpbESU+yxNUUD-8_|&& z6&ai`l`M(n?z@Ra#`o*8x)G_dBbgCeEsTh+AjFUqv4fWs&QhIgvcuBEhVL8Ws9>l< zxJU*%Ig>aEy+D)ze+_OMq0%Do7!e~5#7n-u=8n+1-*-`gc14-h!d^|hUDRi!V5xUl zA!s=;(4yYpXX&c|vkR7kup)?iED?_Qq;vbC{yd$k%?iLsjsE4(cF+q#e>&jp{W?c{ zd1s+fCJ|A&8rm=%GP!|>u8TuIQSU4V=FPRcZ)c2G?lS58k=Lt^W zq_ZedhR~ua>Jh4SRHR-V6d6qlx@Gg4M5s#G57L+}?1OeyjxJ{=2$L%V=0~$)0Fgo@ zGP8OX{Q|lwa1Yc)1?`fN#)ZUrRz*b`#A&eOl4P#3BwF#HMu9yMrkD(!%Xs~fg%CEG zd~DPS`F_bl-{eI`6VqTU?p3?BCg@b*SYd4ZBeZal@hGAvDUWbl9h7Ni2m0u0Bpi@t zD_e4Mc8A_6%s2*=%bf9^1e|EKkLhF+Bqj{^-(||4rtmT7wlVa;hwQgt6|@Rc+U;?+ zm_+Ci80UO!dc>C!g)`+o=abH_VMS~2bfjCi?jXpVJ`NoOM(knEn4MC$o%`k(b64hR1@FC`-U8oOc&Gg zG-vmMp~%71J6G>v4qSChZoJ#d0}MtvjK&PJ8pI$W*{t3S?Vg|&ls%IQ=9Mx7C|cDLjJn~KuPv&? z1wRr)8BOuUu543ucHw+Ahv99Z+- zLgWB^L$|4wH+V41CkZTGFfW#>*zZKMTkIZ~-PWBogvJ89!|gYMKGxL@mbp*V&We*y zwANd9O?x@IuK<9CKqHehTP)J{X~aZX{+Vc#M@U1r>U$zl+SJ zBZk^%OLzSEX@fx%#jkP^W`&CBmc>)7cpb^Mc(Ui+bK-SGpCq|ANkft$xd*Hrx%Uzrt0J~?m^i9eU`~m6 z#NfC>SC$8zI|W$~7mr_jG<|YWi~-pRSH=HIxg5cOu=9(+Wt71u8lXt zEd#dcgsnL{>4@O6(B;T~o65E=kdu`$;*bxv*3(Jk97 zL2TyEhE@ zgfj{Ef!yFOkx~NA^VNQJAMB6Z1A~={`jC zEh)$-jC!ai(WOCX4u*ahTY-pYid~}HvJe*Jf7Ff{3y8$Dr~*G)F;5k;Kvq?krKh{! z647QcTA%1IP#&1MAiPGf7YFW=)q*%1VR2CtT^Kz3!Tj0cZz4ei{LMU+zs$Sbj+F5U z{1^agXOEDK3^Xh%qE!HQa2!Q2G)n@6y(EFbMkOr_T!T+6#P|ODRNO@SJW@dGsb&#B z&FwY@P@brr!cek1Ehv}{!dOCEVv<>ESJhk`u^x`SIA61ip>cDOso(W7BC+SN8eKCQKx5GImSMmLG8*^$X9^&{UKMq3`j zCY>w71Y)Z}iPFZf{v8;_2oX#g;8C!Mg?As?PKl}yis%z}Xb?2L9*CeOyc7u$c6Dfw4BiZASy&07)dAH?>VBMVw)#2wpzt}Fp;u$i$rHt&j~(gc z3L5QZtj0(;hu>mdgq9Zmg5PSdOZc31ZLdrGxzt6E=ts(;sc{>BRO1FAvg&uuq#cQk z3wbpe7tOGV+jIaGNa_*p2twC$WRr2>x1?W%7v|u!e=B@WhH)zwZOt0$aoL9M+%aeP z5kZCc(uqKYKvooBy`47bQ3=5GxJK9hgUO15CT<}s3Z|CQ9grkc4m2dE)vk3J=lghm z1c{dd;*sszmHV_JkiD!zG;@7%Z0hX_P?jQ%(}dR0My%E65TwOf@lp^~dKu*ea{$FF zGMd<`l~C@90HR+cdHf7sPaIF@d!cXWHIhke4uY8ZIinX^2Y9JtX%MaU^77) zK?OTu*lUDSV2mg1_=WD&Qye6XPfiB%{HUo$lm>3M*U^P2|LBD}BFlm*mZ1E)G!P?F zDMG0cnRX<;M02H{=u*VsV70~_?kW0TLi7g&psL?42??uAhXU%2+oHFVqih+cZ$O34 zg3tm156h!Egr33fU%`&uv3>|0JtF+vkk_HMW2~fe=V`QlxX&$=qhMo7-(UNXgugpA%J4U zC~z>yBZwiR2HnAsXl&l0D3RYnV5vK18{6=y@;V1W%`kG3AsRg)*%<=?~!&$hef-y+0tQ&h8dMN6>TE4i31#V zh1#H7u~@ea!s2Y z=RGE0W+EN=BRu%T44OxuZENSIM2qy^;HT z?tAjH4&SD&kNzVqd#9Z1=+Zh})HVkWU%-Z|0YhM0+ zW3J17!tS+q;Vfq#u>0)YIP10#+Wqz(oaOC9_Fj7*&U)+~(6ooW-l?_vIje46aL{#) zKQkw>Z!jEFg&Z9SbtOfvqU#)UO8Sk*!G~!+|I zV9XF2W7`WYGYpnI!{Z~*`NVBdvt8|O)a5~*RYkK zmT??u@)5B>mxoXdK>k4~Gd(rtJgHP5nIW_ zQwD%%nWDM^=Py@X@{A#(`79?_F!hrw=(~^)K16brmn-PB0oQ z?yA^W8URqQBx@Av z``iMrik*IfKGglbqeZSvE{pbJkrsv0*AYO_X-AGtJ`@KRfGt;t1^_1O1CZ`wXN{L| z9GGA-iv-&lkLbtXlpyvP8MuhwfQAbP(P)lGprXb!+JWSv>E7wQyhG|Y))BZu{#OrK1Z;vyfpa_2#@tfp{ zf4f6Yk%i7S_Dx5|(_ar4ric3=w=7i!KsgP74(G2Tnaf^*Np{I~evWs3hRLR|u&YvA z0++Ppb=|fc5-0)5yJfZ(xaaNbFdnOfgF@GSvM+f60fEAQ=~Uim#4t^#0o4E!)I=EW zX8;!@N8U%cAAl{W{tpWwB0vm4>tq15E+AWgn2CrL&<7<(#G{O5Ir-TN)=%OQB%}F*Dx%q9hwZNuMRvQG$DVgjL?PB)vekQ{trNrVE)GlQ|wxC z(RPNgq>qQ9=Ldr+K16vid@e8rAc4p6aL|OAg65%C9sNi;dm9?)ej)VEdh^k-YXq|V z6F4SBj`LRlNx*?QzlfvPO~;-xg@bc_mZA}aW8L^t51&gLX666Hj2~Kq} z0W1=BYe%JA4Vys=x{&{rwg50<)2%YM08zbCFwgnsa4m6w!>Yv2-l0_^={ame#9ff+ z#uo5$!HXd{4`NyCwrY~FReosJ=9huFB)kAR&&A}2cueVyRZYWwKfi8q_ni|aWXZxT za}Z~VCMP4aM%wF-IT)U?6Jw*Gj4td?Fi!Kg(ScvWQ~ak6jb` zQ%@U>r!g1L0fLGKAy^>6sKqdhmi2uR73qj!lGuTXScB9vut^c5`IZy0A1 z3X{NgJ(5G?maKJeYuw_8GYRpFT9G@#FhT1W$|eR8WLGqeWAwO$y$wjBkdBC$%#(pU zT$b-?ael+ba@eJMUg_EDdSitU3I^E67v`wNt!!tMfT!ltS#~FwhZYk7Z~)E_&tAQS z6SO9)!~VU9vo8Ur>MPwoT}1qZHMmW;U;yYA_keO0e)G!QXv=-Jr9e?Ix#e4(Ovl5n z{Wj9`hlP{i+elAO&cBFAFQ5)6Iwf^*WMnLY)gL?FYpve+$kB&Vz57Rf0l=A#NPER- z)+FAGqdyVseh+cr-(o_{*vdZb@}aqE;1fVtD$L!0U_Au7>os0PT#d$)op7GYr0rVC zTS2&^v|B?s0iUFJD(JF&o{IYjLZ=__OpM)5X_|FpYux2i^Z{q2sL=<-T&mh3=6YDn zkyM6zSG)qXn53 z-vYdl&vwR~DjqvB2r3nIfO+sreHau#JGB7jr1D!}j>{e1zq!E*4T^&RqlWQlSg_n@4 z$N;MgN{Mhsm&by9;gLBpdU`hG#s1;~6`3SOb^&-&=sG!Pl)Gan#31P$zNfm%Jax(( z(-^wV;2`??#mT{I_woAdgnIqR&Pt!WxaI4Ss0NHw>@c*!v}|-8?y&%1K~fO%5g`U? zkL2}2;}$Fv*ZJGb`ty9p-&u(7dl?FTT`vb#UV2cndujS_geF&WIkiSgZQC*A!_KY} z=7_DppyRQy0@AFcU4Z8NxBKgGMl0szUU%$XUhego2@_Q-bwuJoRA{ko z19-Cl3IH=@l0`Mb#3J!(2sEsglu;_QF-w-f%}5mHP2ggD^x7;YxU7oueHXxH%jK=) zu(1qV!-EbPeDJgdyjd^Az5q-?c(zFffh&!d>o9z!KOKGY$V-Vj4_XUoOO_OVhBC{y zqQ~>bD}tf@hcs{o0D?!K#fgp{=B}lV7-C~e5TB_BPNP4_vR36rS0%ggLKT7DgU`^~ znXl{W=+~aPa{g`K_!uf7j5O70)T^*|DCR7m`x=-nOH>BT9m{x@pI(GJ+otttNbNA! zH-BTF%P@a7alJH}^=q7Bu+Bj5tRSolyT&!lX-r=BJm%}l%eUYTgn1k6)k^uLQ>(a* zLzgFK$T!wjp=0Gnxb37YlQe^U7OiD}^k=%nJ zNTQqoEGko*T4pGh;nWAlOUjw|NBJIw=*}J{I@V4Mh*4@FRyKT*AnRHl-~^!L$Hg-R za9_k51kun0(>KLSfx?MEZ-T)O@+XPe#lz$eh++sI;h^$Td;w~c+=!`-&KIi|aDb{7 zU0$SG{j8R4V6S5s&V4JE*8c@A3U-{sC^a)Y18exc7%)<)nkBhxd3r^;+Gk zz^xAe&fBFfYM7@LYt?z1<*4aisvBx~NHo>p z9inQe_U2c;+*(y#1fSw}`9Qg|C}`yMgCuZ^^Cir!QjxRHA+g~68b6!IGaqXC%6aGS z@L`BhIV6IIe~QgLjjO~SAbR><~x|~%LrDqoRb>maMg{gZn}k>{!C00f-{Ki+_9ySK++I9EyjQflcfEjTyZpSS z1N$x($y*rkBx1U>%k{f4|`*X-`EcfI4dnETBGM-W_}$xZn^|EdCMK(l|0E(pfRG zeHzl)C-Li-BaCht1KD^6$9}^MQam+{#({VT0uD-%G1;wNKJVcZ7{nk37j_j9JD8`j zpJoct1-NKHRAy+`m%=9Qxc4dqzhd{7wA1=NJP{)IGx|P!5+e7r`aZl9)P1`PQy^y@ z1PnKoGXhY1)HhCOLFI@Z3IR9|BE*XxdBS&i`4hzn&m6o5+zIOoU|6~B6Ay%i5m`Y8 zs&12@ghy1w@B)a-`b=T`c;R)~r(v=%#6RT#{$Zo_(P<|>j%@@bG>1wkBEfmxA+`sK z`e;m#cDfLT2HUZ%ak>!J`uCY(Qe_BtyJ+L1TfQ*69g!@1zca#q#e;NN@qag5rQS%#93k_sH9b{ekrh^v9#WP2nw|G=ghEWcvNy z*!x|P_kW4t+TL{|=lo&yHJtUqi|K#i3%ikns5hs|tv)T!fLxqjJnfG^ZKiR50QdLE zce-``_sTU|!S*y$c$)aG=I5T?ckSN%xMJHEa}%{#u|R`a0J zG9fpu^#0Kff5W>GQi-$PNul+Mgc8a~sHK0-d*5Ut{0x~FMPLa9)q!~Ow|@1yz!7s| zIPWrxyeJB7x{0@eZTUVqBv`~kR(1d!i~A;COr-iCYRnUKgBoIbK+a5|jdI{Pa@fo* z4>4pGclwcL;ldRB_Ci1zUbO-JmXB(NgvQ8PK=*sNfCx=QaenM)SCr7*JQgeE4!#Wy0=j?-gd*GR4w0gXYZC6LFX*h;LPwwTOL)%W^}Qp7*YvqO zKh^dT@yXooD=n36`%3scBApTHNklZyVzMd%%f#6de)5MUsYL+7zX)ZeLTl`qD>=1u z=IE_r-hoEZ6h%$dP!0 zssp7oNaCztc%3}BAg=5_tmJ-d%ov|eEu*EEDM>UoYZ5>Ah)I9K1r9d&nooD0k6ai8Bvg0RSf_fK*eBjtW9T(mHIxhtgI%`+ghhHx(+w4U5Z zAx}4!8}5-iUQDWzZ`R)-AtVN6nXjto*t0Ual$4Xq)uS#pvEU!pIeKV2m0~z>hYI0Mo)v32uq2)1+0!yOE@Ny zhN<2UT@qeks0J0k1q1^=IH;>SLufT+K_#$m4yghc15#-CMB}Bmu(@El3IggZJ%$lS ztWsKpZ??`)c*NT<8g1mdC%P|o&vs8X_D*!a+xr|AMig@HD z7pom*KD(t9`L6Wj`|L?E)j)qqyvcUZpF!@hR{)9wP|q;yR#x{lS6%l+9rV0+W7?}UPfXaI$y>4TvoXe zDX;(MWKOJz>ukYAn3?(7dO_|X5JTWg*X2KOKFm-u$y#->yyT@X zO<#D+p?E{dOP=A`PcvaVIyKPCoy4~hZR+%kc*8w{gs2z);3knX`cgp2sr>HT6Mf&4 LX#KY)_h$Yd%Anq_ literal 0 HcmV?d00001 diff --git a/ChIP_analysis/bootstrap_helper_funcs.pyc b/ChIP_analysis/bootstrap_helper_funcs.pyc index 5c90a35385330166e0364c43fc599f88923fde1f..0184fc23fe4fc6d627b78ea97e31870d9f850a65 100644 GIT binary patch delta 147 zcmcaE^hJ<^`7 z&^IzLw1_t_iZ?Qi_bDofFGws%Es9Ue%uCFPPtH$CjZaD}PTl;K 1: - logging.info("Skipping gapped read %s %s"%(line.QNAME, str(vals))) + logging.info("Skipping gapped read %s %s"%(line.QNAME, str(vals))) read_sampler.add_read(vals[0]) return read_sampler def create_read_list_paired(samfile): """ Read in a samfile and convert it to a list of reads for the sampler - object. + object. This function is for paired end reads only. Skips any reads that are gapped reads or are not properly paired. Assumes samfile is sorted by readname and @@ -260,17 +260,17 @@ def create_read_list_paired(samfile): Returns: read_sampler(obj(ReadSampler)): final read sampler for the samfile - + Raises: ValueError: If pair readnames don't match. Not considered a failsafe for catching violations of assumptions above but should catch most mistakes. """ read_sampler = ReadSampler() - while True: + while True: line1 = samfile.readline() line2 = samfile.readline() - if not line2: + if not line2: break line1 = sam_utils.SamAlignment(line1) line2 = sam_utils.SamAlignment(line2) @@ -292,7 +292,7 @@ def map_read(array, read, res=1.0): """ Take in a [start, stop] read and map it to an array. Read must always be in single basepair coordinates [0,end). Array can - be any resolution desired through control of res parameter. Modifies + be any resolution desired through control of res parameter. Modifies array in place. Args: @@ -314,7 +314,7 @@ def sample(read_sampler, n, array, res=1.0, prng = np.random.RandomState()): n (int): number of reads to sample array (1d np.array): array to store coverage in res (optional, float): resolution the numpy array is in - prng (optional, np.RandomState): random state to pull random numbers + prng (optional, np.RandomState): random state to pull random numbers from """ for read in read_sampler.pull_reads(n, prng): @@ -349,7 +349,7 @@ def sample(read_sampler, n, array, res=1.0, prng = np.random.RandomState()): sample_parser.add_argument('array_size',type=int, help="length of genome") sample_parser.add_argument('--num_samples', type=int, default=1, help="number of full samples to pull from the sampler, default is 1") - sample_parser.add_argument('--num_reads', type=int, default=None, + sample_parser.add_argument('--num_reads', type=int, default=None, help="number of reads to pull for each sample. Default is the size of\ sampling object.") sample_parser.add_argument('--identity', action="store_true", @@ -361,7 +361,7 @@ def sample(read_sampler, n, array, res=1.0, prng = np.random.RandomState()): help="psuedo-random number generator seed, default=1234") args = parser.parse_args() - + if args.command == "parse": if args.samfile == "-": f = sys.stdin @@ -388,6 +388,6 @@ def sample(read_sampler, n, array, res=1.0, prng = np.random.RandomState()): if args.num_reads: num_reads = args.num_reads else: - num_reads = sampler.total + num_reads = sampler.total sample(sampler, num_reads, array[:,i], args.resolution, prng) np.save(args.outpre, array) diff --git a/ChIP_analysis/bootstrap_sam_file.pyc b/ChIP_analysis/bootstrap_sam_file.pyc index bf1197f578426a7c32a94b06360f82469ac980f6..ae7c8b47b7e1e4ce4decb08055dfd95bd36430a7 100644 GIT binary patch delta 852 zcmdl~y1#&f`7N*rS%MjD5v3Lsb@yfZScp-) xd848hGcgJ$gG_Z%{mDv<;?0fPXNdCG=81YMh*Ai)evu&;2Qk)zLrcbs;CXP4v` z=o=XrTErU|#T%K%`xF(#7bF&>7R9G!<|XFDC+DZ6#wR5fr%v7|sJdB;{WK%>Ota= mapq. If negative only return true if self.MAPQ @@ -275,7 +276,7 @@ def quality_filter(self, good_flags=[], bad_flags=[], mapq=None, stats=None): on how many reads are filtered for what issues. Modifies: stats - Returns: True if the read passes filters and False otherwise. + Returns: True if the read passes filters and False otherwise. Tests: @@ -344,27 +345,27 @@ def quality_filter(self, good_flags=[], bad_flags=[], mapq=None, stats=None): elif mapq >= 0: # if the mapq is greater than 0 then we are looking for reads # ABOVE the mapq score specified - mapq_condition = self.MAPQ >= mapq + mapq_condition = self.MAPQ >= mapq elif mapq < 0: # if the mapq is less than 0 then we are looking for reads # BELOW the mapq score specified - mapq_condition = self.MAPQ < abs(mapq) + mapq_condition = self.MAPQ < abs(mapq) else: raise ValueError("mapq must be a positive or negative integer "\ - "value.") + "value.") good_al = good_al and mapq_condition if stats and not mapq_condition: stats.total_mapq += 1 # finally return whether the alignment was good or not return bool(good_al) - + def is_gapped(self): """ Determine if the alignment has any gaps in it as determined by the CIGAR field. Inputs: nothing Modifies: nothing - Returns: True if 'N' is in the CIGAR field. + Returns: True if 'N' is in the CIGAR field. False if not. Raises error if the CIGAR field = "*" Tests: >>> line = "testQ.1.testR.20.30.3M.testR.25.9.AAA.(((.NM:i:0" @@ -399,7 +400,7 @@ def is_rc(self, paired): reference. If paired, forces each read of the pair to have opposite orientations, otherwise it raises an error - Inputs: paired - boolean. if true, checks reads mate to see if it is + Inputs: paired - boolean. if true, checks reads mate to see if it is in a consistent orientation. If not, raises error. if false, just checks if read is rc to reference. Modifies: nothing @@ -441,7 +442,7 @@ def is_rc(self, paired): this_is_rc = eval(bin(self.FLAG)) & 0x10 # check if the read's pair is reverse complement to the reference other_is_rc = eval(bin(self.FLAG)) & 0x20 - + # if paired, force the read's pair to have the opposite orientation # of the read. Raise error if not if paired: @@ -460,14 +461,14 @@ def is_rc(self, paired): def sense_to_ref(self, paired, library): """ Function to determine whether the ORIGINAL RNA molecule was sense - to the reference or antisense to the reference. Only implemented for + to the reference or antisense to the reference. Only implemented for pairs when each read is in a different orientation - + Inputs: paired - boolean. If True than additional checking will be done to make sure the paired read is consistent with this read. If False, then it is considered an individual read. - library - str. ["R1" | "R2" | "unstranded"]. R1 indicates that + library - str. ["R1" | "R2" | "unstranded"]. R1 indicates that the first read sequenced (*_R1.fasta) is sense to the original RNA strand. R2 indicates that the 2nd read sequenced @@ -502,7 +503,7 @@ def sense_to_ref(self, paired, library): Traceback (most recent call last): RuntimeError: Read mate ... - Test a paired end read + Test a paired end read >>> read.FLAG = 32 | 64 >>> read.sense_to_ref(True, "R1") True @@ -527,7 +528,7 @@ def sense_to_ref(self, paired, library): """ # if the read is paired, we have to consider the orientation of # the reads - if paired: + if paired: # first segment in the template is_first = eval(bin(self.FLAG)) & 0x40 # last segment in the template @@ -537,7 +538,7 @@ def sense_to_ref(self, paired, library): is_rc = self.is_rc(True) # raise the error if this can't be determined except RuntimeError: - raise + raise # determine what orientation that entire segment is in if ( (is_first and not(is_rc)) or (is_second and is_rc) ): if library == "R1" or library == "unstranded": @@ -600,7 +601,7 @@ def get_cigar_tuples(self): [(7, 'M')] More complex test - >>> read.CIGAR = "3M4D2M1I1M" + >>> read.CIGAR = "3M4D2M1I1M" >>> read.cigar_tuples = None >>> read.get_cigar_tuples() [(3, 'M'), (4, 'D'), (2, 'M'), (1, 'I'), (1, 'M')] @@ -616,8 +617,8 @@ def get_cigar_tuples(self): # split using regular expressions. The parenthesis in the re gives us # the seperators as well cigar_list = re.split('([MIDNSHP=X])', self.CIGAR) - - # loop through by twos using itertools grouper recipe from the + + # loop through by twos using itertools grouper recipe from the # python itertools documentation. # The cigar string always starts with a number and ends in a char, # so we cut the list short by one since the last value by twos will @@ -634,13 +635,13 @@ def get_cigar_tuples(self): def get_aligned_blocks(self): """ Function to take the cigar field and determine the locations - where there is continuous mapping coverage. - + where there is continuous mapping coverage. + Inputs: nothing Modifies: nothing - Returns: a list of (start, end) locations where continuous mapping - coverage occurs. Continuous coverage includes locations where - there is continuous 'M', '=', 'D', or 'X' in the CIGAR field. + Returns: a list of (start, end) locations where continuous mapping + coverage occurs. Continuous coverage includes locations where + there is continuous 'M', '=', 'D', or 'X' in the CIGAR field. Breaks occur at 'S', 'I', or 'N' in the CIGAR field. Tests: All matching test, read.POS is at 1, cigar is 7M @@ -690,7 +691,7 @@ def get_aligned_blocks(self): start = self.POS # parse the cigar string into tuples cigar_tuples = self.get_cigar_tuples() - + # go through each cigar number and character for val, char in cigar_tuples: # if it is an alignment match of any sort, @@ -699,13 +700,13 @@ def get_aligned_blocks(self): # just add the value to the previous end. if char in ['M', '=', 'X', 'D']: if end is not None: - end += val + end += val else: end = start + val # If there is an intron, go ahead and append # the previous (start, end) pair to the list and reset the # next end to being unknown. Additionally, push the next start - # to the end of the intron. If an end had not been + # to the end of the intron. If an end had not been # determined yet, do not add the (start, end) pair to the list # as this is a continuing insertion of some sort. elif char == 'N': @@ -732,7 +733,7 @@ def get_aligned_seq_and_phred(self): """This function uses the CIGAR field of the read to determine what the sequence that aligns to the reference looks like after - removing any insertions and accounting for deletions. It likewise + removing any insertions and accounting for deletions. It likewise returns the corresponding quality scores associated with the sequence. Inputs: nothing @@ -741,7 +742,7 @@ def get_aligned_seq_and_phred(self): removed phred - phred scores corresponding to each base in seq Tests: - + Test an all matching sequence >>> line = "testQ.1.testR.2.30.7M.testR.4.9.AGTCGCT.!#%()+,.NM:i:0" >>> read = SamAlignment(line, sep = '.') @@ -797,11 +798,11 @@ def get_aligned_seq_and_phred(self): # expand out the tuples to make a big cigar string for val, char in cigar_tuples: expanded_cigar += val*char - + # remove the elements that would not be included in the SEQ field expanded_cigar = expanded_cigar.replace('N', '') expanded_cigar = expanded_cigar.replace('H', '') - + #Initialize a new seq variable new_seq = '' new_phred = '' @@ -839,10 +840,10 @@ def get_aligned_seq_and_phred(self): def get_gap_blocks(self): """ Function to take the cigar field and determine the locations where there is a gap in coverage - + Inputs: nothing Modifies: nothing - Returns: a list of (start, end) locations where gaps occur in mapping + Returns: a list of (start, end) locations where gaps occur in mapping coverage. Tests: @@ -896,7 +897,7 @@ def get_gap_blocks(self): new_end = start[1:] # zip them together into a new list of tuples and return return zip(new_start, new_end) - + def get_aligned_locs(self): """ Function to get the corresponding locations with @@ -953,7 +954,7 @@ def get_aligned_locs(self): >>> read.get_aligned_locs() [1, 2, 3, 4, 5] """ - + # get both the seq with no gaps and the start and stop # locations if self.aligned_locs: @@ -975,7 +976,7 @@ def start_end_gaps(self, paired): consideration. Thus, if paired is True, it will return the start and end for the entire pair, but only the locations within gaps for the individual read it was called on. - + Inputs: paired - boolean. Treat read as SE or PE. If paired is True, returns the pos + TLEN as the end. If false then returns the right most mapped location @@ -988,7 +989,7 @@ def start_end_gaps(self, paired): gaps - list. locations that reside within gaps Tests: - + All matching test, read.POS is at 1, cigar is 7M, read.TLEN is 12 >>> line = "testQ.1.testR.2.30.7M.=.4.12.AGTCGCT.!#%()+,.NM:i:0" >>> read = SamAlignment(line, sep = '.') @@ -1007,7 +1008,7 @@ def start_end_gaps(self, paired): (1, 13, []) Test internal introns - >>> line = "testQ.1.testR.2.30.2M3N5M.=.4.12.AGTCGCT.!#%()+,.NM:i:0" + >>> line = "testQ.1.testR.2.30.2M3N5M.=.4.12.AGTCGCT.!#%()+,.NM:i:0" >>> read = SamAlignment(line, sep = '.') >>> read.start_end_gaps(False) (1, 11, [(3, 6)]) @@ -1068,7 +1069,7 @@ def reconstruct_reference(self): Input: nothing Modifies: nothing Returns: string. Genome sequence reconstructed from MD field - list. Tuples containing mutations in the read from the + list. Tuples containing mutations in the read from the reference, [(loc, ref_base, mut_base, phred, loc)] Tests: @@ -1097,14 +1098,14 @@ def reconstruct_reference(self): >>> read.reconstruct_reference() ('GGTCGCG', [(1, 'G', 'A', '!'), (7, 'G', 'T', ',')]) - Test internal deletions, + Test internal deletions, >>> line = "testQ.1.testR.2.30.2M3D5M.=.4.12.AGTCGCT.!#%()+,.MD:Z:2^AAA5" >>> read = SamAlignment(line, sep = '.') >>> read.reconstruct_reference() ('AGAAATCGCT', [(3, 'A', '-', '-'), (4, 'A', '-', '-'), (5, 'A', '-', '-')]) Test internal introns - >>> line = "testQ.1.testR.2.30.2M3N5M.=.4.12.AGTCGCT.!#%()+,.MD:Z:7" + >>> line = "testQ.1.testR.2.30.2M3N5M.=.4.12.AGTCGCT.!#%()+,.MD:Z:7" >>> read = SamAlignment(line, sep = '.') >>> read.reconstruct_reference() ('AGTCGCT', []) @@ -1130,7 +1131,7 @@ def reconstruct_reference(self): if self.aligned_reference and self.aligned_muts: return self.aligned_reference, self.aligned_muts # get the aligned sequence and phred scores - aligned_seq, aligned_phred = self.get_aligned_seq_and_phred() + aligned_seq, aligned_phred = self.get_aligned_seq_and_phred() # get the aligned locations aligned_locs = self.get_aligned_locs() # create a list to hold the split up MD values @@ -1140,7 +1141,7 @@ def reconstruct_reference(self): # split using regular expressions. The parenthesis give us the # seperators as well try: - MD_list = re.split('([AGTC^])', self.OPT['MD']) + MD_list = re.split('([AGTC^])', self.OPT['MD']) except KeyError: raise KeyError("MD field not found for read %s"%self.QNAME) # make a list to hold of the mutations that are found @@ -1162,7 +1163,7 @@ def reconstruct_reference(self): # append this value to the reference sequence reference_seq.append(val) # we also append mutation information to the mutation list - mut_list.append((aligned_locs[start], val, + mut_list.append((aligned_locs[start], val, aligned_seq[start], aligned_phred[start])) # finally we increment the index of where we are on the @@ -1177,7 +1178,7 @@ def reconstruct_reference(self): # we append all of this to the reference sequence reference_seq.append(aligned_seq[start:end]) # and change the end to become the start - start = end + start = end # finally we join the list of bases in the reference sequence to a # string and cache it along with the mutation list self.aligned_reference = ''.join(reference_seq) @@ -1187,7 +1188,7 @@ def reconstruct_reference(self): def record_muts(self, strand, phred_encoding=33): """ Function to find mutations found in the read as compared to the - reference. Considers all reads individually (doesn't matter if paired + reference. Considers all reads individually (doesn't matter if paired or not). This depends on the presence of both a CIGAR field and an MD field in the alignment. If these are not present then this cannot be used. It also requires the XM field to be present @@ -1198,7 +1199,7 @@ def record_muts(self, strand, phred_encoding=33): [phred_encoding] - int. what is the phred-encoding for the read? the default is 33. Modifies: nothing - Returns: A list of mutation records in the form: + Returns: A list of mutation records in the form: (chrm, loc, strand, base, mut_base, phred) Tests: @@ -1276,16 +1277,16 @@ def get_ref_base_locs(self, base, strand): that actually have sequence for the read, not its pair or the interpolated region between the pair and itself. - Inputs: base - char. The identity of the reference base + Inputs: base - char. The identity of the reference base strand - char. ["+" | "-" | "."] If + treats the base as is, - if - searches for the bases complement. If . it + if - searches for the bases complement. If . it searches for both the base and its complement ( essentially search for base pairs) Modifies: nothing Returns: a numpy array of locations where the base occurs on the read. Tests: - + All base of interest >>> line = "testQ.1.testR.2.30.7M.testR.4.9.TTTTTTT.!#%()+,.NM:i:0.MD:Z:7" >>> read = SamAlignment(line, sep = '.') @@ -1327,15 +1328,15 @@ def get_ref_base_locs(self, base, strand): if strand == "-": # NOTE only complementing! not reverse complementing for - # minus strand read - base = complement(base) + # minus strand read + base = complement(base) # use numpy arrays to quickly find where bases are in the reference # sequence base_index = np.where(np.asarray(list(genome_sequence)) == base)[0] locs_list = [aligned_locs[i] for i in base_index] # if we don't have stranded information, then we have to consider that - # the "base" could be from either strand. We therefore search for a + # the "base" could be from either strand. We therefore search for a # "base-pair" i.e. find all the locations of both the base and its # complement if strand == ".": @@ -1367,7 +1368,7 @@ def __init__(self): Returns: None Tests: - + Test initialization >>> x = MutDataFrame() >>> x.mutation_list @@ -1391,11 +1392,11 @@ def __init__(self): def create_df_from_list(self): """Takes the list of mutation data and enters it into the dataframe all at once. Only to be done once all mutations are found. Mutation_list - has to be in the form of: + has to be in the form of: [(chrom1, loc1, strand1, base1, mut1, phred1), ...,(chromN, locN, strandN, baseN, mutN, phredN)] - + Inputs: None Modifies: self.df @@ -1483,7 +1484,7 @@ def do_fdr(self, alpha=0.05): >>> x.df chrom loc strand base mut phred pvalue qvalue 0 test 1 + A T 30 0.001 0.002 - + Test a non-passing value >>> x = MutDataFrame() >>> x.mutation_list = [("test", 2, "-","G","C",2)] @@ -1497,7 +1498,7 @@ def do_fdr(self, alpha=0.05): # defaults to assuming phred scores havent been converted to p values self.phred_to_p() # calculate the qvalues, alpha does nothing in this regard here - self.df["qvalue"] = multicomp.fdrcorrection0(self.df["pvalue"], + self.df["qvalue"] = multicomp.fdrcorrection0(self.df["pvalue"], alpha)[1] # filter by alpha self.df = self.df[self.df["qvalue"] < alpha] @@ -1517,9 +1518,9 @@ def filter_mut_type(self, base, mut, data_frame=None): Example: get all the T to C mutations from region chr1:1-100 that were from the "+" strand: - - filtered_df = Mut_Df.filter_strand('+', - data_frame= Mut_Df.filter_region("chr1", (1,100), + + filtered_df = Mut_Df.filter_strand('+', + data_frame= Mut_Df.filter_region("chr1", (1,100), data_frame=Mut_Df.filter_mut_type(T, C))) Tests: >>> x = MutDataFrame() @@ -1556,7 +1557,7 @@ def filter_mut_type(self, base, mut, data_frame=None): def filter_region(self, chrom, locs=None, data_frame=None): """Function to grab mutations within a certain region. Can be coupled - with filter_mut_type or filter_strand by specifying + with filter_mut_type or filter_strand by specifying data_frame = output_from on each subsequent function Inputs: chrom - char chromosome you are looking for @@ -1571,9 +1572,9 @@ def filter_region(self, chrom, locs=None, data_frame=None): Example: get all the T to C mutations from region chr1:1-100 that were from the "+" strand: - - filtered_df = Mut_Df.filter_strand('+', - data_frame= Mut_Df.filter_region("chr1", (1,100), + + filtered_df = Mut_Df.filter_strand('+', + data_frame= Mut_Df.filter_region("chr1", (1,100), data_frame=Mut_Df.filter_mut_type(T, C))) Tests: >>> x = MutDataFrame() @@ -1592,7 +1593,7 @@ def filter_region(self, chrom, locs=None, data_frame=None): else: df = self.df if locs: - df = df[(df['chrom'] == chrom) & + df = df[(df['chrom'] == chrom) & ((df['loc'] >= locs[0]) & (df['loc'] <= locs[1]))] else: df = df[(df['chrom']==chrom)] @@ -1616,9 +1617,9 @@ def filter_strand(self, strand, data_frame=None): Example: get all the T to C mutations from region chr1:1-100 that were from the "+" strand: - - filtered_df = Mut_Df.filter_strand('+', - data_frame= Mut_Df.filter_region("chr1", (1,100), + + filtered_df = Mut_Df.filter_strand('+', + data_frame= Mut_Df.filter_region("chr1", (1,100), data_frame=Mut_Df.filter_mut_type(T, C))) Tests: >>> x = MutDataFrame() @@ -1644,12 +1645,12 @@ def filter_strand(self, strand, data_frame=None): def count_muts(self, data_frame=None): """Increments counters for each type of mutation. - + Inputs: [data_frame] - dataframe from the results of a filter. If None (default), then uses self.df Modifies: nothing - Returns: dictionary of mutations with mutation type as the key. For + Returns: dictionary of mutations with mutation type as the key. For example, dict['AT']: 30, if there were 30 mutations that were A->T Tests: @@ -1757,7 +1758,7 @@ def __add__(self, other): for key in set(self.bad_flag_reads.keys(), other.bad_flag_reads.keys()): new.bad_flag_reads[key] = self.bad_flag_reads.get(key, 0) +\ other.bad_flag_reads.get(key, 0) - for key in set(self.missed_good_flag_reads.keys(), + for key in set(self.missed_good_flag_reads.keys(), other.missed_good_flag_reads.keys()): new.missed_good_flag_reads[key] = self.missed_good_flag_reads.get(key, 0) +\ other.missed_good_flag_reads.get(key, 0) @@ -1809,50 +1810,50 @@ def increment_bases(self, read, strand): self.base_counts['G'] += seq.count('G') self.base_counts['C'] += seq.count('C') self.base_counts['A'] += seq.count('A') - + def print_map_stats(self, section, mapq, paired, locs=False): """ This prints out each of the counters in an easy to read format """ - print "MAPPING STATS FOR %s"%section + print("MAPPING STATS FOR {}".format(section)) if paired: - print "Paired-End Mode" + print("Paired-End Mode") else: - print "Single-End Mode" - print 20*"-" - print "Total reads in file: %s" %self.total_reads - print "" - print "Quality Filtering" - print "Total reads filtered: %s"%(self.total_reads - self.total_mapped) - print "Filtered for:" + print("Single-End Mode") + print(20*"-") + print("Total reads in file: {}".format(self.total_reads)) + print("") + print("Quality Filtering") + print("Total reads filtered: {}".format(self.total_reads - self.total_mapped)) + print("Filtered for:") for flag in self.missed_good_flag_reads.keys(): - print "\tMissing %s: %s"%(self.flag_mapping[flag], self.missed_good_flag_reads[flag]) + print("\tMissing {}: {}".format(self.flag_mapping[flag], self.missed_good_flag_reads[flag])) for flag in self.bad_flag_reads.keys(): - print "\t%s: %s"%(self.flag_mapping[flag], self.bad_flag_reads[flag]) + print("\t{}: {}".format(self.flag_mapping[flag], self.bad_flag_reads[flag])) if mapq is None: pass elif mapq >= 0: - print "\tMAPQ < %s : %s"%(mapq, self.total_mapq) + print("\tMAPQ < {} : {}".format(mapq, self.total_mapq)) elif mapq < 0: - print "\tMAPQ > %s : %s"%(mapq, self.total_mapq) + print("\tMAPQ > {} : {}".format(mapq, self.total_mapq)) if locs: - print "\tOverlap with locations: %s"%(self.total_overlap) - print "\tStrandedness could not be determined: %s"%(self.total_bad_sense) + print("\tOverlap with locations: {}".format(self.total_overlap)) + print("\tStrandedness could not be determined: {}".format(self.total_bad_sense)) if paired: - print "Filtered pairs that mapped to plus strand: %s"%( - self.paired["+"]) - print "Filtered pairs that mapped to minus strand: %s"%( - self.paired["-"]) - print "Filtered pairs that mapped to either strand: %s"%( - self.paired["."]) - print "Total pair basepairs considered: %s"%(self.total_paired_bp) + print("Filtered pairs that mapped to plus strand: {}".format( + self.paired["+"])) + print("Filtered pairs that mapped to minus strand: {}".format( + self.paired["-"])) + print("Filtered pairs that mapped to either strand: {}".format( + self.paired["."])) + print("Total pair basepairs considered: {}".format(self.total_paired_bp)) else: - print "Filtered reads that mapped to plus strand: %s"%( - self.unpaired["+"]) - print "Filtered reads that mapped to minus strand: %s"%( - self.unpaired["-"]) - print "Filtered reads that mapped to either strand: %s"%( - self.unpaired["."]) - print "Total read basepairs considered: %s"%(self.total_read_bp) + print("Filtered reads that mapped to plus strand: {}".format( + self.unpaired["+"])) + print("Filtered reads that mapped to minus strand: {}".format( + self.unpaired["-"])) + print("Filtered reads that mapped to either strand: {}".format( + self.unpaired["."])) + print("Total read basepairs considered: {}".format(self.total_read_bp)) def calc_mut_rates(self, stranded): """ Calculates mutation rate for each type of mutation @@ -1888,37 +1889,37 @@ def calc_mut_rates(self, stranded): mut_rate = (self.mut_counts[base+mut] + 0.0 + self.mut_counts[complement(base) +complement(mut)])/( - self.base_counts[base] + + self.base_counts[base] + self.base_counts[complement(base)]) except ZeroDivisionError: mut_rate = 0.0 out_dict[base+mut] = mut_rate return out_dict - def print_mut_stats(self, stranded): + def print_mut_stats(self, stranded): - print "Mutation Stats" - print 20*"-" - print "Total mutations before FDR: %s"%self.muts_before_filt - print "Total mutations after FDR: %s"%sum(self.mut_counts.values()) + print("Mutation Stats") + print(20*"-") + print("Total mutations before FDR: %s"%self.muts_before_filt) + print("Total mutations after FDR: %s"%sum(self.mut_counts.values())) for mut_type in self.mut_counts.keys(): - print "Total %s->%s mutations after FDR: %s"%(mut_type[0], - mut_type[1], - self.mut_counts[mut_type]) + print("Total {}->{} mutations after FDR: {}".format(mut_type[0], + mut_type[1], + self.mut_counts[mut_type])) for base in self.base_counts.keys(): - print "Total reference %s sequenced: %s"%(base, self.base_counts[base]) - print "Mutation Rate Matrix (after filtering):" - print "Original Base on left, Mutation base on top" - print "\tA\tG\tT\tC" + print("Total reference %s sequenced: %s"%(base, self.base_counts[base])) + print("Mutation Rate Matrix (after filtering):") + print("Original Base on left, Mutation base on top") + print("\tA\tG\tT\tC") mut_rates = self.calc_mut_rates(stranded) - print "A\tX\t%.3e\t%.3e\t%.3e"%(mut_rates["AG"], mut_rates["AT"], - mut_rates["AC"]) - print "G\t%.3e\tX\t%.3e\t%.3e"%(mut_rates["GA"], mut_rates["GT"], - mut_rates["GC"]) - print "T\t%.3e\t%.3e\tX\t%.3e"%(mut_rates["TA"], mut_rates["TG"], - mut_rates["TC"]) - print "C\t%.3e\t%.3e\t%.3e\tX"%(mut_rates["CA"], mut_rates["CG"], - mut_rates["CT"]) + print("A\tX\t%.3e\t%.3e\t%.3e"%(mut_rates["AG"], mut_rates["AT"], + mut_rates["AC"])) + print("G\t%.3e\tX\t%.3e\t%.3e"%(mut_rates["GA"], mut_rates["GT"], + mut_rates["GC"])) + print("T\t%.3e\t%.3e\tX\t%.3e"%(mut_rates["TA"], mut_rates["TG"], + mut_rates["TC"])) + print("C\t%.3e\t%.3e\t%.3e\tX"%(mut_rates["CA"], mut_rates["CG"], + mut_rates["CT"])) def write_mut_stats(self, out_prefix, stranded): mut_rates = self.calc_mut_rates(stranded) @@ -1931,7 +1932,7 @@ def write_exposure_df(self, out_prefix, paired=False): file_out = out_prefix + "_exposure.txt" with open(file_out, 'w') as fn: if paired: - fn.write("%s\t%s\n" %((self.total_mapped/2), + fn.write("%s\t%s\n" %((self.total_mapped/2), self.total_paired_bp)) else: fn.write("%s\t%s\n" %((self.total_mapped), self.total_read_bp)) diff --git a/ChIP_analysis/sam_utils.pyc b/ChIP_analysis/sam_utils.pyc index bfe50396e802022ec8a2a9b6133cb397ae236868..158a79308adda14ae341285e9c17adf4403cc5c1 100644 GIT binary patch delta 1808 zcmeCY#{BR!GY9i$UM}6Y?J*lUE-@)vF)%RbXXNLm>X#?xrRO9j=@%Cz>l+yuTEzPl z6~sGdcm_;9xKL)Z74v#Vq7{3xs&W&p_?$o}D^ZFkPLtW(EG9?1$&5;8S%@+j#*t(5=G2wM zN6enWox}&<)%qCXO`h02i<4MSPVSz1h#0k-CrGC%`|G>tLaS=XuQ-MkZGK4&JKRoJpX78)?p|p$5|+YpScypf;G!IoeCnJc z&B~g>Pc;yFrbJ$)tcLJ`e>%(L66cVP@uchgrntp_G~y> zV&D{aDXaO+6dO7>ZsAbZ3cnL9!eG>Y@`+m!lFw#=O|4HEaqG*X z*H)Um_yF1(y-woZj)^xLC|h~5XEm4+Jd52mvZvb>oV?hb9rnZFseGTs)r%_mEgVEMBtGS9uKPt3YcnZ8#b$zTyJRcHSzeHvA2`fu*YMAH Date: Mon, 20 May 2019 16:14:25 -0500 Subject: [PATCH 03/13] Sped up read sampling with numba.jit --- .vscode/launch.json | 16 + .vscode/settings.json | 3 + ChIP_analysis/README.md | 4 +- .../bootstrap_helper_funcs.cpython-36.pyc | Bin 1858 -> 2164 bytes .../bootstrap_sam_file.cpython-36.pyc | Bin 12272 -> 12272 bytes ChIP_analysis/bootstrap_helper_funcs.py | 22 +- ChIP_analysis/bootstrap_sam_file.py | 3 + .../bootstrapped_chip_no_consolidation.py | 285 +++++++++++++++--- 8 files changed, 281 insertions(+), 52 deletions(-) create mode 100644 .vscode/launch.json create mode 100644 .vscode/settings.json diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..59409aa --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,16 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + "pythonPath": "/home/wanglab/anaconda3/bin/python" + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..3469003 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "/home/wanglab/anaconda3/bin/python" +} \ No newline at end of file diff --git a/ChIP_analysis/README.md b/ChIP_analysis/README.md index cd4f2fd..ef71fb7 100755 --- a/ChIP_analysis/README.md +++ b/ChIP_analysis/README.md @@ -66,7 +66,7 @@ arg0 is the sample prefix arg1 is the input bam file ```bash -samtools view -f 3 -F 2828 -q 30 arg1 | python2.7 bootstrap_sam_file.py parse - arg0.ob --paired 2> arg0_sampler.err +samtools view -f 3 -F 2828 -q 30 arg1 | python3 bootstrap_sam_file.py parse - arg0.ob --paired 2> arg0_sampler.err ``` ## Obtaining summary tracks at 10 bp resolution ## @@ -83,7 +83,7 @@ arg7 and arg8 are samplers for the KO input samples Note that arg1 and arg3 are paired, arg2 and arg4 are paired etc. ```bash -python2.7 bootstrapped_chip_no_consolidation.py 4215607 arg0 --ext_samps arg1 arg2 --inp_samps arg3 arg4 --ext_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1 --identity -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0.log +python3 bootstrapped_chip_no_consolidation.py --sample_name_luts run_info --genome_size 4215607 --out_prefix arg0 --ChIP_samps arg1 arg2 --inp_samps arg3 arg4 --ChIP_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1 --identity -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0.log ``` ## Obtaining bootstrap replicate summary statistics ## diff --git a/ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-36.pyc b/ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-36.pyc index e9d4b5a837a7e45f26674f604a98333b1b929ea6..6efad879db3f60768ab937b3f2c43c713728d99c 100644 GIT binary patch delta 1176 zcmZ8f!EPHj5S<}+*Xxxf%PU#7kRXuJ9x4Mp6m5~F5P|^dr9Bt{8YI9X?SeAtT5Ih} z;jRojkf4LyV}b&@_mWdjJ>(Da6#=g~<=%Tb+$f0>5**IF8FDy}^Ro6|eg18=8XUD= zKl))C;5Dobm-IWd^4ka?LE;c1EO!<$+S!n&PUNI+g9Tn~a-GNC{d9teN3m)y3)+SlIWVIBYCVQ$TxA#Ep zen`UFuwV!jYJGL`wbtyNwb<&(r?KIhulC+2@mJ_Jm%re5^Mv>;AfcKTIAf%-DG@Z| zQ$A&q^`PLT?M$7*DcmVaehsFOcuk60U>hjP>A}iN_>p_XT~uC{DHQ3(c*tA82?oD9Xz8!Sb2c!D#uf zckA~~=^CnYo@z delta 809 zcmZ8ePm9w)6n}3r)1=v^t=raZ5y~n(*ux$~Wf6*a7H^7&AcSFOitQ$8nPiu(bm^fh zo&+HuK`-8fy?XLPcnEm%q6cq&0AI4}S`5rD@Av2B&18NzztmTEJ^=KGK_oEV6tn;yy=E4aJo>Ltm(hvd_S;Xj?gl>>ZkZ+2i<*cBWNT z?Sb!>l}pu@#%WEvYxGy1_7p07%`Oqf76t16snt?#D9{aRZR*Av8!!T0*Nrat9%!CSph{gbaEEYbMZ=> zS(1x!IFo74BQ4{cCo_}%u=cLkDCgIRrCJ5j7`gZerj-qO(7Q=iQYU~~9bI&2^>Kiv zbN)%}|Uo^*iH~cauNrDaf&`W3+&s2PA=l3XudGzL>|fXr76y6tO7a(bhFR)F6P-c s`@v|3gxm`CF&$DK>x~_hdnvc!jG~!33iZQ2g_8k+gH7hPv*%9#511XY%m4rY diff --git a/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-36.pyc b/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-36.pyc index dfedf813d8f1af04e521e7a666c4e2a1d08b88fe..0a80ee25094dccfda01e78b5fcd81355a1241105 100644 GIT binary patch delta 649 zcmYL_PiPZC6vp$?>}Hc~Oj|LMkXT(&YfF-XkeFIUsv*@

0}N(S)s;gvkDBXE%zW zM=!m0z*C`K1Q8GFpqHM!dGIR8UMzw)h3dgW>%n;mC<{M6zWKg4`)0qsUyojh#iBny ze!WAdUPnJkaFy>#y8var9exE+W%H3kFxZXgXJ~OveFk9h4};$Uy8P{tZGcS?NOOBg z1$e@{!$p7{_GaYiXdw^?xCsFz5&}h1;z~@B#TVqh0a!d!Gl@djG!PXPU zp};p2=76w%Cp9RrWNH=OvyD_9X86mL791^#iS&|yA2yvhuKLev*+29CEU`Noz?r~ zo=0Vzg74qoaSqi20{_-m7Y)1NSWBMa5pgZ#yv!2%(KUm*q;Alb>xYo^E_NzxD|fMK zU?Sua(&+8wOI<=8my%wQ64x<1UfZ#K)$UmJ{(f%%Eyi(319M8dL2R$>J)&Xsmf4BC MKA;SFteb!E7qP&s-T(jq delta 649 zcmYL_&ubGw6vy+n+08a-Oj|LMkXXB=SX+`5YD}Y2q9KI{Z8a&@Xkghi(-2K|+u4mj zLXU!Y?RX66MfBi79Xu60_a6{sFN$Y{;z6)I)Ojf=10NpW_x-+^H~ad&?w<<={p0#x z^(gz^|5=6w{!QKm$n&1>9YBdK4IYBdivBM^xE6Q?aG&oD{Q!8xKc46TJQao*H-`fN z&v|zw1JGk1MqiFy@OV6SSU?^0f-0*aHK;1$1pAfBw@$Vt;Oy(E3fy7Ya0Xtpt?+3` z^Jn4KfRKJiG)S{Zv<9Epb~FV!{x+&fV;99lniX(;EvwOJnhn?6G&d|dkB!UhXKWe1 zvn%lk6#1=qQqEjO{Tib0$Y3>s$RY9qu1qMkD7_@=^D-@>=_=h1;xr%MHH(O-x|GZJ4x7p#j zwo*lBftypa5>(lX>0g37GnG&sbG|y+Fgh)VR{6oqHoy(`Ffj{x_9k&Qi(9&iRl7&l z9jf4zG$My6B32OiS~pNK%(~SsJBCBVwUBh;H&OR*8q_AbK^wN~Mbf*NDX^X7{Hl%) zuT4n3zn3d_3AJpx-Vah@Tdj`Mv`jZ(cG~*!e)jQSOyiIm`qXBfm`>AqOns=8*_l*j LKpD1KH}&8z|4*ui diff --git a/ChIP_analysis/bootstrap_helper_funcs.py b/ChIP_analysis/bootstrap_helper_funcs.py index 4676954..99d4971 100755 --- a/ChIP_analysis/bootstrap_helper_funcs.py +++ b/ChIP_analysis/bootstrap_helper_funcs.py @@ -28,8 +28,10 @@ #CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. import numpy as np +import numba from math import ceil, floor +# @numba.jit(nopython=True, parallel=True) def credible_interval(array, alpha = 0.05): """ Take a bootstrapped based 95% credible interval @@ -42,14 +44,26 @@ def credible_interval(array, alpha = 0.05): pos 1: min_ci at alpha pos 2: max_ci at alpha """ - out_array = np.zeros(3, dtype=float) + out_array = np.zeros(3) mean = np.mean(array) out_array[0] = mean - sorted_array = np.sort(array) - out_array[1] = sorted_array[int(floor((alpha/2)*array.size))] - out_array[2] = sorted_array[int(floor(array.size - alpha/2*array.size))] + array_sort = bubblesort_jit(array) + out_array[1] = array_sort[int(floor((alpha/2)*array.size))] + out_array[2] = array_sort[int(floor(array.size - alpha/2*array.size))] return out_array +@numba.jit(nopython=True) +def bubblesort_jit(arr): + N = arr.shape[0] + for end in range(N, 1, -1): + for i in range(end - 1): + cur = arr[i] + if cur > arr[i + 1]: + tmp = arr[i] + arr[i] = arr[i + 1] + arr[i + 1] = tmp + return(arr) + def least_extreme_value(stats): """ Take the value closest to zero for the min/max ci diff --git a/ChIP_analysis/bootstrap_sam_file.py b/ChIP_analysis/bootstrap_sam_file.py index a30489e..c5e3fd8 100755 --- a/ChIP_analysis/bootstrap_sam_file.py +++ b/ChIP_analysis/bootstrap_sam_file.py @@ -45,9 +45,11 @@ # custom module import sam_utils +######################################################## ## TO DO: # add support for multiple chromosomed organisms # add support for strandedness +######################################################## class ReadSampler(object): """Class to hold and sample from processed reads. Reads are stored internally @@ -153,6 +155,7 @@ def sort_reads(self): if not self.sampling: self.convert_to_array() self.reads = self.reads[self.reads[:,0].argsort()] + def load_data(self, f): """ Method to load data from a saved sampler object diff --git a/ChIP_analysis/bootstrapped_chip_no_consolidation.py b/ChIP_analysis/bootstrapped_chip_no_consolidation.py index d7e9915..71e31b4 100644 --- a/ChIP_analysis/bootstrapped_chip_no_consolidation.py +++ b/ChIP_analysis/bootstrapped_chip_no_consolidation.py @@ -9,6 +9,9 @@ #Developed by: Michael Wolfe #University of Michigan #http://freddolino-lab.med.umich.edu/ +#Modified by : Jeremy Schroeder +#University of Wisconsin - Madison +#http://wanglab.bact.wisc.edu # #Permission is hereby granted, free of charge, to any person obtaining a copy of #this software and associated documentation files (the "Software"), to deal with @@ -21,8 +24,9 @@ #list of conditions and the following disclaimers. Redistributions in binary #form must reproduce the above copyright notice, this list of conditions and the #following disclaimers in the documentation and/or other materials provided with -#the distribution. Neither the names of Michael Wolfe, University of Michigan, -#nor the names of its contributors may be used to endorse or promote products +#the distribution. Neither the names of Michael Wolfe, Jeremy Schroeder +#University of Michigan, University of Wisconsin - Madison, +#nor the names of their contributors may be used to endorse or promote products #derived from this Software without specific prior written permission. # #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR @@ -40,18 +44,22 @@ import logging import os import multiprocessing as mp -from math import floor, ceil +from math import floor, ceil, log import re +import time # Widely available import numpy as np from scipy import stats import pandas as pd +from dfply import * + +# More custom +import numba # custom from bootstrap_sam_file import ReadSampler -from bootstrap_helper_funcs import credible_interval, least_extreme_value - +from bootstrap_helper_funcs import least_extreme_value, credible_interval def summary_stats_only_finite(array, alpha=0.05): """ This version of summary stats only considers values that aren't nan or @@ -115,14 +123,15 @@ def summary_stats_nan_zeros(array, alpha=0.05): return(these_stats[0], these_stats[1], these_stats[2], var, this_lev, median, mad) -def actual_reads(sampler, size, res=1.0): +def actual_reads(sampler, size, res=1): """ Take all reads from a sampler and map them to an array Args: sampler (class ReadSampler): object holding reads to sample size (int): size of numpy array to create - res (optional, float): resolution the numpy array is in + res (optional, int): resolution the numpy array is in """ + ############# modify to support gpu acceleration ############ array = np.zeros(size) for read in sampler.reads: start, stop = read @@ -131,7 +140,24 @@ def actual_reads(sampler, size, res=1.0): array[int(ceil(start/res)):int(ceil(stop/res))] += 1 return array -def sample_reads(sampler, size, prng, res=1.0): +def sp_sample_reads(sampler, size, i, j, res, jit=False): + """ Sample reads from samplers using multiprocessing + + Args: + sampler (list): List of ReadSampler objects to sample from + size (int): Size of numpy array to create + res (float): resolution of numpy array + start (int): starting seed for random state + p (optional, int): number of processors + + Returns: + arrays (np.array) final result of sampling + """ + start = args.s+i+len(all_sim) + array = sample_reads(sampler, size, np.random.RandomState(start+j), res, jit) + return array + +def sample_reads(sampler, size, prng, res=1, jit=False): """ Sample with replacement all reads from a sampler and map them to an array @@ -140,15 +166,99 @@ def sample_reads(sampler, size, prng, res=1.0): size (int): size of numpy array to create prng (np.RandomState): random state to pull random numbers from - res (optional, float): resolution the numpy array is in + res (optional, int): resolution the numpy array is in """ array = np.zeros(size) - for read in sampler.pull_reads(sampler.total, prng): + sampled_reads = sampler.pull_reads(sampler.total, prng) + # # define heirarchy for using GPU + # threadsperblock = 32 + # blockspergrid = (array.size + (threadsperblock - 1)) // threadsperblock + + if jit: + # threadsperblock = 32 + # blockspergrid = (sampled_reads.shape[0] + (threadsperblock - 1)) // threadsperblock + # note that using cuda.jit here causes us to write directly to the array, + # without explicitly returning it from sum_coverage_from_sample function + # sum_coverage_from_sample[blockspergrid, threadsperblock](sampled_reads, array, res) + # device_array = cuda.to_device(array) + # device_samples = cuda.to_device(sampled_reads) + # device_res = cuda.to_device(res) + array = numba_sum_coverage_from_sample(sampled_reads, array, res) + # cuda_sum_coverage_from_sample[blockspergrid, threadsperblock](device_samples, device_array, res) + # array = device_array.copy_to_host() + else: + array = sum_coverage_from_sample(sampled_reads, array, res) + + return array + +def sum_coverage_from_sample(sampled_reads, array, res): + """ Map sampled reads to an array + + Args: + reads (): sampled read (start,end) positions + array (): array to be populated with coverage + res (float): resolution the array is in + """ + + for read in sampled_reads: start, stop = read + res = float(res) + x = int(ceil(start/res)) + y = int(ceil(stop/res)) # UNCOMMENT TO ONLY EVERY DO SINGLE BP RESOLUTION #array[start:stop] +=1 - array[int(ceil(start/res)):int(ceil(stop/res))] += 1 - return array + array[x:y] += 1 + + return(array) + +@numba.jit(nopython=True, parallel=True) +def numba_sum_coverage_from_sample(sampled_reads, array, res): + """ Map sampled reads to an array + + Args: + reads (): sampled read (start,end) positions + array (): array to be populated with coverage + res (float): resolution the array is in + """ + # pos = cuda.grid(1) + for i in range(sampled_reads.shape[0]): + read = sampled_reads[i,:] + # read = sampled_reads[pos,:] + start, stop = read + start = float(start) + stop = float(stop) + res = float(res) + x = int(ceil(start/res)) + y = int(ceil(stop/res)) + # UNCOMMENT TO ONLY EVERY DO SINGLE BP RESOLUTION + #array[start:stop] +=1 + for idx in range(x,y): + array[idx] += 1 + return(array) + +# @numba.cuda.jit +# def cuda_sum_coverage_from_sample(sampled_reads, array, res): +# """ Map sampled reads to an array + +# Args: +# reads (): sampled read (start,end) positions +# array (): array to be populated with coverage +# res (float): resolution the array is in +# """ +# # pos = cuda.grid(1) +# for i in range(sampled_reads.shape[0]): +# read = sampled_reads[i,:] +# # read = sampled_reads[pos,:] +# start, stop = read +# start = float(start) +# stop = float(stop) +# res = float(res) +# x = int(ceil(start/res)) +# y = int(ceil(stop/res)) +# # UNCOMMENT TO ONLY EVERY DO SINGLE BP RESOLUTION +# #array[start:stop] +=1 +# for idx in range(x,y): +# array[idx] += 1 def log2_ratio(array1, array2): """ Take the median normalized log2 ratio of two arrays @@ -156,19 +266,64 @@ def log2_ratio(array1, array2): Args: array1 (np.array): numpy array holding extracted signal array2 (np.array): numpy array holding input signal + cu (bool): whether to use cuda.jit to accelerate calculations Returns: ratio (np.array): median normalized log2 ratio """ # lets normalize by the median - ratio = (array1/(np.median(array1)+0.0))/(array2/(np.median(array2)+0.0)) - ratio = np.log2(ratio) - # only nans should be np.log2(0/0) which should be 0 - #ratio[np.isnan(ratio)] = 0.0 - return ratio + arr1_median = float(np.median(array1)) + arr2_median = float(np.median(array2)) + ratio = (array1/arr1_median)/(array2/arr2_median) + + # if not cu: + # ratio = (array1/arr1_median)/(array2/arr2_median) + # only nans should be np.log2(0/0) which should be 0 + #ratio[np.isnan(ratio)] = 0.0 + # else: + # threadsperblock = 32 + # blockspergrid = (array1.size + (threadsperblock - 1)) // threadsperblock + + # device_array1 = cuda.to_device(array1) + # device_arr1_med = cuda.to_device(arr1_median) + # device_array2 = cuda.to_device(array2) + # device_arr2_med = cuda.to_device(arr2_median) + + # median_array1 = np.empty(array1.shape) + # device_median_array1 = cuda.to_device(median_array1) + # median_array2 = np.empty(array2.shape) + # device_median_array2 = cuda.to_device(median_array2) + + # ratio = np.empty(array1.shape) + # device_ratio = cuda.to_device(ratio) + # cuda_ratio[blockspergrid, threadsperblock](device_array1, device_array2, + # arr1_median, arr2_median, + # device_median_array1, device_median_array2, + # device_ratio) + # ratio = device_ratio.copy_to_host() + + log2_ratio = np.log2(ratio) + + return log2_ratio + +# @cuda.jit +# def cuda_ratio(array1, array2, arr1_med, arr2_med, +# median_array1, median_array2, +# ratio): + +# pos = cuda.grid(1) +# if pos < ratio.shape[0]: +# # calculate median for position pos of each array +# median_array1[pos] = array1[pos]/arr1_med +# median_array2[pos] = array2[pos]/arr2_med + +# # calculate ratio +# ratio[pos] = median_array1[pos]/median_array2[pos] + + def floored_sub(samp, control): - """ Subtract control signal from sample signal. Only subtract is control is + """ Subtract control signal from sample signal. Only subtract if control is greater than 0. Args: @@ -194,7 +349,7 @@ def read_in_sampler(samplerfile): """ sampler = ReadSampler() sampler.load_data(samplerfile) - sampler.sort_reads() + # sampler.sort_reads() # our reads are already sorted return sampler def mp_sample_reads_helper(args): @@ -209,7 +364,7 @@ def mp_sample_reads(samplers, size, res, start, p=1): Args: samplers (list): List of ReadSampler objects to sample from size (int): Size of numpy array to create - res (float): resolution of numpy array + res (int): resolution of numpy array start (int): starting seed for random state p (optional, int): number of processors @@ -236,7 +391,7 @@ def mp_actual_reads(samplers, size, res, p=1): Args: samplers (list): List of ReadSampler objects to sample from size (int): Size of numpy array to create - res (float): resolution of numpy array + res (int): resolution of numpy array p (optional, int): number of processors Returns: @@ -258,10 +413,11 @@ def mp_actual_reads(samplers, size, res, p=1): controls how large to build the arrays", required=True) parser.add_argument('--out_prefix', help="Output prefix for final matrix.", required=True) - parser.add_argument("--sample_name_lut", help="Path to file containing the\ - lookup run information provided by Illumina. The\ - file will be used to match sample IDs\ - with sample names", required=True) + parser.add_argument("--sample_name_luts", type=str, nargs="+", + help="Paths to file containing the\ + lookup run information provided by Illumina. The\ + file will be used to match sample IDs\ + with sample names", required=True) parser.add_argument('--ChIP_samps', type=str, nargs="+", help="Extracted read simulators for samples") parser.add_argument('--inp_samps', type=str, nargs="+", @@ -274,15 +430,18 @@ def mp_actual_reads(samplers, size, res, p=1): help="Number of bootstrap replicates to do, default is 5") parser.add_argument('--identity', action="store_true", help="Don't sample, just use the data as is") - parser.add_argument('-p', type=int, default=5, + parser.add_argument('-p', type=int, default=10, help="Number of processors to use, default is 5") parser.add_argument('-s', type=int, default=None, help="Seed for random number generation. Default is 1") - parser.add_argument('--resolution', type=float, default=1.0, + parser.add_argument('--resolution', type=int, default=1, help="resolution of data to map, default is 1bp") parser.add_argument('--save_summaries', type=float, default=None, help="Don't save all bootstrap replicates. Just save the summaries:\ minci, maxci, lev, mean, var. Specify the alpha level here") + parser.add_argument('--numba', action="store_true", + help="Adding this flag will enable jit compilation\ + on the cpu to speed up calculations.") args = parser.parse_args() logging.basicConfig(format='%(asctime)s %(message)s',level=logging.INFO) ## TO DO: @@ -307,21 +466,32 @@ def mp_actual_reads(samplers, size, res, p=1): samp_matches = [pat.search(s) for s in args.ChIP_samps] if None in samp_matches: print("Your sampler file names should begin with the sample id\ - Illumina gave you, i.e., Sample_NNNNN,\ - where N is a number. Exiting script.") + Illumina gave you, i.e., Sample_NNNNNN,\ + where N is any integer. Exiting script.") sys.exit() else: samp_strings = [match.group() for match in samp_matches] # grab the sample names from the control sampler files cont_matches = [pat.search(s) for s in args.ChIP_conts] if None in cont_matches: print("Your sampler file names should begin with the sample id\ - Illumina gave you, i.e., Sample_NNNNN,\ - where N is a number. Exiting script.") + Illumina gave you, i.e., Sample_NNNNNN,\ + where N is any integer. Exiting script.") sys.exit() else: cont_strings = [match.group() for match in cont_matches] # read in sample info from illumina to look up descriptions from sample ids - sample_info = pd.read_csv(args.sample_name_lut, header=18) + for i,sample_info_file_name in enumerate(args.sample_name_luts): + if sample_info_file_name.endswith('.csv'): + sample_info_tmp = (pd.read_csv(sample_info_file_name, header=18) >> + select(X.Sample_ID, X.Description)) + else: + sample_info_tmp = (pd.read_csv(sample_info_file_name, sep='\t') >> + mutate(Sample_ID = 'Sample_' + X.SampleID.astype(str)) >> + select(X.Sample_ID, X.Description)) + if i == 0: + sample_info = sample_info_tmp + else: + sample_info = sample_info.append(sample_info_tmp) samp_names = [] for samp_id in samp_strings: @@ -337,33 +507,51 @@ def mp_actual_reads(samplers, size, res, p=1): # cont_names = [os.path.basename(x).split(".")[0] for x in args.ChIP_conts] ## Read in all samplers - all_sim = [] - all_sim.extend(args.ChIP_samps) - all_sim.extend(args.inp_samps) - all_sim.extend(args.ChIP_conts) - all_sim.extend(args.inp_conts) + all_sims = [] + all_sims.extend(args.ChIP_samps) + all_sims.extend(args.inp_samps) + all_sims.extend(args.ChIP_conts) + all_sims.extend(args.inp_conts) if num_ext_samp != num_inp_samp or num_ext_cont != num_inp_cont: logging.error("Mismatch number of extracted and input samples Ext_samp: %s\ Inp_samp: %s, Ext_cont: %s, Inp_cont: %s"%(num_ext_samp, num_inp_samp, num_ext_cont, num_inp_cont)) - logging.info("Reading in samplers") - all_sim = [read_in_sampler(sampler) for sampler in all_sim] + all_sim = [] + for sampler in all_sims: + logging.info("Reading in sampler {}".format(sampler)) + all_sim.append(read_in_sampler(sampler)) + + # all_sim = [read_in_sampler(sampler) for sampler in all_sim] logging.info("Finished reading in samplers") ## sample reads for i in range(args.num_replicates): logging.info("Starting bootstrap replicate %s"%i) logging.info("Sampling reads %s"%i) - if args.identity: - arrays = mp_actual_reads(all_sim, array_size, args.resolution, args.p) - ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING -# arrays = [actual_reads(sampler, args.genome_size, args.resolution) for sampler in all_sim] + begin = time.time() + if args.numba: + if args.identity: + # arrays = mp_actual_reads(all_sim, array_size, args.resolution, args.p) + ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING + arrays = [actual_reads(sampler, args.genome_size, args.resolution) for sampler in all_sim] + else: + ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING + arrays = [sp_sample_reads(sampler, array_size, i, j, args.resolution, jit=args.numba) for j,sampler in enumerate(all_sim)] + # arrays = mp_sample_reads(all_sim, array_size, args.resolution, args.s+i+len(all_sim), args.p) else: - ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING - #arrays = [sample_reads(sampler, args.genome_size, args.resolution) for sampler in all_sim] - arrays = mp_sample_reads(all_sim, array_size, args.resolution, args.s+i+len(all_sim), args.p) + if args.identity: + arrays = mp_actual_reads(all_sim, array_size, args.resolution, args.p) + ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING + # arrays = [actual_reads(sampler, args.genome_size, args.resolution) for sampler in all_sim] + else: + ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING + # arrays = [sp_sample_reads(sampler, array_size, i, j, args.resolution, cu=args.numba) for j,sampler in enumerate(all_sim)] + arrays = mp_sample_reads(all_sim, array_size, args.resolution, args.s+i+len(all_sim), args.p) + finish = time.time() + logging.info("Sampling bootstrap replicate {} took {} seconds".format(i, finish-begin)) ## Calculate ratios logging.info("Calculating Ratios %s"%i) + begin = time.time() for j, (ext_ind, inp_ind) in enumerate(zip(range(0, num_ext_samp), range(num_ext_samp, num_ext_samp+num_inp_samp))): @@ -373,6 +561,8 @@ def mp_actual_reads(samplers, size, res, p=1): range(num_ext_samp + num_inp_samp + num_ext_cont, num_ext_samp + num_inp_samp + num_ext_cont + num_inp_cont))): cont_final[:,i,j]=log2_ratio(arrays[ext_ind], arrays[inp_ind]) + finish = time.time() + logging.info("Calculating Ratios for replicate {} took {} seconds".format(i, finish-begin)) # Write out final output @@ -393,6 +583,7 @@ def mp_actual_reads(samplers, size, res, p=1): # ALL OF THIS IS HARDCODED QUICK CODING. Saves a lot of output files to be # used downstream logging.info("Calculating Summary Stats for finite values") + begin = time.time() # save sample summaries for j, name in enumerate(samp_names): these_stats = np.apply_along_axis(this_summary_stats_only_finite, 1, samp_final[:,:,j]) @@ -430,6 +621,8 @@ def mp_actual_reads(samplers, size, res, p=1): np.save(args.out_prefix+"_%s_Sub_%s_mad"%(samp_name,cont_name), these_stats[:,6]) np.save(args.out_prefix+"_%s_Sub_%s_num_inf"%(samp_name,cont_name), these_stats[:,7]) np.save(args.out_prefix+"_%s_Sub_%s_num_nan"%(samp_name,cont_name), these_stats[:,8]) + finish = time.time() + logging.info("Calculating Summary Stats took {} seconds".format(finish-begin)) # logging.info("Calculating Summary Stats for nans as zeros") # # save sample summaries From 4550ae83af281831fcfb41c41ad5a2df56686d2c Mon Sep 17 00:00:00 2001 From: jwschroeder3 Date: Mon, 20 May 2019 16:20:28 -0500 Subject: [PATCH 04/13] Upated README.md --- ChIP_analysis/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ChIP_analysis/README.md b/ChIP_analysis/README.md index ef71fb7..338a010 100755 --- a/ChIP_analysis/README.md +++ b/ChIP_analysis/README.md @@ -83,7 +83,7 @@ arg7 and arg8 are samplers for the KO input samples Note that arg1 and arg3 are paired, arg2 and arg4 are paired etc. ```bash -python3 bootstrapped_chip_no_consolidation.py --sample_name_luts run_info --genome_size 4215607 --out_prefix arg0 --ChIP_samps arg1 arg2 --inp_samps arg3 arg4 --ChIP_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1 --identity -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0.log +python3 bootstrapped_chip_no_consolidation.py --numba --sample_name_luts run_info --genome_size 4215607 --out_prefix arg0 --ChIP_samps arg1 arg2 --inp_samps arg3 arg4 --ChIP_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1 --identity -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0.log ``` ## Obtaining bootstrap replicate summary statistics ## @@ -101,7 +101,7 @@ arg7 and arg8 are samplers for the KO input samples Note that arg1 and arg3 are paired, arg2 and arg4 are paired etc. ```bash -python2.7 bootstrapped_chip_no_consolidation.py 4215607 arg0 --ext_samps arg1 arg2 --inp_samps arg3 arg4 --ext_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1000 -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0_1000_bootstrap.log +python3 bootstrapped_chip_no_consolidation.py --numba --sample_name_luts run_info --genome_size 4215607 --out_prefix arg0 --ChIP_samps arg1 arg2 --inp_samps arg3 arg4 --ChIP_conts arg5 arg6 --inp_conts arg7 arg8 --num_replicates 1000 -s 1234 -p 8 --save_summaries 0.05 --resolution 10 2> arg0_1000_bootstrap.log ``` ## Calculating IDR statistic ## From 5f08e3a1c6a8b67dc646694206e26950721d9f69 Mon Sep 17 00:00:00 2001 From: jwschroeder3 Date: Tue, 21 May 2019 10:41:42 -0500 Subject: [PATCH 05/13] Added jut support for bootstrap_samfile.py --- ChIP_analysis/bootstrap_sam_file.py | 46 ++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/ChIP_analysis/bootstrap_sam_file.py b/ChIP_analysis/bootstrap_sam_file.py index c5e3fd8..644c1c8 100755 --- a/ChIP_analysis/bootstrap_sam_file.py +++ b/ChIP_analysis/bootstrap_sam_file.py @@ -42,6 +42,7 @@ from math import floor, ceil # Wildely available import numpy as np +import numba # custom module import sam_utils @@ -310,8 +311,6 @@ def map_read(array, read, res=1.0): def sample(read_sampler, n, array, res=1.0, prng = np.random.RandomState()): """ Sample reads with replacement from a sampler and map them to an array - Modifies array in place - Args: read_sampler (class ReadSampler): object holding reads to sample n (int): number of reads to sample @@ -320,9 +319,35 @@ def sample(read_sampler, n, array, res=1.0, prng = np.random.RandomState()): prng (optional, np.RandomState): random state to pull random numbers from """ - for read in read_sampler.pull_reads(n, prng): - map_read(array, read, res) + sampled_reads = read_sampler.pull_reads(n, prng) + numba_sum_coverage_from_sample(sampled_reads, array, res) + # return(array) + # for read in read_sampler.sampled_reads(n, prng): + # map_read(array, read, res) + +@numba.jit(nopython=True, parallel=True) +def numba_sum_coverage_from_sample(sampled_reads, array, res): + """ Map sampled reads to an array + Args: + reads (): sampled read (start,end) positions + array (): array to be populated with coverage + res (float): resolution the array is in + """ + for i in range(sampled_reads.shape[0]): + read = sampled_reads[i,:] + # read = sampled_reads[pos,:] + start, stop = read + start = float(start) + stop = float(stop) + res = float(res) + x = int(ceil(start/res)) + y = int(ceil(stop/res)) + # UNCOMMENT TO ONLY EVERY DO SINGLE BP RESOLUTION + #array[start:stop] +=1 + for idx in range(x,y): + array[idx] += 1 + # return(array) if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -364,11 +389,14 @@ def sample(read_sampler, n, array, res=1.0, prng = np.random.RandomState()): help="psuedo-random number generator seed, default=1234") args = parser.parse_args() - + logging.basicConfig(format='%(asctime)s %(message)s',level=logging.INFO) if args.command == "parse": + logging.info("Preparing sampler object {}".format(args.outpre + ".npy")) if args.samfile == "-": + logging.info("Reading stream from stdin") f = sys.stdin else: + logging.info("Reading file {}".format(args.samfile)) f = open(args.samfile, mode="r") if args.paired: sampler = create_read_list_paired(f) @@ -377,9 +405,12 @@ def sample(read_sampler, n, array, res=1.0, prng = np.random.RandomState()): f.close() sampler.sort_reads() sampler.save_data(args.outpre) + elif args.command == "sample": + logging.info("Sampling from file {}".format(args.outpre + ".npy")) + logging.info("Using seed {}".format(args.seed)) prng = np.random.RandomState(args.seed) - array = np.zeros((int(floor(args.array_size/args.resolution)), args.num_samples)) + array = np.zeros((int(floor(args.array_size/args.resolution)), args.num_samples), dtype=np.int32) sampler = ReadSampler() sampler.load_data(args.samplerfile) if args.identity: @@ -387,10 +418,11 @@ def sample(read_sampler, n, array, res=1.0, prng = np.random.RandomState()): map_read(array, read, args.resolution) np.save(args.outpre, array) else: - for i in xrange(args.num_samples): + for i in range(args.num_samples): if args.num_reads: num_reads = args.num_reads else: num_reads = sampler.total + # array = sample(sampler, num_reads, array[:,i], args.resolution, prng) sample(sampler, num_reads, array[:,i], args.resolution, prng) np.save(args.outpre, array) From 4096e9b8c792f2b4646696de969780b2675ba983 Mon Sep 17 00:00:00 2001 From: jwschroeder3 Date: Tue, 21 May 2019 15:41:44 -0500 Subject: [PATCH 06/13] Updates to bootstrap_sam_file.py to time sampling --- ChIP_analysis/bootstrap_sam_file.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ChIP_analysis/bootstrap_sam_file.py b/ChIP_analysis/bootstrap_sam_file.py index 644c1c8..fce9a5a 100755 --- a/ChIP_analysis/bootstrap_sam_file.py +++ b/ChIP_analysis/bootstrap_sam_file.py @@ -40,6 +40,7 @@ import random import logging from math import floor, ceil +import time # Wildely available import numpy as np import numba @@ -419,10 +420,13 @@ def numba_sum_coverage_from_sample(sampled_reads, array, res): np.save(args.outpre, array) else: for i in range(args.num_samples): + begin = time.time() if args.num_reads: num_reads = args.num_reads else: num_reads = sampler.total # array = sample(sampler, num_reads, array[:,i], args.resolution, prng) sample(sampler, num_reads, array[:,i], args.resolution, prng) + finish = time.time() + logging.info("Sample {} took {} seconds".format(i, finish-begin)) np.save(args.outpre, array) From 6d51330b0f0cc05574514c0ca67160c1c2886301 Mon Sep 17 00:00:00 2001 From: jwschroeder3 Date: Tue, 21 May 2019 16:40:15 -0500 Subject: [PATCH 07/13] Updated bootstrap_sam_file.py to accept reference genome --- ChIP_analysis/bootstrap_sam_file.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ChIP_analysis/bootstrap_sam_file.py b/ChIP_analysis/bootstrap_sam_file.py index fce9a5a..da35e6f 100755 --- a/ChIP_analysis/bootstrap_sam_file.py +++ b/ChIP_analysis/bootstrap_sam_file.py @@ -44,6 +44,7 @@ # Wildely available import numpy as np import numba +from Bio import SeqIO # custom module import sam_utils @@ -375,7 +376,9 @@ def numba_sum_coverage_from_sample(sampled_reads, array, res): on the samfile of interest") sample_parser.add_argument('outpre', help="output file to np.save the\ numpy array that is created") - sample_parser.add_argument('array_size',type=int, help="length of genome") + sample_parser.add_argument('--reference_genome', type=str, help="fasta\ + file containing reference genome. Will be used to infer genome size") + # sample_parser.add_argument('array_size',type=int, help="length of genome") sample_parser.add_argument('--num_samples', type=int, default=1, help="number of full samples to pull from the sampler, default is 1") sample_parser.add_argument('--num_reads', type=int, default=None, @@ -408,10 +411,12 @@ def numba_sum_coverage_from_sample(sampled_reads, array, res): sampler.save_data(args.outpre) elif args.command == "sample": + array_size = len(SeqIO.read(args.reference_genome, 'fasta')) + logging.info("Sampling from file {}".format(args.outpre + ".npy")) logging.info("Using seed {}".format(args.seed)) prng = np.random.RandomState(args.seed) - array = np.zeros((int(floor(args.array_size/args.resolution)), args.num_samples), dtype=np.int32) + array = np.zeros((int(floor(array_size/args.resolution)), args.num_samples), dtype=np.int32) sampler = ReadSampler() sampler.load_data(args.samplerfile) if args.identity: From d102b82e51bbdc2995868a2aa867c8f8df277b44 Mon Sep 17 00:00:00 2001 From: jwschroeder3 Date: Wed, 22 May 2019 13:50:34 -0500 Subject: [PATCH 08/13] Updates to bootstrap_sam_file.py --- ChIP_analysis/bootstrap_sam_file.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/ChIP_analysis/bootstrap_sam_file.py b/ChIP_analysis/bootstrap_sam_file.py index da35e6f..5bef813 100755 --- a/ChIP_analysis/bootstrap_sam_file.py +++ b/ChIP_analysis/bootstrap_sam_file.py @@ -294,7 +294,7 @@ def create_read_list_paired(samfile): logging.error("Skipping pair %s"%err) return read_sampler -def map_read(array, read, res=1.0): +def map_reads(array, reads, res=1.0): """ Take in a [start, stop] read and map it to an array. Read must always be in single basepair coordinates [0,end). Array can @@ -307,8 +307,9 @@ def map_read(array, read, res=1.0): res (optional, float): resolution the numpy array is in """ - start, stop = read - array[int(ceil(start/res)):int(ceil(stop/res))] += 1 + numba_sum_coverage_from_sample(reads, array, res) + # start, stop = reads + # array[int(ceil(start/res)):int(ceil(stop/res))] += 1 def sample(read_sampler, n, array, res=1.0, prng = np.random.RandomState()): """ Sample reads with replacement from a sampler and map them to an array @@ -347,10 +348,18 @@ def numba_sum_coverage_from_sample(sampled_reads, array, res): y = int(ceil(stop/res)) # UNCOMMENT TO ONLY EVERY DO SINGLE BP RESOLUTION #array[start:stop] +=1 + # array[x:y] += 1 + # sum_range_vectorized(x,y,array) for idx in range(x,y): array[idx] += 1 # return(array) +# @numba.guvectorize(["void(int32, int32, int32[:])"], '(), (), (n)') +# def sum_range_vectorized(x,y,array): +# for idx in range(x,y): +# array[idx] += 1 + + if __name__ == "__main__": parser = argparse.ArgumentParser() subparsers = parser.add_subparsers(help='commands', dest='command') @@ -412,6 +421,7 @@ def numba_sum_coverage_from_sample(sampled_reads, array, res): elif args.command == "sample": array_size = len(SeqIO.read(args.reference_genome, 'fasta')) + logging.info("Inferred genome size is {}".format(array_size)) logging.info("Sampling from file {}".format(args.outpre + ".npy")) logging.info("Using seed {}".format(args.seed)) @@ -420,8 +430,7 @@ def numba_sum_coverage_from_sample(sampled_reads, array, res): sampler = ReadSampler() sampler.load_data(args.samplerfile) if args.identity: - for read in sampler.reads: - map_read(array, read, args.resolution) + map_reads(array, sampler.reads, args.resolution) np.save(args.outpre, array) else: for i in range(args.num_samples): From e34287b8fe8b9d7e834f8337f43ff9416521b8bf Mon Sep 17 00:00:00 2001 From: bwanderson3 Date: Wed, 22 May 2019 14:58:32 -0500 Subject: [PATCH 09/13] Updated bootstrap_sam_file.py to do summary statistics --- .../bootstrap_helper_funcs.cpython-37.pyc | Bin 0 -> 2012 bytes .../bootstrap_sam_file.cpython-37.pyc | Bin 0 -> 12256 bytes .../__pycache__/sam_utils.cpython-37.pyc | Bin 0 -> 55011 bytes ChIP_analysis/bootstrap_helper_funcs.py | 39 ++++++---- ChIP_analysis/bootstrap_sam_file.py | 71 ++++++++++++++++++ .../bootstrapped_chip_no_consolidation.py | 4 +- 6 files changed, 97 insertions(+), 17 deletions(-) create mode 100644 ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-37.pyc create mode 100644 ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-37.pyc create mode 100644 ChIP_analysis/__pycache__/sam_utils.cpython-37.pyc diff --git a/ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-37.pyc b/ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..795f36229d89fd62fc4b8f9cee5e6d6c07d0cce6 GIT binary patch literal 2012 zcmbUi%Whjmbml%B`zCQ*lDbK$s*!-GuGG3|L84Ivlq>+LLa1eRRcSQ7GmdX^Uz)j2 z?Z{rW?Ad}?B+E$r02{u6^=uJB`2|*-89(BZE-J>Fne#s9p2zsh)ztvOdVK!Zi(eZE z{b`WdssQi>Y;_kS#1Y2{3bDPN(1F`Y+|&!b)DL}(_!{@P{~CuCUg1lxQMe?kyn5vP zh~>7`sf9IBI}CVj3x&%T3IJLW_;gh)pW(1B>Kt?YgEKu**9HFCf2cJ}ZEz$Spm|L+ z&hQ%ALt;%dHWApsr|*dhuZjAZ8?IYy39xm*HUPUWR`~J_B-ZCXoGoUf zUtN%`b>Eb&FYtmHkK^a4)wnPj{I=z2FA{O0{ce)y^6#I2GVA>F$T4hKCckFy!&bk; zQKD1Gz!@Gp068NBkUMfm4tMqp&$Se9J zAJ8b|dv+uUBb_`iRVi6IGgKSPESwHOvN%HQc2i3Y%PKq8QcOc`e1WjhqZZ1jL6GX))!YE!1!}xtB z%D&7N0#W@mWpYSW$x8J`Ku>1zfUya8hYZN0c@)On+gFh*@?e*wf@SY0y#vy5Mx&Up za`HXzsQ3^R+2DUDJ~Hv~cpBk_2c0RQu;{`x?uCV1fLcr1$qMbC2$?JGnNQN5%4Mlt zD6cdkfJ4e#746$n!e#2AYk9wz}XMy*w4|mn=I-Sf{OI z)PDG2=TrJr7IeS&&3A2cP?oV|sAAQgHy!PXq!5yJ`&p#!7ejq*R&+Y+y$`^?YQlk5 z103Kg?A!Q5w~n`+rc=ezup;xdX8Ev9BKBdcXK;+r3vUdQI(Ek%%m6;Zl8juqeM!tK zU}?Vm*wQ?>VL~ho$*(O9oBKJ;^&v=eGe7RhZfjbPk zn3X+g`rP)f?SZ+QjHQB7?W!TPJCg@sI?lRrB6P*%R|@UHv%^C_&cKXzUcx@gbv0Jq zIEzc68>}djA!SLP9Z)uimE1JZx+Y3bv17524%7Y8=A^op=DeSX&u#-rZNpK8W^K4Z ZtGYgE+rSBM6E~d((ChHK1u43P{{`Z@_e}r* literal 0 HcmV?d00001 diff --git a/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-37.pyc b/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..37e798b97a16b7210d59a500bfd27a3b737626a3 GIT binary patch literal 12256 zcmd5?OKcohcCD(eZZ?|~MbWf=mgVwC7TYFUrsUtqvNIZ48auIRg_1{3+moJR*K4w= z?ylCWYDsKR2{5t&XJKGgvzj2(GFfI3WSb!CEGAh5g#=j)g20;~z-$5|KyuD~)zw8x z)MSi7k|Mk6Rn>dleHk9TO;g7r*Eq9Lox^%uEly$uBCbj*FtBaTdtS8ll958ex*Jo@2Bh2-?sucD1L4Q zMRnkQp*|Cof{D+q`oW+aOyW8lRDvm74+Yb~0bJ*T?*s>f+0X6z6Tys{+AQwOKd@R( za449=vnSQz;E9{gPaOUCs<3n9fmJ`Mj&2?c=7T4L!(Tb| zEJ#bft>XTr;N@WPD;Ko*WC>54pq6`b2EDzK_Qo2^(&lWg#yLEBHFzz3a&*VT44+lc zZaxYK85mW)cm=cQ>X6R^@Uzp9Wo_# zYNz_Z!sr%uUP`YoqxRwh+qQcDh|+VxT)MWh?s&5Fz^>QS)Xo|8j4I>)75tW;vhLgW zZIsTo9K1iLX6{?{1yFqz&t4-nTwe!|w(p))3!87eYyEJ({-%0Dol|eJ{e7(VPn=uV zZ&;xG9Xwf7=N^DRy-)7Gr_SU4?0p+;&ez{U`2y(tjMw^wUJ2CRu3u13ss(jHox|!n zUpVTmmUE0fF00!8qEP;FFXh0wW>wFBO?j_6NjYfWZb$8~m(28ayB9ZFgI+Vn{R&(cyNAH-;nkeI_)T~o|kb%-a-`nI$rdIOgzAFBQI=?MsX?Z z-BvnA_ZWC@(CufPr(+mhe69U0@6LwmnQ66q>shyP82cS>0TV#4AhM=()&-{J#T&}w zn6d`4FreTqtc77m`8~9|t_R9%n}Kb4tx#wEEP7Y`PNcjLbo8Beq=ZfrOD+emWd0@m zqQf|?TiaFxRuX4BaBy}43oI+FXjZ*aEhI(ZK~j`~C*{#_s&?W=s?+*YOHZNZH9SKyluJGrW@FApi#pWT`CNJ>3yi;jZpaN2ardvr|t;X$!T)LNT*K4iF|&)HG1f^ zBQkEZFkUwd)FNB;Ac-O|k;j#)$wHE%znjA%{wjt(jC=B09e>?h^P4x>KzNGIGoiSs z*K?xG@nFOacHk|D+)}tOhg|~A*BOPc$={V~QBUHU985>rh{Fb@Pt%PiCBNT?D(Et9 zs*a}9t(Qj~3ERzSnw85~IT^ua+(m3+RZ4c*p5DW~QME^N?UYY z*)=(0SG1R3lOsk%*U{K0+awt_d@`?WS*=|*#6=U)wi77C)+X%7CLBoMd}VA%0hXi? zxZqZC_cyExWz$WHFq^Nx3e!^z;;p{Aj`=4} zub)i#kr0q2kQJ%gBCdNUJ)4trY2kD3oQ+kOvxg`0Gv8Zv*8=P^qA^7lFhYzd&v526 zqQl>L{M3cRyXTDMZqt2dJ|Ua@l@%CcH&f2X%u=LbeLOK};8XidjN~#!ETBoo*jI2v zMU2K~)TSAwtJriOIQPKl&Ehs@O5j4+A~XKG+>DLUgtd#{zMz=2KgPj7S}Sj~JDriq z%#15!WS3>7+v|m~=XXeME2q>C>s4RVZT9-LRhZbYdo2be@WQoC)ui*)@(_rEPtnZ) zZZq#nvy~TDxKb?{RXd7>NlK=1GQlf^QcR>sU$r2Dm$>Yl0vo8id!=x)Kj?JCR(8oH zv6W@2-s~Rb%jz}B8y@6O5kFDB3n=X2kwZ1aGq^h_W8q#Qc6W-~g$MRMsN`m8yAa?n za8=ynE;tkUw=?vTT3yO`2QsfI@ncBwt+dbksMpzog5FkRr$ps8Qsa+c zw+D3YduzbtddnbZWAMklKdiiPe60;=S4TlteUA{)Gxt_=ma0>4b0` zWRkH%a46u~C}c$VBo=+o5{2OX9o>d$_j_AjCW!>ue(c>*h$Z`4-EN122oenZo>Cc^ zmFv5$zL6*n#K5d#wBh$v9vN#+?jiYN3z4rs$*&tc@-C2-)mo{}FbL}TUDaqLl|}=> z#Gu3Sbfa-=;CIqD6OBd?HX99nfSn%V=?R{MqXGsAr1f(=o#N>5xdSOpxP#Rf;el@FLBTdSuQ}8Ua`sxh`u%77e%&m1TOOFUmIRyymSCtn`u6Kz@cugXVX2<>FGL zZW+3Z7#c+I-Ia@JKMU=es(Fl;m#(Q>)fD3FwvZ$G_}#U@f9EgI_Luls!y_`T+65ZIPV^26rUu#REY^OlC`!l*}aS2jY!(giM2xkDNw`$1X~(KSS$?G_C1br{YxXd3(W` zvGp5xk{LH5u}lr%fKxTN>FO7^Z7Re~3nfWIK>Tf{BkmP?pBQDqyARS@E=%$jUL&n_ zmi`0potuu{e2B(;j9eXRxTxW_oWn?C{@7l(?zvE%0@RC&2CR+}e(32hw}}HE@s4Rv zsH?kEGU_EI;6>YtC$`=D_B}T)^BTCA)i?2EP#pCdSNObDFnw<$sUlo1`TtC{ygpK+ zoJJBA0Jyi;dtJnLfCQpi6hujVtoaf0ZDOCHULsWnN6?mc{62(vtZ(~>qO{trDtZ}u zk0jF|Zg-;EHNSgtGy()<@%Gg<+j zr+IefZPz|(pTvCy_tW-qTVF!W%n}kG5?%3O^chaEb>D(Ryt++?_})E7KoIngu94`F z(b{$ax@JSyuEfQi5_H_zoPY-6DYaa5)!vz89Cu9aZcV4|(Dgs1-mOxgjiUHn0nOC1 zk=ts=#%Yb5g^>!nq+CcY0+9z**I_Qb(Z+%i1Z13z-y6F}da;^!?Pj}A&E85SPShEh zdm);Wr^xuB-Kq*Tl_BSmKad%ckb?hH_|M^T;PgwPxM{O}JR7Sr%goJ(1x#y!O9V5( z`zlO;ac!(+G7V6R+=6gChHxs*oM4|_=O&X1J=Wip;jDP?kc)Sk2T2Y8lAD*YG(Q<2oaMIn{czs~$l>Xo(&(aEcpM4ckHQZm^E2fhd} z{$@e1#zlbq^nT=e$Ujdp!VPV&uV{y7MX7R6ClZNST|B!_DJP;2s=JrIZ{~?9U-r=>!fA7 zib`#TJQs#s$W}Q5P)K1ybHiYLafN^oU&Z=8&O-~08paW$Eb{$TwzY8JHR%@V~;h2XS?Ln zqkv+U6|l|-tRb?2sQ$5pu%M~ek@3^9DZja4W+USe5}e>;k1h}i*1u-V_HZ^c=|c8Y zv{IciM!d}Rqq)`Z!YJy?Jc*gcS0`!MO_Qmw;FJ^Gwp~(a<1_DC_{%W4k0!kwok9h7Jeo8w5PD4m;E<7&u6XhR8vhYu<-p&~CNUe5DMJ zgXnks=A&(usj{#Plh&`v4fByQ9OfbUJpy`T_5o>#wizF#Pu61;PG5t#L09rCRy8$1 z0}CdzGOZBNhhlO4&LVb!LtpkPR5nvu>HMA<>@Rv10)kNZ;YR_nttRIIgNriO#N~LB z0v01FN`?S=Y$Pl8ieedq5%};Tcx^Vu4wP+O2dyRv@?gZvvY-Wm`({enKC$>o|S$+nE;|!nRGYJtXP?)vJx1=Iv zCOqP0%8wBr&CA`;lHH9^ptsPJ?@w0o%#22Zrv~xLmmoCGW_)C^j5p)ZudzMx?M3I? z3BkzdnJtmA6z$#tW7G>h#dkK41GDwTjfc@g^G(V)oj)SF@0FkI$m&4yOvaJYg6%R| zKm9CEAMMw$Xct*1W=y{}u=Zrn2yCCn#&D@dlS-v!q8sg26`EGDnwA;f!eqX#CTGP?-V-9Fp zjL&Z*F92+lBgfV-EbN5VPax0;1EemFmY+su46rUXo2riti1DU5YBH)QMr^~N{q)jl zL3e%Cq|FLEeAJ_ANhut}z-_}1c!f7cxz)bQ$PSz~J0ma>xg%SKbncQ%m&gqBGK&=q z%S%hNzbZ&7V$~WkzB2OXz>rIf?wJ+I9h?iu%Gz;z$23i$BQYVrU%@rX#*P+d39@E6 zavy$t?K=Gdrm|!f6tl&IQil!*`_anfM?lz`cV(bKSuf^@aiL+A#npi3%17QH4~*n< z9gd&O6{hjV==-Qmj@l!zOAT^Y^5r`_k(%LIS>c=`NTn-JW+aM+2eo9%w9y>J@OvpM zWk}L^K)*=G1sI+?;OjVAw3VvxZ<5D^^barQY3wfydFQc529k=HtVV=A(BUjHE9*dx z=we;<@NpNFjh|hZDIZu`0;SZXMkY9Z+ZYcvY0;#Z8A|S1OpHt{9X`YFMfmNOj}Lb! zDa**-u2bPB(uby>DbtUzXZY#3gUmh~$^g591x=SV@7*zB=@2vGuyTjgqrIclwV)} zg0LHN9Sx_KmOvrJHNyqj8-RlZfKIK+U3?a1pi-2r-`mT>(h{~5RM7r8(oVzWel$=) zxRe_Dbk)E{z!V@BY?r7Y*ZAzYSHH2B5?gZvoZqj(8Zv)heTREctpQTB*n`qc>+1qp zRQC0{h$fP4TaiA_7XpG^BuqCnGhc!~k|Mh9Z^?$NU})Gb*{u_NEg3w?nMpcA5UfBA zd@w2=>KYxCcme$~yPbhgN9fy&uTm{^Qj#&~WQJcUMT0eS8`Uf0GDI5h%!~T8T+_Q` znr|=>p;$j)n3}%S{3t4^nEI#_B`(kU_c=5A2%;!hY1#ElT&Ae%7>$@u=4~k5NwL`p zF-+NeJH>`;Y>3Y$RUnQqDGrqmBi-WT%9!iPR8B&S^OUVS#d_tDjGB*~#FswL7p1%4 z_PR>!c72lVMcB-D3u9lK?fE6;3ubTe9gZye5~o>-Fc@XbI%TI^o{h^#$}iyeQe}4j z5ObU}1@8Hl9Q;_OEHv+o*n{dSj~dp9w+>wLoQw)5{M?EE{)x!t(xv3qgV zi>oJawZrbiRUfYQ*vX5D;r_d*Wn$Ry`&QgirB+;Att`|^)yj1Jh3ALL&JCwrv6h`- z&wPH!^6-S`$8fRic%HR{i{Yf7U97sxR?W}I`wJcZ->Q>j^OOkN(&I$rIVoSi&* z(yXr5%oFC4TU}jo+@ZmV)01b;o;yD{Vjdnb2iFFN^^0hndJPp~E>%RKP~Zop)6`?h zfPzrO;Atk4K_NqfXLxy*$vJ(GpTc(}Q6KZW7fa>x4XeEB45$1w+v0cY7WqB((n_&h ztt_D#!)ZUqC)Rw~@r{a`LGSu$%U$w*CE@ns;=N-pU8ycRFWs~%OJ!^RCC^=WX?%3- zW5w6pmEz=;3vax{q-a&F@-46Az2sTT#noD=>=jmS`CUP;9Ob}q1QWe|hTDhZg25~N z@z{kQ$J#rF#5U}towC!Mj&|4Avp*7y_Il!n6N`qOv%BBTH4P4$c}0 zjC)W0XD6%6D`kh{Wm;xsb)oE3YbDz;J?HwWQ(18IU@YhthXy9j&rS}gt{a@4oH#$Y z!CTX_(Oc6K8@+WX_ST``E#H{+jY;1)?;8`oG5wx#XgKTcMr++&O!}GZWby=)JxFv1 zC2_&7;}efhmeAPCz3ax85)I=(;wO{W(-+lH$#`KEB@3f|dfxIJEXHoOvuKwVYVy?G zfdbykA1{}@8lRk;VX6E?Hc@{j>XX88EDNjboy1A_s*i=`r+Mk8uU1PHl%5N_RejoU z_n{>DhM$iLJHu}Xn9J`@=90UUdBgn>?k(V_|JcM+_<28r#7WE)mf2tc;N|TeJn6D~?HxGg>^^%Zj@@>@y$i>@z1w~Q#~yo+{UnaP z_Fj7*jyvrA_5mFG?5FIfaolMiv_FJnzx@&W!?uZU?y{e?2XJS%-G%w}yx)7#TAnDE zmMWZK^)H%}Wy|wSF6=ASvR$eyVb*y#lpV8NsyG50s*AXJabgM(sO%KX39dZzW(iPe z*{NNr+MdidK&WM_Qd(IpTQ$eDDz@pY)m$qCVFXNO?FycRUoS|u_l)i)Fv;~h$!_$K zG>qGcI|(e2B$h~uAo2j_PBB^IdGY|35U$cVPs#m^+)wkqo!rkivvMaByxD~}v+`yR zXI*mEEpOyvdGd0nJIK?6H}mplFV1@8Y)2z;yAM|wtRF-z?ywd+ZQL#POS((a-I6{b z=^ja+lytAX14GegeLI^-1pQyClxpagy5lN1?h(K?4}|m|hQV5ndXT*=ok;lW)vB)J znU1w^MfLs-$GwFibIP`9c_?i`hT5vt+|vAN%_%5Uz^oT4D*)zdzz?Ay*PO9#%BM0-DK8e@oSv9EYpT4{z>+5B3}r~V|*xz{Gno5%y? zW}Yt9Zk9a9Tr68l%*%Xups*>?R+uw7N*Oa7es|Gv2`+&sG4ZM%Fu+D-yz%-)l&b16 zdWc}hHp|Xp%|tdN*2)T(1uC^za&dz%Y*f_L#2ashMTHr!uVQ3tw>HW$dExxTOwe-+ zRbc5-g`=-qZdNP04zlK!tQEE3qORas&>d&ro(&2kv7)-Ny(@GVuUI8_qnf=jtqSA# zu#eCKRg9?{4vx85=GoWIPKP~FSzVq-HsojD0oo80NDl*MxTU2lHBR@1xG^79i;{?^yG-bTen3fP&hsl@joofMGyxH=0kR-~?2k zFb8TV=gq=cfybG`_-NrU{!fLMM++Y>OiWA^hK7a;(^JPv$3_QqBUy?px)WUY8bb`R z$O^-BZOQv;V-_y!&3lK#Cb38}EVDJGmLPXXxoY0ectBQwEIVTAoOhtJQ**zf@Wl%Eke<)_(KehNL| zr`blof5mb=r-&t56eQ^RDYVD$T_`PCZn3rsD%0GqKu?V{qM{GRX{z{TGYUaVYk z9oz2>FF{#){;u%KbprSry|BDm^T3y+36n{1RvbTtB{bK?Q_FP~B}?W+k(#+;krY6& z_&MD2hL9x2`ZD>QeW`qEcdD;HpUS6qr~8t5{M(bt8qvS{lkwGB2+LXEQ1FkP&fXV_ z>XSIBC2B@3i4^~A5v(24!WZ?j%2(TF@tU;+Ldi3) zIk#{G1-npnT`=M0TR5wLo~k(=s2O2X!GI6GIw+aI@`mNzT0zfPE~t<&J23+%d}XNv z5+@m6sorG6S3$MSJ7y_l20@b`C+3cYs;Pc77gvEyqGkt3gut19?ReoxhzK4wUa&yI zoDg(UXkCOu1So=fM$Cave(KmFkPyyZojo?E&fc6kHo-GJW`TG?HL6oOFmd|Wt19#4 znPaDMc3H8)VJ`r8Izy@s2o6E}+;oOmtY>1yGO$ugL-tsfDa9UuY`*D=|hxmCEvLrs7 zGxEtaz~1iEo@D()O~9^oYE3T?p8eQIZ8tHW6YL}gUC6P=NyK1>c<{gR)yR0QN**q5rED0$95Fr!;*C#}sVrsxi z2!BP$2JYlK#CDTc90(7apmM~UDWFK8MDqHGIkZ|KQ{&jf0_Vofp%u3Zks-iifDDJt z`BKe`xI_ic@w8mMXTHE=3KB`SZ-BYSo5>Q!gVx^LD0{ zFBRD=5Bz~T)~;9(o&oB*keQW$y4fKBrYqo&DOUsd)Q!}IcY-W01aQQV4PgxKg=3PbN01- zUr6Fo2Xqv6$;lJJbU}F)X9;rT8{jNAE3UP7!jd4C_RQl!X=j{8YqgBw@Pr#0F>hWe zL3VKyl3!p!kveb+&qu6IR7`Nj=9&UVqB$2ly5vFN?On#Cm3jl8m~V?~XegFF)*&$?1r5-YwV|7d}=MV+uTbTu9$O~6JK0yY~2Iib@~ zv}jJ4myX1+wzRaV)}69PSYA-LexJPoFe2cit@%=rVnWSvokg>!Ej5fm7gTXPR>d}| zAzpCAE8Tm+s_nXF(wJqelM)yf1>p2EU3l0Yvtf?mH-7Zx_IEaK*@!u|W#hIkVT4mI zfX3{vqc6t_-C$6|y5CQYxA8q)Y|xRE;6VsIUj+=a1QIC#qTg%n)UC&oQ0TLZ^>DZQ67u?e;an+l>3}_7a%%qGWx!7G<9gw#hweCYq<}m9 z{HLiFJnOnu*U#6gkee4-2Pll7?F`A%yw2~a{ItB85g0tzCGS{eKTRMp*T-i? zFlvfQX`xH~zUFSqK;q$%C{CONd1~a1w2@98l(S?~&eJ*Y>A6%de*I7y<&2!sm#jbC zM8YK5D9Y4q7tlj?@gQ8F8K5o7e}f#}|!C+Rk2t zatTyn&bpogRd^;KuQYlQT1^y;SEv^NDx*l5R8a|F6Z+yX131uIO{Afci3wR4Lp~wT z-pXu8IIs0d8`c|~9wZrnDiHrQeI{H+`Y5xQsFLI$6cxaPkUnrag!nU-A8hErM`L0x zUdPT&%XBBg*PrEV=n7!bkljaSk=gj>Ugu;i_Pt6TTaRl+P`s}BHP0ahjPuRmGe z+XR#J>sp9NPy|S#z|p61LLnl7&N~Jat;VUuIt7Z+oh0t08ixDHM$!e_>m9KnE^MUS zQ;no|%1&QQ)W8@Eu*%ia4U__a1)v2sb`4y66C1l>AGxr{f!q;-wPpz!;72qPp=4HZ zLQr)%HdxQ9yAaANrN;yH5%o`G^sZRsZdQbGipr|GCCnBvq4B^=g)}%prmH{@Mpd9t zMD&M%ktKsWIAjt-IaZ}00}g_J1lmdAzXbsTxkGV|8hDPtY81U0R|a4Soo=-Po0S?T za;%2ZUVz7uux?VZHWn!aGT_Rf+$3``VqPUC4{C+G;gobmrKLsPQBn5*g&`K6lfBylHw zs}B}JkoUrBDf3pMvNxTmrLX1P*Ng=JeKGNwL?!t)QR1f?8TWILKz|9xFEt>0zLxYV zI8tux3ffHCS)6}ADf(K*&I#Il4LQDo9A85T-^TIVLC#+ea{emc#yNBT220U7ch}Oi zi3Re`YL4OirLJA9Ya{a&!`-V&`%1!lPnGwTg!_M@67fcFQ<*23%X=-zWq6}OF2hxb zU+RHer<>XU)Nl8&HFhsh{|@WFLN&Yr>JMiUd4tzyE}XwG4Py>yg<+rr6WLXWML4w} zRgD?u0FCl>jj=e%EKqgw7Ljbz(}MXr#hGz!8^{E`G&t!zNcLb~D^?wx^Z~zO0iZ65 z0=NkSM=-gC2#8Yw7Zyz}K~x2_LnIwn=$Fz+5rslpVbfS7T4_0k~T#!NO;g;y4pkby3yvma58lj0eOD=BP5_jmVa$ zR^b_|Grm=IWcmjm-K1(!KZ4IyJx&E3rQV{F;UeG1*{tc_&QeW{;xE-_w}&Lwuv7q5wGg8t$LMAU<#TAS*_DhECK0V81_XEnDN z3jJ8m4Na~BX1^R6vFYaI3{rYTYRduRR=XS&y zsr%}lhgY5Z>z>Zd>^REp!=;Dg?kUjGcfWvSb}l(HHkXgk6RyY4)t?WMRH#pw85;pg zg}lWzqj&33J2m6KB(KS=pUTLjTZbqZTBHlodd zrOkSBZIl8)+8ff3!cML3fZuTNJK{5%%%oC)bxIcjYqTMy6bJuz1~njlJ+;Jf0&ZH0y+9w;Fg z^}R)F1c=6}5Zgj57Emuzm}s-4JYA63Q^XBSO=J{l0I42`n!&Az3{ja+Q2&D3&@{zy z!Eh|c%nPesAQ-V)t_4SvxNSo#^##KUb4rBb`a=Z7n`sDMX;xvto9+DMY~k7Go*x>1 zapVEoszbwAj1Nyt?{aDA6*wSH4Gv$9c(*Bd)Bti&!o1erAmNjgh+nM4kDM8w8oMwS zvliF*C;m;Sq{R5Y4U27Bxx=b*Bj(T%oShjQ#u)%3D~vOM$-xWytho{O9YdFtCg1 z^FE#p(FPDwlOGM<`9dRk8@^@@LqQK21s zf0N!{anMyG3ol@T2I?`$6^YdNlx~gW(jpEXFapyQ&yXq}y;iEMR#!b*3yAO10*oCX zg^ODZg<^?~!A9$o7_i{z6IKj?Wp<%uz)TsADn_6s>jvCpQ#~{|2&X>Qma3RpU07HJ zP!WsoAd`Mpsw|XOZ7{96>FNvZ#dYcftPm`Y6HMrP4AegOHk8fV(KMgN3a5`X>t@2L zYw#jVxe$~>mGvg21W++-DuE&-+Y5xz0!pBpp}?PhD;PACjy0^{E-qi0(v8~$bke&= zL(U^~?s=$yGqCXtTqT-7C+hyOIN%hPj#eKoouh20>@-hV4Hz}}9G^OT=IDKc&gQCZ z3SpaHZEeNa*v8F3d))ZC>)43t4?rUh6D7P1i>^%@A__ zJ#(R43S2WGmW4MmbPWJRO$(8w%za1*=IUalrA=#-|-t^N!bLRjo4Dd95ZH3$G2UI|$N^bXeLptuF>378ec z?`Wj=BtQ-%uYVLI0LTWgH3pd+IS0wWbL7n6m$lRDU28YQA)&IKYviEt$-2MLpea|n z0mpQdlMy$COmLNLblF+l?YdsZ+g%`0zSV#_NWCY=?Deud>qbex7rlMG3}=FTBez$~ z!>*T6H+%(z2%I2_VpbNmZn($#Q_xtm&50e1O{TS0?(F9Ab_kzAPrrP>nvAq5M$Cx4UW+P$igKQ5DVbx zNn$LC-L*zCFn*{>S+tRkl_ZY0iol{*m$yPoKV%MU@JtVU71|23>*_)Yss>wyIEc5R zmFt3|S}T`$$p#h>!3bZpIeQ+oqcnLXf7T^j3_8)DOjfQLYY| z>O5$y__yhvz)*^Mi{2unZBSAD!B!?uFg(>(ak?fPZJmn%Yy-aUF%d7QB0{~`Md^kZ z_D8fDMLdzj3m8v$JL{G_E{cYeWg~_$1e0n=H+m}$_%>_(p+kp+?N%LtD`CDj?t@1~ z4n4Xah&6x9wuhidPY{i(<0S0^pq?IG^CxH3(7!m+Ht3I!-g;Ec*JB@!dcf{25!Q0kLNBEX01OxX;CyT z5=w~5L~^0zqR)FE1W)kAlT1h+`dzNGQic_-`wDM;j0r{Ga0np9D3sw`AH=Vs`B#EE z_XeMkQk?6mI5#2vzCPE@69IbeIX=k9cUvjR0~-RqKO#Z-GIAzP^cuae%uhou-VaOt z-V}*TSjMJ6LgF8f{FjzvI@M3#`&9jf4GBxsF2XLFzN8ruv-V zKTuK@)F2RAX}}sTL|wIw_5R-lLKgHz2LVe=CjGbw5fva~Lt2E3?H=p?s0fJruK5sz z?&O8_RBZ11h)CBjv?B}1wH^QwaNkX6t^DId? zzpv2`btXhXllXZ=m=tNmb+SN52GCJ*%@l#eeyx9%IC~9fWkvh?OpEr_uwH|>pa~tJ z1|2x;nA{#>I=xuPSqCgk!~(}e3;!Q;+u@9-%EJ@s87w8EZoFrIF?nb6h2o5%GLY zexO=ULhCiS!<-^90dog&M#p&)ztlUT^?+$t;wt<+_t8Y+)6hbItHqrz?Rc`6@6%-C zcJ2<0zi0pk?l*;ib2Q-SzLQvFn7NGAjevn2$&ASSU_?{`5r(8B9lWG)mg-!S9hN3G z0TQ;sqjH-kUAPwT8K4{n2=yGPlXst3} zL0Bb5vM9s?1gmH9FQBVJr#D?x*e)4qTu8WKRaC4&oCe!2N#-g`qB#y~6dDF$ipkJ< z3>zO?2vL*C$HrZ2@0BbJY*%D7F%8C=eML7{_?##lDU6LjL{k+Rk0N@K@(8ch?Utr; zppP!a;?iigvLz>HcjR}$jAKB#%-P(NfD^5TE}e{f#1`S+yG$9z6g~jmHijN}pZylD zf>uFFyJ5{HlL$QmE1M5Yj|5VpXr|obeA4+ftZ40>j&$qh9R!)v2cd(&h&{*|vt8=8 zbKe}H%9%DmkG*FAM(z&=A>&wwKJDDzakiHdvd7+D01hx|?{6lAphJ{X^&#k;U|T%&2I1)8QhGO>cNuNTPaztm zvT8a?a(tEv^){ld5=RXo(cRDEUf`MTF7Ww=#CTsbZuxg4#i>!FZ$>`Zm+XgG6a6!? zh!xc$)oaeFrFx8?t$ou!3hf<;>4caobu?OnOb|%Ot&BFhZsR$ni;XPKQVo0)?;CPVGF?o| z)12K4HX;X0?Od&=kr$?-TXN&wMz`991EqBHowpE9=5}ub9_&md>#3X zOr43DUc>q+-~{NMp*%OhR+OSpz_6J?JPi`d>dnZg3A#Ypu%}>NEHQqcRdHaG4KH(T zE+sDIu>j5p8?H>dsfZ&bFXXq5rX3Ycrzg(z7+H*6L14S2QAlW^tsGsTI+KkhXzt{~ z821J|;qeXKrdE#N{>+*rs&&CU?^LkahGw$ZJ+PXsxvPlG1SW@jKZJd(s~axvfRvpT z_ndgGH}9JEa&%t->zYx8N&x$-z?sx7KWI>!idj8S$$$7E#V<%h^&nf8+>4ayc{d5GAdtwj;Yp8hpn~8I6 znHj~a-=Y&X=jWs&#y^LBJdQq^zQE+b6rsDFbO#7bHS~u#9S${5k=Q*Dq0R4Y+(n&~ z4VysC&AGb|0OV|*02>$qY~H|4xHDWsOrsWWPS zh;G^A2f{LME=MYiRO;LgO}EgvFUCUMVA(A;5l?m`cwT^3Lbl#aoTWh@O!}~A#GX{r z*b5`RJvhg|z^o7M?&*B87g~?rR6g4aw|Iu=g%j#PoboDk{LI~FQ6n83j0_k(N5rh+ zXK(`chk6TG_Hmm+IG=t&SfDkA)#7G_;#fL#omvlgfZ!gr9bL5~&eR5l*s>75Acnkm z1)C7UQ-qrku5*h*DFNsCO24|l-mUI|!Ajju!<(ZEY*ROwr`%#z%r*AHT!Z?PtTlsb zHtA0mq1hlA9B>V2M{U@!fXG7FI*vgMttK}VvOrcv zm!+qB&=S#MG4h;fEl?hqxiDNrxD^EMlGTC`8&O$N6I~d5^1=Ms;!Gm$|G{1o${FTu z?uE(70)h9w+E^kYBSXE4iqI6o9UKoKxR)gXuU?cuSBE4m3|xjwD}?sJ`&8G&`#e@a z>#1fDKf&D|22h?To1(C&+bt-Z4#HSMMsy@0= zhK<{09SUF3dM#G<6S{+28xp%_8FsWe#NoqHHQJRmVLq*^nGhzDQ${z5tJ#&wDfJ`Y z8wL9v!VZzk!USTUJBd2RsPY{c#TXGx8sJf|h=sQw*fxEt4~pm$Z)gxSy&i~5F&lzM z$o{CN9ez-hj%%x?4ZIi&d39-MkPO}oXj#|>q17SPO6q=`Ub6Z*`Jm`InW0yjo|7kP zem*wZ$&)kQ%~*}GZjQdix(F>T`~|<&UYF=O>)KwI=I2ruJ)#elMN{K8{;0+cqGQ$X znn@dN8W-|vGA^276R+nGDv;D8+!2JX=g0=*!f#2x3NOsTY5!LAoDAb;9@d&Q)Z?-R z-MMYf@F78k=A{#Z3Zbkh#CqFp(4!K7>2Zy({ri&@g-zT zjRX#QR@1x;!4i72$Uf&WdY7~>%!q%4foGEt@P7o9tpSSFsDVDkF9_xYp**aG?!bqd zH1ZE87D4Y~cl2BXfm*17P{Cu}zuKBhm8Nroe!kr$x#3HV+*#%pRY3+cZF7d~^YcOtky1ew zOMrP@8i*086rt3JOgj=^qOVdJFA}#@HFmP)`$bt-g5gUv_BciB5 z`6vk5k{8Ly!qTE645tta7Mr6Y$d=lai&P-%8Fx4&8k=`0O60c?Sn7`1!XkXCw8lYDGmM;M#4_n3?y37A zXve?D-+kyiNM~XEzMWo^0ch$q^gg=&<$K5;-xZvl9#lXEQWd4Ea+&}Ld@h)Leea$+ zS?1{ZN7~6kST@bkZT78DkxhLY_eco4Vjh6J!o4r=uS3qu-y`ji4vThsv!%lljWR0w zi8c}1!~t%)5|@RZ#$qk81S5ybz3ax8V0nBX@sr8x*h?lNeBgEUB;FLCVC;n>mee9X zWNb`#Ka!8b70P3&*+hN3iP&iaD23w$&t>lA|qWB#!8=w(j_Il92pB;wUyWgUmFSK4|Pl1fD!(tox1nGn@A&O8+qO;_$;f zJmF)B(z@n`V!OeDD!407tZL%W^abGeCJJ$E=~ z<;uAexmR}FTQIz9%yrpM+P(G;oaOAjcAvcyXWjNbyWif0v%J0E-fcgDvmUz#H0=SucWSkE z#;RH8Ty$Oi&&)||Tcv2FE7h{ZvI*G-AzDe)R%}pSHuAtAc^S43VJtljY?iS}M-`$m zlA)O7*m;9i0XhdnyF`s);8KRmQM>p>N?#Q`b>695wYZ7zp)1veEAx)K1nF&c!LhL? z0pj{y!1nfvkP54as%ZNYT^mr_UJYS*u)6|xH)^(GQBuIB6{roMM$q?hb0!<+OG`QM z9+0&|qTc;wngS&WKZ+?kkKXLD>Hq{mwMunmyu)VGbbHwmxvyAWK^P4a{CS7nB3pmO zgppc;?p+BtY{xbZT4o*zAf+J^--xp2V-Yncv01||5vH%8B5w{+ELq)9E;*v&_zcHn zlA+95f{W@NjR@08lH{PK<&*XzHnLT7hBT%$%(NGEHja9G-E^Loc(GvvboThz>YG|Z z+g!s|f|l19ewi<&x~M305u_E?U?npZTL>|07kOL=Ya`vMvqH=nCy3SxvDTjDmTsnA zTKdJwYI$|Jf+$aywlf$Q6A67WVqTYH^uY@RyeK6mQ}HuWr2AJeaRq2A@cm=F{4x^k zbR&iMSykAu;dd>l>M$f*a>btO))n@X_M-cv$ORO58i%=F^{T*FF9!4k6B3-hMr^~? zlX#l&yf0OMGTx_!B9KL?RxIk(7IOZR$l?*6)@X1D{vN(R8MguM99M{Ck)@d-!#6YF zDtY(@xN!Bt+avcXjkH^8j*-rm8@bqsh6 zVedF{{8?)afOr-$R2A15f{t)92uKrP29}O0D;)|N(_e!-DzfzGgHB}-=D5W{3ZuR##__@YlY40$S!Pda~vd@P!!Tsl5-(c3vx8s zU#1$$0Ne!7Mi(_|N_<%YyD++X0GpJTEiEbJ;* zm%t@0`CT_Hmjp^k@@|@~1@8GzbQq5n!a<>HKirqxi!eaZzjP{Z)SEC(rvcRf64XQ# zePj-9db_+cTJNb0t3EUh`O}oQ*MBWL1A2}7Uf4;HSx?pn0%hvL|ZJqo_ zSSA09Z`X&XVw6`Q>d-j5tnvsI{^0Pz%hWzfFJUopXZ>=UAR|Tf&`m5fLUnch0dLs ziAIXyM`a66?0$;vCA5Y?&82Xe2lDthPlC5eyXcY>>qP(&WIYM8WaWw#A?y&t%WZ)3 z#c~y74Rau%y6j4;p1>!-DGL*bnxY1}hEY-N&}2Y;b>Kds3HeiHgf5h>Zq>H%e+Y^M z^FKzIVpofcwmXC+eKZn1-yci~Aj>3I`)((9GvUJG>F#Cs6e~`)cF}%WUVkRt6KXl z&5AvngJu@~Ur#i_IQ7=_f`CAL6t72cjbboH4#ajAeGPD36-0Hu$_RoWJ{Xe$u>;d2 zJ{q_>0O%j+X3xld*dJ0CEBEIy(adWS#cIhi?=9uA-sYVW(A}# z!68U~h-fst5SOAL0DKs?I$8lC1Qg;~jb^9-g=#IX>ojjwSH(b(;i&mpuxR=tVNSrE z@KhHQ!Xja}wpGfds2Q}Ni}+7z3jiZ_4K8s957jFL^Gsk4*AnD6s!G$@TeNB0o$;sHPk@f~-4u)s!*w`UZMi_8lshiMc>N?4bEQ0!GHEUw5v--GWgC7Jg z!15`cSu6xIO1(^*w}?RwJ0X6IhNQIwRllJDRuD`ed`3{m;cb`JTpdi7BA3U|?1(uz z3;|wPhk)V^FaJVe!Jw;k+d_^xYTb6I>i0oXcjRXk8g8o__;nKu`-7<4wkYA-SR}+6#3I)+`U;lq z*NjsMg-Kw$9?PL|OV+x(Ic^Dpm_+zRt;lU*n6PyWVABKFWAwO0y$wjBkdBC$ z%;TXvT$b-iael*=aM*QuUg_Ctde?*q3WnGx5ay`Gt!!tMfT!xwS#~>^hZYkdZ~)E_ z&tAO)6SO8P!@<3nvo8Ur>MPwoT}1Np?L`0UM@9C_qoaJ&hQENMrCTY=K_7;pny;m2v)|LD_h(Fc_0rhcA8)EyTL@ zduTzX#diTO6tJC5P8E+G8HAOJJHXt3rQQz;pq*L>^V7;}n76hD_o@k#1ow#HlSn8g zXG1`*Ue`*2ZsgFc4fi?;jfR%e>&X%wcnw4;>UN)L7?Ch-?Tn}ll6Lk^@=W6L5pn3n zCT-%xix@@gjF*%W7u+;VjZ*75gg(t--)gzHFL7M$TupS5xB7u+E>xh3q&+)r+p&ik z?Gb2zCA)7?I6$FF2ybH`Gvp>V2bqfK*TNZCG>NQDykJGi7ed|*jzfX(C2R~(6t;dA zjR!pnVp_88h>z@&XgMrKREgRHb9V%1)y)%GA%hpO+Si<0(22yvZjO>7jhHO!(jWqv zTs{UZyaZZB23TECN`yPQJQfrPkIacfCubvG>@O}*kx5cy7l0>)u9I^{xjRNe43f^# zd#bC<6DQ0ujiK8N4&tw$pB%h=53kQosMklgSNi1nO<#{iHDEvkm!S=&m1|X;KqM&$ z`G^pMv`6xLp>YeBiRb<*v;HFA3HB4>`+kOkU(e5hm6slr?0%a58_yK#hY1yKfCRFhFYfVscTS{YD;PEYfI|Gn;4S>2yz@-s|+ zmdRNre}l;{G5I@8q*=elvtMWO51B+FAT0-SzlH~AL)lL*lTPM7l6xulbbcs*IA6%W zD9;!0j{YOTdkR1A4J47_DyLA<(emD5F$pW0ov|o3Yr-8^Fcp(QC7q;Ib;p_dNic zO_#Tl!!~8u93FJY5P+vm;LTd0=?lOVgl8LM5O0I}OEnn2(w~k#dE});od>N2v?WUl zKSP-nSkdEo{bj+>!9yB20|3FJ&*DT!4|7XWR}8T+C5X?|1GnBEW?8Lpdn?DTKVLz3 z_wX~ccINB4I{LL|E}nfeFg}J#2qR5(8ucoy9f~>4=Ya-h(-M^dbH_5C<);_n&bDEF z>QXz*^^MFfJG&0Q%j7cIh=aWcu_g?{siBn5Z&FyM911`0%DXJT*ObL?F3nu^8hCR zB|jyeDS-Qey$GVA38rs~p8|ywgWd##@8ypZvx|qx?-RukKEgre#pVU5O>$$VHacHZ zwNQqt7F}MXTK%k+ZQMrH?u3;~s>&U}JE*8c?+N^8{t0fi)Y1J7-1~#5a?-)m!@K;R zTD4}C;noKL=kHJ#Rm{_}wc@_Xa@aq9?_$|nDh3<7xu52(o$B3r%Z8Tr41!fC`Y z6MRU-QcJvp-I%~rz+&(!wohmzWus=eQ(sNWRdVeVLIPo{=-Zh)7*ln8hrG!P3_*nJ zELAplm_wg4CpMQ(fg3sYRw5{&D=0yZu@bssCFpZ0p-W0=yoKq06~T(G=A=eBTy^8B zTdo)#7ExTU=H)8CHi9qr@Jp!SGg8Bz28|Mot8(O9JVVX?=q;ApizkTpigx#|74U3F zkk^#FJA%A&l)N}Qp7@G!eZS=HL+(Dw-6y$sHs#(Kp%Dp?tEl0_XGs%spvkh+2n5JSl03I6Ye)Lh1`G2(jk zbXJUP4?;To6n_13gwZWyAREu%*l(C&il?Sg-`m`QfP+$Glk8S6pD*AO4FI|z1{d}c z5j&WJ*hMpi=mK0EASyGo>r34Rk$cB@1%h9(`%BtseIK3(vHKZ)A3h1O`&oS--U;fy z-GwQTv-Saoo5~phs6Fl*C$yk)M3003oCh)DMGr0EJG}gf;)G`o-vjQ1^#w4j+zyBb z!orxWpaWI6K~Ta`6(_s^;<7eV7(ZJ0xa`g_Ss3D#uC1SK?wN+=@1 zdA%XF2a5W5OpkWD5QYX@v8~N?A*}W9F~g+F5bbf%#z(h&VRq9mD6~sIS4C9_ebLWL zk{h%v`ypP??NgQ^9EHg4hbu14ZV`tehUdH9g1G#hExq{XTzGvbEAa$3y3Hl`!&{U7 zn$3ezXvubN0gR;lab*#G3Ud06ASVzKV#@xGfq>g7_h1dS#XPU%aE4U|3tO1D2geNF z#hSYVPa(OFFynIU3XVDT9Pgy|C-ht0d@A5qP#n;Pxt>9(d3hVLKd^p*=6Jj#-@bkW zD2?D+5Sf0zr|JE!*!#akaBcTq%ejA0`54Z2z>DdB;tPGqLDZX5LGawhY7f*LK zKW(IOe*pLUDMhb8J5Z>A?cYEw7Q0uTi_5}ByjNRKC+}jtsw_on{ zs(h8#P}%{xv!kiBr-FO&(w+|P#Y;OV_xe<6b}wMohw87Y5PT~xqDUyGM`3mX>=SaD z;+ToBzm|84d@pAsMVAQhr=eDT&(y>lZ(Nu@Z(c+|gp206*JsSnuh)0JA#~iKvv4hh z^w6pAya<-6>_j*AVAJ`wzxuvel9&+>P4@X13RE#KcfARRVTLa`Cqhm#Y5U_D6xZR{ z%u(N=UIM3vDj%!w><%LveSTe@cXg|iX<4LvKJmtz=5b^-<#}-9q`vWV_v?z-Rq&ZE z;{@377{u3xyDv&OImfQhO94Fy838#WaN#-zysFqCT`|(Pfe2HP!tCXhGE4!LB%39> z{}>ChUf4Vjd~usI5#-cg+B{QWMH%}P8KL@fsz+HjlXyy15`3D3e+)V|Y6B<5R&9t? zzvJtz)sNM@F?K=D~IrIuM$CiB9C!BDRJ5G(acMohKx*0K5vU;nxPBVilFF$L zPm|D@V6!-+KoQIpl%7q|-+$z{fPq8Kfd8=EfWKY$0FwJ%$$51J8a8&wk{>naK-QEz1nPw%zE%gPR6?^9B zbAAcWS-ieCTKJegm**$iJ|gazTYaUavTa|9p2q|;LO6+Z=2=WuMO&FTTf$HNup~A2 zFRY6YRVu2+t~tl8o+;fZd04BmTL2<&oJKZ3rywBI0pGx-`}djr115W!&}LIbUSl7& zF6h4I9zMJrI#tGGcHwGrlSE> z7En||4}m^NVc8hmqD9eT;-L+{>N~ij?yikE{ zN2t2%r$A#3;a*5%J++=fo*b4N?vXKGOsbNv*WMxxC{!Gd*sewD(8D9Vupl)^ z=>>Oiy52VxShSd;pQv{xKb}#z4=W}x(@+et2|uZ0*f*&Dq1NZtBIbmC{1Nr|^XrFB zf{KVriM<7cjuBL5P<2ziA9*G`&xj2wj0iXnOP-&=56{oz`D~OiC}}P^i|4cF@x$}U*mL>L+jGfDJfFn#$yq4x z{w;fhvIej?tIM)#{jB(id9L8(6kqrqCjTBuXfpl#I9JP(Ds)+@bkGSPUF$O48K6Ks z7rc9o>vRM~Bn0F3VNFsvK@DF&L>0upjCO-n8~bg9SHnWJkb*Es67-wg@5# zXsoceQ5qr&7*oHK{E5PPX6*%tCz2vBdx};D6ji{!06$D)EHZ{ZxIXk&0y`Zg-b~CS z-o_D=I*myk!S+IJt>zmy>)GeM?&lZt;2_ysei}-Piu5Bw3c0146F5`*d8$YuSXeBr zx$H~#Kk#*GMx7!a`N_pfM76Bk3M#q;Df|BmfV@Hctzn3j3w9H+qKW>F|C_UuCB2JBrR4_9aDx8g`ZY zAMrWWSzcZQ+?ZH@hC8T@Om{NVodW(Fd(3TB)<8%Dq)2cc?%Mbo8lYhhH_n2^~-S05@FHHWJNmMa8 zW9>w0_|p^;5R)RX(wB)BROrLw*r949kNS@)@*O+{hNrwjdx#RdY!b_ce0dIJtNQiLp4}K9j YV}A;WIF;X-d$R8b60QGM<=)Ky1G%V#WB>pF literal 0 HcmV?d00001 diff --git a/ChIP_analysis/bootstrap_helper_funcs.py b/ChIP_analysis/bootstrap_helper_funcs.py index 99d4971..4d30044 100755 --- a/ChIP_analysis/bootstrap_helper_funcs.py +++ b/ChIP_analysis/bootstrap_helper_funcs.py @@ -33,7 +33,7 @@ # @numba.jit(nopython=True, parallel=True) def credible_interval(array, alpha = 0.05): - """ Take a bootstrapped based 95% credible interval + """ Take a bootstrap-based 95% credible interval Args: array (np.array): 1xn array (n number of bootstraps) @@ -43,26 +43,35 @@ def credible_interval(array, alpha = 0.05): pos 0: mean pos 1: min_ci at alpha pos 2: max_ci at alpha + pos 3: median """ - out_array = np.zeros(3) + out_array = np.zeros(4) mean = np.mean(array) out_array[0] = mean - array_sort = bubblesort_jit(array) + array_sort = np.sort(array) out_array[1] = array_sort[int(floor((alpha/2)*array.size))] out_array[2] = array_sort[int(floor(array.size - alpha/2*array.size))] + # print(array_sort.size) + # print(array_sort.size//2) + if array_sort.size % 2 == 0: + median = (array_sort[array_sort.size//2]+array_sort[array_sort.size//2-1])/2 + else: + median = array_sort[int(floor(array_sort.size//2))] + # print("Median: ", median) + out_array[3] = median return out_array -@numba.jit(nopython=True) -def bubblesort_jit(arr): - N = arr.shape[0] - for end in range(N, 1, -1): - for i in range(end - 1): - cur = arr[i] - if cur > arr[i + 1]: - tmp = arr[i] - arr[i] = arr[i + 1] - arr[i + 1] = tmp - return(arr) +# @numba.jit(nopython=True) +# def bubblesort_jit(arr): +# N = arr.shape[0] +# for end in range(N, 1, -1): +# for i in range(end - 1): +# cur = arr[i] +# if cur > arr[i + 1]: +# tmp = arr[i] +# arr[i] = arr[i + 1] +# arr[i + 1] = tmp +# return(arr) def least_extreme_value(stats): """ Take the value closest to zero for the min/max ci @@ -72,7 +81,7 @@ def least_extreme_value(stats): Returns: lev (float): least extreme value """ - mean, minci, maxci = stats + mean, minci, maxci, med = stats if minci <= 0.0 and maxci >= 0.0: return 0.0 elif minci >= 0.0 and maxci > 0.0: diff --git a/ChIP_analysis/bootstrap_sam_file.py b/ChIP_analysis/bootstrap_sam_file.py index 5bef813..972a618 100755 --- a/ChIP_analysis/bootstrap_sam_file.py +++ b/ChIP_analysis/bootstrap_sam_file.py @@ -47,6 +47,7 @@ from Bio import SeqIO # custom module import sam_utils +from bootstrap_helper_funcs import least_extreme_value, credible_interval ######################################################## ## TO DO: @@ -168,7 +169,43 @@ def load_data(self, f): self.reads = np.load(f) self.total = self.reads.shape[0] +@numba.jit(nopython=True, parallel=True) +def get_cpm(array, cpm_array): + for i in range(array.shape[1]): + cpm_array[:,i] = array[:,i]/np.sum(array[:,i]) * 1e6 + +def summary_stats_only_finite(array, alpha=0.05): + """ This version of summary stats only considers values that aren't nan or + inf. Creates summaries a single location based on bootstrap + Args: + array (np.array): np.array that is 1 x n (n is the number of bootstrap + replicates) + + Returns: + summary_stats (np.array): np.array that is 1 x 9 + pos 0 average + pos 1 min_ci at alpha + pos 2 max_ci at alpha + pos 3 variance + pos 4 least extreme value + pos 5 median + pos 6 median absolute deviation + pos 7 number of infinite values in bootstrap + pos 8 number of nans in bootstrap + """ + finite_vals = np.isfinite(array) + num_inf = np.sum(np.isinf(array)) + num_nan = np.sum(np.isnan(array)) + if np.sum(finite_vals) == 0: + return(np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, num_inf, num_nan) + else: + these_stats = credible_interval(array[finite_vals], alpha) + this_lev = least_extreme_value(these_stats) + var = np.var(array[finite_vals]) + # median = np.median(array[finite_vals]) + mad = np.median(np.abs(array[finite_vals] - these_stats[3])) + return(these_stats[0], these_stats[1], these_stats[2], var, this_lev, these_stats[3], mad, num_inf, num_nan) def merge(intervals): """ Merge several individual intervals into one (start, stop) interval @@ -400,6 +437,15 @@ def numba_sum_coverage_from_sample(sampled_reads, array, res): help="only report every x basepairs, default=1") sample_parser.add_argument('--seed', type=int, default=1234, help="psuedo-random number generator seed, default=1234") + + # summarize + summarize_parser = subparsers.add_parser('summarize', help="Summarize\ + coverage from bootstrap samples.") + summarize_parser.add_argument('--samples_path', help="output file from using sample\ + on the sampler of interest") + summarize_parser.add_argument('--out_prefix', help="output file to np.save the\ + numpy array that is created") + args = parser.parse_args() logging.basicConfig(format='%(asctime)s %(message)s',level=logging.INFO) @@ -444,3 +490,28 @@ def numba_sum_coverage_from_sample(sampled_reads, array, res): finish = time.time() logging.info("Sample {} took {} seconds".format(i, finish-begin)) np.save(args.outpre, array) + + elif args.command == "summarize": + + logging.info("Reading in samples from file {}".format(args.samples_path)) + + array = np.load(args.samples_path) + cpm_array = np.zeros(array.shape) + get_cpm(array, cpm_array) # modifies cpm_array in place + logging.info("Created cpm_array of shape {}".format(cpm_array.shape)) + logging.info("Calculating mean, minci, maxci, and median cpm for each position from bootstrapped coverage values") + stats = np.apply_along_axis(summary_stats_only_finite, axis=1, arr=cpm_array) + + logging.info("Saved summary stats to {0}_mean.npy, {0}_minci.npy, {0}_maxci.npy, {0}_median.npy".format(args.out_prefix)) + np.save(args.out_prefix+"_mean", stats[:,0]) + np.save(args.out_prefix+"_minci", stats[:,1]) + np.save(args.out_prefix+"_maxci", stats[:,2]) + np.save(args.out_prefix+"_median", stats[:,5]) + + + + + + + + diff --git a/ChIP_analysis/bootstrapped_chip_no_consolidation.py b/ChIP_analysis/bootstrapped_chip_no_consolidation.py index 71e31b4..cb6acb1 100644 --- a/ChIP_analysis/bootstrapped_chip_no_consolidation.py +++ b/ChIP_analysis/bootstrapped_chip_no_consolidation.py @@ -90,9 +90,9 @@ def summary_stats_only_finite(array, alpha=0.05): these_stats = credible_interval(array[finite_vals], alpha) this_lev = least_extreme_value(these_stats) var = np.var(array[finite_vals]) - median = np.median(array[finite_vals]) + # median = np.median(array[finite_vals]) mad = np.median(np.abs(array[finite_vals] - median)) - return(these_stats[0], these_stats[1], these_stats[2], var, this_lev, median, mad, num_inf, num_nan) + return(these_stats[0], these_stats[1], these_stats[2], var, this_lev, these_stats[3], mad, num_inf, num_nan) def summary_stats_nan_zeros(array, alpha=0.05): """ This version of summary stats turns nans to 0. NOTE THAT THIS MODIFIES From e222feec0e1b8ce40b997e803cef1d6063e2b85e Mon Sep 17 00:00:00 2001 From: jwschroeder3 Date: Thu, 23 May 2019 08:43:20 -0500 Subject: [PATCH 10/13] updates --- .../bootstrap_helper_funcs.cpython-36.pyc | Bin 2164 -> 2008 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-36.pyc b/ChIP_analysis/__pycache__/bootstrap_helper_funcs.cpython-36.pyc index 6efad879db3f60768ab937b3f2c43c713728d99c..fa749468f0b017c2d47b0b9321f9bf45ea651a86 100644 GIT binary patch delta 965 zcmZ8gOKTKC5U!q{_r50XW|K`M3`vZP;3?)12|@58>M4qcA+SugdtqO*p=T13Fv}h` z-VJlqKOlJXsCe)fcoG@#q#knj=EdqsqN@${_`a&HsxD@JI$tU)>!p(Y{o{|_Hx9rz zm^vEz_fVo+XDh4Y3R9VON_?H^%s2(#;3hMB$_paa7%48BhQsf4Q-0?%73htvkn99;LRECOh^T`fLF%cnFlOFy4SeQEMA^aznUWh zQ58`Q(JU{r!W{7GpC(EJCJKSeSP^5jDO{S$wc7vYmZxHNyL1_3hjA2(VE^`B5OL;iZrpHNg0s$E|Luk{BIz+* z-dVrv4v-2$+r@LP;-ms=z_}t-?osqSBU8ftOgrIX7-i~!2Vtg0LlI}R6UIWOAnu%S zPfvf*h3f@L*ntqiaWhZTFS@)2QA<6Qjq8{IC7am9L|G!Mv`p5Nx?&O`LoL}^9{GvM zq4E{(pp(F^IwB)wL`NzSPor7d7NB)U3Qz?eidtoBSvie88KBLH~e1#z2IB z2!L564pFa!iiPx%vAUe;&AhI(vam;le3)0ywD4fCe=Mx@%vjXr>+!~W!zEez)!5vg rMh=GI`y4*Gn136rBWR3`4#p|+PkGnak?6LQva delta 1190 zcmZ8fOKT%X5U!e@k;WtIVM%@wP~@y3nZ1N0hdeNg!C?bAm?c@VFgOftcY8g0%xHT? z!Zs2*$RVeMg!u&_r#Z-3EQ&nAW>VG#LeNwG@$6Iet zKC1(~g_Yxwd`Of&`SsEMyd`Ym++a|WRHT(AG%XByEedZCMlbZIGMGpIE!Fun_KT4YF;m=o>4( z)$<46y}Lxs6=UteDDYeG+xa*6e$C9RM08~4*2{myZvGm-e%B%ghyRWM-RHb7yS#gv zBw3m%J{n2U{exRScfj9|II#`*GPkP z4?T40>)<}BHX+Y_)NY)NuCj|HJ~M@xV;WsHJ0WEXJ$m+CnBnDlhEo)lU<>voeE;3d zx_m~6C9EA<7(xzbYzk9)*qJ@GjU=pX*quY)R*&xu&Jw1?Q+5E;C2|0-9>0KBJ1@vW zqqUEVv8giEaQiPE-_lNc!ADX%ipMXdU($@a)_AO$jD^NXv$L_%EE|rB3EZ*PoD!%C zu}_IoGa9fwY|`gqqo6hx_T>z0axt?%B1)eTo5Bg3;mo4`Xs!FGI@BBX9n~>wQecky zxWH_>X)_88)KkJ76Yw;@cHRh=a%mRx*te;djAv=_dVw>H; Date: Fri, 6 Sep 2019 09:28:18 -0500 Subject: [PATCH 11/13] updated_chip_bootstrapping.py --- .../bootstrap_sam_file.cpython-37.pyc | Bin 12256 -> 16063 bytes ChIP_analysis/bootstrap_sam_file.py | 5 +- ChIP_analysis/bootstrapped_chip_enrichment.py | 672 ++++++++++++++++++ .../bootstrapped_chip_no_consolidation.py | 4 +- 4 files changed, 677 insertions(+), 4 deletions(-) create mode 100644 ChIP_analysis/bootstrapped_chip_enrichment.py diff --git a/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-37.pyc b/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-37.pyc index 37e798b97a16b7210d59a500bfd27a3b737626a3..ebcc1b5287c4bdec7427f41ec88309cca03b2856 100644 GIT binary patch delta 6317 zcmai2U2GdycAgn>D3YQiN|t5)*&fTXCEAkW-`LT5oj7tFd!1OedwfKXevx0Zwmx)>%{;IG}s3U`_!T@t%?FgQS4jM-pz*i zRZlX7X?ABeCEKoS7uu-ZUFym9sEV9RXVYkPySLPr?JFI~9#EC{6&7P1&lJ|d`?n(5 zgS@Laz~cBzJl9x~bv{$J)a)U)#JX7bGc`NNdRY3Ik{x2btPkxlJHYzUjNMmeU$sXs&i{tDl zGuW}`TJ{7#Q9Q}U*l{+F(FC6;PO=m1B#l%>jwaY7tM3gn>=ZlArk+QDCe{%)Em<=> z1J)V#Mw69B6YNae=qx|Q&av~7ldu=~X?Bsl$u2#QLKe+j#*D_^YMYtGYCm98^%m0= z$@IfElB<|`o4wPVA(gJNccn^GdyWaoZmZCBrg)M4NRl{`p@o$}i#kXPr17348of;|JRcr@eM!BZ}QpV2gJNZ zK9XvDi2enBa~lrje6{ZII~d>CQm-qYKg!PYWFv`VxQn@a;Qupfv-g}BWZci*<45@{ zzsGOZ;TGDDH9lX^hRFQy@wqJ>!b9c-x$_6u(>!0`cL$YijZ}IOS^eqaJ_TN=PCxk1 zvTMydLDu+4TI0d4H5wm*0wZiQ)ws@Y7C+vutJy_<2s>NcqH94+AA@NTsL&&5W>MN9 zReZk1;`>GNa-y-UMM#j%2ge4lxPeB`z)Dk$#uWl`OEH{dUa25Q=Ew%(+->F4AjDYAqf{W zIc9q1bC_||7wKQl?un5{(qW2aQtW#Sn@plc3envav%=e5 zL`?jaJtc@ks`GRIFZw>dohF)o0s{m_2uNo+M!gdRCJCG(Fil|3NU}#hClUa&{^Eh7 zb?QB!YvGhD8{P`8MZ97P)3&*t*MOs61YZZfIehMYfb|bNrKS{BrVb&BBHYbxYBeer zlpkv~jqaVNH6pLhY^x(EH2RgAx<_|KsjsFN!G;SOxg&ZZdKsVc^|P64nP{L3(^=+X z3Nu2QFXFD1GLb-+?iwWWl~U-rfoipFQ{JkMW$xub@F!q)qa;-8SH)XsfBo$r((jMn zrv23kqmU8klNix!yHa+IGX_NlbD6iM2Wa}NQL>y| z-ZID(c4ftU6@AVqnd`q5-9y~xVM<|{PJU17FwI{uWR5YKIVLoCOL7;HTr^5ZIHuE@ zHE8zD=B!~Zxn+CJ;|AkTENSi*;WQCmYBL)S%yKNyHqX^+JCPPwowzQyapA;WoOPSg z>F|Y$%kdH}rO3pNMNxcIovbJM@Zs z|Y{{0ti=4X0@Dc zxn7s*Pahsst(_Z(kLoKFJcBq@H@4;A9+8|S0#fXv6ieU{fjmHM4W#*ZA=J?(KZUzt`D6K-m$g39-ATQH{=2 z1VL?Q{Wu;PcW{13D4Li#I72R94ljJJAj+kjYnCcD$H|coh6y?%`0F)R$*hE#{SIz| zD-&dwaKV$;0q7UQNBL$BpF0XrQyQevP;^_{)IGJP*R)z>PdW3i{l7XfzcX=iO|8rH zy$YU8Oj<*XqaUgys7IAg&=u+$){L)d7z?pEQctG6&Xg+Fz?Y|6OFybUNJ**S3v|}M zX=d~KZaQDzM82y1bmzZ6W)i)x3xorLBd}DkLx($6Dj~ z`WP1JsKvK5lnecv30NedD?Vl^9b+lG{z*tzwoB0V5W#^Q6_R_-lFg&StV20gTN&YI@FDDZf1OGc`HbS z%}hWX^n)K5lm2%n57k31Bqqy|GG#an>lQfi1=vy+D}i1>)#e2eJQf7`MJ6dcLQ3b7 zCwUpeAWGW`bg42*U9VgbAJDwI0ams@)QHtFAS-mRRR1Z7el@Md5vS-6aZ69DV%7i6 zL@D+)u!8S=F?mK)zxMy&)IZd#ebdu|7dWzcp68aiQ!er9-2xsuX1GVgC|F3|C;~ha z*}_@IuAAgWNIqjMTDDF0TBE!U(;;pkF^5=W|0+ir8TB?$)RF?#uIXtC+jKm-jjPn_ z|Mc{GJ--K3(7|}YT(dp#_0E5ueyrAmL>QhSqk3^c7ANTLYDVqup%`>q_mWcc@ZFhe z_w;n=2reR_w^E%euX&XW?xHe5-ft}iu}ejs_l&2T z)#!{<+4vuIx;k_pXD?~+&NNF#UFzt1n5*{e;^dh;-3(NL*cC068PDqS(5l0?$oL5O z?e?dV32Pi&pkPL_rOc*~YS%}h0XUeRZsP4SNT)O;iMdgKx`{I_KL*NhIsAaR)uX$5 zpblsn_f?UH!#a=|tx8;0m-Mh9MV9AGJ5N`ObSZJunWAzkZ=o^Q!-h@aN;4qqRPB%) z1J*RSnO|w$-+gCZf&MLb2x}U0JagT0QAr3<{bB*N9BV6bsUU>EK0ULUBQ59%ros_v z6AtSVOKP>DWeL_cHW>{9U5d%Za&kA+5}&vtT^l&s%1gTU*(gx#B}H4H7jKZN-1O;#zKya_%d4Y}#RoMuuE$T0s*~q(AS`L>B00ChZCphx;^bXfbjk87h=tB5pMXK6XnFEcByK~*Iq`r5JtANcpcE^!ThMtQ z$5bvYpt$BTod`-(V#fdP>GF~+8@|Lin{M4>+(O}U5hP`cXWFuhGa}fDhk=!!E2At$ z@p0?Ujd?+@Fu@3XLtu*l2cX!93n3=_-=7()%O|aT+)@D})&a8VR+wr=wIC^jbdKT? zVk(?H%$jN;cIVk;G}@m2cTgBsz`%5upe*?Xt6+lg>+t zz=Mi=QKzl+=~ixlT~?3rD~JSZHor>cg8R<^>1Gz7JP?;TAqAL@r^SGtJQhC`AIEnp z+1uYcVE9ko7^@>?px%tDNg~E4`)WKA!(8*z;#yiwsRQ_c)%s8tH|f!)wUpWiH~@%p k89XVP#Y|HbEv6m8%&6L@r8Im}N*^SpA?ZkmdPM&HAEN4@;{X5v delta 2829 zcmah~Z)_Vy7T+1~I_p1i94BegHceb7PVJ^i+k_?|adSW^ZN*TkQ!FEQj_?uKImX7@wkQp=Y=<<#N7*r!lRI%W z5Aty~$xhr;0c#UgQ=p=3x~)RZNjUCQ`#6+O%S|&K7PFvvh@EL`hPKmi$D{mcHNyts z-bDNi{1}^){0CgEjvHA!!6&44o-M$ov%nwQ1kO-JA18AEHFmsTn#`*})Rp%d9UhX#*cpsnT3ka|U0l!agUAd|N^uwSz!Dn|MMfQv9 z7x^N{Gg}1qEE-F|7XasNoQ;AV2DGzAfe-K;FYsB2FMXZzr4l`eC!XfhTN)w{-Fjm` zG@~m@e$jtUJKg+HyGBXOPwJP+Bh6L4N-jsBa)X{#$NnQ1UeR-kR>k!M)^rdy>ve83 zp@Wd80$+fk+x*p{;3g9>5L+aU#=6OvTsYqxjoRcK;))(5EiAt70oE=|WEaNWj}zI& zaj*Da#Xg(C-2%b#M0&8_KH1^ADTI03^uLIInEgN7q-W*kyWJ~fMS3~}Cj?<(+RUj1 zT{u^4Pu^VmpAX5y1GO&fa-dag#@F|I5~;6QmJC6y`DD*q9l+u+UIN1?4&XVe>xfp2 z^J)v1*kTx$k099>+30V$?nZN=w@8+M^LQ7|gyi9k+u;pJX+KEB1)Cx>J@Zcx)?xpr z)Ypr9Ffv75rYNIh|0B>*T$9&x-}L|8_tOQry>%bUfr5}AntQ{n^UjhH6y|7fJuYNN#%X3|Zn@jf*n>D;T-BL5U}6 zltxKE$*sN6|Y;)>P`1k zKRtGmoNGQe_JELtKb1Z97+xuWB3K;uFzTrXXX!chj>2m7wVGL%ahf3DlnM|^ zw@!os^C|-3hdv`O{hk13J>5{YmBtups5=z8dGd}1?$d_W2x0%fmADG-f}7L~{Ik=~ z4nFO$N~yx#v~(?9vD0h+XT&(%t^fmIVdN395-%sPfgitwPFk2g{psX?tD)fkL+T+WUg7BC^r8#caseMLZ$*D z;3=v#hjHjID`1sh_9IIZ>=61Ot4OVx-VK2eJ5O%It`Kl)$x|l~h zS%Oa?hb1M`62vacHooFPiHmEfp7g(8a`IPz4We~ZxV(tHE>Mw*SI`1W7V7}oz8Lr4 zFJv}ltI6WpbzDklqxE+$D7OrjJFVMrdggyJL)n#ArJ%n8sWNScIk1x7zfQh5LklX3DN D-^h@P diff --git a/ChIP_analysis/bootstrap_sam_file.py b/ChIP_analysis/bootstrap_sam_file.py index 972a618..84d71c2 100755 --- a/ChIP_analysis/bootstrap_sam_file.py +++ b/ChIP_analysis/bootstrap_sam_file.py @@ -499,14 +499,15 @@ def numba_sum_coverage_from_sample(sampled_reads, array, res): cpm_array = np.zeros(array.shape) get_cpm(array, cpm_array) # modifies cpm_array in place logging.info("Created cpm_array of shape {}".format(cpm_array.shape)) - logging.info("Calculating mean, minci, maxci, and median cpm for each position from bootstrapped coverage values") + logging.info("Calculating mean, minci, maxci, mad, and median cpm for each position from bootstrapped coverage values") stats = np.apply_along_axis(summary_stats_only_finite, axis=1, arr=cpm_array) - logging.info("Saved summary stats to {0}_mean.npy, {0}_minci.npy, {0}_maxci.npy, {0}_median.npy".format(args.out_prefix)) + logging.info("Saved summary stats to {0}_mean.npy, {0}_minci.npy, {0}_maxci.npy, {0}_median.npy, {0}_mad.npy".format(args.out_prefix)) np.save(args.out_prefix+"_mean", stats[:,0]) np.save(args.out_prefix+"_minci", stats[:,1]) np.save(args.out_prefix+"_maxci", stats[:,2]) np.save(args.out_prefix+"_median", stats[:,5]) + np.save(args.out_prefix+"_mad", stats[:,6]) diff --git a/ChIP_analysis/bootstrapped_chip_enrichment.py b/ChIP_analysis/bootstrapped_chip_enrichment.py new file mode 100644 index 0000000..ec1dfeb --- /dev/null +++ b/ChIP_analysis/bootstrapped_chip_enrichment.py @@ -0,0 +1,672 @@ +################################################################################ +# bootstrap sampling for ChIP analysis with paired extraced and input files and +# a KO to subtract +# +# Written by Michael Wolfe +# Copyright (c) 2018 Michael Wolfe University of Michigan. All rights reserved. +# +# +#Developed by: Michael Wolfe +#University of Michigan +#http://freddolino-lab.med.umich.edu/ +#Modified by : Jeremy Schroeder +#University of Wisconsin - Madison +#http://wanglab.bact.wisc.edu +# +#Permission is hereby granted, free of charge, to any person obtaining a copy of +#this software and associated documentation files (the "Software"), to deal with +#the Software without restriction, including without limitation the rights to +#use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +#the Software, and to permit persons to whom the Software is furnished to do so, +#subject to the following conditions: +# +#Redistributions of source code must retain the above copyright notice, this +#list of conditions and the following disclaimers. Redistributions in binary +#form must reproduce the above copyright notice, this list of conditions and the +#following disclaimers in the documentation and/or other materials provided with +#the distribution. Neither the names of Michael Wolfe, Jeremy Schroeder +#University of Michigan, University of Wisconsin - Madison, +#nor the names of their contributors may be used to endorse or promote products +#derived from this Software without specific prior written permission. +# +#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +#FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS +#OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +#WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +#CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. +################################################################################ + +# base python +from __future__ import print_function +import argparse +import sys +import logging +import os +import multiprocessing as mp +from math import floor, ceil, log +import re +import time + +# Widely available +import numpy as np +from scipy import stats +import pandas as pd +from dfply import * + +# More custom +import numba + +# custom +from bootstrap_sam_file import ReadSampler +from bootstrap_helper_funcs import least_extreme_value, credible_interval + +def summary_stats_only_finite(array, alpha=0.05): + """ This version of summary stats only considers values that aren't nan or + inf. Creates summaries a single location based on bootstrap + + Args: + array (np.array): np.array that is 1 x n (n is the number of bootstrap + replicates) + + Returns: + summary_stats (np.array): np.array that is 1 x 9 + pos 0 average + pos 1 min_ci at alpha + pos 2 max_ci at alpha + pos 3 variance + pos 4 least extreme value + pos 5 median + pos 6 median absolute deviation + pos 7 number of infinite values in bootstrap + pos 8 number of nans in bootstrap + """ + finite_vals = np.isfinite(array) + num_inf = np.sum(np.isinf(array)) + num_nan = np.sum(np.isnan(array)) + if np.sum(finite_vals) == 0: + return(np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, num_inf, num_nan) + else: + these_stats = credible_interval(array[finite_vals], alpha) + this_lev = least_extreme_value(these_stats) + var = np.var(array[finite_vals]) + median = np.median(array[finite_vals]) + mad = np.median(np.abs(array[finite_vals] - median)) + return(these_stats[0], these_stats[1], these_stats[2], var, this_lev, these_stats[3], mad, num_inf, num_nan) + +def summary_stats_nan_zeros(array, alpha=0.05): + """ This version of summary stats turns nans to 0. NOTE THAT THIS MODIFIES + THE ARRAY + + Creates summaries for a single location based on bootstraps + + Args: + array (np.array): np.array that is 1 x n (n is the number of bootstrap + replicates) + + Returns: + summary_stats (np.array): np.array that is 1 x 9 + pos 0 average + pos 1 min_ci at alpha + pos 2 max_ci at alpha + pos 3 variance + pos 4 least extreme value + pos 5 median + pos 6 median absolute deviation + """ + array[np.isnan(array)] = 0 + these_stats = credible_interval(array, alpha) + this_lev = least_extreme_value(these_stats) + var = np.var(array) + median = np.median(array) + mad = np.sum(np.abs(array - median))/array.size + return(these_stats[0], these_stats[1], these_stats[2], var, this_lev, median, mad) + + +def actual_reads(sampler, size, res=1): + """ Take all reads from a sampler and map them to an array + + Args: + sampler (class ReadSampler): object holding reads to sample + size (int): size of numpy array to create + res (optional, int): resolution the numpy array is in + """ + ############# modify to support jit acceleration ############ + array = np.zeros(size) + for read in sampler.reads: + start, stop = read + # UNCOMMENT TO ONLY EVERY DO SINGLE BP RESOLUTION + #array[start:stop] +=1 + array[int(ceil(start/res)):int(ceil(stop/res))] += 1 + return array + +def sp_sample_reads(sampler, size, i, j, res, jit=False): + """ Sample reads from samplers using multiprocessing + + Args: + sampler (list): List of ReadSampler objects to sample from + size (int): Size of numpy array to create + res (float): resolution of numpy array + start (int): starting seed for random state + p (optional, int): number of processors + + Returns: + arrays (np.array) final result of sampling + """ + start = args.s+i+len(all_sim) + array = sample_reads(sampler, size, np.random.RandomState(start+j), res, jit) + return array + +def sample_reads(sampler, size, prng, res=1, jit=False): + """ Sample with replacement all reads from a sampler and map them to an + array + + Args: + sampler (class ReadSampler): object holding reads to sample + size (int): size of numpy array to create + prng (np.RandomState): random state to pull random numbers + from + res (optional, int): resolution the numpy array is in + """ + array = np.zeros(size) + sampled_reads = sampler.pull_reads(sampler.total, prng) + # # define heirarchy for using GPU + # threadsperblock = 32 + # blockspergrid = (array.size + (threadsperblock - 1)) // threadsperblock + + if jit: + # threadsperblock = 32 + # blockspergrid = (sampled_reads.shape[0] + (threadsperblock - 1)) // threadsperblock + # note that using cuda.jit here causes us to write directly to the array, + # without explicitly returning it from sum_coverage_from_sample function + # sum_coverage_from_sample[blockspergrid, threadsperblock](sampled_reads, array, res) + # device_array = cuda.to_device(array) + # device_samples = cuda.to_device(sampled_reads) + # device_res = cuda.to_device(res) + array = numba_sum_coverage_from_sample(sampled_reads, array, res) + # cuda_sum_coverage_from_sample[blockspergrid, threadsperblock](device_samples, device_array, res) + # array = device_array.copy_to_host() + else: + array = sum_coverage_from_sample(sampled_reads, array, res) + + return array + +def sum_coverage_from_sample(sampled_reads, array, res): + """ Map sampled reads to an array + + Args: + reads (): sampled read (start,end) positions + array (): array to be populated with coverage + res (float): resolution the array is in + """ + + for read in sampled_reads: + start, stop = read + res = float(res) + x = int(ceil(start/res)) + y = int(ceil(stop/res)) + # UNCOMMENT TO ONLY EVERY DO SINGLE BP RESOLUTION + #array[start:stop] +=1 + array[x:y] += 1 + + return(array) + +@numba.jit(nopython=True, parallel=True) +def numba_sum_coverage_from_sample(sampled_reads, array, res): + """ Map sampled reads to an array + + Args: + reads (): sampled read (start,end) positions + array (): array to be populated with coverage + res (float): resolution the array is in + """ + # pos = cuda.grid(1) + for i in range(sampled_reads.shape[0]): + read = sampled_reads[i,:] + # read = sampled_reads[pos,:] + start, stop = read + start = float(start) + stop = float(stop) + res = float(res) + x = int(ceil(start/res)) + y = int(ceil(stop/res)) + # UNCOMMENT TO ONLY EVERY DO SINGLE BP RESOLUTION + #array[start:stop] +=1 + for idx in range(x,y): + array[idx] += 1 + return(array) + +# @numba.cuda.jit +# def cuda_sum_coverage_from_sample(sampled_reads, array, res): +# """ Map sampled reads to an array + +# Args: +# reads (): sampled read (start,end) positions +# array (): array to be populated with coverage +# res (float): resolution the array is in +# """ +# # pos = cuda.grid(1) +# for i in range(sampled_reads.shape[0]): +# read = sampled_reads[i,:] +# # read = sampled_reads[pos,:] +# start, stop = read +# start = float(start) +# stop = float(stop) +# res = float(res) +# x = int(ceil(start/res)) +# y = int(ceil(stop/res)) +# # UNCOMMENT TO ONLY EVERY DO SINGLE BP RESOLUTION +# #array[start:stop] +=1 +# for idx in range(x,y): +# array[idx] += 1 + +def log2_ratio(array1, array2): + """ Take the median normalized log2 ratio of two arrays + + Args: + array1 (np.array): numpy array holding extracted signal + array2 (np.array): numpy array holding input signal + cu (bool): whether to use cuda.jit to accelerate calculations + + Returns: + ratio (np.array): median normalized log2 ratio + """ + # lets normalize by the median + arr1_median = float(np.median(array1)) + arr2_median = float(np.median(array2)) + ratio = (array1/arr1_median)/(array2/arr2_median) + + # if not cu: + # ratio = (array1/arr1_median)/(array2/arr2_median) + # only nans should be np.log2(0/0) which should be 0 + #ratio[np.isnan(ratio)] = 0.0 + # else: + # threadsperblock = 32 + # blockspergrid = (array1.size + (threadsperblock - 1)) // threadsperblock + + # device_array1 = cuda.to_device(array1) + # device_arr1_med = cuda.to_device(arr1_median) + # device_array2 = cuda.to_device(array2) + # device_arr2_med = cuda.to_device(arr2_median) + + # median_array1 = np.empty(array1.shape) + # device_median_array1 = cuda.to_device(median_array1) + # median_array2 = np.empty(array2.shape) + # device_median_array2 = cuda.to_device(median_array2) + + # ratio = np.empty(array1.shape) + # device_ratio = cuda.to_device(ratio) + # cuda_ratio[blockspergrid, threadsperblock](device_array1, device_array2, + # arr1_median, arr2_median, + # device_median_array1, device_median_array2, + # device_ratio) + # ratio = device_ratio.copy_to_host() + + log2_ratio = np.log2(ratio) + + return log2_ratio + +# @cuda.jit +# def cuda_ratio(array1, array2, arr1_med, arr2_med, +# median_array1, median_array2, +# ratio): + +# pos = cuda.grid(1) +# if pos < ratio.shape[0]: +# # calculate median for position pos of each array +# median_array1[pos] = array1[pos]/arr1_med +# median_array2[pos] = array2[pos]/arr2_med + +# # calculate ratio +# ratio[pos] = median_array1[pos]/median_array2[pos] + + + +def floored_sub(samp, control): + """ Subtract control signal from sample signal. Only subtract if control is + greater than 0. + + Args: + samp (np.array): sample numpy array holding signal + control (np.array): control numpy array holding signal + + Returns: + sub (np.array): subtracted signal + """ + # make copy as to not modify control array + this_control = np.copy(control) + this_control[this_control < 0] = 0 + return samp-this_control + +def read_in_sampler(samplerfile): + """ Take a saved numpy array file and load it back into a sampler + + Args: + samplerfile (str): name of stored numpy array + + Returns: + sampler (ReadSampler object): a sampler ready for sampling + """ + sampler = ReadSampler() + sampler.load_data(samplerfile) + # sampler.sort_reads() # our reads are already sorted + return sampler + +def mp_sample_reads_helper(args): + """ Helper function for sampling reads with multiprocessing + """ + sampler, size, res, prng = args + return sample_reads(sampler, size, prng, res) + +def mp_sample_reads(samplers, size, res, start, p=1): + """ Sample reads from samplers using multiprocessing + + Args: + samplers (list): List of ReadSampler objects to sample from + size (int): Size of numpy array to create + res (int): resolution of numpy array + start (int): starting seed for random state + p (optional, int): number of processors + + Returns: + arrays (np.array) final result of sampling + """ + pool = mp.Pool(processes=p) + arrays = pool.map(mp_sample_reads_helper, + ((sampler, size, res, np.random.RandomState(start+x)) for x,sampler in enumerate(samplers))) + pool.close() + pool.join() + return arrays + +def mp_actual_reads_helper(args): + """ Helper function for mapping reads with multiprocessing + """ + sampler, size, res = args + np.random.RandomState() + return actual_reads(sampler, size, res) + +def mp_actual_reads(samplers, size, res, p=1): + """ Map reads from samplers using multiprocessing + + Args: + samplers (list): List of ReadSampler objects to sample from + size (int): Size of numpy array to create + res (int): resolution of numpy array + p (optional, int): number of processors + + Returns: + arrays (np.array) final result of mapping + """ + pool = mp.Pool(processes=p) + arrays = pool.map(mp_actual_reads_helper, + ((sampler, size, res) for sampler in samplers)) + pool.close() + pool.join() + return arrays + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Given sampling objects,\ + goes through the entire chip-seq pipeline with x bootstraps.\ + outputs a numpy g by x matrix where g is the size of the genome\ + and x is the number of bootstrap replicates to do.") + parser.add_argument("--genome_size", type=int, help="size of genome\ + controls how large to build the arrays", required=True) + parser.add_argument('--out_prefix', help="Output prefix for final matrix.", + required=False, default='') + parser.add_argument("--sample_name_luts", type=str, nargs="+", + help="Paths to files containing the\ + lookup run information provided by Illumina. The\ + file will be used to match sample IDs\ + with sample names. Multiple file names should\ + be separated by spaces.", required=True) + parser.add_argument('--ChIP_samps', type=str, nargs="+", + help="Extracted read simulators for samples") + parser.add_argument('--inp_samps', type=str, nargs="+", + help="Input read simulators for samples") + # parser.add_argument('--ChIP_conts', type=str, nargs="+", + # help="Extracted read simulators for controls") + # parser.add_argument('--inp_conts', type=str, nargs="+", + # help="Input read simulators for controls") + parser.add_argument('--num_replicates', type=int, default=5, + help="Number of bootstrap replicates to do, default is 5") + parser.add_argument('--identity', action="store_true", + help="Don't sample, just use the data as is") + parser.add_argument('-p', type=int, default=10, + help="Number of processors to use, default is 5") + parser.add_argument('-s', type=int, default=None, + help="Seed for random number generation. Default is 1") + parser.add_argument('--resolution', type=int, default=1, + help="resolution of data to map, default is 1bp") + parser.add_argument('--save_summaries', type=float, default=None, + help="Don't save all bootstrap replicates. Just save the summaries:\ + minci, maxci, lev, mean, var. Specify the alpha level here") + parser.add_argument('--numba', action="store_true", + help="Adding this flag will enable jit compilation\ + on the cpu to speed up calculations.") + args = parser.parse_args() + logging.basicConfig(format='%(asctime)s %(message)s',level=logging.INFO) + ## TO DO: + # allow functions to deal with more than two samples for each + prng= np.random.RandomState(seed=args.s) + array_size = int(floor(args.genome_size/args.resolution)) + + num_ext_samp = len(args.ChIP_samps) + num_inp_samp = len(args.inp_samps) + # num_ext_cont = len(args.ChIP_conts) + # num_inp_cont = len(args.inp_conts) + + if args.out_prefix == '': + save_name_prefix = '' + else: + save_name_prefix = '{}_'.format(args.out_prefix) + + if args.save_summaries is not None: + this_summary_stats_only_finite = lambda x: summary_stats_only_finite(x, alpha=args.save_summaries) + this_summary_stats_nan_zeros = lambda x: summary_stats_nan_zeros(x, alpha=args.save_summaries) + + samp_final = np.zeros((array_size, args.num_replicates, num_ext_samp)) + # cont_final = np.zeros((array_size, args.num_replicates, num_ext_cont)) + + pat = re.compile(r'Sample_\d+') + # grab the sample names from the treatment sampler files + samp_matches = [pat.search(s) for s in args.ChIP_samps] + if None in samp_matches: + print("Your sampler file names should begin with the sample id\ + Illumina gave you, i.e., Sample_NNNNNN,\ + where N is any integer. Exiting script.") + sys.exit() + else: samp_strings = [match.group() for match in samp_matches] + # grab the sample names from the control sampler files + # cont_matches = [pat.search(s) for s in args.ChIP_conts] + # if None in cont_matches: + # print("Your sampler file names should begin with the sample id\ + # Illumina gave you, i.e., Sample_NNNNNN,\ + # where N is any integer. Exiting script.") + # sys.exit() + # else: cont_strings = [match.group() for match in cont_matches] + + # read in sample info from illumina to look up descriptions from sample ids + for i,sample_info_file_name in enumerate(args.sample_name_luts): + if sample_info_file_name.endswith('.csv'): + sample_info_tmp = (pd.read_csv(sample_info_file_name, header=18) >> + select(X.Sample_ID, X.Description)) + else: + sample_info_tmp = (pd.read_csv(sample_info_file_name, sep='\t') >> + mutate(Sample_ID = 'Sample_' + X.SampleID.astype(str)) >> + select(X.Sample_ID, X.Description)) + if i == 0: + sample_info = sample_info_tmp + else: + sample_info = sample_info.append(sample_info_tmp) + + samp_names = [] + for samp_id in samp_strings: + sample_rows = sample_info[sample_info.Sample_ID == samp_id] + samp_names.append(sample_rows['Description'].iloc[0]) # grab the first description for this Sample_ID + + # cont_names = [] + # for samp_id in cont_strings: + # cont_rows = sample_info[sample_info.Sample_ID == samp_id] + # cont_names.append(cont_rows['Description'].iloc[0]) + + # samp_names = [os.path.basename(x).split(".")[0] for x in args.ChIP_samps] + # cont_names = [os.path.basename(x).split(".")[0] for x in args.ChIP_conts] + + ## Read in all samplers + all_sims = [] + all_sims.extend(args.ChIP_samps) + all_sims.extend(args.inp_samps) + # all_sims.extend(args.ChIP_conts) + # all_sims.extend(args.inp_conts) + if num_ext_samp != num_inp_samp: # or num_ext_cont != num_inp_cont: + logging.error("Mismatch number of extracted and input samples Ext_samp: %s\ + Inp_samp: %s"%(num_ext_samp, num_inp_samp))#, num_ext_cont, num_inp_cont)) + + all_sim = [] + for sampler in all_sims: + logging.info("Reading in sampler {}".format(sampler)) + all_sim.append(read_in_sampler(sampler)) + + # all_sim = [read_in_sampler(sampler) for sampler in all_sim] + logging.info("Finished reading in samplers") + + ## sample reads + for i in range(args.num_replicates): + logging.info("Starting bootstrap replicate %s"%i) + logging.info("Sampling reads %s"%i) + begin = time.time() + if args.numba: + if args.identity: + # arrays = mp_actual_reads(all_sim, array_size, args.resolution, args.p) + ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING + arrays = [actual_reads(sampler, array_size, args.resolution) for sampler in all_sim] + else: + ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING + arrays = [sp_sample_reads(sampler, array_size, i, j, args.resolution, jit=args.numba) for j,sampler in enumerate(all_sim)] + # arrays = mp_sample_reads(all_sim, array_size, args.resolution, args.s+i+len(all_sim), args.p) + else: + if args.identity: + arrays = mp_actual_reads(all_sim, array_size, args.resolution, args.p) + ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING + # arrays = [actual_reads(sampler, args.genome_size, args.resolution) for sampler in all_sim] + else: + ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING + # arrays = [sp_sample_reads(sampler, array_size, i, j, args.resolution, cu=args.numba) for j,sampler in enumerate(all_sim)] + arrays = mp_sample_reads(all_sim, array_size, args.resolution, args.s+i+len(all_sim), args.p) + finish = time.time() + logging.info("Sampling bootstrap replicate {} took {} seconds".format(i, finish-begin)) + ## Calculate ratios + logging.info("Calculating Ratios %s"%i) + begin = time.time() + for j, (ext_ind, inp_ind) in enumerate(zip(range(0, num_ext_samp), + range(num_ext_samp, + num_ext_samp+num_inp_samp))): + samp_final[:,i,j] = log2_ratio(arrays[ext_ind], arrays[inp_ind]) + # for j, (ext_ind, inp_ind) in enumerate(zip(range(num_ext_samp + num_inp_samp, + # num_ext_samp + num_inp_samp+num_ext_cont), + # range(num_ext_samp + num_inp_samp + num_ext_cont, + # num_ext_samp + num_inp_samp + num_ext_cont + num_inp_cont))): + # cont_final[:,i,j]=log2_ratio(arrays[ext_ind], arrays[inp_ind]) + finish = time.time() + logging.info("Calculating Ratios for replicate {} took {} seconds".format(i, finish-begin)) + + + # Write out final output + if args.identity and args.save_summaries: + # save sample summaries + for j, name in enumerate(samp_names): + save_name = "{}{}_actual".format(save_name_prefix,name) + np.save(save_name, samp_final[:,:,j]) + # save control summaries + # for j, name in enumerate(cont_names): + # np.save(args.out_prefix+"_%s_actual"%name, cont_final[:,:,j]) + # save each combination of sample - control summary + # for j, samp_name in enumerate(samp_names): + # for k, cont_name in enumerate(cont_names): + # # note that floored sub MODIFIES the control array. Since we have already written the control array + # # that is not a big deal but be aware that this modification happens + # np.save(args.out_prefix+"_%s_Sub_%s_actual"%(samp_name,cont_name), floored_sub(samp_final[:,:,j], cont_final[:,:,k])) + + elif args.save_summaries: + # ALL OF THIS IS HARDCODED QUICK CODING. Saves a lot of output files to be + # used downstream + logging.info("Calculating Summary Stats for finite values") + begin = time.time() + # save sample summaries + for j, name in enumerate(samp_names): + these_stats = np.apply_along_axis(this_summary_stats_only_finite, 1, samp_final[:,:,j]) + np.save("{}{}_mean".format(save_name_prefix,name), these_stats[:,0]) + np.save("{}{}_minci".format(save_name_prefix,name), these_stats[:,1]) + np.save("{}{}_maxci".format(save_name_prefix,name), these_stats[:,2]) + np.save("{}{}_var".format(save_name_prefix,name), these_stats[:,3]) + np.save("{}{}_lev".format(save_name_prefix,name), these_stats[:,4]) + np.save("{}{}_median".format(save_name_prefix,name), these_stats[:,5]) + np.save("{}{}_mad".format(save_name_prefix,name), these_stats[:,6]) + np.save("{}{}_num_inf".format(save_name_prefix,name), these_stats[:,7]) + np.save("{}{}_num_nan".format(save_name_prefix,name), these_stats[:,8]) + # save control summaries + # for j, name in enumerate(cont_names): + # these_stats = np.apply_along_axis(this_summary_stats_only_finite, 1, cont_final[:,:,j]) + # np.save("{}{}_mean".format(save_name_prefix,name), these_stats[:,0]) + # np.save("{}{}_minci".format(save_name_prefix,name), these_stats[:,1]) + # np.save("{}{}_maxci".format(save_name_prefix,name), these_stats[:,2]) + # np.save("{}{}_var".format(save_name_prefix,name), these_stats[:,3]) + # np.save("{}{}_lev".format(save_name_prefix,name), these_stats[:,4]) + # np.save("{}{}_median".format(save_name_prefix,name), these_stats[:,5]) + # np.save("{}{}_mad".format(save_name_prefix,name), these_stats[:,6]) + # np.save("{}{}_num_inf".format(save_name_prefix,name), these_stats[:,7]) + # np.save("{}{}_num_nan".format(save_name_prefix,name), these_stats[:,8]) + # # save each combination of sample - control summary + # for j, samp_name in enumerate(samp_names): + # for k, cont_name in enumerate(cont_names): + # these_stats = np.apply_along_axis(this_summary_stats_only_finite, 1, floored_sub(samp_final[:,:,j], cont_final[:,:,k])) + # np.save(args.out_prefix+"_%s_Sub_%s_mean"%(samp_name,cont_name), these_stats[:,0]) + # np.save(args.out_prefix+"_%s_Sub_%s_minci"%(samp_name,cont_name), these_stats[:,1]) + # np.save(args.out_prefix+"_%s_Sub_%s_maxci"%(samp_name, cont_name), these_stats[:,2]) + # np.save(args.out_prefix+"_%s_Sub_%s_var"%(samp_name,cont_name), these_stats[:,3]) + # np.save(args.out_prefix+"_%s_Sub_%s_lev"%(samp_name,cont_name), these_stats[:,4]) + # np.save(args.out_prefix+"_%s_Sub_%s_median"%(samp_name,cont_name), these_stats[:,5]) + # np.save(args.out_prefix+"_%s_Sub_%s_mad"%(samp_name,cont_name), these_stats[:,6]) + # np.save(args.out_prefix+"_%s_Sub_%s_num_inf"%(samp_name,cont_name), these_stats[:,7]) + # np.save(args.out_prefix+"_%s_Sub_%s_num_nan"%(samp_name,cont_name), these_stats[:,8]) + # finish = time.time() + # logging.info("Calculating Summary Stats took {} seconds".format(finish-begin)) + +# logging.info("Calculating Summary Stats for nans as zeros") +# # save sample summaries +# for j, name in enumerate(samp_names): +# these_stats = np.apply_along_axis(this_summary_stats_nan_zeros, 1, samp_final[:,:,j]) +# np.save("{}{}_mean".format(save_name_prefix,name), these_stats[:,0]) +# np.save("{}{}_minci".format(save_name_prefix,name), these_stats[:,1]) +# np.save("{}{}_maxci".format(save_name_prefix,name), these_stats[:,2]) +# np.save("{}{}_var".format(save_name_prefix,name), these_stats[:,3]) +# np.save("{}{}_lev".format(save_name_prefix,name), these_stats[:,4]) +# np.save("{}{}_median".format(save_name_prefix,name), these_stats[:,5]) +# np.save("{}{}_mad".format(save_name_prefix,name), these_stats[:,6]) +# # save control summaries +# for j, name in enumerate(cont_names): +# these_stats = np.apply_along_axis(this_summary_stats_only_finite, 1, cont_final[:,:,j]) +# np.save("{}{}_mean".format(save_name_prefix,name), these_stats[:,0]) +# np.save("{}{}_minci".format(save_name_prefix,name), these_stats[:,1]) +# np.save("{}{}_maxci".format(save_name_prefix,name), these_stats[:,2]) +# np.save("{}{}_var".format(save_name_prefix,name), these_stats[:,3]) +# np.save("{}{}_lev".format(save_name_prefix,name), these_stats[:,4]) +# np.save("{}{}_median".format(save_name_prefix,name), these_stats[:,5]) +# np.save("{}{}_mad".format(save_name_prefix,name), these_stats[:,6]) +# # save each combination of sample - control summary +# for j, samp_name in enumerate(samp_names): +# for k, cont_name in enumerate(cont_names): +# these_stats = np.apply_along_axis(this_summary_stats_only_finite, 1, floored_sub(samp_final[:,:,j], cont_final[:,:,k])) +# np.save(args.out_prefix+"_%s_Sub_%s_mean"%(samp_name,cont_name), these_stats[:,0]) +# np.save(args.out_prefix+"_%s_Sub_%s_minci"%(samp_name,cont_name), these_stats[:,1]) +# np.save(args.out_prefix+"_%s_Sub_%s_maxci"%(samp_name, cont_name), these_stats[:,2]) +# np.save(args.out_prefix+"_%s_Sub_%s_var"%(samp_name,cont_name), these_stats[:,3]) +# np.save(args.out_prefix+"_%s_Sub_%s_lev"%(samp_name,cont_name), these_stats[:,4]) +# np.save(args.out_prefix+"_%s_Sub_%s_median"%(samp_name,cont_name), these_stats[:,5]) +# np.save(args.out_prefix+"_%s_Sub_%s_mad"%(samp_name,cont_name), these_stats[:,6]) + + else: + # write final array + logging.info("Writing final array") + np.save("{}samp".format(save_name_prefix), samp_final) + # np.save(args.out_prefix+"_cont", cont_final) diff --git a/ChIP_analysis/bootstrapped_chip_no_consolidation.py b/ChIP_analysis/bootstrapped_chip_no_consolidation.py index cb6acb1..fb53bcb 100644 --- a/ChIP_analysis/bootstrapped_chip_no_consolidation.py +++ b/ChIP_analysis/bootstrapped_chip_no_consolidation.py @@ -90,7 +90,7 @@ def summary_stats_only_finite(array, alpha=0.05): these_stats = credible_interval(array[finite_vals], alpha) this_lev = least_extreme_value(these_stats) var = np.var(array[finite_vals]) - # median = np.median(array[finite_vals]) + median = np.median(array[finite_vals]) mad = np.median(np.abs(array[finite_vals] - median)) return(these_stats[0], these_stats[1], these_stats[2], var, this_lev, these_stats[3], mad, num_inf, num_nan) @@ -533,7 +533,7 @@ def mp_actual_reads(samplers, size, res, p=1): if args.identity: # arrays = mp_actual_reads(all_sim, array_size, args.resolution, args.p) ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING - arrays = [actual_reads(sampler, args.genome_size, args.resolution) for sampler in all_sim] + arrays = [actual_reads(sampler, array_size, args.resolution) for sampler in all_sim] else: ### UNCOMMENT TO GO BACK TO NOT USING MULTIPROCCESING arrays = [sp_sample_reads(sampler, array_size, i, j, args.resolution, jit=args.numba) for j,sampler in enumerate(all_sim)] From 24f7f2633b2c989941f13405fa90eb8065311bdc Mon Sep 17 00:00:00 2001 From: bwanderson3 Date: Mon, 28 Oct 2019 14:55:45 -0500 Subject: [PATCH 12/13] calculate_peaks.py --- ...rt_multiple_to_average_robustz.cpython-37.pyc | Bin 0 -> 2722 bytes ChIP_analysis/__pycache__/peak.cpython-37.pyc | Bin 0 -> 5234 bytes ChIP_analysis/calculate_peaks.py | 13 +++++++------ 3 files changed, 7 insertions(+), 6 deletions(-) create mode 100644 ChIP_analysis/__pycache__/convert_multiple_to_average_robustz.cpython-37.pyc create mode 100644 ChIP_analysis/__pycache__/peak.cpython-37.pyc diff --git a/ChIP_analysis/__pycache__/convert_multiple_to_average_robustz.cpython-37.pyc b/ChIP_analysis/__pycache__/convert_multiple_to_average_robustz.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16a76205c750b430952632f3958dc74deacca40c GIT binary patch literal 2722 zcmds3O>Y}T7@nEkb)2o^xFJA+R*Z^-YKbN(7pkf%AEii@5UP@>hZSfw@r<2KcW2Ga ztihHeAzF?-@DCD+3;Y7aAK<`0*dsmV!WF3}-kDuHPE?6rxG>Jn&ey)*&olXCWhF$= z{<`|?%Rf5^J+qTp^I`B2Jo+c-IN~@(8ICZvW0I1HxVIPi?j1w{zsP+aJi$?euki+7 zg3*#_^5!F=js!Va=F4xOD0HKclOG7oF7OMy^#q9vpQ6V&YVj2}Yt^&%!fZwO2W_ze zx>lc(XjS+}9TD=*>qx8~6R?TwpzbRtU@7Y2vdNQCqLW;HvD+hd4O*jq?!sGt2i+7M zfK!h=wPr9UkMZ|-iq%y^j=U*4#)m|93_0-ORA3t1drA%o>@C?j?0PV6PSF&cfSGS+ z@GMs=Q~wgI!g_Ez=l(QUM{l92Hzl{B2Xh1V3q~lU&^{|<&w#zuX=0PV2TXg^~--C>Msw%X?#Ew_rjQrEQjIu^{!KwfUY>xuVHD z0Eq^#%-L8uqfQ`OXmWGMg`G+vxV9$miF0}BgSg7xXwd|y%Yd8lPz<_%bbkNs*S|mP zhNUkdBV~AMNJ-?dY$m#jcIoF*=+f88M3lY_eHl!I%5~Yu1W%YOJ+`O2jj~Z?uWT*m zs%+F!mg?}S74V?+GRA*KY8}SUcQ^O*Ol*F`9Bio#@TFGUBXAO)^YHZ1R{vt+0WJYaJWIak(~&{A@LzbU~Z8Dp4oIP3+?q>EIS_ z<4f>UufclHv%AnN@dWTVr0&9ChE%;i#WiLx&oPS*2&@vo@fNIk4#x=4hAr9-0d495 z52l{+j{K>2j8{>Oz6KisHMqpS7=4V|NYkE>MkvAQXQ#%ib-luXNUqioxEFrSSy11s5My@w~A6B2WEfHGs+TIH&7N-%HD&@-ui0Ec1`HI zzFLJ>G=2OPsZ(pXQ8rNl7-gqA-!e z!o>X~73a)+9^Ot9ZF+e&yfXm7+Nx?Gbp>8$)LW);EMtkusc#*E>LYZF7M0_2sSg*j z5fbdU3`fhr-J0jrI{EvsUF-XIRR&G*<1>C%R9LG~Z-cNGDS33N1a1|deVgcLxh7zs zTD)ujE|y^&_Y1g41;pQmp!D=umw~(Yt9Uk<8Vs4z!ZGkXXA+7ijzO>*jLJq{n4uDN zyluGQ_Hq7$(u(w<7)n1AOh!#O3@pV`xU=lVU<13*xgu)b&T?L);?6tPjW?mQWdlDi Ryc}L>k`T8^8@D~^$v;m8{To11eD zJpa1+<>QB!4C6~qCXbE5Dq2DyjgBFOGzX$%{@Re1v|k$14y*&KV`E;CRm`h;UctO3 z9n77eg7vC2|7f)9BiR}T{?p~*p6jj+{3HpImLJJhZzt-eVH_o`G;YbjAGCJEw6AfK zRv5K5gDBn(yRB{?># zOF967(GlRq3@mc<(l`(uTP|t(MW>R}4T|g1eQ9)RvLT!39XTiG(bwfUxq#l4=j8?T z4S7*6qHoGe;P7&0KLmqa6UXqe(B{zA(2~yq#^TU8G7g0lM`GW6aV<5+<^ia9YOC90 z^U#vkkr2kSg$-kDj7@1@HBuXWf$2k0GmemEWsyh;BFtvCSZevZ_BEayu)G z(yY=O#D3bgGOIgCGJ6n4L8nIOEA?L8cQkY>@@2Pm(<6gJ1OEM2&nDc_ax5SdTAx865=yHjciDgBB zbQ?s)!=3`uDt>^$Sd0x1$^#T*3(y?ffY!JIXpgIam2nNQI(7hS<2s--b^+@WYWG|X z8`6N1#Cp4_&S6)zKyaSm0>MRsMFRM&p_T|P6I=nvYBGqDFx|`Q-8hmVeXHHb>?9cU zzW#1nz&W_d%^lARqcHWn_b^L%o?%tRtlwx(3vYtorFHRPPs7tP`v3z8nQejhE%uPy z7UZ@Csck`KTaefmPv>?D*DTmj2GA-O4{z%XSglYJ0R*T-!;#J zp?cmutR?gZLtGFGV$>{{)QhKhEbXERWYDcSyi(}Z%(Y3qzWG4-g6I7TC<*s6ribZ; zGR|UrMhqn~#7!IdDO9cVe;Rkimhjptblh z1}Op?xHJ*k1jg1FK~9RR#=fa%)GTI)_7UZfk1I#Ub?Ee|onszWkBoilz6SrXt`=Gz zecF1|50NU8WM>;GBklVsK!8-Ct-OV`I2!D=cDaBFj6U_@(^BUVBoH9Gt>PKNpLN!W z;?}yqj!zYGOb^?ufXPPhu0^z5exrlqCRoJt&tJ5i%#4Pa6DB=6LZGFaRd)LUV_tV> z8)Rmd2HRnzJ-2OZ*>`G@A8q@|(~cWPGI-%33?`jsH;Awcyb7<*>g6KNC>7)7m<%%s zs^n-A-pB7a7`2-c0r&OH6 z`QN?g?OY%X3Yi6!oz>AC-J^`Ok58O^L7#nUc^x6?+d$E2BnQ*!bVSdZqO|82c9~l} zFLOO_JC-{Gj+>tMY{wrID~&FzD(IshOZ6@8z_g~?1m7Wem*9s4KO%Ua;Ku|^q9?#j zPN^2Mo-ofiX45rP3%`bAI+n)hVzGSq$tqg%2w+lZ_S>F6mA&D1)y$0#OsIk~uDY=FMs8-Fl9$?3G#GojiL3=An~S28xZX#y$xPlkh>_jHOcGPLRo8FHZ-?OuW@UG+g+%C6yO%2ZZp$VVdbUMcd^M#={afa>E!wx7*pz|PB&REiDRWz7K=ShSZ1K< zqC$#IkauXOC>2>kY>J*Xdfu*pQyrriVF_|rlP2y~=Dzjf=GcOCuRS*R?R}J&hfZ2Y zfnJgJE8s%f_X}xP)^*m;WAJM!(N*6ADVgOz!HqP73qL|H)~LKYlEGH%Ao^yxep)qrP!E?@{Fjj1@L5KM^nGCCV|C1*!Ws3%I`Zf!%$;qh>{@il5iCKodnRd7$=*;VYJjW zV -np.log10(args.bioalpha), techq > -np.log10(args.techalpha)) temp_peaks = np.logical_or(temp_peaks, this_rep > 0) - print "Fractions passing bio and tech filter" - print np.sum(this_rep == 1)/(bio_qs[0].size + 0.0) - print np.sum(this_rep == 2)/(bio_qs[0].size + 0.0) - print np.sum(this_rep == 3)/(bio_qs[0].size + 0.0) - print np.sum(this_rep == 4)/(bio_qs[0].size + 0.0) + print ("Fractions passing bio and tech filter") + print (np.sum(this_rep == 1)/(bio_qs[0].size + 0.0)) + print (np.sum(this_rep == 2)/(bio_qs[0].size + 0.0)) + print (np.sum(this_rep == 3)/(bio_qs[0].size + 0.0)) + print (np.sum(this_rep == 4)/(bio_qs[0].size + 0.0)) return temp_peaks def idr_filter(temp_peaks, idrs): @@ -233,6 +233,7 @@ def find_min_value(start, stop, vallist, ret_index=False): parser.add_argument('--resolution', type=float, help="resolution of data", default=1) parser.add_argument('--bins', type=int, help="bins to consolidate over", default=30) parser.add_argument('--outpre', type=str, help="output prefix") + parser.add_argument('--chrom_name', type=str, help="Chromosome name") args = parser.parse_args() actual_sigs = [np.load(x) for x in args.log2ratios] @@ -259,7 +260,7 @@ def find_min_value(start, stop, vallist, ret_index=False): max_valbio = find_max_value(entry[0], entry[1], bio_qs) max_validr = find_max_value(entry[0], entry[1], idrs, False) max_valtech = find_max_value(entry[0], entry[1], tech_qs, False) - this_peak = peak.Peak("gi|48994873|gb|U00096.2|mod|ATCC.47076|", start=int(start), end=int(end), + this_peak = peak.Peak(args.chrom_name, start=int(start), end=int(end), name = "%s_%s"%(args.outpre,i), score = max_value, signalval = max_valtech, From 0883171acf5578919bdd4c0aa656dd90ca08c980 Mon Sep 17 00:00:00 2001 From: jwschroeder3 Date: Mon, 13 Jan 2020 12:54:30 -0600 Subject: [PATCH 13/13] jit bootstrapping --- .../bootstrap_sam_file.cpython-36.pyc | Bin 12272 -> 16113 bytes ChIP_analysis/bootstrap_sam_file.py | 1 + .../bootstrapped_chip_no_consolidation.py | 10 ++++-- ChIP_analysis/save_csv_files.py | 34 ++++++++++++++++++ 4 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 ChIP_analysis/save_csv_files.py diff --git a/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-36.pyc b/ChIP_analysis/__pycache__/bootstrap_sam_file.cpython-36.pyc index 0a80ee25094dccfda01e78b5fcd81355a1241105..ddc5366fd32e18dfd3966067218b5ac29658b436 100644 GIT binary patch delta 6404 zcmai2U2GfIm7W=LD3bc4WLdH-*&fTXCE5~W`6sef$BCoFv6}c-j+IQ4G9%tAY3SjQ zzB5uJnhJMUPLTvK;4Ogy4T}DtK!E~%Xsa&vXl!`oF*bgPV%-|4Pf(hW;xkUQYkl(Ybn> zDNJ+P%?{ainw@ClPFJbh>{b;C%a|Foy3aoVsto@O)rJ4h*z&csiBh@^>x>@FtVh*w%){FKa+sFFQ4zc}p){o9GW9$GM z1nMC+z&nb1@$i{{_Gujh*A*xUflYfFW!x#D^Db&29ils6?bUVPD+A+p zJdOTEF#jPR=F>R;88^v?!XBS4er-zu|C!>YsC^08=eE?ksyxx$MGU6c5EhP$etwyE z7heYC6?St=1EVW);agbvO0+CVe_IODDm+2&Na(8(p0p%z!q}XJUA@Ln@g%IL z%e4QiOB$x$VDs3+HNGFJn}M8jK>B*5lQ*Bl#Ao~vzXteQM7%{d(owvQ{yBba3kKzW zvhMO57+>8|UsK-yz`V)R#WZ-`0_-;Ee@U%*+f9JS9rHFn!e{txeyt9(&_34q%_VJ+ z)c-o4-O|B4B%YT$zl%NHM_JPwByAs_F(#t)m zm%B~9ybTPD%=$H{mTSd(7%cDs>~VqiBbhINXn|(rBE;sMXt{Uz0d}4*uy-HF%y%WO zDHkxC3y^M1_Kg9BzZ-6m;X)5$15M4D?7ab{-kd)F-1Pe`#NT{wHs3P)!E>`#%k1LL zEYs$dEPqPQFslVgX0O?e&e-UgtQzR^{K4E!K=(P^^87qs_XRKU{F>#g@}Q?6II|ZW zp0{0}i#2QebnHD%?P@&M@2IbB|DRq|?{t7`&|N6IYh3twznph$&lhQeCag+@yG*pp zg+(VCGwJdV^MYGUbYU1)-GrR97?Tro+uv(*)mh>zdZ`L|K{D{lY9X7{4jXSTpVST? z@14f=)PJ8LYC+7hswCZupJ@@@62jWxjk)x%j{gHPDcSw)k@k6YUP?NI6@rAYTvjdx zZPY?#a_3V1Y)md5#4EyG4s_dPe0{GZsmiM3NEJNT{&mOaEsVus+61Xl4AMGruPj0x zXX7DX)fGo*+MsHWX46f+-bdU2)|FQ;HznR-XW{T@$B}vib!jh?ln0zVvwZ6@)H>X7 zGryhLWh0NM%@WHb*jFezO>=U4-sg>K&mZ5Ac$Co}qlK)ik=OzE+Gq3)xbKkfZb|HFX#7lJX$404iMQ*bUJG&dVrD{huD?u4LnZ z4#QjKVgjJ>Xk*@LDH{uPY1u)lP$}iZu4=bvKwiMMW$x#J@G}s5apJ1=sp2BqFF*e- zeLsDn+Yw+hxy|0P)Fw(!O)MB;x zQfglD6E&p<71h)&ZBzH#YC8BPK$Q&P2&bEE7^YCt5vtBWfWSc_Bx^e99d%6BanxV z?8Av#mB!SGzQ#wRZLheMoIJPejIq-VmxGv7#=7AefvIRy`EXO?JB+_-jf=$PaHDcG-c>1k#%Y7RhPBL_;C=#68714z z7i@!c;Z#lw{C_VuT8gFv$B$V?;3Bk&8 zn^1$GFGf(qTJ*}!s?QC^*KDcpCgLO^PPga{CuX}gaOE-{PViY(N+r~Z>Ou<@j(iU) zoxV_UPoBc26nR~rl8d;JZ4a+!()PlY#Cd=b7hn=g)PPqILnV?#NN<-Q2F(PCaF0RE zTC987lz35+Nj(ScAuEdAUeE?BK-69eJH)OXw&C&cu!E!z1J65rO4s2zMk&2Rtg z;O|th`*e6rO*MWod}#D8F?ok1YIDlV%ecEi911Rr1)4Ca>TCRIIJ{O=!m{TZZt?zM&#Ho7atHGiz>{k=ge>@U z_-ds6OoemQwm&)cBei}5$9kB&jFpFt;;QpMpoJTlENrpE+-K20lG zl`t8&feYfvz}YDTXbL(i`Y3=lB{pOY{V&gMxx5<_Y4P=$}SsW3`dno<4<2J4?i zQNF$t=GQk72WzqLDL`R0vnAJ&2e&l z1dFuQl3Na_aYO~-Pf*THj?|_I<;i$PUQLopqOb>siyU}JHDT+__|W!O5Fw)g zK@{}C4vg_{j}6pABE&1(l`N%`4D%V7^f{J$wo{^a^PEtA?`9Z z9ibZ`dd^s|9f#DlO5q-wgBL-(4rh@RR5{YixW9okm;|VHPEL|Dr_<3L+@zkyPfoqw z{VhxdZHzBjtBx<~+kZZFU#$nJD8H!k>iKz@vw+=>WYq4Qi9wfjH>yQv@8zmplarwx zc<_tQ`3%a9fgSmGYK|pp|vmZFHYX)^Yn~@*$s$-1Dm)lat_)uRsy@dUfFV zxaAdmyTr4eaeTbQJr6lV*2A6?aHb_3H+H$ko$AmnBrM1Vnu~$$Vv!eokyp2@?_KoxLZ@lu&_Q2UKV?!_ zWB)w)GU6?HMog-mZ-)xtAbPlo*UkYtr9p|zjrzk)oN4(!rVNk64wzT%o43}8O{7X` z{kiIq9qEu>L@IxFp+jNP$c$#%E>lr@Adz^>vzAk!`$nuw+;S%<^{N+awAOmqunAmr zwvz*aj{zMU+$yXzFZCWvs9;x3BMI{#GDmCO_K0~Ctq3sszmLW;iGL-3B%h+Nx2y|lOnu$DiQ?RKeI&n!FJmx6LdrIEGjAM#DB^QiS zpobfpM48%eG-{J4>W9z`V`dz7wIaxkOFx3^1p^Fp+^fn)n2J$<(K2JN+GRl;Wh4&z zn<;dDg*&*B`K492;K__k=4L@6v`P6c3}Qvwmro|~CRm&i^TbF-?iK1$+?6pdXul)> zDVWD|0)Lwjn1YqaH4?ex|1NJT8K?Z1IR&_HZavx8vDh7iP=IS&@;< z-MD&Fd_=qoRDDX-zfi?ck&DE=5Mzz+=SJ%CEi7Nfl+}n2QJI;hw~Cxwke1e(C$|LG z3Y!mOr(Wfv>;-AKbV{oc9D?>8F9jz>e{uL{&xDRzzPCT7^^F%W;*B3eRDlkR&ELWXKNWp_C*t(D?| z4-t2rPfI@3rCbHMOe2$*;1E_FewmU6?+a9!C>T&UNXpRAfjX1Sh<-hNG&ztQMLC)7 z>Fen?8sBjj;r9qS?WE9y~ld%L~QPGlGM(t4hQ826ZB4>{9(Pp#`wHI|iYUI+O h=^!W|ktkY1I}FG$XnVB|4Mp*m{}*SWkB5NW@lz+ zznz_#oqKTOemXkc+UkGs^^X_$tR(#;)jdtXAA!&QPJT6daw9+`Di>N(EsGLWsdiJM z8f&d6sUY>yrkhf#jq0=+#t`+>0F3Rlg|@=jK@ZV38oEhRoixY-c`e^nkuoxEryZc{ zW<9j?s(eT0-!LWLTai+ItS`TfcF}Iyb6ZYrXWR2TXfN&CpxZ#~XZ`t|bUWRF5|M<| zPj}Wy19Xsv>8{%f%;M4x8WAf8*Z`~?qR~xTl=^9`P8w!|bT=IlD{(ds@*X-$$8M{D zwSlT}P|0*+Q-zwnu-(4R?NFW+i>B%5sU2IoALVMxdg#Qlw5KRdF zW2WY#shAmL!$NzI&cLB#!0+Aw)4QyL#VRBVj=V>7%W^q?6Bgb7H*UN*>dkoUnixkIXu3IT2^Bg6bO3xb;?fVv-a2LNqD0ww`@ zDz6QI`rvWtVpnR0b+TzTg8&r>_#OG$+4B-W9|Fx7o322L%xBk5uvw6&Dg@TdrsjZ8 z0?u1Frjn))&<>}PtecInB%6l#%C}@Tmyx&Oj+1Pnq9JnMwU-;Ask|)3XWet!Wc73H zd6@*<9({=%spj-NS!#yL_1ab}^99D8v}2_UIosh_)1J>Lm6%Dn4nkb!ei%}^$zDuz zW>6jgu|`_ZST`sY6ARUt-z3Knm$xCQVX-v_u(n{Dw_xl>oaQYY`>gwYN5A_p@Zl69 za0HAK;YH(`P@Ve*=f-uzz1#9h?0=RC@3PeoT9?VP@U{P+IGi`6w0rMSr2n;3f!F^QQ;fAzXRKo$Ee#sugY8k#r)# zOw@{J0J#W~7?Ryc_8@s2Ai-e`avzX{d#`(t_^OlrC*>iWiW86gBTss=U=%M?WByb3 z?SYBmAlUu-7d(6laUKy!c^DL)QZ&|h$bB#{((`yEb<*dnw7SJq)7d*Xus45l{v zB?W!N53f#INFV7S{8Y7Tcu>27q4adT=!Klg_%RUqaU^1=L&%AA9Yt;dh^H~mEzXm` z$2Go)`V&Hr0F$1E!<46hRbB2{+&4%2ZpbPd4 zkTSU--%xI-Wyz5oQdY`xttksh-I8v}nzT)#<`W-VNn@=|$-l+}t1Ek1NZ1NY^!R#xz*a4mQq+yysf!*kD1obG+fqPa|t*NQC z?-H6XRv%97lgTA_=)hM=&oA~)+qreN9x5<1>s$$2nQ)ewR*^vnRp&a~c3imLIzDc$ z7SlC&n+IKD)ot$NQf%;&Gyok0%(N51%W;!|Ny6Bl)q-h8zd~5L6$uf`RQq8y?tI>vL z3J%rfQDn6C(@jDIR|{+&U9(>X5`vptdqc7=>w3u1d-W)Md;FoUc6V_;av~(IyI%$~ m`0(oErzmngpY#{qcl>7}}M2`Ug diff --git a/ChIP_analysis/bootstrap_sam_file.py b/ChIP_analysis/bootstrap_sam_file.py index 84d71c2..732c0fb 100755 --- a/ChIP_analysis/bootstrap_sam_file.py +++ b/ChIP_analysis/bootstrap_sam_file.py @@ -489,6 +489,7 @@ def numba_sum_coverage_from_sample(sampled_reads, array, res): sample(sampler, num_reads, array[:,i], args.resolution, prng) finish = time.time() logging.info("Sample {} took {} seconds".format(i, finish-begin)) + logging.info("Saving output array.") np.save(args.outpre, array) elif args.command == "summarize": diff --git a/ChIP_analysis/bootstrapped_chip_no_consolidation.py b/ChIP_analysis/bootstrapped_chip_no_consolidation.py index fb53bcb..d00e450 100644 --- a/ChIP_analysis/bootstrapped_chip_no_consolidation.py +++ b/ChIP_analysis/bootstrapped_chip_no_consolidation.py @@ -461,7 +461,8 @@ def mp_actual_reads(samplers, size, res, p=1): samp_final = np.zeros((array_size, args.num_replicates, num_ext_samp)) cont_final = np.zeros((array_size, args.num_replicates, num_ext_cont)) - pat = re.compile(r'Sample_\d+') + # pat = re.compile(r'Sample_\d+') + pat = re.compile(r'SRR.+') # grab the sample names from the treatment sampler files samp_matches = [pat.search(s) for s in args.ChIP_samps] if None in samp_matches: @@ -485,8 +486,11 @@ def mp_actual_reads(samplers, size, res, p=1): sample_info_tmp = (pd.read_csv(sample_info_file_name, header=18) >> select(X.Sample_ID, X.Description)) else: + # sample_info_tmp = (pd.read_csv(sample_info_file_name, sep='\t') >> + # mutate(Sample_ID = 'Sample_' + X.SampleID.astype(str)) >> + # select(X.Sample_ID, X.Description)) sample_info_tmp = (pd.read_csv(sample_info_file_name, sep='\t') >> - mutate(Sample_ID = 'Sample_' + X.SampleID.astype(str)) >> + mutate(Sample_ID = X.SampleID.astype(str)) >> select(X.Sample_ID, X.Description)) if i == 0: sample_info = sample_info_tmp @@ -519,6 +523,8 @@ def mp_actual_reads(samplers, size, res, p=1): all_sim = [] for sampler in all_sims: logging.info("Reading in sampler {}".format(sampler)) + if not sampler.endswith(".ob.npy"): + sampler = sampler + ".ob.npy" all_sim.append(read_in_sampler(sampler)) # all_sim = [read_in_sampler(sampler) for sampler in all_sim] diff --git a/ChIP_analysis/save_csv_files.py b/ChIP_analysis/save_csv_files.py new file mode 100644 index 0000000..efc170c --- /dev/null +++ b/ChIP_analysis/save_csv_files.py @@ -0,0 +1,34 @@ +#%% +import numpy as np +import os +import fnmatch +import pandas as pd + +# %% +file_names = ['PcrA_Myc-PcrA_Sub_Myc_control_median.npy', + 'PcrA_Myc-PcrA_Sub_Myc_control_maxci.npy', + 'PcrA_Myc-PcrA_Sub_Myc_control_mean.npy', + 'PcrA_Myc-PcrA_Sub_Myc_control_mad.npy', + 'PcrA_Myc-PcrA_Sub_Myc_control_minci.npy', + 'PcrA_Myc-PcrA_Sub_Myc_control_var.npy'] +# %% +arr_list = [] +for fname in file_names: + arr_list.append(np.load(fname)) +dat = np.array(arr_list) +# %% +dat = np.transpose(dat) + +# %% +dat.shape +# %% +df = pd.DataFrame(dat) + +# %% +df.columns = ['median','maxci','mean','mad','minci','var'] + +# %% +df['position'] = np.arange(0, dat.shape[1]) * 10 + 5 + +# %% +df.to_csv('PcrA_over_Myc_control_enrichment_stats.csv') \ No newline at end of file