diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 00000000..f5f9313e --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,6 @@ +# This is a file that can be passed to git blame to keep it from thinking that +# I wrote every line of this because I was the one who ran Black on it. Use +# git blame [file] --ignore-revs-file .git-blame-ignore-revs +# or to make this permanent, +# git config blame.ignoreRevsFile .git-blame-ignore-revs +6ebb6072ed57d2b73b98bc9c94c7d9fe432c8364 diff --git a/.travis.yml b/.travis.yml index 21bedead..043ebf1f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,15 +3,16 @@ os: linux language: generic python: 3.7.1 -#go: "1.5.4" dist: xenial services: - docker stages: - - name: docker-build + - name: docker-build-master if: branch = master + - name: docker-build-develop + if: branch = develop - name: test addons: @@ -33,10 +34,22 @@ addons: jobs: include: - - stage: docker-build + - stage: docker-build-master script: - - if [[ "$TRAVIS_PULL_REQUEST" == "false" ]]; then docker build -t quay.io/ucsc_cgl/cat:latest .; fi - - if [[ "$TRAVIS_PULL_REQUEST" == "false" ]]; then docker login --username $QUAY_USERNAME --password $QUAY_PASSWORD quay.io; docker push quay.io/ucsc_cgl/cat:latest; fi + - > + if [[ "$TRAVIS_PULL_REQUEST" == "false" ]]; then + docker build -t quay.io/ucsc_cgl/cat:latest . + docker login --username $QUAY_USERNAME --password $QUAY_PASSWORD quay.io + docker push quay.io/ucsc_cgl/cat:latest + fi + - stage: docker-build-develop + script: + - > + if [[ "$TRAVIS_PULL_REQUEST" == "false" ]]; then + docker build -t quay.io/ucsc_cgl/cat:develop . + docker login --username $QUAY_USERNAME --password $QUAY_PASSWORD quay.io + docker push quay.io/ucsc_cgl/cat:develop + fi - stage: test script: - set -ex @@ -50,6 +63,7 @@ jobs: - sed -i "s/maxCores = self.numCores/maxCores = 8/g" /opt/pyenv/versions/3.7.1/lib/python3.7/site-packages/toil/batchSystems/singleMachine.py - sed -i "s/maxMemory = self.physicalMemory/pass/g" /opt/pyenv/versions/3.7.1/lib/python3.7/site-packages/toil/batchSystems/singleMachine.py - > + DOCKER_IMAGE=quay.io/ucsc_cgl/cat:develop luigi --module cat RunCat --hal=test_data/vertebrates.hal --target-genomes='("hg38", "galGal4")' --ref-genome=mm10 --workers=2 --config=test_data/test.config --work-dir test_install --out-dir test_install --local-scheduler - --augustus --augustus-cgp --augustus-pb --assembly-hub --log-level DEBUG + --augustus --augustus-cgp --augustus-pb --assembly-hub --log-level DEBUG --logLevel DEBUG diff --git a/Dockerfile b/Dockerfile index d043be0e..475e45cb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,90 +1,152 @@ -FROM ubuntu:18.04 AS builder -ARG AUGUSTUS_COMMIT=36ae43d -RUN apt-get update -RUN apt-get install -y build-essential libssl-dev libncurses5-dev libcurl4-openssl-dev liblzma-dev libbz2-dev \ -libboost-all-dev sqlite3 libsqlite3-0 libsqlite3-dev libgsl0-dev lp-solve liblpsolve55-dev libbamtools-dev wget git - -# htslib -RUN git clone git://github.com/samtools/htslib.git -RUN cd htslib && make install - -# bcftools -RUN git clone git://github.com/samtools/bcftools.git -RUN cd bcftools && make - -# samtools -RUN git clone git://github.com/samtools/samtools -RUN cd samtools && make && make install - -# MOVE Directories INTO $HOME/tool -RUN mkdir /root/tools -RUN mv samtools /root/tools -RUN mv htslib /root/tools -RUN mv bcftools /root/tools - -# Augustus -RUN git clone https://github.com/Gaius-Augustus/Augustus augustus -RUN cd augustus && git reset --hard ${AUGUSTUS_COMMIT} -RUN echo 'COMPGENEPRED = true' >> augustus/common.mk -RUN echo 'SQLITE = true' >> augustus/common.mk -RUN cd augustus/auxprogs/homGeneMapping/src && sed 's/# BOOST = true/BOOST = true/g' -i Makefile && sed 's/# SQLITE = true/SQLITE = true/g' -i Makefile -RUN cd augustus && make - -# HDF5 -RUN wget -q http://www.hdfgroup.org/ftp/HDF5/releases/hdf5-1.10/hdf5-1.10.1/src/hdf5-1.10.1.tar.gz -RUN tar xzf hdf5-1.10.1.tar.gz -RUN cd hdf5-1.10.1 && ./configure --enable-cxx --prefix=/usr -RUN cd hdf5-1.10.1 && make && make install - -# sonLib -RUN git clone git://github.com/ComparativeGenomicsToolkit/sonLib.git - -# HAL -RUN git clone git://github.com/ComparativeGenomicsToolkit/hal.git -RUN cd sonLib && make -RUN cd hal && make - -# LibBigWig -RUN git clone https://github.com/dpryan79/libBigWig.git -RUN cd libBigWig && make install - -# WiggleTools -RUN git clone https://github.com/dahlo/WiggleTools -# Their makefile now hardcodes /bin/cc as compiler :( -RUN ln -s /usr/bin/cc /bin/cc -RUN cd WiggleTools && make - -# sambamba -RUN wget -q https://github.com/biod/sambamba/releases/download/v0.6.7/sambamba_v0.6.7_linux.tar.bz2 -RUN tar xvjf sambamba_v0.6.7_linux.tar.bz2 - -# Slimmer final Docker image - -FROM ubuntu:18.04 -RUN apt-get update -RUN apt-get install -y wget bedtools bamtools samtools sqlite3 libgsl0-dev libcolamd2 software-properties-common libcurl4-openssl-dev exonerate -RUN add-apt-repository -y ppa:deadsnakes/ppa -RUN apt-get install -y python3.7 python3-pip -# Kent -RUN for i in wigToBigWig faToTwoBit gff3ToGenePred genePredToBed genePredToFakePsl bamToPsl transMapPslToGenePred \ -pslPosTarget axtChain chainMergeSort pslMap pslRecalcMatch pslMapPostChain gtfToGenePred genePredToGtf bedtools \ -pslCheck pslCDnaFilter clusterGenes pslToBigPsl bedSort bedToBigBed; do \ -wget -q http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/$i -O /bin/$i ; chmod +x /bin/$i ; done - -COPY --from=builder /hal/bin/* /bin/ -COPY --from=builder /sambamba /bin/ -COPY --from=builder /augustus/bin/* /bin/ -COPY --from=builder /augustus/scripts/* /bin/ -COPY --from=builder /WiggleTools/bin/* /bin/ - -RUN mkdir -p /augustus -COPY --from=builder /augustus/config /augustus/config - -# Python deps -RUN pip3 install bd2k-python-lib toil[all] pyfasta numpy matplotlib - -# make Python 3 primary python -RUN rm /usr/bin/python -RUN ln -s /usr/bin/python3.7 /usr/bin/python - -ENV AUGUSTUS_CONFIG_PATH=/augustus/config/ +# install python dependencies +FROM ubuntu:20.04 AS cat-python + +RUN apt update && apt install -y --no-install-recommends \ + gcc \ + python3-dev \ + python3-pip + +COPY ./setup.py / + +RUN mkdir cat tools \ + && python3 setup.py egg_info \ + && pip3 install -r cat.egg-info/requires.txt + +COPY ./ /cat + +RUN cd /cat \ + && sed -i'' "s#'augustus_cfgs/#'/opt/augustus/config/extrinsic/#g" cat/__init__.py \ + && python3 setup.py install + +######################################## + +FROM curlimages/curl:7.70.0 AS cat-binaries + +USER root + +WORKDIR /binaries + +# Need >= v395 for clusterGenes -minOverlappingBases option +RUN curl -LO http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/{axtChain,bamToPsl,bedSort,bedToBigBed,chainMergeSort,clusterGenes,faToTwoBit,genePredToBed,genePredToFakePsl,genePredToGtf,gff3ToGenePred,gtfToGenePred,pslCDnaFilter,pslCheck,pslMap,pslMapPostChain,pslPosTarget,pslRecalcMatch,pslToBigPsl,transMapPslToGenePred,wigToBigWig} \ + && chmod a+x /binaries/* + +RUN set -o pipefail && curl -L https://github.com/biod/sambamba/releases/download/v0.7.1/sambamba-0.7.1-linux-static.gz \ + | gzip -d > /binaries/sambamba && chmod a+x /binaries/sambamba + +######################################## + +FROM ubuntu:20.04 AS cat-augustus + +# Install required packages +RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends \ + autoconf \ + build-essential \ + ca-certificates \ + curl \ + libbamtools-dev \ + libboost-iostreams-dev \ + libgsl-dev \ + libhts-dev \ + liblpsolve55-dev \ + libsqlite3-dev \ + libsuitesparse-dev \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +# 2020-07-03 snapshot + bam2wig build simplification +# https://github.com/Gaius-Augustus/Augustus/pull/153 +RUN mkdir /src && cd /src \ + && curl -L https://github.com/harvardinformatics/Augustus/archive/08b7d320cbee586ebfbee410aeae90d81ce03f1e.tar.gz \ + | tar --strip-components=1 -xzf - \ + && make INSTALLDIR=/opt/augustus MYSQL=false HTSLIBS='-lhts' \ + && make -j install \ + && mv /opt/augustus-* /opt/augustus \ + && rm -rf /src + +######################################## +# https://github.com/Ensembl/WiggleTools/blob/597d84/Dockerfile + +FROM ubuntu:20.04 AS cat-wiggletools + +RUN apt update && apt install -y --no-install-recommends \ + ca-certificates \ + libgsl-dev \ + libhts-dev \ + libbigwig-dev \ + libcurl4-openssl-dev \ + gcc \ + python \ + make + +WORKDIR /build + +# 2020-06-02 snapshot +ADD https://github.com/Ensembl/WiggleTools/archive/c1daac89e3775bc8f96376fc1ed7f7e645ce168c.tar.gz wiggletools.tar.gz + +RUN tar --strip-components=1 -xzf wiggletools.tar.gz \ + && make LIBS='-lwiggletools -lBigWig -lcurl -lhts -lgsl -lgslcblas -lz -lpthread -lm -llzma' + +######################################## + +FROM ubuntu:20.04 AS cat-hal + +RUN apt update && apt install -y \ + libhdf5-dev \ + g++ \ + make \ + zlib1g-dev + +WORKDIR /sonLib +# 2020-06-16 snapshot +ADD https://github.com/ComparativeGenomicsToolkit/sonLib/archive/ea0b939828ba24d998a7c1aa407ff5a016912f56.tar.gz sonLib.tar.gz +RUN tar --strip-components=1 -xzf sonLib.tar.gz +RUN make -j + +WORKDIR /hal +# 2020-07-08 snapshot +ADD https://github.com/ComparativeGenomicsToolkit/hal/archive/f8f3fa2dada4751b642f0089b2bf30769967e68a.tar.gz hal.tar.gz +RUN tar --strip-components=1 -xzf hal.tar.gz \ + && make -j \ + && mkdir /binaries \ + && mv bin/hal2fasta bin/hal2maf bin/halStats bin/halLiftover /binaries \ + && strip /binaries/* \ + && rm -rf /hal + +######################################## + +FROM ubuntu:20.04 AS final + +RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends \ + bamtools \ + bedtools \ + exonerate \ + libbamtools2.5.1 \ + libbigwig0 \ + libboost-iostreams1.71.0 \ + libcolamd2 \ + libcurl4 \ + libgsl23 \ + libhts3 \ + libsqlite3-0 \ + libsz2 \ + libsuitesparseconfig5 \ + python3-pip \ + samtools \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=cat-python /usr/local /usr/local +COPY --from=cat-binaries /binaries /usr/local/bin +COPY --from=cat-hal /binaries /usr/local/bin +COPY --from=cat-wiggletools /build/bin/wiggletools /usr/local/bin +COPY --from=cat-augustus /opt/augustus /opt/augustus + +# (2020-06-04) augustus_cfgs/log_reg_parameters_default.cfg identical to Augustus config/cgp/log_reg_parameters_default.cfg +COPY ./augustus_cfgs/*extrinsic*.cfg /opt/augustus/config/extrinsic/ + +# luigi looks for luigi.cfg in /etc/luigi/luigi.cfg by default +COPY ./logging.cfg ./luigi.cfg /etc/luigi/ + +# but need to tell luigi to look for logging.cfg at /etc/luigi/logging.cfg +RUN sed -i'' '/logging_conf_file/s#.*#logging_conf_file=/etc/luigi/logging.cfg#' /etc/luigi/luigi.cfg + +ENV PATH=/opt/augustus/bin:/opt/augustus/scripts:${PATH} diff --git a/Dockerfile.complete b/Dockerfile.complete deleted file mode 100644 index bb3fd166..00000000 --- a/Dockerfile.complete +++ /dev/null @@ -1,161 +0,0 @@ -# install python dependencies -FROM ubuntu:20.04 AS cat-python - -RUN apt update && apt install -y --no-install-recommends \ - gcc \ - python3-dev \ - python3-pip - -COPY ./setup.py / - -RUN mkdir cat tools \ - && python3 setup.py egg_info \ - && pip3 install -r cat.egg-info/requires.txt - -COPY ./ /cat - -RUN cd /cat \ - && sed -i'' "s#'augustus_cfgs/#'/opt/augustus/config/extrinsic/#g" cat/__init__.py \ - && python3 setup.py install - -######################################## - -FROM curlimages/curl:7.70.0 AS cat-binaries - -USER root - -WORKDIR /binaries - -# Need >= v395 for clusterGenes -minOverlappingBases option -RUN curl -LO http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/{axtChain,bamToPsl,bedSort,bedToBigBed,chainMergeSort,clusterGenes,faToTwoBit,genePredToBed,genePredToFakePsl,genePredToGtf,gff3ToGenePred,gtfToGenePred,pslCDnaFilter,pslCheck,pslMap,pslMapPostChain,pslPosTarget,pslRecalcMatch,pslToBigPsl,transMapPslToGenePred,wigToBigWig} \ - && chmod a+x /binaries/* - -RUN set -o pipefail && curl -L https://github.com/biod/sambamba/releases/download/v0.7.1/sambamba-0.7.1-linux-static.gz \ - | gzip -d > /binaries/sambamba && chmod a+x /binaries/sambamba - -# CAT v2.1.0 needs more recent hal2fasta supporting the --onlySequenceNames option -#RUN set -o pipefail && curl -L https://github.com/ComparativeGenomicsToolkit/cactus/releases/download/v1.0.0/cactus-bin-v1.0.0.tar.gz \ -# | tar -C /tmp -xzf - \ -# cactus-bin-v1.0.0/bin/hal2maf \ -# cactus-bin-v1.0.0/bin/hal2fasta \ -# cactus-bin-v1.0.0/bin/halLiftover \ -# cactus-bin-v1.0.0/bin/halStats \ -# && mv /tmp/cactus-bin-v1.0.0/bin/* /binaries && chmod a+x /binaries/hal* - -######################################## - -FROM ubuntu:20.04 AS cat-augustus - -# Install required packages -RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends \ - autoconf \ - build-essential \ - ca-certificates \ - curl \ - libbamtools-dev \ - libboost-iostreams-dev \ - libgsl-dev \ - libhts-dev \ - liblpsolve55-dev \ - libsqlite3-dev \ - libsuitesparse-dev \ - zlib1g-dev \ - && rm -rf /var/lib/apt/lists/* - -# 2020-07-03 snapshot + bam2wig build simplification -# https://github.com/Gaius-Augustus/Augustus/pull/153 -RUN mkdir /src && cd /src \ - && curl -L https://github.com/harvardinformatics/Augustus/archive/08b7d320cbee586ebfbee410aeae90d81ce03f1e.tar.gz \ - | tar --strip-components=1 -xzf - \ - && make INSTALLDIR=/opt/augustus MYSQL=false HTSLIBS='-lhts' \ - && make -j install \ - && mv /opt/augustus-* /opt/augustus \ - && rm -rf /src - -######################################## -# https://github.com/Ensembl/WiggleTools/blob/597d84/Dockerfile - -FROM ubuntu:20.04 AS cat-wiggletools - -RUN apt update && apt install -y --no-install-recommends \ - ca-certificates \ - libgsl-dev \ - libhts-dev \ - libbigwig-dev \ - libcurl4-openssl-dev \ - gcc \ - python \ - make - -WORKDIR /build - -# 2020-06-02 snapshot -ADD https://github.com/Ensembl/WiggleTools/archive/c1daac89e3775bc8f96376fc1ed7f7e645ce168c.tar.gz wiggletools.tar.gz - -RUN tar --strip-components=1 -xzf wiggletools.tar.gz \ - && make LIBS='-lwiggletools -lBigWig -lcurl -lhts -lgsl -lgslcblas -lz -lpthread -lm -llzma' - -######################################## - -FROM ubuntu:20.04 AS cat-hal - -RUN apt update && apt install -y \ - libhdf5-dev \ - g++ \ - make \ - zlib1g-dev - -WORKDIR /sonLib -# 2020-06-16 snapshot -ADD https://github.com/ComparativeGenomicsToolkit/sonLib/archive/ea0b939828ba24d998a7c1aa407ff5a016912f56.tar.gz sonLib.tar.gz -RUN tar --strip-components=1 -xzf sonLib.tar.gz -RUN make -j - -WORKDIR /hal -# 2020-07-08 snapshot -ADD https://github.com/ComparativeGenomicsToolkit/hal/archive/cb7c044731271ec41640db71f5694af53a0ead57.tar.gz hal.tar.gz -RUN tar --strip-components=1 -xzf hal.tar.gz \ - && make -j \ - && mkdir /binaries \ - && mv bin/hal2fasta bin/hal2maf bin/halStats bin/halLiftover /binaries \ - && strip /binaries/* \ - && rm -rf /hal - -######################################## - -FROM ubuntu:20.04 AS final - -RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends \ - bamtools \ - bedtools \ - exonerate \ - libbamtools2.5.1 \ - libbigwig0 \ - libboost-iostreams1.71.0 \ - libcolamd2 \ - libcurl4 \ - libgsl23 \ - libhts3 \ - libsqlite3-0 \ - libsz2 \ - libsuitesparseconfig5 \ - python3-pip \ - samtools \ - && rm -rf /var/lib/apt/lists/* - -COPY --from=cat-python /usr/local /usr/local -COPY --from=cat-binaries /binaries /usr/local/bin -COPY --from=cat-hal /binaries /usr/local/bin -COPY --from=cat-wiggletools /build/bin/wiggletools /usr/local/bin -COPY --from=cat-augustus /opt/augustus /opt/augustus - -# (2020-06-04) augustus_cfgs/log_reg_parameters_default.cfg identical to Augustus config/cgp/log_reg_parameters_default.cfg -COPY ./augustus_cfgs/*extrinsic*.cfg /opt/augustus/config/extrinsic/ - -# luigi looks for luigi.cfg in /etc/luigi/luigi.cfg by default -COPY ./logging.cfg ./luigi.cfg /etc/luigi/ - -# but need to tell luigi to look for logging.cfg at /etc/luigi/logging.cfg -RUN sed -i'' '/logging_conf_file/s#.*#logging_conf_file=/etc/luigi/logging.cfg#' /etc/luigi/luigi.cfg - -ENV PATH=/opt/augustus/bin:/opt/augustus/scripts:${PATH} diff --git a/Dockerfile.old b/Dockerfile.old new file mode 100644 index 00000000..d043be0e --- /dev/null +++ b/Dockerfile.old @@ -0,0 +1,90 @@ +FROM ubuntu:18.04 AS builder +ARG AUGUSTUS_COMMIT=36ae43d +RUN apt-get update +RUN apt-get install -y build-essential libssl-dev libncurses5-dev libcurl4-openssl-dev liblzma-dev libbz2-dev \ +libboost-all-dev sqlite3 libsqlite3-0 libsqlite3-dev libgsl0-dev lp-solve liblpsolve55-dev libbamtools-dev wget git + +# htslib +RUN git clone git://github.com/samtools/htslib.git +RUN cd htslib && make install + +# bcftools +RUN git clone git://github.com/samtools/bcftools.git +RUN cd bcftools && make + +# samtools +RUN git clone git://github.com/samtools/samtools +RUN cd samtools && make && make install + +# MOVE Directories INTO $HOME/tool +RUN mkdir /root/tools +RUN mv samtools /root/tools +RUN mv htslib /root/tools +RUN mv bcftools /root/tools + +# Augustus +RUN git clone https://github.com/Gaius-Augustus/Augustus augustus +RUN cd augustus && git reset --hard ${AUGUSTUS_COMMIT} +RUN echo 'COMPGENEPRED = true' >> augustus/common.mk +RUN echo 'SQLITE = true' >> augustus/common.mk +RUN cd augustus/auxprogs/homGeneMapping/src && sed 's/# BOOST = true/BOOST = true/g' -i Makefile && sed 's/# SQLITE = true/SQLITE = true/g' -i Makefile +RUN cd augustus && make + +# HDF5 +RUN wget -q http://www.hdfgroup.org/ftp/HDF5/releases/hdf5-1.10/hdf5-1.10.1/src/hdf5-1.10.1.tar.gz +RUN tar xzf hdf5-1.10.1.tar.gz +RUN cd hdf5-1.10.1 && ./configure --enable-cxx --prefix=/usr +RUN cd hdf5-1.10.1 && make && make install + +# sonLib +RUN git clone git://github.com/ComparativeGenomicsToolkit/sonLib.git + +# HAL +RUN git clone git://github.com/ComparativeGenomicsToolkit/hal.git +RUN cd sonLib && make +RUN cd hal && make + +# LibBigWig +RUN git clone https://github.com/dpryan79/libBigWig.git +RUN cd libBigWig && make install + +# WiggleTools +RUN git clone https://github.com/dahlo/WiggleTools +# Their makefile now hardcodes /bin/cc as compiler :( +RUN ln -s /usr/bin/cc /bin/cc +RUN cd WiggleTools && make + +# sambamba +RUN wget -q https://github.com/biod/sambamba/releases/download/v0.6.7/sambamba_v0.6.7_linux.tar.bz2 +RUN tar xvjf sambamba_v0.6.7_linux.tar.bz2 + +# Slimmer final Docker image + +FROM ubuntu:18.04 +RUN apt-get update +RUN apt-get install -y wget bedtools bamtools samtools sqlite3 libgsl0-dev libcolamd2 software-properties-common libcurl4-openssl-dev exonerate +RUN add-apt-repository -y ppa:deadsnakes/ppa +RUN apt-get install -y python3.7 python3-pip +# Kent +RUN for i in wigToBigWig faToTwoBit gff3ToGenePred genePredToBed genePredToFakePsl bamToPsl transMapPslToGenePred \ +pslPosTarget axtChain chainMergeSort pslMap pslRecalcMatch pslMapPostChain gtfToGenePred genePredToGtf bedtools \ +pslCheck pslCDnaFilter clusterGenes pslToBigPsl bedSort bedToBigBed; do \ +wget -q http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/$i -O /bin/$i ; chmod +x /bin/$i ; done + +COPY --from=builder /hal/bin/* /bin/ +COPY --from=builder /sambamba /bin/ +COPY --from=builder /augustus/bin/* /bin/ +COPY --from=builder /augustus/scripts/* /bin/ +COPY --from=builder /WiggleTools/bin/* /bin/ + +RUN mkdir -p /augustus +COPY --from=builder /augustus/config /augustus/config + +# Python deps +RUN pip3 install bd2k-python-lib toil[all] pyfasta numpy matplotlib + +# make Python 3 primary python +RUN rm /usr/bin/python +RUN ln -s /usr/bin/python3.7 /usr/bin/python + +ENV AUGUSTUS_CONFIG_PATH=/augustus/config/ diff --git a/README.md b/README.md index a301665c..7c47a023 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,53 @@ If you want to do the direct pip installation, you can grab the config files fro Either form of `pip` installation will install all of the python dependencies. +## Docker + +The docker container for this repository will be automatically pulled from `quay.io/ucsc_cgl/cat:latest` if CAT is ran with the default setting for `--binary-mode`. This mode of operation is suitable if you are using a cluster batch submission system like such as gridEngine or Slurm. + +Alternatively, you can run CAT entirely self contained using the Docker container, because the container contains the source code as well. This mode of operation is suitable if you are running on a single machine, such as reserving space on a cluster. In this mode, the self-contained binary could be ran like this: + +~~~ +docker run -v $PWD/test_data/:/test_data/ quay.io/ucsc_cgl/cat:latest \ +luigi \ + --module cat RunCat \ + --hal=test_data/vertebrates.hal \ + --ref-genome=mm10 \ + --workers=10 \ + --config=test_data/test.config \ + --local-scheduler \ + --binary-mode local \ + --augustus \ + --augustus-cgp \ + --augustus-pb \ + --assembly-hub +~~~ + +## Singularity + +CAT also supports working from Singularity, by converting the docker container. If `--binary-mode` is set to `singularity`, then CAT will automatically pull the docker container and convert it to singularity. The location where the image is built can be controlled by the environmental variable `SINGULARITY_IMAGE`. If this variable is unset, the image will be built in the current working directory. + +The Singularity container can also be used in a self-contained fashion, by doing something along the lines of: + + +~~~ +singularity exec --cleanenv cat_v2.1.0.sif \ +luigi \ + --module cat RunCat \ + --hal=test_data/vertebrates.hal \ + --ref-genome=mm10 \ + --workers=10 \ + --config=test_data/test.config \ + --local-scheduler \ + --binary-mode local \ + --augustus \ + --augustus-cgp \ + --augustus-pb \ + --assembly-hub +~~~ + +Thank you to @nathanweeks for building the dockerfile for the standalone version! + ## Dependencies (if not using Docker) Below is a full breakdown of the required dependencies if you are not using Docker. Some of these can be challenging to get to all compile and work properly. In addition to the breakdown below, you may get guidance looking at the [Dockerfile](https://github.com/ComparativeGenomicsToolkit/Comparative-Annotation-Toolkit/blob/master/Dockerfile) or looking at [this list of installation commands](https://github.com/ComparativeGenomicsToolkit/Comparative-Annotation-Toolkit/issues/118#issuecomment-434910846) generated by a helpful user. diff --git a/cat/__init__.py b/cat/__init__.py index c951078e..430ddade 100644 --- a/cat/__init__.py +++ b/cat/__init__.py @@ -8,6 +8,7 @@ import logging import os import shutil +import pyfaidx import json import subprocess from collections import OrderedDict @@ -52,8 +53,8 @@ from .parent_gene_assignment import assign_parents from .exceptions import * -logger = logging.getLogger('cat') -logger.setLevel('INFO') +logger = logging.getLogger("cat") +logger.setLevel("INFO") ### @@ -72,25 +73,25 @@ class PipelineTask(luigi.Task): scheduler to know which parameters define a unique task ID. This would come into play if multiple instances of this pipeline are being run on the same scheduler at once. """ + hal = luigi.Parameter() ref_genome = luigi.Parameter() config = luigi.Parameter() - out_dir = luigi.Parameter(default='./cat_output') - work_dir = luigi.Parameter(default='./cat_work') + out_dir = luigi.Parameter(default="./cat_output") + work_dir = luigi.Parameter(default="./cat_work") target_genomes = luigi.TupleParameter(default=None) annotate_ancestors = luigi.BoolParameter(default=False) - binary_mode = luigi.ChoiceParameter(choices=["docker", "local", "singularity"], default='docker', - significant=False) + binary_mode = luigi.ChoiceParameter(choices=["docker", "local", "singularity"], default="docker", significant=False) # AugustusTM(R) parameters augustus = luigi.BoolParameter(default=False) - augustus_species = luigi.Parameter(default='human', significant=False) - tm_cfg = luigi.Parameter(default='augustus_cfgs/extrinsic.ETM1.cfg', significant=False) - tmr_cfg = luigi.Parameter(default='augustus_cfgs/extrinsic.ETM2.cfg', significant=False) + augustus_species = luigi.Parameter(default="human", significant=False) + tm_cfg = luigi.Parameter(default="augustus_cfgs/extrinsic.ETM1.cfg", significant=False) + tmr_cfg = luigi.Parameter(default="augustus_cfgs/extrinsic.ETM2.cfg", significant=False) augustus_utr_off = luigi.BoolParameter(default=False, significant=False) # AugustusCGP parameters augustus_cgp = luigi.BoolParameter(default=False) cgp_param = luigi.Parameter(default=None, significant=False) - augustus_cgp_cfg_template = luigi.Parameter(default='augustus_cfgs/cgp_extrinsic_template.cfg', significant=False) + augustus_cgp_cfg_template = luigi.Parameter(default="augustus_cfgs/cgp_extrinsic_template.cfg", significant=False) maf_chunksize = luigi.IntParameter(default=2500000, significant=False) maf_overlap = luigi.IntParameter(default=500000, significant=False) cgp_train_num_exons = luigi.IntParameter(default=5000, significant=False) @@ -98,12 +99,12 @@ class PipelineTask(luigi.Task): augustus_pb = luigi.BoolParameter(default=False) pb_genome_chunksize = luigi.IntParameter(default=5000000, significant=False) pb_genome_overlap = luigi.IntParameter(default=500000, significant=False) - pb_cfg = luigi.Parameter(default='augustus_cfgs/extrinsic.M.RM.PB.E.W.cfg', significant=False) + pb_cfg = luigi.Parameter(default="augustus_cfgs/extrinsic.M.RM.PB.E.W.cfg", significant=False) # Hgm parameters hgm_cpu = luigi.IntParameter(default=4, significant=False) # assemblyHub parameters assembly_hub = luigi.BoolParameter(default=False) - hub_email = luigi.Parameter(default='NoEmail', significant=False) + hub_email = luigi.Parameter(default="NoEmail", significant=False) # Paralogy detection options global_near_best = luigi.FloatParameter(default=0.15, significant=False) filter_overlapping_genes = luigi.BoolParameter(default=False, significant=True) @@ -127,14 +128,14 @@ class PipelineTask(luigi.Task): in_species_rna_support_only = luigi.BoolParameter(default=False, significant=True) rebuild_consensus = luigi.BoolParameter(default=False, significant=True) # Toil options - batchSystem = luigi.Parameter(default='singleMachine', significant=False) + batchSystem = luigi.Parameter(default="singleMachine", significant=False) maxCores = luigi.IntParameter(default=8, significant=False) parasolCommand = luigi.Parameter(default=None, significant=False) - defaultMemory = luigi.Parameter(default='8G', significant=False) + defaultMemory = luigi.Parameter(default="8G", significant=False) disableCaching = luigi.BoolParameter(default=False, significant=False) workDir = luigi.Parameter(default=None, significant=False) - defaultDisk = luigi.Parameter(default='8G', significant=False) - cleanWorkDir = luigi.Parameter(default='onSuccess', significant=False) + defaultDisk = luigi.Parameter(default="8G", significant=False) + cleanWorkDir = luigi.Parameter(default="onSuccess", significant=False) provisioner = luigi.Parameter(default=None, significant=False) nodeTypes = luigi.Parameter(default=None, significant=False) maxNodes = luigi.Parameter(default=None, significant=False) @@ -145,98 +146,100 @@ class PipelineTask(luigi.Task): def __repr__(self): """override the repr to make logging cleaner""" - if hasattr(self, 'genome'): - return 'Task: {} for {}'.format(self.__class__.__name__, self.genome) - elif hasattr(self, 'mode'): - return 'Task: {} for {}'.format(self.__class__.__name__, self.mode) + if hasattr(self, "genome"): + return "Task: {} for {}".format(self.__class__.__name__, self.genome) + elif hasattr(self, "mode"): + return "Task: {} for {}".format(self.__class__.__name__, self.mode) else: - return 'Task: {}'.format(self.__class__.__name__) + return "Task: {}".format(self.__class__.__name__) def get_pipeline_args(self): """returns a namespace of all of the arguments to the pipeline. Resolves the target genomes variable""" args = tools.misc.PipelineNamespace() - args.set('binary_mode', self.binary_mode, False) - args.set('hal', os.path.abspath(self.hal), True) - args.set('ref_genome', self.ref_genome, True) - args.set('out_dir', os.path.abspath(self.out_dir), True) - args.set('work_dir', os.path.abspath(self.work_dir), True) - args.set('augustus', self.augustus, True) - args.set('augustus_cgp', self.augustus_cgp, True) - args.set('augustus_pb', self.augustus_pb, True) - args.set('augustus_species', self.augustus_species, True) - args.set('tm_cfg', os.path.abspath(self.tm_cfg), True) - args.set('tmr_cfg', os.path.abspath(self.tmr_cfg), True) - args.set('augustus_cgp', self.augustus_cgp, True) - args.set('maf_chunksize', self.maf_chunksize, True) - args.set('maf_overlap', self.maf_overlap, True) - args.set('pb_genome_chunksize', self.pb_genome_chunksize, True) - args.set('pb_genome_overlap', self.pb_genome_overlap, True) - args.set('pb_cfg', os.path.abspath(self.pb_cfg), True) - - args.set('augustus_cgp_cfg_template', os.path.abspath(self.augustus_cgp_cfg_template), True) - args.set('augustus_utr_off', self.augustus_utr_off, True) + args.set("binary_mode", self.binary_mode, False) + args.set("hal", os.path.abspath(self.hal), True) + args.set("ref_genome", self.ref_genome, True) + args.set("out_dir", os.path.abspath(self.out_dir), True) + args.set("work_dir", os.path.abspath(self.work_dir), True) + args.set("augustus", self.augustus, True) + args.set("augustus_cgp", self.augustus_cgp, True) + args.set("augustus_pb", self.augustus_pb, True) + args.set("augustus_species", self.augustus_species, True) + args.set("tm_cfg", os.path.abspath(self.tm_cfg), True) + args.set("tmr_cfg", os.path.abspath(self.tmr_cfg), True) + args.set("augustus_cgp", self.augustus_cgp, True) + args.set("maf_chunksize", self.maf_chunksize, True) + args.set("maf_overlap", self.maf_overlap, True) + args.set("pb_genome_chunksize", self.pb_genome_chunksize, True) + args.set("pb_genome_overlap", self.pb_genome_overlap, True) + args.set("pb_cfg", os.path.abspath(self.pb_cfg), True) + + args.set("augustus_cgp_cfg_template", os.path.abspath(self.augustus_cgp_cfg_template), True) + args.set("augustus_utr_off", self.augustus_utr_off, True) if self.cgp_param is not None: - args.set('cgp_param', os.path.abspath(self.cgp_param), True) + args.set("cgp_param", os.path.abspath(self.cgp_param), True) else: - args.set('cgp_param', None, True) - args.set('cgp_train_num_exons', self.cgp_train_num_exons, True) - args.set('hgm_cpu', self.hgm_cpu, False) + args.set("cgp_param", None, True) + args.set("cgp_train_num_exons", self.cgp_train_num_exons, True) + args.set("hgm_cpu", self.hgm_cpu, False) # user flags for paralog resolution - args.set('global_near_best', self.global_near_best, True) - args.set('filter_overlapping_genes', self.filter_overlapping_genes, True) - args.set('overlapping_gene_distance', self.overlapping_gene_distance, True) + args.set("global_near_best", self.global_near_best, True) + args.set("filter_overlapping_genes", self.filter_overlapping_genes, True) + args.set("overlapping_gene_distance", self.overlapping_gene_distance, True) # user specified flags for consensus finding - args.set('intron_rnaseq_support', self.intron_rnaseq_support, False) - args.set('exon_rnaseq_support', self.exon_rnaseq_support, False) - args.set('intron_annot_support', self.intron_annot_support, False) - args.set('exon_annot_support', self.exon_annot_support, False) - args.set('original_intron_support', self.original_intron_support, False) - args.set('denovo_num_introns', self.denovo_num_introns, False) - args.set('denovo_splice_support', self.denovo_splice_support, False) - args.set('denovo_exon_support', self.denovo_exon_support, False) - args.set('denovo_ignore_novel_genes', self.denovo_ignore_novel_genes, False) - args.set('denovo_only_novel_genes', self.denovo_only_novel_genes, False) - args.set('denovo_allow_novel_ends', self.denovo_allow_novel_ends, False) - args.set('denovo_novel_end_distance', self.denovo_novel_end_distance, False) - args.set('denovo_allow_unsupported', self.denovo_allow_unsupported, False) - args.set('denovo_allow_bad_annot_or_tm', self.denovo_allow_bad_annot_or_tm, False) - args.set('require_pacbio_support', self.require_pacbio_support, False) - args.set('in_species_rna_support_only', self.in_species_rna_support_only, False) - args.set('rebuild_consensus', self.rebuild_consensus, False) + args.set("intron_rnaseq_support", self.intron_rnaseq_support, False) + args.set("exon_rnaseq_support", self.exon_rnaseq_support, False) + args.set("intron_annot_support", self.intron_annot_support, False) + args.set("exon_annot_support", self.exon_annot_support, False) + args.set("original_intron_support", self.original_intron_support, False) + args.set("denovo_num_introns", self.denovo_num_introns, False) + args.set("denovo_splice_support", self.denovo_splice_support, False) + args.set("denovo_exon_support", self.denovo_exon_support, False) + args.set("denovo_ignore_novel_genes", self.denovo_ignore_novel_genes, False) + args.set("denovo_only_novel_genes", self.denovo_only_novel_genes, False) + args.set("denovo_allow_novel_ends", self.denovo_allow_novel_ends, False) + args.set("denovo_novel_end_distance", self.denovo_novel_end_distance, False) + args.set("denovo_allow_unsupported", self.denovo_allow_unsupported, False) + args.set("denovo_allow_bad_annot_or_tm", self.denovo_allow_bad_annot_or_tm, False) + args.set("require_pacbio_support", self.require_pacbio_support, False) + args.set("in_species_rna_support_only", self.in_species_rna_support_only, False) + args.set("rebuild_consensus", self.rebuild_consensus, False) # stats location - args.set('stats_db', os.path.join(args.out_dir, 'databases', 'timing_stats.db'), False) + args.set("stats_db", os.path.join(args.out_dir, "databases", "timing_stats.db"), False) # flags for assembly hub building - args.set('assembly_hub', self.assembly_hub, False) # assembly hub doesn't need to cause rebuild of gene sets - args.set('hub_email', self.hub_email, False) + args.set("assembly_hub", self.assembly_hub, False) # assembly hub doesn't need to cause rebuild of gene sets + args.set("hub_email", self.hub_email, False) # flags for figuring out which genomes we are going to annotate - args.set('annotate_ancestors', self.annotate_ancestors, True) + args.set("annotate_ancestors", self.annotate_ancestors, True) # halStats is run below, before any validate() methods are called. - if not tools.misc.is_exec('halStats'): - raise ToolMissingException('halStats from the HAL tools package not in global path') + if not tools.misc.is_exec("halStats"): + raise ToolMissingException("halStats from the HAL tools package not in global path") - args.set('hal_genomes', tools.hal.extract_genomes(args.hal, self.annotate_ancestors), True) + args.set("hal_genomes", tools.hal.extract_genomes(args.hal, self.annotate_ancestors), True) target_genomes = tools.hal.extract_genomes(args.hal, self.annotate_ancestors, self.target_genomes) target_genomes = tuple(x for x in target_genomes if x != self.ref_genome) - args.set('target_genomes', target_genomes, True) - args.set('cfg', self.parse_cfg(), True) - args.set('dbs', PipelineTask.get_databases(args), True) - args.set('annotation', args.cfg['ANNOTATION'][args.ref_genome], True) - args.set('hints_db', os.path.join(args.work_dir, 'hints_database', 'hints.db'), True) - args.set('rnaseq_genomes', frozenset(set(args.cfg['INTRONBAM'].keys()) | set(args.cfg['BAM'].keys())), True) - args.set('intron_only_genomes', frozenset(set(args.cfg['INTRONBAM'].keys()) - set(args.cfg['BAM'].keys())), True) - args.set('isoseq_genomes', frozenset(list(args.cfg['ISO_SEQ_BAM'].keys())), True) - args.set('annotation_genomes', frozenset(list(args.cfg['ANNOTATION'].keys())), True) - args.set('external_ref_genomes', args.annotation_genomes - {args.ref_genome}, True) - args.set('modes', self.get_modes(args), True) - args.set('augustus_tmr', True if 'augTMR' in args.modes else False, True) - - if self.__class__.__name__ in ['RunCat', 'Augustus', 'AugustusCgp', 'AugustusPb']: + args.set("target_genomes", target_genomes, True) + args.set("cfg", self.parse_cfg(), True) + args.set("dbs", PipelineTask.get_databases(args), True) + args.set("annotation", args.cfg["ANNOTATION"][args.ref_genome], True) + args.set("hints_db", os.path.join(args.work_dir, "hints_database", "hints.db"), True) + args.set("rnaseq_genomes", frozenset(set(args.cfg["INTRONBAM"].keys()) | set(args.cfg["BAM"].keys())), True) + args.set( + "intron_only_genomes", frozenset(set(args.cfg["INTRONBAM"].keys()) - set(args.cfg["BAM"].keys())), True + ) + args.set("isoseq_genomes", frozenset(list(args.cfg["ISO_SEQ_BAM"].keys())), True) + args.set("annotation_genomes", frozenset(list(args.cfg["ANNOTATION"].keys())), True) + args.set("external_ref_genomes", args.annotation_genomes - {args.ref_genome}, True) + args.set("modes", self.get_modes(args), True) + args.set("augustus_tmr", True if "augTMR" in args.modes else False, True) + + if self.__class__.__name__ in ["RunCat", "Augustus", "AugustusCgp", "AugustusPb"]: self.validate_cfg(args) return args @@ -269,32 +272,39 @@ def parse_cfg(self): """ if not os.path.exists(self.config): - raise MissingFileException('Config file {} not found.'.format(self.config)) + raise MissingFileException("Config file {} not found.".format(self.config)) # configspec validates the input config file - configspec = ['[ANNOTATION]', '__many__ = string', - '[INTRONBAM]', '__many__ = list', - '[BAM]', '__many__ = list', - '[ISO_SEQ_BAM]', '__many__ = list', - '[PROTEIN_FASTA]', '__many__ = list'] + configspec = [ + "[ANNOTATION]", + "__many__ = string", + "[INTRONBAM]", + "__many__ = list", + "[BAM]", + "__many__ = list", + "[ISO_SEQ_BAM]", + "__many__ = list", + "[PROTEIN_FASTA]", + "__many__ = list", + ] parser = ConfigObj(self.config, configspec=configspec) for key in parser: - if key not in ['ANNOTATION', 'INTRONBAM', 'BAM', 'ISO_SEQ_BAM', 'PROTEIN_FASTA']: - raise InvalidInputException('Invalid field {} in config file'.format(key)) + if key not in ["ANNOTATION", "INTRONBAM", "BAM", "ISO_SEQ_BAM", "PROTEIN_FASTA"]: + raise InvalidInputException("Invalid field {} in config file".format(key)) # convert the config into a new dict, parsing the fofns cfg = collections.defaultdict(dict) - for dtype in ['ANNOTATION', 'PROTEIN_FASTA']: + for dtype in ["ANNOTATION", "PROTEIN_FASTA"]: if dtype not in parser: cfg[dtype] = {} else: for genome, annot in parser[dtype].items(): annot = os.path.abspath(annot) if not os.path.exists(annot): - raise MissingFileException('Missing {} file {}.'.format(dtype.lower(), annot)) + raise MissingFileException("Missing {} file {}.".format(dtype.lower(), annot)) cfg[dtype][genome] = annot # if a given genome only has one BAM, it is a string. Fix this. Extract all paths from fofn files. - for dtype in ['BAM', 'INTRONBAM', 'ISO_SEQ_BAM']: + for dtype in ["BAM", "INTRONBAM", "ISO_SEQ_BAM"]: if dtype not in parser: # the user does not have to specify all field types cfg[dtype] = {} continue @@ -316,59 +326,68 @@ def parse_cfg(self): cfg[dtype][genome].extend([os.path.abspath(x.rstrip()) for x in open(p)]) # return a hashable version - return frozendict((key, frozendict((ikey, tuple(ival) if isinstance(ival, list) else ival) - for ikey, ival in val.items())) for key, val in cfg.items()) + return frozendict( + (key, frozendict((ikey, tuple(ival) if isinstance(ival, list) else ival) for ikey, ival in val.items())) + for key, val in cfg.items() + ) def validate_cfg(self, args): """Validate the input config file.""" - if len(args.cfg['BAM']) + len(args.cfg['INTRONBAM']) + \ - len(args.cfg['ISO_SEQ_BAM']) + len(args.cfg['ANNOTATION']) == 0: - logger.warning('No extrinsic data or annotations found in config. Will load genomes only.') - elif len(args.cfg['BAM']) + len(args.cfg['INTRONBAM']) + len(args.cfg['ISO_SEQ_BAM']) == 0: - logger.warning('No extrinsic data found in config. Will load genomes and annotation only.') - - for dtype in ['BAM', 'INTRONBAM', 'ISO_SEQ_BAM']: + if ( + len(args.cfg["BAM"]) + + len(args.cfg["INTRONBAM"]) + + len(args.cfg["ISO_SEQ_BAM"]) + + len(args.cfg["ANNOTATION"]) + == 0 + ): + logger.warning("No extrinsic data or annotations found in config. Will load genomes only.") + elif len(args.cfg["BAM"]) + len(args.cfg["INTRONBAM"]) + len(args.cfg["ISO_SEQ_BAM"]) == 0: + logger.warning("No extrinsic data found in config. Will load genomes and annotation only.") + + for dtype in ["BAM", "INTRONBAM", "ISO_SEQ_BAM"]: for genome in args.cfg[dtype]: for bam in args.cfg[dtype][genome]: if not os.path.exists(bam): - raise MissingFileException('Missing BAM {}.'.format(bam)) + raise MissingFileException("Missing BAM {}.".format(bam)) if not tools.misc.is_bam(bam): - raise InvalidInputException('BAM {} is not a valid BAM.'.format(bam)) - if not os.path.exists(bam + '.bai'): - raise MissingFileException('Missing BAM index {}.'.format(bam + '.bai')) + raise InvalidInputException("BAM {} is not a valid BAM.".format(bam)) + if not os.path.exists(bam + ".bai"): + raise MissingFileException("Missing BAM index {}.".format(bam + ".bai")) - for dtype in ['ANNOTATION', 'PROTEIN_FASTA']: + for dtype in ["ANNOTATION", "PROTEIN_FASTA"]: for genome, annot in args.cfg[dtype].items(): if not os.path.exists(annot): - raise MissingFileException('Missing {} file {}.'.format(dtype.lower(), annot)) + raise MissingFileException("Missing {} file {}.".format(dtype.lower(), annot)) + if annot.endswith("gz"): + raise InvalidInputException("Cannot use gzipped annotation/FASTA files.") if all(g in args.hal_genomes for g in args.target_genomes) is False: bad_genomes = set(args.hal_genomes) - set(args.target_genomes) - err_msg = 'Genomes {} present in configuration and not present in HAL.'.format(','.join(bad_genomes)) + err_msg = "Genomes {} present in configuration and not present in HAL.".format(",".join(bad_genomes)) raise UserException(err_msg) - if args.ref_genome not in args.cfg['ANNOTATION']: - raise UserException('Reference genome {} did not have a provided annotation.'.format(self.ref_genome)) + if args.ref_genome not in args.cfg["ANNOTATION"]: + raise UserException("Reference genome {} did not have a provided annotation.".format(self.ref_genome)) # raise if the user if the user is providing dubious inputs if args.augustus_cgp and len(args.rnaseq_genomes) == 0: - raise InvalidInputException('AugustusCGP is being ran without any RNA-seq hints!') + raise InvalidInputException("AugustusCGP is being ran without any RNA-seq hints!") if args.augustus_pb and len(args.isoseq_genomes) == 0: - raise InvalidInputException('AugustusPB is being ran without any IsoSeq hints!') + raise InvalidInputException("AugustusPB is being ran without any IsoSeq hints!") def get_modes(self, args): """returns a tuple of the execution modes being used here""" - modes = ['transMap'] + modes = ["transMap"] if args.augustus_cgp is True: - modes.append('augCGP') + modes.append("augCGP") if args.augustus is True: - modes.append('augTM') + modes.append("augTM") if len(set(args.rnaseq_genomes) & set(args.target_genomes)) > 0: - modes.append('augTMR') + modes.append("augTMR") if args.augustus_pb is True: - modes.append('augPB') + modes.append("augPB") if len(args.annotation_genomes) > 1: - modes.append('exRef') + modes.append("exRef") return tuple(modes) def get_module_args(self, module, **args): @@ -387,26 +406,30 @@ def load_docker(self): # We use this environment variable as a bit of global state, # to avoid threading this through in each of the hundreds of # command invocations. - os.environ['CAT_BINARY_MODE'] = self.binary_mode - if self.binary_mode == 'docker': - if not tools.misc.is_exec('docker'): - raise ToolMissingException('docker binary not found. ' - 'Either install it or use a different option for --binary-mode.') + os.environ["CAT_BINARY_MODE"] = self.binary_mode + docker_image = os.getenv("DOCKER_IMAGE", "quay.io/ucsc_cgl/cat:latest") + if self.binary_mode == "docker": + if not tools.misc.is_exec("docker"): + raise ToolMissingException( + "docker binary not found. " "Either install it or use a different option for --binary-mode." + ) # Update docker container - subprocess.check_call(['docker', 'pull', 'quay.io/ucsc_cgl/cat:latest']) - elif self.binary_mode == 'singularity': - if not tools.misc.is_exec('singularity'): - raise ToolMissingException('singularity binary not found. ' - 'Either install it or use a different option for --binary-mode.') - os.environ['SINGULARITY_PULLFOLDER'] = os.path.abspath(self.work_dir) - os.environ['SINGULARITY_CACHEDIR'] = os.path.abspath(self.work_dir) - if os.environ.get('SINGULARITY_IMAGE'): + subprocess.check_call(["docker", "pull", docker_image]) + elif self.binary_mode == "singularity": + if not tools.misc.is_exec("singularity"): + raise ToolMissingException( + "singularity binary not found. " "Either install it or use a different option for --binary-mode." + ) + os.environ["SINGULARITY_PULLFOLDER"] = os.path.abspath(self.work_dir) + os.environ["SINGULARITY_CACHEDIR"] = os.path.abspath(self.work_dir) + if os.environ.get("SINGULARITY_IMAGE"): return tools.fileOps.ensure_dir(self.work_dir) - if not os.path.isfile(os.path.join(self.work_dir, 'cat.img')): - subprocess.check_call(['singularity', 'pull', '--name', 'cat.img', - 'docker://quay.io/ucsc_cgl/cat:latest']) - assert os.path.exists(os.path.join(self.work_dir, 'cat.img')) + if not os.path.isfile(os.path.join(self.work_dir, "cat.img")): + subprocess.check_call( + ["singularity", "pull", "--name", "cat.img", "docker://quay.io/ucsc_cgl/cat:latest"] + ) + assert os.path.exists(os.path.join(self.work_dir, "cat.img")) @staticmethod def get_databases(pipeline_args): @@ -417,26 +440,26 @@ def get_databases(pipeline_args): @staticmethod def get_database(pipeline_args, genome): """database paths must be resolved here to handle multiple programs accessing them""" - base_out_dir = os.path.join(pipeline_args.out_dir, 'databases') - return os.path.join(base_out_dir, '{}.db'.format(genome)) + base_out_dir = os.path.join(pipeline_args.out_dir, "databases") + return os.path.join(base_out_dir, "{}.db".format(genome)) @staticmethod def get_plot_dir(pipeline_args, genome): """plot base directories must be resolved here to handle multiple programs accessing them""" - base_out_dir = os.path.join(pipeline_args.out_dir, 'plots') + base_out_dir = os.path.join(pipeline_args.out_dir, "plots") return os.path.join(base_out_dir, genome) @staticmethod def get_metrics_dir(pipeline_args, genome): """plot data directories must be resolved here to handle multiple programs accessing them""" - base_out_dir = os.path.join(pipeline_args.work_dir, 'plot_data') + base_out_dir = os.path.join(pipeline_args.work_dir, "plot_data") return os.path.join(base_out_dir, genome) @staticmethod def write_metrics(metrics_dict, out_target): """write out a metrics dictionary to a path for later loading and plotting""" tools.fileOps.ensure_file_dir(out_target.path) - with out_target.open('w') as outf: + with out_target.open("w") as outf: json.dump(metrics_dict, outf) @@ -447,17 +470,17 @@ def processing_time(task, processing_time): """ pipeline_args = task.get_pipeline_args() stats_db = pipeline_args.stats_db - finish_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + finish_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") with tools.sqlite.ExclusiveSqlConnection(stats_db) as engine: c = engine.cursor() - c.execute('create table if not exists stats ' - '(TaskId string unique, FinishTime string, ProcessingTime real)') - c.execute('insert or replace into stats values (?, ?, ?)', [task.task_id, finish_time, processing_time]) + c.execute("create table if not exists stats " "(TaskId string unique, FinishTime string, ProcessingTime real)") + c.execute("insert or replace into stats values (?, ?, ?)", [task.task_id, finish_time, processing_time]) engine.commit() class PipelineWrapperTask(PipelineTask, luigi.WrapperTask): """add WrapperTask functionality to PipelineTask""" + pass @@ -465,12 +488,13 @@ class AbstractAtomicFileTask(PipelineTask): """ Abstract Task for single files. """ + def run_cmd(self, cmd): """ Run a external command that will produce the output file for this task to stdout. Capture this to the file. """ # luigi localTargets guarantee atomicity if used as a context manager - with self.output().open('w') as outf: + with self.output().open("w") as outf: tools.procOps.run_proc(cmd, stdout=outf) @@ -478,12 +502,13 @@ class ToilTask(PipelineTask): """ Task for launching toil pipelines from within luigi. """ - resources = {'toil': 1} # all toil pipelines use 1 toil + + resources = {"toil": 1} # all toil pipelines use 1 toil def __repr__(self): """override the PipelineTask repr to report the batch system being used""" base_repr = super(ToilTask, self).__repr__() - return 'Toil' + base_repr + ' using batchSystem {}'.format(self.batchSystem) + return "Toil" + base_repr + " using batchSystem {}".format(self.batchSystem) def prepare_toil_options(self, work_dir): """ @@ -498,7 +523,7 @@ def prepare_toil_options(self, work_dir): toil_args.stats = True toil_args.defaultPreemptable = True if self.zone is not None: - job_dir = os.path.join(work_dir, 'jobStore') # Directory where the AWS directory file is + job_dir = os.path.join(work_dir, "jobStore") # Directory where the AWS directory file is if os.path.exists(job_dir): for i in os.listdir(job_dir): if os.path.isfile(os.path.join(job_dir, i)) and self.provisioner in i: @@ -506,22 +531,27 @@ def prepare_toil_options(self, work_dir): toil_args.restart = True break if toil_args.restart is not True: - job_store = self.provisioner + ':' + self.zone + ':' + ''.join( - random.choice(string.ascii_lowercase) for _ in range(7)) + job_store = ( + self.provisioner + + ":" + + self.zone + + ":" + + "".join(random.choice(string.ascii_lowercase) for _ in range(7)) + ) try: os.makedirs(job_dir) except OSError: pass tools.fileOps.touch(os.path.join(job_dir, job_store)) else: - job_store = os.path.join(work_dir, 'jobStore') + job_store = os.path.join(work_dir, "jobStore") tools.fileOps.ensure_file_dir(job_store) # this logic tries to determine if we should try and restart an existing jobStore if os.path.exists(job_store): try: - root_job = next(open(os.path.join(job_store, 'rootJobStoreID'))).rstrip() - if not os.path.exists(os.path.join(job_store, 'tmp', root_job)): + root_job = next(open(os.path.join(job_store, "rootJobStoreID"))).rstrip() + if not os.path.exists(os.path.join(job_store, "tmp", root_job)): shutil.rmtree(job_store) else: toil_args.restart = True @@ -533,11 +563,12 @@ def prepare_toil_options(self, work_dir): # container filesystems are transient overlays that don't # support hardlinking. toil_args.disableCaching = True - if toil_args.batchSystem == 'parasol' and toil_args.disableCaching is False: - raise RuntimeError('Running parasol without disabled caching is a very bad idea.') - if toil_args.batchSystem == 'parasol' and toil_args.workDir is None: - raise RuntimeError('Running parasol without setting a shared work directory will not work. Please specify ' - '--workDir.') + if toil_args.batchSystem == "parasol" and toil_args.disableCaching is False: + raise RuntimeError("Running parasol without disabled caching is a very bad idea.") + if toil_args.batchSystem == "parasol" and toil_args.workDir is None: + raise RuntimeError( + "Running parasol without setting a shared work directory will not work. Please specify " "--workDir." + ) if toil_args.workDir is not None: tools.fileOps.ensure_dir(toil_args.workDir) toil_args.jobStore = job_store @@ -550,7 +581,7 @@ def get_toil_defaults(self): :return: dict """ parser = Job.Runner.getDefaultArgumentParser() - namespace = parser.parse_args(['']) # empty jobStore attribute + namespace = parser.parse_args([""]) # empty jobStore attribute namespace.jobStore = None # jobStore attribute will be updated per-batch namespace.logLevel = self.logLevel return namespace @@ -564,23 +595,23 @@ def success(task): pipeline_args = task.get_pipeline_args() stats_db = pipeline_args.stats_db if task.zone is not None: - cmd = ['toil', 'stats', '--raw', task.job_store] + cmd = ["toil", "stats", "--raw", task.job_store] try: os.remove(os.path.abspath(task.job_store)) except OSError: pass else: - cmd = ['toil', 'stats', '--raw', os.path.abspath(task.job_store)] + cmd = ["toil", "stats", "--raw", os.path.abspath(task.job_store)] raw = tools.procOps.call_proc(cmd) - parsed = raw[raw.index('{'):raw.rfind('}') + 1] + parsed = raw[raw.index("{") : raw.rfind("}") + 1] stats = json.loads(parsed) with tools.sqlite.ExclusiveSqlConnection(stats_db) as engine: c = engine.cursor() - c.execute('create table if not exists toil_stats ' - '(TaskId string unique, TotalTime real, AverageTime real)') - c.execute('insert or replace into toil_stats values (?, ?, ?)', [task.task_id, - stats['jobs']['total_clock'], - stats['jobs']['average_clock']]) + c.execute("create table if not exists toil_stats " "(TaskId string unique, TotalTime real, AverageTime real)") + c.execute( + "insert or replace into toil_stats values (?, ?, ?)", + [task.task_id, stats["jobs"]["total_clock"], stats["jobs"]["average_clock"]], + ) engine.commit() @@ -598,6 +629,7 @@ def __init__(self, *args, **kwargs): class TrackTask(RebuildableTask): """Provides shared values for all of the track tasks""" + genome = luigi.Parameter() track_path = luigi.Parameter() trackdb_path = luigi.Parameter() @@ -624,29 +656,30 @@ class RunCat(PipelineWrapperTask): """ Task that executes the entire pipeline. """ + def validate(self, pipeline_args): """General input validation""" if not os.path.exists(pipeline_args.hal): - raise InputMissingException('HAL file not found at {}.'.format(pipeline_args.hal)) + raise InputMissingException("HAL file not found at {}.".format(pipeline_args.hal)) for d in [pipeline_args.out_dir, pipeline_args.work_dir]: if not os.path.exists(d): if not tools.fileOps.dir_is_writeable(os.path.dirname(d)): - raise UserException('Cannot create directory {}.'.format(d)) + raise UserException("Cannot create directory {}.".format(d)) else: if not tools.fileOps.dir_is_writeable(d): - raise UserException('Directory {} is not writeable.'.format(d)) + raise UserException("Directory {} is not writeable.".format(d)) if not os.path.exists(pipeline_args.annotation): - raise InputMissingException('Annotation file {} not found.'.format(pipeline_args.annotation)) + raise InputMissingException("Annotation file {} not found.".format(pipeline_args.annotation)) # TODO: validate augustus species, tm/tmr/cgp/param files. if pipeline_args.ref_genome not in pipeline_args.hal_genomes: - raise InvalidInputException('Reference genome {} not present in HAL.'.format(pipeline_args.ref_genome)) + raise InvalidInputException("Reference genome {} not present in HAL.".format(pipeline_args.ref_genome)) missing_genomes = {g for g in pipeline_args.target_genomes if g not in pipeline_args.hal_genomes} if len(missing_genomes) > 0: - missing_genomes = ','.join(missing_genomes) - raise InvalidInputException('Target genomes {} not present in HAL.'.format(missing_genomes)) + missing_genomes = ",".join(missing_genomes) + raise InvalidInputException("Target genomes {} not present in HAL.".format(missing_genomes)) if pipeline_args.ref_genome in pipeline_args.target_genomes: - raise InvalidInputException('A target genome cannot be the reference genome.') + raise InvalidInputException("A target genome cannot be the reference genome.") def requires(self): self.load_docker() @@ -662,10 +695,10 @@ def requires(self): yield self.clone(Augustus) if self.augustus_cgp is True: yield self.clone(AugustusCgp) - yield self.clone(FindDenovoParents, mode='augCGP') + yield self.clone(FindDenovoParents, mode="augCGP") if self.augustus_pb is True: yield self.clone(AugustusPb) - yield self.clone(FindDenovoParents, mode='augPB') + yield self.clone(FindDenovoParents, mode="augPB") yield self.clone(IsoSeqTranscripts) yield self.clone(Hgm) yield self.clone(AlignTranscripts) @@ -681,6 +714,7 @@ class PrepareFiles(PipelineWrapperTask): """ Wrapper for file preparation tasks GenomeFiles and ReferenceFiles """ + def requires(self): yield self.clone(GenomeFiles) yield self.clone(ReferenceFiles) @@ -691,29 +725,27 @@ class GenomeFiles(PipelineWrapperTask): """ WrapperTask for producing all genome files. - GenomeFiles -> GenomeFasta -> GenomeTwoBit -> GenomeFlatFasta -> GenomeFastaIndex + GenomeFiles -> GenomeFasta -> GenomeTwoBit -> GenomeFastaIndex -> GenomeSizes """ + @staticmethod def get_args(pipeline_args, genome): - base_dir = os.path.join(pipeline_args.work_dir, 'genome_files') + base_dir = os.path.join(pipeline_args.work_dir, "genome_files") args = tools.misc.HashableNamespace() args.genome = genome - args.fasta = os.path.join(base_dir, genome + '.fa') - args.two_bit = os.path.join(base_dir, genome + '.2bit') - args.sizes = os.path.join(base_dir, genome + '.chrom.sizes') - args.flat_fasta = os.path.join(base_dir, genome + '.fa.flat') + args.fasta = os.path.join(base_dir, genome + ".fa") + args.two_bit = os.path.join(base_dir, genome + ".2bit") + args.sizes = os.path.join(base_dir, genome + ".chrom.sizes") return args def validate(self): - for haltool in ['hal2fasta', 'halStats']: + for haltool in ["hal2fasta", "halStats"]: if not tools.misc.is_exec(haltool): - raise ToolMissingException('{} from the HAL tools package not in global path'.format(haltool)) - if not tools.misc.is_exec('faToTwoBit'): - raise ToolMissingException('faToTwoBit tool from the Kent tools package not in global path.') - if not tools.misc.is_exec('pyfasta'): - raise ToolMissingException('pyfasta wrapper not found in global path.') + raise ToolMissingException("{} from the HAL tools package not in global path".format(haltool)) + if not tools.misc.is_exec("faToTwoBit"): + raise ToolMissingException("faToTwoBit tool from the Kent tools package not in global path.") def requires(self): self.validate() @@ -723,13 +755,14 @@ def requires(self): yield self.clone(GenomeFasta, **vars(args)) yield self.clone(GenomeTwoBit, **vars(args)) yield self.clone(GenomeSizes, **vars(args)) - yield self.clone(GenomeFlatFasta, **vars(args)) + yield self.clone(GenomeFastaIndex, **vars(args)) class GenomeFasta(AbstractAtomicFileTask): """ Produce a fasta file from a hal file. Requires hal2fasta. """ + genome = luigi.Parameter() fasta = luigi.Parameter() @@ -737,25 +770,48 @@ def output(self): return luigi.LocalTarget(self.fasta) def run(self): - logger.info('Extracting fasta for {}.'.format(self.genome)) - cmd = ['hal2fasta', os.path.abspath(self.hal), self.genome] + logger.info("Extracting fasta for {}.".format(self.genome)) + cmd = ["hal2fasta", os.path.abspath(self.hal), self.genome] self.run_cmd(cmd) +@requires(GenomeFasta) +class GenomeFastaIndex(AbstractAtomicFileTask): + """ + Produce a fasta index file. Requires samtools. + + Samtools seems to act very weirdly when the file is piped out of Docker. To avoid this, just use pyfaidx directly. + """ + + fasta = luigi.Parameter() + genome = luigi.Parameter() + + def output(self): + return luigi.LocalTarget(self.fasta + ".fai") + + def run(self): + logger.info("Building FASTA index for {}.".format(self.genome)) + try: + _ = pyfaidx.Faidx(self.fasta) + except Exception as e: + self.output()[0].remove() + raise Exception(e) + + @requires(GenomeFasta) class GenomeTwoBit(AbstractAtomicFileTask): """ Produce a 2bit file from a fasta file. Requires kent tool faToTwoBit. - Needs to be done BEFORE we flatten. """ + two_bit = luigi.Parameter() def output(self): return luigi.LocalTarget(self.two_bit) def run(self): - logger.info('Converting fasta for {} to 2bit.'.format(self.genome)) - cmd = ['faToTwoBit', self.fasta, '/dev/stdout'] + logger.info("Converting fasta for {} to 2bit.".format(self.genome)) + cmd = ["faToTwoBit", self.fasta, "/dev/stdout"] self.run_cmd(cmd) @@ -763,6 +819,7 @@ class GenomeSizes(AbstractAtomicFileTask): """ Produces a genome chromosome sizes file. Requires halStats. """ + genome = luigi.Parameter() sizes = luigi.Parameter() @@ -770,47 +827,32 @@ def output(self): return luigi.LocalTarget(self.sizes) def run(self): - logger.info('Extracting chromosome sizes for {}.'.format(self.genome)) - cmd = ['halStats', '--chromSizes', self.genome, os.path.abspath(self.hal)] + logger.info("Extracting chromosome sizes for {}.".format(self.genome)) + cmd = ["halStats", "--chromSizes", self.genome, os.path.abspath(self.hal)] self.run_cmd(cmd) -@requires(GenomeTwoBit) -class GenomeFlatFasta(AbstractAtomicFileTask): - """ - Flattens a genome fasta in-place using pyfasta. Requires the pyfasta package. - """ - flat_fasta = luigi.Parameter() - - def output(self): - return luigi.LocalTarget(self.flat_fasta) - - def run(self): - logger.info('Flattening fasta for {}.'.format(self.genome)) - cmd = ['pyfasta', 'flatten', self.fasta] - tools.procOps.run_proc(cmd) - - class ExternalReferenceFiles(PipelineWrapperTask): """ WrapperTask for running gff3ToGenePred and genePredToGtf for non-reference annotation files """ + @staticmethod def get_args(pipeline_args, genome): - base_dir = os.path.join(pipeline_args.work_dir, 'reference') + base_dir = os.path.join(pipeline_args.work_dir, "reference") args = tools.misc.HashableNamespace() args.genome = genome - args.annotation_gff3 = pipeline_args.cfg['ANNOTATION'][genome] - args.annotation_gp = os.path.join(base_dir, genome + '.external_reference.gp') - args.annotation_gtf = os.path.join(base_dir, genome + '.external_reference.gtf') - args.annotation_attrs = os.path.join(base_dir, genome + '.external_reference.gp_attrs') - args.duplicates = os.path.join(base_dir, genome + '.external_reference.duplicates.txt') + args.annotation_gff3 = pipeline_args.cfg["ANNOTATION"][genome] + args.annotation_gp = os.path.join(base_dir, genome + ".external_reference.gp") + args.annotation_gtf = os.path.join(base_dir, genome + ".external_reference.gtf") + args.annotation_attrs = os.path.join(base_dir, genome + ".external_reference.gp_attrs") + args.duplicates = os.path.join(base_dir, genome + ".external_reference.duplicates.txt") return args def validate(self): - for tool in ['gff3ToGenePred', 'genePredToBed']: + for tool in ["gff3ToGenePred", "genePredToBed"]: if not tools.misc.is_exec(tool): - raise ToolMissingException('{} from the Kent tools package not in global path'.format(tool)) + raise ToolMissingException("{} from the Kent tools package not in global path".format(tool)) def requires(self): self.validate() @@ -826,32 +868,32 @@ class ReferenceFiles(PipelineWrapperTask): """ WrapperTask for producing annotation files. - ReferenceFiles -> Gff3ToGenePred -> TranscriptBed -> TranscriptFasta -> FlatTranscriptFasta + ReferenceFiles -> Gff3ToGenePred -> TranscriptBed -> TranscriptFasta V FakePsl, TranscriptGtf """ + @staticmethod def get_args(pipeline_args): - base_dir = os.path.join(pipeline_args.work_dir, 'reference') + base_dir = os.path.join(pipeline_args.work_dir, "reference") annotation = os.path.splitext(os.path.basename(pipeline_args.annotation))[0] args = tools.misc.HashableNamespace() args.annotation_gff3 = pipeline_args.annotation - args.annotation_gp = os.path.join(base_dir, annotation + '.gp') - args.annotation_attrs = os.path.join(base_dir, annotation + '.gp_attrs') - args.annotation_gtf = os.path.join(base_dir, annotation + '.gtf') - args.transcript_fasta = os.path.join(base_dir, annotation + '.fa') - args.transcript_flat_fasta = os.path.join(base_dir, annotation + '.fa.flat') - args.transcript_bed = os.path.join(base_dir, annotation + '.bed') - args.duplicates = os.path.join(base_dir, annotation + '.duplicates.txt') - args.ref_psl = os.path.join(base_dir, annotation + '.psl') + args.annotation_gp = os.path.join(base_dir, annotation + ".gp") + args.annotation_attrs = os.path.join(base_dir, annotation + ".gp_attrs") + args.annotation_gtf = os.path.join(base_dir, annotation + ".gtf") + args.transcript_fasta = os.path.join(base_dir, annotation + ".fa") + args.transcript_bed = os.path.join(base_dir, annotation + ".bed") + args.duplicates = os.path.join(base_dir, annotation + ".duplicates.txt") + args.ref_psl = os.path.join(base_dir, annotation + ".psl") args.genome = pipeline_args.ref_genome args.__dict__.update(**vars(GenomeFiles.get_args(pipeline_args, pipeline_args.ref_genome))) return args def validate(self): - for tool in ['gff3ToGenePred', 'genePredToBed', 'genePredToFakePsl']: + for tool in ["gff3ToGenePred", "genePredToBed", "genePredToFakePsl"]: if not tools.misc.is_exec(tool): - raise ToolMissingException('{} from the Kent tools package not in global path'.format(tool)) + raise ToolMissingException("{} from the Kent tools package not in global path".format(tool)) def requires(self): self.validate() @@ -862,7 +904,6 @@ def requires(self): yield self.clone(TranscriptBed, **vars(args)) yield self.clone(TranscriptFasta, **vars(args)) yield self.clone(TranscriptGtf, **vars(args)) - yield self.clone(FlatTranscriptFasta, **vars(args)) yield self.clone(FakePsl, **vars(args)) @@ -870,6 +911,7 @@ class Gff3ToGenePred(PipelineTask): """ Generates a genePred from a gff3 file. """ + genome = luigi.Parameter() annotation_gff3 = luigi.Parameter() annotation_gp = luigi.Parameter() @@ -887,20 +929,32 @@ def validate(self): c[l[0]] += 1 duplicates = {x for x, y in c.items() if y > 1} if len(duplicates) > 0: - with open(self.duplicates, 'w') as outf: + with open(self.duplicates, "w") as outf: for l in duplicates: - outf.write(l + '\n') - raise InvalidInputException('Found {:,} duplicate transcript IDs after parsing input GFF3. ' - 'Please check your input. One possible cause is the lack of a transcript-level ' - 'identifier on a gene record. Duplicate IDs have been written to: ' - '{}'.format(len(duplicates), self.duplicates)) + outf.write(l + "\n") + raise InvalidInputException( + "Found {:,} duplicate transcript IDs after parsing input GFF3. " + "Please check your input. One possible cause is the lack of a transcript-level " + "identifier on a gene record. Duplicate IDs have been written to: " + "{}".format(len(duplicates), self.duplicates) + ) + tx_dict = tools.transcripts.get_gene_pred_dict(annotation_gp.path) + grouped_genes = tools.transcripts.group_transcripts_by_name2(tx_dict.values()) + multi_chrom_genes = [] + for gene_id, txs in grouped_genes.items(): + if len({x.chromosome for x in txs}) != 1: + multi_chrom_genes.append(gene_id) + if len(multi_chrom_genes) > 0: + raise InvalidInputException( + "Found {:,} genes on multiple chromosomes. " "This is not allowed.".format(len(multi_chrom_genes)) + ) def run(self): - logger.info('Converting annotation gff3 to genePred.') + logger.info("Converting annotation gff3 to genePred.") if self.genome == self.ref_genome: cmd = tools.gff3.convert_gff3_cmd(self.annotation_attrs, self.annotation_gff3) annotation_gp, annotation_attrs = self.output() - with annotation_gp.open('w') as outf: + with annotation_gp.open("w") as outf: tools.procOps.run_proc(cmd, stdout=outf) else: annotation_gp, annotation_attrs = self.output() @@ -909,13 +963,13 @@ def run(self): tools.procOps.run_proc(cmd, stdout=tmp_gp) recs = tools.transcripts.get_gene_pred_dict(tmp_gp) for rec in recs.values(): - rec.name = f'exRef-{rec.name}' - rec.name2 = f'exRef-{rec.name2}' - with annotation_gp.open('w') as outf: + rec.name = f"exRef-{rec.name}" + rec.name2 = f"exRef-{rec.name2}" + with annotation_gp.open("w") as outf: for rec in recs.values(): tools.fileOps.print_row(outf, rec.get_gene_pred()) - with annotation_attrs.open('w') as outf: - tools.procOps.run_proc(['sed', 's/^/exRef-/'], stdin=tmp_attrs, stdout=outf) + with annotation_attrs.open("w") as outf: + tools.procOps.run_proc(["sed", "s/^/exRef-/"], stdin=tmp_attrs, stdout=outf) self.validate() @@ -924,33 +978,36 @@ class Gff3ToAttrs(PipelineTask): """ Converts the attrs file from -attrsOut in gff3ToGenePred into a SQLite table. """ + table = tools.sqlInterface.Annotation.__tablename__ def output(self): pipeline_args = self.get_pipeline_args() database = pipeline_args.dbs[self.genome] tools.fileOps.ensure_file_dir(database) - conn_str = 'sqlite:///{}'.format(database) - digest = tools.fileOps.hashfile(pipeline_args.cfg['ANNOTATION'][self.genome]) - attrs_table = luigi.contrib.sqla.SQLAlchemyTarget(connection_string=conn_str, - target_table=self.table, - update_id='_'.join([self.table, str(digest)])) + conn_str = "sqlite:///{}".format(database) + digest = tools.fileOps.hashfile(pipeline_args.cfg["ANNOTATION"][self.genome]) + attrs_table = luigi.contrib.sqla.SQLAlchemyTarget( + connection_string=conn_str, target_table=self.table, update_id="_".join([self.table, str(digest)]) + ) return attrs_table def run(self): - logger.info('Extracting gff3 attributes to sqlite database.') + logger.info("Extracting gff3 attributes to sqlite database.") pipeline_args = self.get_pipeline_args() df = tools.gff3.parse_gff3(self.annotation_attrs, self.annotation_gp, self.genome != self.ref_genome) - if 'protein_coding' not in set(df.GeneBiotype) or 'protein_coding' not in set(df.TranscriptBiotype): - raise InvalidInputException('No genes or transcripts with biotype protein_coding found!') + if "protein_coding" not in set(df.GeneBiotype) or "protein_coding" not in set(df.TranscriptBiotype): + raise InvalidInputException("No genes or transcripts with biotype protein_coding found!") # validate number parsed tot_genes = len(open(self.annotation_gp).readlines()) if tot_genes != len(df): - raise InvalidInputException('The number of genes parsed from the attrs file is not the same number as ' - 'in the genePred. This is a parser failure. Contact Ian and make him fix it.') + raise InvalidInputException( + "The number of genes parsed from the attrs file is not the same number as " + "in the genePred. This is a parser failure. Contact Ian and make him fix it." + ) database = pipeline_args.dbs[self.genome] with tools.sqlite.ExclusiveSqlConnection(database) as engine: - df.to_sql(self.table, engine, if_exists='replace') + df.to_sql(self.table, engine, if_exists="replace") self.output().touch() @@ -959,6 +1016,7 @@ class TranscriptBed(AbstractAtomicFileTask): """ Produces a BED record from the input genePred annotation. Makes use of Kent tool genePredToBed """ + transcript_bed = luigi.Parameter() annotation_gp = luigi.Parameter() @@ -966,28 +1024,35 @@ def output(self): return luigi.LocalTarget(self.transcript_bed) def run(self): - logger.info('Converting annotation genePred to BED.') - cmd = ['genePredToBed', self.annotation_gp, '/dev/stdout'] + logger.info("Converting annotation genePred to BED.") + cmd = ["genePredToBed", self.annotation_gp, "/dev/stdout"] self.run_cmd(cmd) -@multiple_requires(GenomeFlatFasta, TranscriptBed) +@multiple_requires(GenomeFasta, GenomeFastaIndex, TranscriptBed) class TranscriptFasta(AbstractAtomicFileTask): """ Produces a fasta for each transcript. """ + transcript_fasta = luigi.Parameter() def output(self): - return luigi.LocalTarget(self.transcript_fasta) + return luigi.LocalTarget(self.transcript_fasta), luigi.LocalTarget(self.transcript_fasta + ".fai") def run(self): - logger.info('Extracting reference annotation fasta.') + logger.info("Extracting reference annotation fasta.") seq_dict = tools.bio.get_sequence_dict(self.fasta, upper=False) seqs = {tx.name: tx.get_mrna(seq_dict) for tx in tools.transcripts.transcript_iterator(self.transcript_bed)} - with self.output().open('w') as outf: + fa, fai = self.output() + with fa.open("w") as outf: for name, seq in seqs.items(): tools.bio.write_fasta(outf, name, seq) + try: + _ = pyfaidx.Faidx(self.transcript_fasta) + except Exception as e: + fai.remove() + raise Exception(e) @requires(Gff3ToGenePred) @@ -995,6 +1060,7 @@ class TranscriptGtf(AbstractAtomicFileTask): """ Produces a GTF out of the genePred for the reference """ + annotation_gtf = luigi.Parameter() annotation_gp = luigi.Parameter() @@ -1002,41 +1068,31 @@ def output(self): return luigi.LocalTarget(self.annotation_gtf) def run(self): - logger.info('Extracting reference annotation GTF.') + logger.info("Extracting reference annotation GTF.") tools.misc.convert_gp_gtf(self.output(), luigi.LocalTarget(self.annotation_gp)) -@requires(TranscriptFasta) -class FlatTranscriptFasta(AbstractAtomicFileTask): - """ - Flattens the transcript fasta for pyfasta. - """ - transcript_fasta = luigi.Parameter() - transcript_flat_fasta = luigi.Parameter() - - def output(self): - return luigi.LocalTarget(self.transcript_flat_fasta) - - def run(self): - logger.info('Flattening reference annotation fasta.') - cmd = ['pyfasta', 'flatten', self.transcript_fasta] - tools.procOps.run_proc(cmd) - - @multiple_requires(Gff3ToGenePred, GenomeSizes) class FakePsl(AbstractAtomicFileTask): """ Produces a fake PSL mapping transcripts to the genome, using the Kent tool genePredToFakePsl """ + ref_psl = luigi.Parameter() def output(self): return luigi.LocalTarget(self.ref_psl) def run(self): - logger.info('Generating annotation fake PSL.') - cmd = ['genePredToFakePsl', '-chromSize={}'.format(self.sizes), 'noDB', - self.annotation_gp, '/dev/stdout', '/dev/null'] + logger.info("Generating annotation fake PSL.") + cmd = [ + "genePredToFakePsl", + "-chromSize={}".format(self.sizes), + "noDB", + self.annotation_gp, + "/dev/stdout", + "/dev/null", + ] self.run_cmd(cmd) @@ -1046,9 +1102,10 @@ class BuildDb(PipelineTask): TODO: output() should be way smarter than this. Currently, it only checks if the indices have been created. """ + @staticmethod def get_args(pipeline_args, genome): - base_dir = os.path.join(pipeline_args.work_dir, 'hints_database') + base_dir = os.path.join(pipeline_args.work_dir, "hints_database") args = tools.misc.HashableNamespace() args.genome = genome args.fasta = GenomeFiles.get_args(pipeline_args, genome).fasta @@ -1060,17 +1117,29 @@ def get_args(pipeline_args, genome): args.annotation_gp = ExternalReferenceFiles.get_args(pipeline_args, genome).annotation_gp else: args.annotation_gp = None - args.protein_fasta = pipeline_args.cfg['PROTEIN_FASTA'].get(genome, None) - args.hints_path = os.path.join(base_dir, genome + '.extrinsic_hints.gff') + args.protein_fasta = pipeline_args.cfg["PROTEIN_FASTA"].get(genome, None) + args.hints_path = os.path.join(base_dir, genome + ".extrinsic_hints.gff") return args def validate(self): tools.misc.samtools_version() # validate samtools version - for tool in ['load2sqlitedb', 'samtools', 'filterBam', 'bam2hints', 'bam2wig', 'wig2hints.pl', 'bam2hints', - 'bamToPsl', 'exonerate2hints.pl', 'gff3ToGenePred', 'join_mult_hints.pl', 'sambamba', - 'exonerate']: + for tool in [ + "load2sqlitedb", + "samtools", + "filterBam", + "bam2hints", + "bam2wig", + "wig2hints.pl", + "bam2hints", + "bamToPsl", + "exonerate2hints.pl", + "gff3ToGenePred", + "join_mult_hints.pl", + "sambamba", + "exonerate", + ]: if not tools.misc.is_exec(tool): - raise ToolMissingException('Auxiliary program {} not found on path.'.format(tool)) + raise ToolMissingException("Auxiliary program {} not found on path.".format(tool)) def requires(self): pipeline_args = self.get_pipeline_args() @@ -1088,23 +1157,29 @@ def run(self): self.validate() for genome in list(pipeline_args.target_genomes) + [pipeline_args.ref_genome]: args = BuildDb.get_args(pipeline_args, genome) - logger.info('Loading sequence for {} into database.'.format(genome)) - base_cmd = ['load2sqlitedb', '--noIdx', '--clean', '--species={}'.format(genome), - '--dbaccess={}'.format(pipeline_args.hints_db)] - tools.procOps.run_proc(base_cmd + [args.fasta], stdout='/dev/null', stderr='/dev/null') + logger.info("Loading sequence for {} into database.".format(genome)) + base_cmd = [ + "load2sqlitedb", + "--noIdx", + "--clean", + "--species={}".format(genome), + "--dbaccess={}".format(pipeline_args.hints_db), + ] + tools.procOps.run_proc(base_cmd + [args.fasta], stdout="/dev/null", stderr="/dev/null") if os.path.getsize(args.hints_path) != 0: - logger.info('Loading hints for {} into database.'.format(genome)) - tools.procOps.run_proc(base_cmd + [args.hints_path], stderr='/dev/null') - logger.info('Indexing database.') - cmd = ['load2sqlitedb', '--makeIdx', '--clean', '--dbaccess={}'.format(pipeline_args.hints_db)] - tools.procOps.run_proc(cmd, stdout='/dev/null', stderr='/dev/null') - logger.info('Hints database completed.') + logger.info("Loading hints for {} into database.".format(genome)) + tools.procOps.run_proc(base_cmd + [args.hints_path], stderr="/dev/null") + logger.info("Indexing database.") + cmd = ["load2sqlitedb", "--makeIdx", "--clean", "--dbaccess={}".format(pipeline_args.hints_db)] + tools.procOps.run_proc(cmd, stdout="/dev/null", stderr="/dev/null") + logger.info("Hints database completed.") class GenerateHints(ToilTask): """ Generate hints for each genome as a separate Toil pipeline. """ + hints_args = luigi.Parameter() genome = luigi.Parameter() stats = luigi.BoolParameter() @@ -1116,24 +1191,31 @@ def requires(self): return self.clone(PrepareFiles), self.clone(ReferenceFiles) def validate(self): - for tool in ['samtools', 'sambamba']: + for tool in ["samtools", "sambamba"]: if not tools.misc.is_exec(tool): - raise ToolMissingException('{} is not in global path.'.format(tool)) - for tool in ['gff3ToGenePred', 'bamToPsl']: + raise ToolMissingException("{} is not in global path.".format(tool)) + for tool in ["gff3ToGenePred", "bamToPsl"]: if not tools.misc.is_exec(tool): - raise ToolMissingException('{} from the Kent tools package not in global path.'.format(tool)) - for tool in ['join_mult_hints.pl', 'exonerate2hints.pl', 'blat2hints.pl', - 'wig2hints.pl', 'bam2wig', 'bam2hints', 'filterBam']: + raise ToolMissingException("{} from the Kent tools package not in global path.".format(tool)) + for tool in [ + "join_mult_hints.pl", + "exonerate2hints.pl", + "blat2hints.pl", + "wig2hints.pl", + "bam2wig", + "bam2hints", + "filterBam", + ]: if not tools.misc.is_exec(tool): - raise ToolMissingException('{} from the augustus tool package not in global path.'.format(tool)) + raise ToolMissingException("{} from the augustus tool package not in global path.".format(tool)) def run(self): self.validate() - logger.info('Beginning GenerateHints Toil pipeline for {}.'.format(self.genome)) - work_dir = os.path.abspath(os.path.join(self.work_dir, 'toil', 'hints_db', self.genome)) + logger.info("Beginning GenerateHints Toil pipeline for {}.".format(self.genome)) + work_dir = os.path.abspath(os.path.join(self.work_dir, "toil", "hints_db", self.genome)) toil_options = self.prepare_toil_options(work_dir) hints_db(self.hints_args, toil_options) - logger.info('Finished GenerateHints Toil pipeline for {}.'.format(self.genome)) + logger.info("Finished GenerateHints Toil pipeline for {}.".format(self.genome)) class Chaining(ToilTask): @@ -1141,14 +1223,17 @@ class Chaining(ToilTask): Task that launches the Chaining toil pipeline. This pipeline operates on all genomes at once to reduce the repeated downloading of the HAL file. """ + @staticmethod def get_args(pipeline_args): - base_dir = os.path.join(pipeline_args.work_dir, 'chaining') + base_dir = os.path.join(pipeline_args.work_dir, "chaining") ref_files = GenomeFiles.get_args(pipeline_args, pipeline_args.ref_genome) tgt_files = {genome: GenomeFiles.get_args(pipeline_args, genome) for genome in pipeline_args.target_genomes} tgt_two_bits = {genome: tgt_files[genome].two_bit for genome in pipeline_args.target_genomes} - chain_files = {genome: os.path.join(base_dir, '{}-{}.chain'.format(pipeline_args.ref_genome, genome)) - for genome in pipeline_args.target_genomes} + chain_files = { + genome: os.path.join(base_dir, "{}-{}.chain".format(pipeline_args.ref_genome, genome)) + for genome in pipeline_args.target_genomes + } args = tools.misc.HashableNamespace() args.hal = pipeline_args.hal args.ref_genome = pipeline_args.ref_genome @@ -1165,11 +1250,11 @@ def output(self): yield luigi.LocalTarget(path) def validate(self): - if not tools.misc.is_exec('halLiftover'): - raise ToolMissingException('halLiftover from the halTools package not in global path.') - for tool in ['pslPosTarget', 'axtChain', 'chainMergeSort']: + if not tools.misc.is_exec("halLiftover"): + raise ToolMissingException("halLiftover from the halTools package not in global path.") + for tool in ["pslPosTarget", "axtChain", "chainMergeSort"]: if not tools.misc.is_exec(tool): - raise ToolMissingException('{} from the Kent tools package not in global path.'.format(tool)) + raise ToolMissingException("{} from the Kent tools package not in global path.".format(tool)) def requires(self): yield self.clone(PrepareFiles) @@ -1177,21 +1262,22 @@ def requires(self): def run(self): self.validate() pipeline_args = self.get_pipeline_args() - logger.info('Launching Pairwise Chaining toil pipeline.') - toil_work_dir = os.path.join(self.work_dir, 'toil', 'chaining') + logger.info("Launching Pairwise Chaining toil pipeline.") + toil_work_dir = os.path.join(self.work_dir, "toil", "chaining") toil_options = self.prepare_toil_options(toil_work_dir) chain_args = self.get_args(pipeline_args) chaining(chain_args, toil_options) - logger.info('Pairwise Chaining toil pipeline is complete.') + logger.info("Pairwise Chaining toil pipeline is complete.") class TransMap(PipelineWrapperTask): """ Runs transMap. """ + @staticmethod def get_args(pipeline_args, genome): - base_dir = os.path.join(pipeline_args.work_dir, 'transMap') + base_dir = os.path.join(pipeline_args.work_dir, "transMap") ref_files = ReferenceFiles.get_args(pipeline_args) args = tools.misc.HashableNamespace() args.two_bit = GenomeFiles.get_args(pipeline_args, genome).two_bit @@ -1199,12 +1285,12 @@ def get_args(pipeline_args, genome): args.transcript_fasta = ref_files.transcript_fasta args.ref_psl = ref_files.ref_psl args.annotation_gp = ref_files.annotation_gp - args.tm_psl = os.path.join(base_dir, genome + '.psl') - args.tm_gp = os.path.join(base_dir, genome + '.gp') - args.tm_gtf = os.path.join(base_dir, genome + '.gtf') - args.filtered_tm_psl = os.path.join(base_dir, genome + '.filtered.psl') - args.filtered_tm_gp = os.path.join(base_dir, genome + '.filtered.gp') - args.metrics_json = os.path.join(PipelineTask.get_metrics_dir(pipeline_args, genome), 'filter_tm_metrics.json') + args.tm_psl = os.path.join(base_dir, genome + ".psl") + args.tm_gp = os.path.join(base_dir, genome + ".gp") + args.tm_gtf = os.path.join(base_dir, genome + ".gtf") + args.filtered_tm_psl = os.path.join(base_dir, genome + ".filtered.psl") + args.filtered_tm_gp = os.path.join(base_dir, genome + ".filtered.gp") + args.metrics_json = os.path.join(PipelineTask.get_metrics_dir(pipeline_args, genome), "filter_tm_metrics.json") args.ref_db_path = pipeline_args.dbs[pipeline_args.ref_genome] args.db_path = pipeline_args.dbs[genome] args.global_near_best = pipeline_args.global_near_best @@ -1213,9 +1299,9 @@ def get_args(pipeline_args, genome): return args def validate(self): - for tool in ['pslMap', 'pslRecalcMatch', 'pslMapPostChain', 'pslCDnaFilter', 'clusterGenes']: + for tool in ["pslMap", "pslRecalcMatch", "pslMapPostChain", "pslCDnaFilter", "clusterGenes"]: if not tools.misc.is_exec(tool): - raise ToolMissingException('{} from the Kent tools package not in global path.'.format(tool)) + raise ToolMissingException("{} from the Kent tools package not in global path.".format(tool)) def requires(self): self.validate() @@ -1230,6 +1316,7 @@ class TransMapPsl(PipelineTask): """ Runs transMap. Requires Kent tools pslMap, pslMapPostChain, pslRecalcMatch, transMapPslToGenePred """ + genome = luigi.Parameter() def output(self): @@ -1241,23 +1328,31 @@ def requires(self): def run(self): tm_args = self.get_module_args(TransMap, genome=self.genome) - logger.info('Running transMap for {}.'.format(self.genome)) - cmd = [['pslMap', '-chainMapFile', tm_args.ref_psl, tm_args.chain_file, '/dev/stdout'], - ['pslMapPostChain', '/dev/stdin', '/dev/stdout'], - ['sort', '-k14,14', '-k16,16n'], - ['pslRecalcMatch', '/dev/stdin', tm_args.two_bit, tm_args.transcript_fasta, 'stdout'], - ['sort', '-k10,10']] # re-sort back to query name for filtering + logger.info("Running transMap for {}.".format(self.genome)) + cmd = [ + ["pslMap", "-chainMapFile", tm_args.ref_psl, tm_args.chain_file, "/dev/stdout"], + ["pslMapPostChain", "/dev/stdin", "/dev/stdout"], + ["sort", "-k14,14", "-k16,16n"], + ["pslRecalcMatch", "/dev/stdin", tm_args.two_bit, tm_args.transcript_fasta, "stdout"], + ["sort", "-k10,10"], + ] # re-sort back to query name for filtering tmp_file = luigi.LocalTarget(is_tmp=True) - with tmp_file.open('w') as tmp_fh: - tools.procOps.run_proc(cmd, stdout=tmp_fh, stderr='/dev/null') + with tmp_file.open("w") as tmp_fh: + tools.procOps.run_proc(cmd, stdout=tmp_fh, stderr="/dev/null") tm_psl_tgt, tm_gp_tgt = self.output() tools.fileOps.ensure_file_dir(tm_psl_tgt.path) - with tm_psl_tgt.open('w') as outf: + with tm_psl_tgt.open("w") as outf: for psl_rec in tools.psl.psl_iterator(tmp_file.path, make_unique=True): tools.fileOps.print_row(outf, psl_rec.psl_string()) - with tm_gp_tgt.open('w') as outf: - cmd = ['transMapPslToGenePred', '-nonCodingGapFillMax=80', '-codingGapFillMax=50', - tm_args.annotation_gp, tm_psl_tgt.path, '/dev/stdout'] + with tm_gp_tgt.open("w") as outf: + cmd = [ + "transMapPslToGenePred", + "-nonCodingGapFillMax=80", + "-codingGapFillMax=50", + tm_args.annotation_gp, + tm_psl_tgt.path, + "/dev/stdout", + ] tools.procOps.run_proc(cmd, stdout=outf) @@ -1266,35 +1361,53 @@ class FilterTransMap(PipelineTask): """ Filters transMap output using the localNearBest algorithm. """ + eval_table = tools.sqlInterface.TmFilterEval.__tablename__ def output(self): pipeline_args = self.get_pipeline_args() tm_args = self.get_module_args(TransMap, genome=self.genome) tools.fileOps.ensure_file_dir(tm_args.db_path) - conn_str = 'sqlite:///{}'.format(tm_args.db_path) + conn_str = "sqlite:///{}".format(tm_args.db_path) tm_args = self.get_module_args(TransMap, genome=self.genome) - return (luigi.contrib.sqla.SQLAlchemyTarget(connection_string=conn_str, - target_table=self.eval_table, - update_id='_'.join([self.eval_table, str(hash(pipeline_args))])), - luigi.LocalTarget(tm_args.filtered_tm_psl), - luigi.LocalTarget(tm_args.metrics_json), - luigi.LocalTarget(tm_args.filtered_tm_gp)) + return ( + luigi.contrib.sqla.SQLAlchemyTarget( + connection_string=conn_str, + target_table=self.eval_table, + update_id="_".join([self.eval_table, str(hash(pipeline_args))]), + ), + luigi.LocalTarget(tm_args.filtered_tm_psl), + luigi.LocalTarget(tm_args.metrics_json), + luigi.LocalTarget(tm_args.filtered_tm_gp), + ) def run(self): tm_args = self.get_module_args(TransMap, genome=self.genome) - logger.info('Filtering transMap PSL for {}.'.format(self.genome)) + logger.info("Filtering transMap PSL for {}.".format(self.genome)) table_target, psl_target, json_target, gp_target = self.output() - resolved_df = filter_transmap(tm_args.tm_psl, tm_args.ref_psl, tm_args.tm_gp, - tm_args.ref_db_path, psl_target, tm_args.global_near_best, - tm_args.filter_overlapping_genes, tm_args.overlapping_gene_distance, - json_target) + resolved_df = filter_transmap( + tm_args.tm_psl, + tm_args.ref_psl, + tm_args.tm_gp, + tm_args.ref_db_path, + psl_target, + tm_args.global_near_best, + tm_args.filter_overlapping_genes, + tm_args.overlapping_gene_distance, + json_target, + ) with tools.sqlite.ExclusiveSqlConnection(tm_args.db_path) as engine: - resolved_df.to_sql(self.eval_table, engine, if_exists='replace') + resolved_df.to_sql(self.eval_table, engine, if_exists="replace") table_target.touch() - with gp_target.open('w') as outf: - cmd = ['transMapPslToGenePred', '-nonCodingGapFillMax=80', '-codingGapFillMax=50', - tm_args.annotation_gp, psl_target.path, '/dev/stdout'] + with gp_target.open("w") as outf: + cmd = [ + "transMapPslToGenePred", + "-nonCodingGapFillMax=80", + "-codingGapFillMax=50", + tm_args.annotation_gp, + psl_target.path, + "/dev/stdout", + ] tools.procOps.run_proc(cmd, stdout=outf) @@ -1303,16 +1416,23 @@ class TransMapGtf(PipelineTask): """ Converts the unfiltered transMap PSL to GTF """ + def output(self): tm_args = self.get_module_args(TransMap, genome=self.genome) return luigi.LocalTarget(tm_args.tm_gtf) def run(self): tm_args = self.get_module_args(TransMap, genome=self.genome) - logger.info('Creating unfiltered transMap GTF for {}.'.format(self.genome)) + logger.info("Creating unfiltered transMap GTF for {}.".format(self.genome)) tmp_gp = luigi.LocalTarget(is_tmp=True) - cmd = ['transMapPslToGenePred', '-nonCodingGapFillMax=80', '-codingGapFillMax=50', - tm_args.annotation_gp, tm_args.tm_psl, tmp_gp.path] + cmd = [ + "transMapPslToGenePred", + "-nonCodingGapFillMax=80", + "-codingGapFillMax=50", + tm_args.annotation_gp, + tm_args.tm_psl, + tmp_gp.path, + ] tools.procOps.run_proc(cmd) tools.misc.convert_gp_gtf(self.output(), tmp_gp) @@ -1321,6 +1441,7 @@ class EvaluateTransMap(PipelineWrapperTask): """ Evaluates transMap derived transcripts (cat/classify.py) """ + @staticmethod def get_args(pipeline_args, genome): tm_args = TransMap.get_args(pipeline_args, genome) @@ -1351,6 +1472,7 @@ class EvaluateTransMapDriverTask(PipelineTask): """ Task for per-genome analysis of transMap derived transcripts (cat/classify.py) """ + genome = luigi.Parameter() tm_eval_args = luigi.Parameter() table = tools.sqlInterface.TmEval.__tablename__ @@ -1358,23 +1480,25 @@ class EvaluateTransMapDriverTask(PipelineTask): def write_to_sql(self, df): """Load the results into the SQLite database""" with tools.sqlite.ExclusiveSqlConnection(self.tm_eval_args.db_path) as engine: - df.to_sql(self.table, engine, if_exists='replace') + df.to_sql(self.table, engine, if_exists="replace") self.output().touch() - logger.info('Loaded table: {}.{}'.format(self.genome, self.table)) + logger.info("Loaded table: {}.{}".format(self.genome, self.table)) def output(self): pipeline_args = self.get_pipeline_args() tools.fileOps.ensure_file_dir(self.tm_eval_args.db_path) - conn_str = 'sqlite:///{}'.format(self.tm_eval_args.db_path) - return luigi.contrib.sqla.SQLAlchemyTarget(connection_string=conn_str, - target_table=self.table, - update_id='_'.join([self.table, str(hash(pipeline_args))])) + conn_str = "sqlite:///{}".format(self.tm_eval_args.db_path) + return luigi.contrib.sqla.SQLAlchemyTarget( + connection_string=conn_str, + target_table=self.table, + update_id="_".join([self.table, str(hash(pipeline_args))]), + ) def requires(self): return self.clone(TransMap), self.clone(ReferenceFiles) def run(self): - logger.info('Evaluating transMap results for {}.'.format(self.genome)) + logger.info("Evaluating transMap results for {}.".format(self.genome)) results = transmap_classify(self.tm_eval_args) self.write_to_sql(results) @@ -1383,9 +1507,10 @@ class Augustus(PipelineWrapperTask): """ Runs AugustusTM(R) on the coding output from transMap. """ + @staticmethod def get_args(pipeline_args, genome): - base_dir = os.path.join(pipeline_args.work_dir, 'augustus') + base_dir = os.path.join(pipeline_args.work_dir, "augustus") args = tools.misc.HashableNamespace() args.ref_genome = pipeline_args.ref_genome args.genome = genome @@ -1395,8 +1520,8 @@ def get_args(pipeline_args, genome): tm_args = TransMap.get_args(pipeline_args, genome) args.ref_psl = tm_args.ref_psl args.filtered_tm_psl = tm_args.filtered_tm_psl - args.augustus_tm_gp = os.path.join(base_dir, genome + '.augTM.gp') - args.augustus_tm_gtf = os.path.join(base_dir, genome + '.augTM.gtf') + args.augustus_tm_gp = os.path.join(base_dir, genome + ".augTM.gp") + args.augustus_tm_gtf = os.path.join(base_dir, genome + ".augTM.gtf") args.tm_cfg = pipeline_args.tm_cfg args.tmr_cfg = pipeline_args.tmr_cfg args.augustus_species = pipeline_args.augustus_species @@ -1405,14 +1530,14 @@ def get_args(pipeline_args, genome): args.augustus_hints_db = pipeline_args.hints_db args.augustus_tmr = genome in pipeline_args.rnaseq_genomes if args.augustus_tmr: - args.augustus_tmr_gp = os.path.join(base_dir, genome + '.augTMR.gp') - args.augustus_tmr_gtf = os.path.join(base_dir, genome + '.augTMR.gtf') + args.augustus_tmr_gp = os.path.join(base_dir, genome + ".augTMR.gp") + args.augustus_tmr_gtf = os.path.join(base_dir, genome + ".augTMR.gtf") return args def validate(self): - for tool in ['augustus', 'transMap2hints.pl']: + for tool in ["augustus", "transMap2hints.pl"]: if not tools.misc.is_exec(tool): - raise ToolMissingException('Auxiliary program {} from the Augustus package not in path.'.format(tool)) + raise ToolMissingException("Auxiliary program {} from the Augustus package not in path.".format(tool)) def requires(self): self.validate() @@ -1425,6 +1550,7 @@ class AugustusDriverTask(ToilTask): """ Task for per-genome launching of a toil pipeline for running Augustus. """ + genome = luigi.Parameter() def output(self): @@ -1442,22 +1568,22 @@ def requires(self): def extract_coding_genes(self, augustus_args): """extracts only coding genes from the input genePred, returning a path to a tmp file""" coding_gp = tools.fileOps.get_tmp_file() - with open(coding_gp, 'w') as outf: + with open(coding_gp, "w") as outf: for tx in tools.transcripts.gene_pred_iterator(augustus_args.filtered_tm_gp): if tx.cds_size > 0: tools.fileOps.print_row(outf, tx.get_gene_pred()) if os.path.getsize(coding_gp) == 0: - raise InvalidInputException('Unable to extract coding transcripts from the filtered transMap genePred.') + raise InvalidInputException("Unable to extract coding transcripts from the filtered transMap genePred.") return coding_gp def run(self): - toil_work_dir = os.path.join(self.work_dir, 'toil', 'augustus', self.genome) - logger.info('Launching AugustusTMR toil pipeline on {}.'.format(self.genome)) + toil_work_dir = os.path.join(self.work_dir, "toil", "augustus", self.genome) + logger.info("Launching AugustusTMR toil pipeline on {}.".format(self.genome)) toil_options = self.prepare_toil_options(toil_work_dir) augustus_args = self.get_module_args(Augustus, genome=self.genome) coding_gp = self.extract_coding_genes(augustus_args) augustus(augustus_args, coding_gp, toil_options) - logger.info('Augustus toil pipeline for {} completed.'.format(self.genome)) + logger.info("Augustus toil pipeline for {} completed.".format(self.genome)) os.remove(coding_gp) for out_gp, out_gtf in tools.misc.pairwise(self.output()): tools.misc.convert_gtf_gp(out_gp, out_gtf) @@ -1467,15 +1593,16 @@ class AugustusCgp(ToilTask): """ Task for launching the AugustusCGP toil pipeline """ + @staticmethod def get_args(pipeline_args): genomes = list(pipeline_args.target_genomes) + [pipeline_args.ref_genome] fasta_files = {genome: GenomeFiles.get_args(pipeline_args, genome).fasta for genome in genomes} - base_dir = os.path.join(pipeline_args.work_dir, 'augustus_cgp') + base_dir = os.path.join(pipeline_args.work_dir, "augustus_cgp") # output - output_gp_files = {genome: os.path.join(base_dir, genome + '.augCGP.gp') for genome in genomes} - output_gtf_files = {genome: os.path.join(base_dir, genome + '.augCGP.gtf') for genome in genomes} - raw_output_gtf_files = {genome: os.path.join(base_dir, genome + '.raw.augCGP.gtf') for genome in genomes} + output_gp_files = {genome: os.path.join(base_dir, genome + ".augCGP.gp") for genome in genomes} + output_gtf_files = {genome: os.path.join(base_dir, genome + ".augCGP.gtf") for genome in genomes} + raw_output_gtf_files = {genome: os.path.join(base_dir, genome + ".raw.augCGP.gtf") for genome in genomes} args = tools.misc.HashableNamespace() args.genomes = genomes args.annotate_ancestors = pipeline_args.annotate_ancestors @@ -1485,13 +1612,13 @@ def get_args(pipeline_args): args.augustus_cgp_gp = output_gp_files args.augustus_cgp_gtf = output_gtf_files args.augustus_cgp_raw_gtf = raw_output_gtf_files - args.stdout_file = os.path.join(base_dir, 'CGP_stdout.txt') + args.stdout_file = os.path.join(base_dir, "CGP_stdout.txt") args.species = pipeline_args.augustus_species args.chunksize = pipeline_args.maf_chunksize args.overlap = pipeline_args.maf_overlap args.cgp_param = pipeline_args.cgp_param if args.cgp_param is None: - args.param_out_path = os.path.join(base_dir, 'trained_parameters.cfg') + args.param_out_path = os.path.join(base_dir, "trained_parameters.cfg") args.num_exons = pipeline_args.cgp_train_num_exons args.hints_db = pipeline_args.hints_db args.query_sizes = GenomeFiles.get_args(pipeline_args, pipeline_args.ref_genome).sizes @@ -1506,9 +1633,9 @@ def output(self): yield luigi.LocalTarget(path) def validate(self): - for tool in ['joingenes', 'augustus', 'hal2maf']: + for tool in ["joingenes", "augustus", "hal2maf"]: if not tools.misc.is_exec(tool): - raise ToolMissingException('tool {} not in global path.'.format(tool)) + raise ToolMissingException("tool {} not in global path.".format(tool)) def requires(self): yield self.clone(TransMap), self.clone(ReferenceFiles), self.clone(BuildDb) @@ -1516,39 +1643,41 @@ def requires(self): def prepare_cgp_cfg(self, pipeline_args): """use the config template to create a config file""" # bam genomes have IsoSeq and/or at least one BAM - bam_genomes = (pipeline_args.rnaseq_genomes | pipeline_args.isoseq_genomes) - \ - (pipeline_args.annotation_genomes | pipeline_args.intron_only_genomes) + bam_genomes = (pipeline_args.rnaseq_genomes | pipeline_args.isoseq_genomes) - ( + pipeline_args.annotation_genomes | pipeline_args.intron_only_genomes + ) # intron only genomes have only intron hints intron_only_genomes = pipeline_args.intron_only_genomes - (bam_genomes | pipeline_args.annotation_genomes) if not tools.mathOps.all_disjoint([bam_genomes, intron_only_genomes, pipeline_args.annotation_genomes]): - raise UserException('Error in CGP configuration. Not all genome groups are disjoint.') + raise UserException("Error in CGP configuration. Not all genome groups are disjoint.") # if --target-genomes is set, remove these genomes from the groups target_genomes = set(pipeline_args.target_genomes) target_genomes.add(pipeline_args.ref_genome) annotation_genomes = pipeline_args.annotation_genomes & target_genomes bam_genomes = bam_genomes & target_genomes intron_only_genomes = intron_only_genomes & target_genomes - annotation_genomes = 'none' if len(pipeline_args.annotation_genomes) == 0 else ' '.join(annotation_genomes) - bam_genomes = 'none' if len(bam_genomes) == 0 else ' '.join(bam_genomes) - intron_only_genomes = 'none' if len(intron_only_genomes) == 0 else ' '.join(intron_only_genomes) + annotation_genomes = "none" if len(pipeline_args.annotation_genomes) == 0 else " ".join(annotation_genomes) + bam_genomes = "none" if len(bam_genomes) == 0 else " ".join(bam_genomes) + intron_only_genomes = "none" if len(intron_only_genomes) == 0 else " ".join(intron_only_genomes) template = open(pipeline_args.augustus_cgp_cfg_template).read() - cfg = template.format(annotation_genomes=annotation_genomes, target_genomes=bam_genomes, - intron_target_genomes=intron_only_genomes) + cfg = template.format( + annotation_genomes=annotation_genomes, target_genomes=bam_genomes, intron_target_genomes=intron_only_genomes + ) out_path = tools.fileOps.get_tmp_file() - with open(out_path, 'w') as outf: + with open(out_path, "w") as outf: outf.write(cfg) return out_path def run(self): self.validate() pipeline_args = self.get_pipeline_args() - logger.info('Launching AugustusCGP toil pipeline.') - toil_work_dir = os.path.join(self.work_dir, 'toil', 'augustus_cgp') + logger.info("Launching AugustusCGP toil pipeline.") + toil_work_dir = os.path.join(self.work_dir, "toil", "augustus_cgp") toil_options = self.prepare_toil_options(toil_work_dir) cgp_args = self.get_args(pipeline_args) cgp_args.cgp_cfg = self.prepare_cgp_cfg(pipeline_args) augustus_cgp(cgp_args, toil_options) - logger.info('Finished AugustusCGP toil pipeline.') + logger.info("Finished AugustusCGP toil pipeline.") class AugustusPb(PipelineWrapperTask): @@ -1556,9 +1685,10 @@ class AugustusPb(PipelineWrapperTask): Runs AugustusPB. This mode is done on a per-genome basis, but ignores transMap information and and relies only on a combination of IsoSeq and RNA-seq """ + @staticmethod def get_args(pipeline_args, genome): - base_dir = os.path.join(pipeline_args.work_dir, 'augustus_pb') + base_dir = os.path.join(pipeline_args.work_dir, "augustus_pb") args = tools.misc.HashableNamespace() args.genome = genome genome_files = GenomeFiles.get_args(pipeline_args, genome) @@ -1569,17 +1699,17 @@ def get_args(pipeline_args, genome): args.overlap = pipeline_args.pb_genome_overlap args.species = pipeline_args.augustus_species args.hints_gff = BuildDb.get_args(pipeline_args, genome).hints_path - args.augustus_pb_gtf = os.path.join(base_dir, genome + '.augPB.gtf') - args.augustus_pb_gp = os.path.join(base_dir, genome + '.augPB.gp') - args.augustus_pb_raw_gtf = os.path.join(base_dir, genome + '.raw.augPB.gtf') + args.augustus_pb_gtf = os.path.join(base_dir, genome + ".augPB.gtf") + args.augustus_pb_gp = os.path.join(base_dir, genome + ".augPB.gp") + args.augustus_pb_raw_gtf = os.path.join(base_dir, genome + ".raw.augPB.gtf") # invert the UTR flag args.utr = not pipeline_args.augustus_utr_off return args def validate(self): - for tool in ['augustus', 'joingenes']: + for tool in ["augustus", "joingenes"]: if not tools.misc.is_exec(tool): - raise ToolMissingException('Auxiliary program {} from the Augustus package not in path.'.format(tool)) + raise ToolMissingException("Auxiliary program {} from the Augustus package not in path.".format(tool)) def requires(self): self.validate() @@ -1592,6 +1722,7 @@ class AugustusPbDriverTask(ToilTask): """ Task for per-genome launching of a toil pipeline for running AugustusPB. """ + genome = luigi.Parameter() def output(self): @@ -1605,90 +1736,117 @@ def requires(self): return self.clone(TransMap), self.clone(BuildDb) def run(self): - toil_work_dir = os.path.join(self.work_dir, 'toil', 'augustus_pb', self.genome) - logger.info('Launching AugustusPB toil pipeline on {}.'.format(self.genome)) + toil_work_dir = os.path.join(self.work_dir, "toil", "augustus_pb", self.genome) + logger.info("Launching AugustusPB toil pipeline on {}.".format(self.genome)) toil_options = self.prepare_toil_options(toil_work_dir) augustus_pb_args = self.get_module_args(AugustusPb, genome=self.genome) augustus_pb(augustus_pb_args, toil_options) - if 'stats_path' in augustus_pb_args: + if "stats_path" in augustus_pb_args: self.get_stats(toil_options, augustus_pb_args.stat_file) - logger.info('Finished AugustusPB toil pipeline on {}.'.format(self.genome)) + logger.info("Finished AugustusPB toil pipeline on {}.".format(self.genome)) class FindDenovoParents(PipelineTask): """Task for finding parental gene candidates for denovo predictions. Flags possible fusions""" + mode = luigi.Parameter() @staticmethod def get_args(pipeline_args, mode): args = tools.misc.HashableNamespace() - if mode == 'augPB': + if mode == "augPB": args.tablename = tools.sqlInterface.AugPbAlternativeGenes.__tablename__ - args.gps = {genome: AugustusPb.get_args(pipeline_args, genome).augustus_pb_gp - for genome in set(pipeline_args.target_genomes) & pipeline_args.isoseq_genomes} - args.filtered_tm_gps = {genome: TransMap.get_args(pipeline_args, genome).filtered_tm_gp - for genome in set(pipeline_args.target_genomes) & pipeline_args.isoseq_genomes} - args.unfiltered_tm_gps = {genome: TransMap.get_args(pipeline_args, genome).tm_gp - for genome in set(pipeline_args.target_genomes) & pipeline_args.isoseq_genomes} - args.chrom_sizes = {genome: GenomeFiles.get_args(pipeline_args, genome).sizes - for genome in set(pipeline_args.target_genomes) & pipeline_args.isoseq_genomes} + args.gps = { + genome: AugustusPb.get_args(pipeline_args, genome).augustus_pb_gp + for genome in set(pipeline_args.target_genomes) & pipeline_args.isoseq_genomes + } + args.filtered_tm_gps = { + genome: TransMap.get_args(pipeline_args, genome).filtered_tm_gp + for genome in set(pipeline_args.target_genomes) & pipeline_args.isoseq_genomes + } + args.unfiltered_tm_gps = { + genome: TransMap.get_args(pipeline_args, genome).tm_gp + for genome in set(pipeline_args.target_genomes) & pipeline_args.isoseq_genomes + } + args.chrom_sizes = { + genome: GenomeFiles.get_args(pipeline_args, genome).sizes + for genome in set(pipeline_args.target_genomes) & pipeline_args.isoseq_genomes + } if pipeline_args.ref_genome in pipeline_args.isoseq_genomes: # add the reference annotation as a pseudo-transMap to assign parents in reference args.filtered_tm_gps[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp args.unfiltered_tm_gps[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp - args.chrom_sizes[pipeline_args.ref_genome] = GenomeFiles.get_args(pipeline_args, pipeline_args.ref_genome).sizes - args.gps[pipeline_args.ref_genome] = AugustusPb.get_args(pipeline_args, pipeline_args.ref_genome).augustus_pb_gp - elif mode == 'augCGP': + args.chrom_sizes[pipeline_args.ref_genome] = GenomeFiles.get_args( + pipeline_args, pipeline_args.ref_genome + ).sizes + args.gps[pipeline_args.ref_genome] = AugustusPb.get_args( + pipeline_args, pipeline_args.ref_genome + ).augustus_pb_gp + elif mode == "augCGP": args.tablename = tools.sqlInterface.AugCgpAlternativeGenes.__tablename__ args.gps = AugustusCgp.get_args(pipeline_args).augustus_cgp_gp - filtered_tm_gp_files = {genome: TransMap.get_args(pipeline_args, genome).filtered_tm_gp - for genome in pipeline_args.target_genomes} - unfiltered_tm_gp_files = {genome: TransMap.get_args(pipeline_args, genome).tm_gp - for genome in pipeline_args.target_genomes} + filtered_tm_gp_files = { + genome: TransMap.get_args(pipeline_args, genome).filtered_tm_gp + for genome in pipeline_args.target_genomes + } + unfiltered_tm_gp_files = { + genome: TransMap.get_args(pipeline_args, genome).tm_gp for genome in pipeline_args.target_genomes + } # add the reference annotation as a pseudo-transMap to assign parents in reference filtered_tm_gp_files[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp unfiltered_tm_gp_files[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp args.filtered_tm_gps = filtered_tm_gp_files args.unfiltered_tm_gps = unfiltered_tm_gp_files - args.chrom_sizes = {genome: GenomeFiles.get_args(pipeline_args, genome).sizes - for genome in list(pipeline_args.target_genomes) + [pipeline_args.ref_genome]} - elif mode == 'exRef': + args.chrom_sizes = { + genome: GenomeFiles.get_args(pipeline_args, genome).sizes + for genome in list(pipeline_args.target_genomes) + [pipeline_args.ref_genome] + } + elif mode == "exRef": args.tablename = tools.sqlInterface.ExRefAlternativeGenes.__tablename__ - args.gps = {genome: ExternalReferenceFiles.get_args(pipeline_args, genome).annotation_gp - for genome in pipeline_args.external_ref_genomes} - filtered_tm_gp_files = {genome: TransMap.get_args(pipeline_args, genome).filtered_tm_gp - for genome in pipeline_args.external_ref_genomes} - unfiltered_tm_gp_files = {genome: TransMap.get_args(pipeline_args, genome).tm_gp - for genome in pipeline_args.external_ref_genomes} + args.gps = { + genome: ExternalReferenceFiles.get_args(pipeline_args, genome).annotation_gp + for genome in pipeline_args.external_ref_genomes + } + filtered_tm_gp_files = { + genome: TransMap.get_args(pipeline_args, genome).filtered_tm_gp + for genome in pipeline_args.external_ref_genomes + } + unfiltered_tm_gp_files = { + genome: TransMap.get_args(pipeline_args, genome).tm_gp for genome in pipeline_args.external_ref_genomes + } # add the reference annotation as a pseudo-transMap to assign parents in reference filtered_tm_gp_files[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp unfiltered_tm_gp_files[pipeline_args.ref_genome] = ReferenceFiles.get_args(pipeline_args).annotation_gp args.filtered_tm_gps = filtered_tm_gp_files args.unfiltered_tm_gps = unfiltered_tm_gp_files - args.chrom_sizes = {genome: GenomeFiles.get_args(pipeline_args, genome).sizes - for genome in list(pipeline_args.target_genomes) + [pipeline_args.ref_genome]} + args.chrom_sizes = { + genome: GenomeFiles.get_args(pipeline_args, genome).sizes + for genome in list(pipeline_args.target_genomes) + [pipeline_args.ref_genome] + } else: - raise Exception('Invalid mode passed to FindDenovoParents') + raise Exception("Invalid mode passed to FindDenovoParents") return args def requires(self): - if self.mode == 'augPB': + if self.mode == "augPB": yield self.clone(AugustusPb) - elif self.mode == 'augCGP': + elif self.mode == "augCGP": yield self.clone(AugustusCgp) - elif self.mode == 'exRef': + elif self.mode == "exRef": yield self.clone(PrepareFiles) else: - raise Exception('Invalid mode passed to FindDenovoParents') + raise Exception("Invalid mode passed to FindDenovoParents") yield self.clone(TransMap) def get_table_targets(self, genome, tablename, pipeline_args): db = pipeline_args.dbs[genome] tools.fileOps.ensure_file_dir(db) - conn_str = 'sqlite:///{}'.format(db) - return luigi.contrib.sqla.SQLAlchemyTarget(connection_string=conn_str, - target_table=tablename, - update_id='_'.join([tablename, str(hash(pipeline_args))])) + conn_str = "sqlite:///{}".format(db) + return luigi.contrib.sqla.SQLAlchemyTarget( + connection_string=conn_str, + target_table=tablename, + update_id="_".join([tablename, str(hash(pipeline_args))]), + ) def output(self): pipeline_args = self.get_pipeline_args() @@ -1707,16 +1865,15 @@ def run(self): df = assign_parents(filtered_tm_gp, unfiltered_tm_gp, chrom_sizes, denovo_gp) db = pipeline_args.dbs[genome] with tools.sqlite.ExclusiveSqlConnection(db) as engine: - df.to_sql(denovo_args.tablename, engine, if_exists='replace') + df.to_sql(denovo_args.tablename, engine, if_exists="replace") table_target.touch() counts = collections.Counter(df.ResolutionMethod) - log_msg = 'Loaded table: {}.{}. Results: {}' - assigned_str = '{}: {:,}'.format('assigned', counts[None]) + log_msg = "Loaded table: {}.{}. Results: {}" + assigned_str = "{}: {:,}".format("assigned", counts[None]) log_msg = log_msg.format(genome, denovo_args.tablename, assigned_str) - result_str = ', '.join(['{}: {:,}'.format(name, val) - for name, val in counts.items() if name is not None]) + result_str = ", ".join(["{}: {:,}".format(name, val) for name, val in counts.items() if name is not None]) if len(result_str) > 0: - log_msg += ', ' + result_str + '.' + log_msg += ", " + result_str + "." logger.info(log_msg) @@ -1726,45 +1883,50 @@ class Hgm(PipelineWrapperTask): support across all species. It will be launched once for each of transMap, AugustusTM, AugustusTMR, AugustusCGP """ + @staticmethod def get_args(pipeline_args, mode): - base_dir = os.path.join(pipeline_args.work_dir, 'hgm', mode) - if mode == 'augCGP': + base_dir = os.path.join(pipeline_args.work_dir, "hgm", mode) + if mode == "augCGP": # add reference to the target genomes tgt_genomes = list(pipeline_args.target_genomes) + [pipeline_args.ref_genome] - gtf_in_files = {genome: AugustusCgp.get_args(pipeline_args).augustus_cgp_gtf[genome] - for genome in tgt_genomes} - elif mode == 'augTM': + gtf_in_files = { + genome: AugustusCgp.get_args(pipeline_args).augustus_cgp_gtf[genome] for genome in tgt_genomes + } + elif mode == "augTM": tgt_genomes = pipeline_args.target_genomes - gtf_in_files = {genome: Augustus.get_args(pipeline_args, genome).augustus_tm_gtf - for genome in tgt_genomes} - elif mode == 'augTMR': + gtf_in_files = {genome: Augustus.get_args(pipeline_args, genome).augustus_tm_gtf for genome in tgt_genomes} + elif mode == "augTMR": # remove reference it may have RNA-seq - tgt_genomes = (pipeline_args.rnaseq_genomes & set(pipeline_args.target_genomes)) - {pipeline_args.ref_genome} - gtf_in_files = {genome: Augustus.get_args(pipeline_args, genome).augustus_tmr_gtf - for genome in tgt_genomes} - elif mode == 'augPB': + tgt_genomes = (pipeline_args.rnaseq_genomes & set(pipeline_args.target_genomes)) - { + pipeline_args.ref_genome + } + gtf_in_files = {genome: Augustus.get_args(pipeline_args, genome).augustus_tmr_gtf for genome in tgt_genomes} + elif mode == "augPB": # add reference genome to target_genomes, but then intersect with isoseq genomes - tgt_genomes = (set(pipeline_args.target_genomes) | {pipeline_args.ref_genome}) & pipeline_args.isoseq_genomes - gtf_in_files = {genome: AugustusPb.get_args(pipeline_args, genome).augustus_pb_gtf - for genome in tgt_genomes} - elif mode == 'transMap': + tgt_genomes = ( + set(pipeline_args.target_genomes) | {pipeline_args.ref_genome} + ) & pipeline_args.isoseq_genomes + gtf_in_files = { + genome: AugustusPb.get_args(pipeline_args, genome).augustus_pb_gtf for genome in tgt_genomes + } + elif mode == "transMap": tgt_genomes = pipeline_args.target_genomes - gtf_in_files = {genome: TransMap.get_args(pipeline_args, genome).tm_gtf - for genome in tgt_genomes} - elif mode == 'exRef': + gtf_in_files = {genome: TransMap.get_args(pipeline_args, genome).tm_gtf for genome in tgt_genomes} + elif mode == "exRef": tgt_genomes = pipeline_args.external_ref_genomes - gtf_in_files = {genome: ExternalReferenceFiles.get_args(pipeline_args, genome).annotation_gtf - for genome in tgt_genomes} + gtf_in_files = { + genome: ExternalReferenceFiles.get_args(pipeline_args, genome).annotation_gtf for genome in tgt_genomes + } else: - raise UserException('Invalid mode was passed to Hgm module: {}.'.format(mode)) + raise UserException("Invalid mode was passed to Hgm module: {}.".format(mode)) args = tools.misc.HashableNamespace() args.genomes = tgt_genomes args.ref_genome = pipeline_args.ref_genome args.hal = pipeline_args.hal args.in_gtf = gtf_in_files args.gtf_out_dir = base_dir - args.gtf_out_files = {genome: os.path.join(base_dir, genome + '.gtf') for genome in tgt_genomes} + args.gtf_out_files = {genome: os.path.join(base_dir, genome + ".gtf") for genome in tgt_genomes} args.hints_db = pipeline_args.hints_db args.annotation_gtf = ReferenceFiles.get_args(pipeline_args).annotation_gtf args.annotation_gp = ReferenceFiles.get_args(pipeline_args).annotation_gp @@ -1772,15 +1934,16 @@ def get_args(pipeline_args, mode): return args def validate(self): - for tool in ['homGeneMapping', 'join_mult_hints.pl']: + for tool in ["homGeneMapping", "join_mult_hints.pl"]: if not tools.misc.is_exec(tool): - raise ToolMissingException('auxiliary program {} from the Augustus ' - 'package not in global path.'.format(tool)) - if not tools.misc.is_exec('halLiftover'): - raise ToolMissingException('halLiftover from the halTools package not in global path.') - for tool in ['bedtools', 'bedSort']: + raise ToolMissingException( + "auxiliary program {} from the Augustus " "package not in global path.".format(tool) + ) + if not tools.misc.is_exec("halLiftover"): + raise ToolMissingException("halLiftover from the halTools package not in global path.") + for tool in ["bedtools", "bedSort"]: if not tools.misc.is_exec(tool): - raise ToolMissingException('{} is required for the homGeneMapping module.'.format(tool)) + raise ToolMissingException("{} is required for the homGeneMapping module.".format(tool)) def requires(self): pipeline_args = self.get_pipeline_args() @@ -1794,6 +1957,7 @@ class HgmDriverTask(PipelineTask): Task for running each individual instance of the Hgm pipeline. Dumps the results into a sqlite database Also produces a GTF file that is parsed into this database. """ + mode = luigi.Parameter() def output(self): @@ -1802,46 +1966,48 @@ def output(self): for genome in hgm_args.genomes: db = pipeline_args.dbs[genome] tools.fileOps.ensure_file_dir(db) - conn_str = 'sqlite:///{}'.format(db) - tablename = tools.sqlInterface.tables['hgm'][self.mode].__tablename__ - yield luigi.contrib.sqla.SQLAlchemyTarget(connection_string=conn_str, - target_table=tablename, - update_id='_'.join([tablename, str(hash(pipeline_args))])) + conn_str = "sqlite:///{}".format(db) + tablename = tools.sqlInterface.tables["hgm"][self.mode].__tablename__ + yield luigi.contrib.sqla.SQLAlchemyTarget( + connection_string=conn_str, + target_table=tablename, + update_id="_".join([tablename, str(hash(pipeline_args))]), + ) for f in hgm_args.gtf_out_files.values(): yield luigi.LocalTarget(f) def requires(self): - if self.mode == 'augCGP': + if self.mode == "augCGP": yield self.clone(AugustusCgp) - yield self.clone(FindDenovoParents, mode='augCGP') - elif self.mode == 'augTM' or self.mode == 'augTMR': + yield self.clone(FindDenovoParents, mode="augCGP") + elif self.mode == "augTM" or self.mode == "augTMR": yield self.clone(Augustus) - elif self.mode == 'transMap': + elif self.mode == "transMap": yield self.clone(TransMap) - elif self.mode == 'augPB': + elif self.mode == "augPB": yield self.clone(AugustusPb) - yield self.clone(FindDenovoParents, mode='augPB') - elif self.mode == 'exRef': - yield self.clone(FindDenovoParents, mode='exRef') + yield self.clone(FindDenovoParents, mode="augPB") + elif self.mode == "exRef": + yield self.clone(FindDenovoParents, mode="exRef") else: - raise UserException('Invalid mode passed to HgmDriverTask: {}.'.format(self.mode)) + raise UserException("Invalid mode passed to HgmDriverTask: {}.".format(self.mode)) yield self.clone(BuildDb) yield self.clone(ReferenceFiles) def run(self): - logger.info('Launching homGeneMapping for {}.'.format(self.mode)) + logger.info("Launching homGeneMapping for {}.".format(self.mode)) pipeline_args = self.get_pipeline_args() hgm_args = Hgm.get_args(pipeline_args, self.mode) hgm(hgm_args) # convert the output to a dataframe and write to the genome database databases = self.__class__.get_databases(pipeline_args) - tablename = tools.sqlInterface.tables['hgm'][self.mode].__tablename__ + tablename = tools.sqlInterface.tables["hgm"][self.mode].__tablename__ for genome, sqla_target in zip(*[hgm_args.genomes, self.output()]): df = parse_hgm_gtf(hgm_args.gtf_out_files[genome], genome) with tools.sqlite.ExclusiveSqlConnection(databases[genome]) as engine: - df.to_sql(tablename, engine, if_exists='replace') + df.to_sql(tablename, engine, if_exists="replace") sqla_target.touch() - logger.info('Loaded table: {}.{}'.format(genome, tablename)) + logger.info("Loaded table: {}.{}".format(genome, tablename)) class IsoSeqTranscripts(PipelineWrapperTask): @@ -1851,6 +2017,7 @@ class IsoSeqTranscripts(PipelineWrapperTask): These structures are analogous to Transcript objects. """ + @staticmethod def get_args(pipeline_args, genome): args = tools.misc.HashableNamespace() @@ -1868,6 +2035,7 @@ class IsoSeqTranscriptsDriverTask(PipelineTask): """ Driver task for IsoSeqTranscripts """ + genome = luigi.Parameter() tablename = tools.sqlInterface.IsoSeqExonStructures.__tablename__ @@ -1875,10 +2043,12 @@ def output(self): pipeline_args = self.get_pipeline_args() db = pipeline_args.dbs[self.genome] tools.fileOps.ensure_file_dir(db) - conn_str = 'sqlite:///{}'.format(db) - return luigi.contrib.sqla.SQLAlchemyTarget(connection_string=conn_str, - target_table=self.tablename, - update_id='_'.join([self.tablename, str(hash(pipeline_args))])) + conn_str = "sqlite:///{}".format(db) + return luigi.contrib.sqla.SQLAlchemyTarget( + connection_string=conn_str, + target_table=self.tablename, + update_id="_".join([self.tablename, str(hash(pipeline_args))]), + ) def requires(self): yield self.clone(BuildDb) @@ -1888,14 +2058,14 @@ def construct_intervals(self, hints): Converts hints derived from IsoSeq BAMs into discrete clusters of transcript objects. Merges all alignment gaps below 50bp and separates clusters over 100kb separated to avoid mega-transcripts for tandem gene families. """ - lines = [x.split() for x in open(hints) if 'PB' in x and '\texon\t' in x] + lines = [x.split() for x in open(hints) if "PB" in x and "\texon\t" in x] # group these exons by grp tag groups = collections.defaultdict(list) for l in lines: - attrs = dict([x.split('=') for x in l[-1].split(';')]) - if 'grp' not in attrs: # not all introns get confidently assigned a group + attrs = dict([x.split("=") for x in l[-1].split(";")]) + if "grp" not in attrs: # not all introns get confidently assigned a group continue - groups[attrs['grp']].append([l[0], int(l[3]) - 1, int(l[4])]) + groups[attrs["grp"]].append([l[0], int(l[3]) - 1, int(l[4])]) # for each grp, perform clustering with 100kb distance to separate contigs as well as disjoint mappings # to do this, we use the ClusterTree data structure from bx-python. @@ -1918,15 +2088,31 @@ def construct_intervals(self, hints): for grp, cluster_tree in cluster_trees[chrom].items(): for start, end, interval_indices in cluster_tree.getregions(): intervals = [interval_flat_list[i] for i in interval_indices] - intervals = {tools.intervals.ChromosomeInterval(chrom, start, stop, '.') - for chrom, start, stop in intervals} + intervals = { + tools.intervals.ChromosomeInterval(chrom, start, stop, ".") for chrom, start, stop in intervals + } intervals = tools.intervals.gap_merge_intervals(intervals, 50) txs.append(tools.transcripts.intervals_to_bed(intervals, name=grp)) # convert these to a dataframe for sql output txs = [x.get_bed() for x in txs] - df = pd.DataFrame(txs, columns=['chromosome', 'start', 'stop', 'name', 'score', 'strand', 'thickStart', - 'thickStop', 'rgb', 'blockCount', 'blockSizes', 'blockStarts']) + df = pd.DataFrame( + txs, + columns=[ + "chromosome", + "start", + "stop", + "name", + "score", + "strand", + "thickStart", + "thickStop", + "rgb", + "blockCount", + "blockSizes", + "blockStarts", + ], + ) return df def run(self): @@ -1934,18 +2120,19 @@ def run(self): intron_args = IsoSeqTranscripts.get_args(pipeline_args, self.genome) df = pd.DataFrame(self.construct_intervals(intron_args.hints_gff)) with tools.sqlite.ExclusiveSqlConnection(pipeline_args.dbs[self.genome]) as engine: - df.to_sql(self.tablename, engine, if_exists='replace') + df.to_sql(self.tablename, engine, if_exists="replace") self.output().touch() - logger.info('Loaded table {}.{}'.format(self.genome, self.tablename)) + logger.info("Loaded table {}.{}".format(self.genome, self.tablename)) class AlignTranscripts(PipelineWrapperTask): """ Aligns the transcripts from transMap/AugustusTMR to the parent transcript(s). """ + @staticmethod def get_args(pipeline_args, genome): - base_dir = os.path.join(pipeline_args.work_dir, 'transcript_alignment') + base_dir = os.path.join(pipeline_args.work_dir, "transcript_alignment") args = tools.misc.HashableNamespace() args.ref_genome = pipeline_args.ref_genome args.genome = genome @@ -1954,17 +2141,25 @@ def get_args(pipeline_args, genome): args.annotation_gp = ReferenceFiles.get_args(pipeline_args).annotation_gp args.ref_db_path = PipelineTask.get_database(pipeline_args, pipeline_args.ref_genome) # the alignment_modes members hold the input genePreds and the mRNA/CDS alignment output paths - args.transcript_modes = {'transMap': {'gp': TransMap.get_args(pipeline_args, genome).filtered_tm_gp, - 'mRNA': os.path.join(base_dir, genome + '.transMap.mRNA.psl'), - 'CDS': os.path.join(base_dir, genome + '.transMap.CDS.psl')}} + args.transcript_modes = { + "transMap": { + "gp": TransMap.get_args(pipeline_args, genome).filtered_tm_gp, + "mRNA": os.path.join(base_dir, genome + ".transMap.mRNA.psl"), + "CDS": os.path.join(base_dir, genome + ".transMap.CDS.psl"), + } + } if pipeline_args.augustus is True: - args.transcript_modes['augTM'] = {'gp': Augustus.get_args(pipeline_args, genome).augustus_tm_gp, - 'mRNA': os.path.join(base_dir, genome + '.augTM.mRNA.psl'), - 'CDS': os.path.join(base_dir, genome + '.augTM.CDS.psl')} + args.transcript_modes["augTM"] = { + "gp": Augustus.get_args(pipeline_args, genome).augustus_tm_gp, + "mRNA": os.path.join(base_dir, genome + ".augTM.mRNA.psl"), + "CDS": os.path.join(base_dir, genome + ".augTM.CDS.psl"), + } if pipeline_args.augustus is True and genome in pipeline_args.rnaseq_genomes: - args.transcript_modes['augTMR'] = {'gp': Augustus.get_args(pipeline_args, genome).augustus_tmr_gp, - 'mRNA': os.path.join(base_dir, genome + '.augTMR.mRNA.psl'), - 'CDS': os.path.join(base_dir, genome + '.augTMR.CDS.psl')} + args.transcript_modes["augTMR"] = { + "gp": Augustus.get_args(pipeline_args, genome).augustus_tmr_gp, + "mRNA": os.path.join(base_dir, genome + ".augTMR.mRNA.psl"), + "CDS": os.path.join(base_dir, genome + ".augTMR.CDS.psl"), + } return args def requires(self): @@ -1980,29 +2175,30 @@ class AlignTranscriptDriverTask(ToilTask): Each task returns a PSL of all alignments that will be analyzed next by EvaluateTranscripts. """ + genome = luigi.Parameter() def output(self): alignment_args = self.get_module_args(AlignTranscripts, genome=self.genome) for mode, paths in alignment_args.transcript_modes.items(): - for aln_type in ['CDS', 'mRNA']: + for aln_type in ["CDS", "mRNA"]: yield luigi.LocalTarget(paths[aln_type]) def requires(self): alignment_args = self.get_module_args(AlignTranscripts, genome=self.genome) - if 'augTM' in alignment_args.transcript_modes: + if "augTM" in alignment_args.transcript_modes: yield self.clone(Augustus) yield self.clone(TransMap) yield self.clone(ReferenceFiles) yield self.clone(GenomeFiles) def run(self): - logger.info('Launching Align Transcript toil pipeline for {} using {}.'.format(self.genome, self.batchSystem)) - toil_work_dir = os.path.join(self.work_dir, 'toil', 'transcript_alignment', self.genome) + logger.info("Launching Align Transcript toil pipeline for {} using {}.".format(self.genome, self.batchSystem)) + toil_work_dir = os.path.join(self.work_dir, "toil", "transcript_alignment", self.genome) toil_options = self.prepare_toil_options(toil_work_dir) alignment_args = self.get_module_args(AlignTranscripts, genome=self.genome) align_transcripts(alignment_args, toil_options) - logger.info('Align Transcript toil pipeline for {} completed.'.format(self.genome)) + logger.info("Align Transcript toil pipeline for {} completed.".format(self.genome)) class EvaluateTranscripts(PipelineWrapperTask): @@ -2011,6 +2207,7 @@ class EvaluateTranscripts(PipelineWrapperTask): Each task will generate a genome-specific sqlite database. See the classify.py docstring for details. """ + @staticmethod def get_args(pipeline_args, genome): args = tools.misc.HashableNamespace() @@ -2038,12 +2235,13 @@ class EvaluateDriverTask(PipelineTask): """ Task for per-genome launching of a toil pipeline for aligning transcripts to their parent. """ + genome = luigi.Parameter() def build_table_names(self, eval_args): """construct table names based on input arguments""" tables = [] - for aln_mode in ['mRNA', 'CDS']: + for aln_mode in ["mRNA", "CDS"]: for tx_mode in eval_args.transcript_modes.keys(): names = [x.__tablename__ for x in list(tools.sqlInterface.tables[aln_mode][tx_mode].values())] tables.extend(names) @@ -2058,25 +2256,25 @@ def write_to_sql(self, results, eval_args): with tools.sqlite.ExclusiveSqlConnection(eval_args.db_path) as engine: for table, target in self.pair_table_output(eval_args).items(): df = results[table] - df.to_sql(table, engine, if_exists='replace') + df.to_sql(table, engine, if_exists="replace") target.touch() - logger.info('Loaded table: {}.{}'.format(self.genome, table)) + logger.info("Loaded table: {}.{}".format(self.genome, table)) def output(self): pipeline_args = self.get_pipeline_args() eval_args = self.get_module_args(EvaluateTranscripts, genome=self.genome) tools.fileOps.ensure_file_dir(eval_args.db_path) - conn_str = 'sqlite:///{}'.format(eval_args.db_path) + conn_str = "sqlite:///{}".format(eval_args.db_path) for table in self.build_table_names(eval_args): - yield luigi.contrib.sqla.SQLAlchemyTarget(connection_string=conn_str, - target_table=table, - update_id='_'.join([table, str(hash(pipeline_args))])) + yield luigi.contrib.sqla.SQLAlchemyTarget( + connection_string=conn_str, target_table=table, update_id="_".join([table, str(hash(pipeline_args))]) + ) def requires(self): return self.clone(AlignTranscripts), self.clone(ReferenceFiles), self.clone(TransMap) def run(self): - logger.info('Evaluating transcript alignments for {}.'.format(self.genome)) + logger.info("Evaluating transcript alignments for {}.".format(self.genome)) eval_args = self.get_module_args(EvaluateTranscripts, genome=self.genome) results = classify(eval_args) # results should be a dictionary of {table: dataframe} @@ -2087,29 +2285,30 @@ class Consensus(PipelineWrapperTask): """ Construct the consensus gene sets making use of the classification databases. """ + @staticmethod def get_args(pipeline_args, genome): - base_dir = os.path.join(pipeline_args.out_dir, 'consensus_gene_set') + base_dir = os.path.join(pipeline_args.out_dir, "consensus_gene_set") # grab the genePred of every mode args = tools.misc.HashableNamespace() gp_list = [TransMap.get_args(pipeline_args, genome).filtered_tm_gp] - args.tx_modes = ['transMap'] + args.tx_modes = ["transMap"] args.denovo_tx_modes = [] if pipeline_args.augustus is True: gp_list.append(Augustus.get_args(pipeline_args, genome).augustus_tm_gp) - args.tx_modes.append('augTM') + args.tx_modes.append("augTM") if pipeline_args.augustus is True and genome in pipeline_args.rnaseq_genomes: gp_list.append(Augustus.get_args(pipeline_args, genome).augustus_tmr_gp) - args.tx_modes.append('augTMR') + args.tx_modes.append("augTMR") if pipeline_args.augustus_cgp is True: gp_list.append(AugustusCgp.get_args(pipeline_args).augustus_cgp_gp[genome]) - args.denovo_tx_modes.append('augCGP') + args.denovo_tx_modes.append("augCGP") if pipeline_args.augustus_pb is True and genome in pipeline_args.isoseq_genomes: gp_list.append(AugustusPb.get_args(pipeline_args, genome).augustus_pb_gp) - args.denovo_tx_modes.append('augPB') + args.denovo_tx_modes.append("augPB") if genome in pipeline_args.external_ref_genomes: gp_list.append(ExternalReferenceFiles.get_args(pipeline_args, genome).annotation_gp) - args.denovo_tx_modes.append('exRef') + args.denovo_tx_modes.append("exRef") args.gp_list = gp_list args.genome = genome args.transcript_modes = list(AlignTranscripts.get_args(pipeline_args, genome).transcript_modes.keys()) @@ -2119,12 +2318,12 @@ def get_args(pipeline_args, genome): args.hints_db_has_rnaseq = len(pipeline_args.rnaseq_genomes) > 0 args.annotation_gp = ReferenceFiles.get_args(pipeline_args).annotation_gp args.fasta = GenomeFiles.get_args(pipeline_args, genome).fasta - args.consensus_gp = os.path.join(base_dir, genome + '.gp') - args.consensus_gp_info = os.path.join(base_dir, genome + '.gp_info') - args.consensus_gff3 = os.path.join(base_dir, genome + '.gff3') - args.consensus_fasta = os.path.join(base_dir, genome + '.consensus.fasta') - args.consensus_protein_fasta = os.path.join(base_dir, genome + '.protein.consensus.fasta') - args.metrics_json = os.path.join(PipelineTask.get_metrics_dir(pipeline_args, genome), 'consensus.json') + args.consensus_gp = os.path.join(base_dir, genome + ".gp") + args.consensus_gp_info = os.path.join(base_dir, genome + ".gp_info") + args.consensus_gff3 = os.path.join(base_dir, genome + ".gff3") + args.consensus_fasta = os.path.join(base_dir, genome + ".consensus.fasta") + args.consensus_protein_fasta = os.path.join(base_dir, genome + ".protein.consensus.fasta") + args.metrics_json = os.path.join(PipelineTask.get_metrics_dir(pipeline_args, genome), "consensus.json") # user configurable options on how consensus finding should work args.intron_rnaseq_support = pipeline_args.intron_rnaseq_support args.exon_rnaseq_support = pipeline_args.exon_rnaseq_support @@ -2160,6 +2359,7 @@ class ConsensusDriverTask(RebuildableTask): """ Driver task for performing consensus finding. """ + genome = luigi.Parameter() def output(self): @@ -2183,16 +2383,16 @@ def requires(self): if pipeline_args.augustus_pb: yield self.clone(AugustusPb) yield self.clone(IsoSeqTranscripts) - yield self.clone(FindDenovoParents, mode='augPB') + yield self.clone(FindDenovoParents, mode="augPB") if pipeline_args.augustus_cgp: yield self.clone(AugustusCgp) - yield self.clone(FindDenovoParents, mode='augCGP') + yield self.clone(FindDenovoParents, mode="augCGP") if self.genome in pipeline_args.external_ref_genomes: - yield self.clone(FindDenovoParents, mode='exRef') + yield self.clone(FindDenovoParents, mode="exRef") def run(self): consensus_args = self.get_module_args(Consensus, genome=self.genome) - logger.info('Generating consensus gene set for {}.'.format(self.genome)) + logger.info("Generating consensus gene set for {}.".format(self.genome)) metrics_dict = generate_consensus(consensus_args) metrics_json = next(self.output()) PipelineTask.write_metrics(metrics_dict, metrics_json) @@ -2202,48 +2402,52 @@ class Plots(RebuildableTask): """ Produce final analysis plots """ + @staticmethod def get_args(pipeline_args): - base_dir = os.path.join(pipeline_args.out_dir, 'plots') + base_dir = os.path.join(pipeline_args.out_dir, "plots") tools.fileOps.ensure_dir(base_dir) - ordered_genomes = tools.hal.build_genome_order(pipeline_args.hal, pipeline_args.ref_genome, - pipeline_args.target_genomes, - pipeline_args.annotate_ancestors) + ordered_genomes = tools.hal.build_genome_order( + pipeline_args.hal, pipeline_args.ref_genome, pipeline_args.target_genomes, pipeline_args.annotate_ancestors + ) args = tools.misc.HashableNamespace() args.ordered_genomes = ordered_genomes # plots derived from transMap results - args.tm_coverage = luigi.LocalTarget(os.path.join(base_dir, 'transmap_coverage.pdf')) - args.tm_identity = luigi.LocalTarget(os.path.join(base_dir, 'transmap_identity.pdf')) + args.tm_coverage = luigi.LocalTarget(os.path.join(base_dir, "transmap_coverage.pdf")) + args.tm_identity = luigi.LocalTarget(os.path.join(base_dir, "transmap_identity.pdf")) # plots derived from transMap filtering - args.paralogy = luigi.LocalTarget(os.path.join(base_dir, 'paralogy.pdf')) - args.unfiltered_paralogy = luigi.LocalTarget(os.path.join(base_dir, 'unfiltered_paralogy.pdf')) - args.gene_collapse = luigi.LocalTarget(os.path.join(base_dir, 'gene_family_collapse.pdf')) + args.paralogy = luigi.LocalTarget(os.path.join(base_dir, "paralogy.pdf")) + args.unfiltered_paralogy = luigi.LocalTarget(os.path.join(base_dir, "unfiltered_paralogy.pdf")) + args.gene_collapse = luigi.LocalTarget(os.path.join(base_dir, "gene_family_collapse.pdf")) # plots derived from transcript alignment / consensus finding - args.coverage = luigi.LocalTarget(os.path.join(base_dir, 'coverage.pdf')) - args.identity = luigi.LocalTarget(os.path.join(base_dir, 'identity.pdf')) - args.completeness = luigi.LocalTarget(os.path.join(base_dir, 'completeness.pdf')) - args.consensus_extrinsic_support = luigi.LocalTarget(os.path.join(base_dir, 'consensus_extrinsic_support.pdf')) - args.consensus_annot_support = luigi.LocalTarget(os.path.join(base_dir, 'consensus_annotation_support.pdf')) - args.tx_modes = luigi.LocalTarget(os.path.join(base_dir, 'transcript_modes.pdf')) - args.indel = luigi.LocalTarget(os.path.join(base_dir, 'coding_indels.pdf')) - args.missing = luigi.LocalTarget(os.path.join(base_dir, 'missing_genes_transcripts.pdf')) + args.coverage = luigi.LocalTarget(os.path.join(base_dir, "coverage.pdf")) + args.identity = luigi.LocalTarget(os.path.join(base_dir, "identity.pdf")) + args.completeness = luigi.LocalTarget(os.path.join(base_dir, "completeness.pdf")) + args.consensus_extrinsic_support = luigi.LocalTarget(os.path.join(base_dir, "consensus_extrinsic_support.pdf")) + args.consensus_annot_support = luigi.LocalTarget(os.path.join(base_dir, "consensus_annotation_support.pdf")) + args.tx_modes = luigi.LocalTarget(os.path.join(base_dir, "transcript_modes.pdf")) + args.indel = luigi.LocalTarget(os.path.join(base_dir, "coding_indels.pdf")) + args.missing = luigi.LocalTarget(os.path.join(base_dir, "missing_genes_transcripts.pdf")) # plots that depend on execution mode if pipeline_args.augustus is True: - args.improvement = luigi.LocalTarget(os.path.join(base_dir, 'augustus_improvement.pdf')) - if 'augCGP' in pipeline_args.modes or 'augPB' in pipeline_args.modes: - args.denovo = luigi.LocalTarget(os.path.join(base_dir, 'denovo.pdf')) - if 'augPB' in pipeline_args.modes: - args.pb_support = luigi.LocalTarget(os.path.join(base_dir, 'IsoSeq_isoform_validation.pdf')) + args.improvement = luigi.LocalTarget(os.path.join(base_dir, "augustus_improvement.pdf")) + if "augCGP" in pipeline_args.modes or "augPB" in pipeline_args.modes: + args.denovo = luigi.LocalTarget(os.path.join(base_dir, "denovo.pdf")) + if "augPB" in pipeline_args.modes: + args.pb_support = luigi.LocalTarget(os.path.join(base_dir, "IsoSeq_isoform_validation.pdf")) args.pb_genomes = pipeline_args.isoseq_genomes - args.split_genes = luigi.LocalTarget(os.path.join(base_dir, 'split_genes.pdf')) + args.split_genes = luigi.LocalTarget(os.path.join(base_dir, "split_genes.pdf")) # input data - args.metrics_jsons = OrderedDict([[genome, Consensus.get_args(pipeline_args, genome).metrics_json] - for genome in ordered_genomes]) - args.tm_jsons = OrderedDict([[genome, TransMap.get_args(pipeline_args, genome).metrics_json] - for genome in ordered_genomes]) + args.metrics_jsons = OrderedDict( + [[genome, Consensus.get_args(pipeline_args, genome).metrics_json] for genome in ordered_genomes] + ) + args.tm_jsons = OrderedDict( + [[genome, TransMap.get_args(pipeline_args, genome).metrics_json] for genome in ordered_genomes] + ) args.annotation_db = PipelineTask.get_database(pipeline_args, pipeline_args.ref_genome) - args.dbs = OrderedDict([[genome, PipelineTask.get_database(pipeline_args, genome)] - for genome in ordered_genomes]) + args.dbs = OrderedDict( + [[genome, PipelineTask.get_database(pipeline_args, genome)] for genome in ordered_genomes] + ) args.in_species_rna_support_only = pipeline_args.in_species_rna_support_only return args @@ -2263,7 +2467,7 @@ def requires(self): def run(self): pipeline_args = self.get_pipeline_args() - logger.info('Generating plots.') + logger.info("Generating plots.") generate_plots(Plots.get_args(pipeline_args)) @@ -2271,6 +2475,7 @@ class ReportStats(PipelineTask): """ Reports all the stats at the end of the pipeline """ + def requires(self): yield self.clone(PrepareFiles) yield self.clone(BuildDb) @@ -2281,10 +2486,10 @@ def requires(self): yield self.clone(Augustus) if self.augustus_cgp is True: yield self.clone(AugustusCgp) - yield self.clone(FindDenovoParents, mode='augCGP') + yield self.clone(FindDenovoParents, mode="augCGP") if self.augustus_pb is True: yield self.clone(AugustusPb) - yield self.clone(FindDenovoParents, mode='augPB') + yield self.clone(FindDenovoParents, mode="augPB") yield self.clone(IsoSeqTranscripts) yield self.clone(Hgm) yield self.clone(AlignTranscripts) @@ -2298,25 +2503,27 @@ def output(self): # dumb -- need it to be something pipeline_args = self.get_pipeline_args() tools.fileOps.ensure_file_dir(pipeline_args.stats_db) - conn_str = 'sqlite:///{}'.format(pipeline_args.stats_db) - return luigi.contrib.sqla.SQLAlchemyTarget(connection_string=conn_str, - target_table='stats', - update_id='_'.join(['stats', str(hash(pipeline_args))])) + conn_str = "sqlite:///{}".format(pipeline_args.stats_db) + return luigi.contrib.sqla.SQLAlchemyTarget( + connection_string=conn_str, target_table="stats", update_id="_".join(["stats", str(hash(pipeline_args))]) + ) def run(self): pipeline_args = self.get_pipeline_args() - luigi_stats = tools.sqlInterface.load_luigi_stats(pipeline_args.stats_db, 'stats') + luigi_stats = tools.sqlInterface.load_luigi_stats(pipeline_args.stats_db, "stats") try: - toil_stats = tools.sqlInterface.load_luigi_stats(pipeline_args.stats_db, 'toil_stats') + toil_stats = tools.sqlInterface.load_luigi_stats(pipeline_args.stats_db, "toil_stats") except ValueError: - logger.warning('Toil task already ran, therefore no stats') + logger.warning("Toil task already ran, therefore no stats") else: core_time = round(sum(luigi_stats.ProcessingTime) / 3600, 1) toil_core_time = round(sum(toil_stats.TotalTime) / 3600, 1) total = core_time + toil_core_time - logger.info('Local core time: {:,} hours. Toil core time: {:,} hours. ' - 'Total computation time: {:,} hours.'.format(core_time, toil_core_time, total)) + logger.info( + "Local core time: {:,} hours. Toil core time: {:,} hours. " + "Total computation time: {:,} hours.".format(core_time, toil_core_time, total) + ) self.output().touch() @@ -2324,6 +2531,7 @@ class AssemblyHub(PipelineWrapperTask): """ Construct an assembly hub out of all the results """ + def requires(self): tools.fileOps.ensure_dir(self.out_dir) yield self.clone(CreateDirectoryStructure) @@ -2334,22 +2542,23 @@ class CreateDirectoryStructure(RebuildableTask): """ Constructs the directory structure. Creates symlinks for all relevant files. """ + @staticmethod def get_args(pipeline_args): args = tools.misc.HashableNamespace() args.genomes = list(pipeline_args.target_genomes) + [pipeline_args.ref_genome] - args.out_dir = os.path.join(pipeline_args.out_dir, 'assemblyHub') - args.hub_txt = os.path.join(args.out_dir, 'hub.txt') - args.genomes_txt = os.path.join(args.out_dir, 'genomes.txt') - args.groups_txt = os.path.join(args.out_dir, 'groups.txt') + args.out_dir = os.path.join(pipeline_args.out_dir, "assemblyHub") + args.hub_txt = os.path.join(args.out_dir, "hub.txt") + args.genomes_txt = os.path.join(args.out_dir, "genomes.txt") + args.groups_txt = os.path.join(args.out_dir, "groups.txt") genome_files = frozendict({genome: GenomeFiles.get_args(pipeline_args, genome) for genome in args.genomes}) sizes = {} twobits = {} trackdbs = {} for genome, genome_file in genome_files.items(): - sizes[genome] = (genome_file.sizes, os.path.join(args.out_dir, genome, 'chrom.sizes')) - twobits[genome] = (genome_file.two_bit, os.path.join(args.out_dir, genome, '{}.2bit'.format(genome))) - trackdbs[genome] = os.path.join(args.out_dir, genome, 'trackDb.txt') + sizes[genome] = (genome_file.sizes, os.path.join(args.out_dir, genome, "chrom.sizes")) + twobits[genome] = (genome_file.two_bit, os.path.join(args.out_dir, genome, "{}.2bit".format(genome))) + trackdbs[genome] = os.path.join(args.out_dir, genome, "trackDb.txt") args.sizes = frozendict(sizes) args.twobits = frozendict(twobits) args.trackdbs = frozendict(trackdbs) @@ -2383,16 +2592,19 @@ def run(self): tools.fileOps.ensure_file_dir(args.out_dir) # write the hub.txt file - with luigi.LocalTarget(args.hub_txt).open('w') as outf: - outf.write(hub_str.format(hal=os.path.splitext(os.path.basename(pipeline_args.hal))[0], - email=pipeline_args.hub_email)) + with luigi.LocalTarget(args.hub_txt).open("w") as outf: + outf.write( + hub_str.format( + hal=os.path.splitext(os.path.basename(pipeline_args.hal))[0], email=pipeline_args.hub_email + ) + ) # write the groups.txt file - with luigi.LocalTarget(args.groups_txt).open('w') as outf: + with luigi.LocalTarget(args.groups_txt).open("w") as outf: outf.write(groups_str) # write the genomes.txt file, construct a dir - with luigi.LocalTarget(args.genomes_txt).open('w') as outf: + with luigi.LocalTarget(args.genomes_txt).open("w") as outf: for genome, (sizes_local_path, sizes_hub_path) in args.sizes.items(): outf.write(genome_str.format(genome=genome, default_pos=find_default_pos(sizes_local_path))) @@ -2411,10 +2623,11 @@ class CreateTracks(PipelineWrapperTask): """ Wrapper task for track creation. """ + def validate(self): - for tool in ['bedSort', 'pslToBigPsl', 'wiggletools', 'wigToBigWig']:#, 'bamCoverage']: + for tool in ["bedSort", "pslToBigPsl", "wiggletools", "wigToBigWig"]: # , 'bamCoverage']: if not tools.misc.is_exec(tool): - raise ToolMissingException('Tool {} not in global path.'.format(tool)) + raise ToolMissingException("Tool {} not in global path.".format(tool)) def requires(self): self.validate() @@ -2430,6 +2643,7 @@ class CreateTracksDriverTask(PipelineWrapperTask): """ Dynamically generates each track task, the combines the results into the final trackDb. """ + genome = luigi.Parameter() def requires(self): @@ -2439,11 +2653,19 @@ def requires(self): directory_args = CreateDirectoryStructure.get_args(pipeline_args) out_dir = os.path.join(directory_args.out_dir, self.genome) if pipeline_args.augustus_cgp is True and self.genome in pipeline_args.target_genomes: - yield self.clone(DenovoTrack, track_path=os.path.join(out_dir, 'augustus_cgp.bb'), - trackdb_path=os.path.join(out_dir, 'augustus_cgp.txt'), mode='augCGP') + yield self.clone( + DenovoTrack, + track_path=os.path.join(out_dir, "augustus_cgp.bb"), + trackdb_path=os.path.join(out_dir, "augustus_cgp.txt"), + mode="augCGP", + ) if pipeline_args.augustus_pb is True and self.genome in pipeline_args.isoseq_genomes: - yield self.clone(DenovoTrack, track_path=os.path.join(out_dir, 'augustus_pb.bb'), - trackdb_path=os.path.join(out_dir, 'augustus_pb.txt'), mode='augPB') + yield self.clone( + DenovoTrack, + track_path=os.path.join(out_dir, "augustus_pb.bb"), + trackdb_path=os.path.join(out_dir, "augustus_pb.txt"), + mode="augPB", + ) if self.genome in pipeline_args.annotation_genomes: if self.genome == pipeline_args.ref_genome: @@ -2452,52 +2674,77 @@ def requires(self): else: annotation_gp = ExternalReferenceFiles.get_args(pipeline_args, self.genome).annotation_gp annotation_genome = self.genome - yield self.clone(BgpTrack, track_path=os.path.join(out_dir, 'annotation.bb'), - trackdb_path=os.path.join(out_dir, 'annotation.txt'), - genepred_path=annotation_gp, label=os.path.splitext(os.path.basename(annotation_gp))[0], - annotation_genome=annotation_genome, - mode='annot') + yield self.clone( + BgpTrack, + track_path=os.path.join(out_dir, "annotation.bb"), + trackdb_path=os.path.join(out_dir, "annotation.txt"), + genepred_path=annotation_gp, + label=os.path.splitext(os.path.basename(annotation_gp))[0], + annotation_genome=annotation_genome, + mode="annot", + ) if self.genome in pipeline_args.target_genomes: - yield self.clone(ConsensusTrack, track_path=os.path.join(out_dir, 'consensus.bb'), - trackdb_path=os.path.join(out_dir, 'consensus.txt')) + yield self.clone( + ConsensusTrack, + track_path=os.path.join(out_dir, "consensus.bb"), + trackdb_path=os.path.join(out_dir, "consensus.txt"), + ) - tx_modes = ['transMap'] + tx_modes = ["transMap"] if pipeline_args.augustus is True: - tx_modes.append('augTM') + tx_modes.append("augTM") if self.genome in pipeline_args.rnaseq_genomes: - tx_modes.append('augTMR') - yield self.clone(EvaluationTrack, track_path=os.path.join(out_dir, 'evaluation.bb'), - trackdb_path=os.path.join(out_dir, 'evaluation.txt'), - tx_modes=tuple(tx_modes)) + tx_modes.append("augTMR") + yield self.clone( + EvaluationTrack, + track_path=os.path.join(out_dir, "evaluation.bb"), + trackdb_path=os.path.join(out_dir, "evaluation.txt"), + tx_modes=tuple(tx_modes), + ) tm_args = TransMap.get_args(pipeline_args, self.genome) - yield self.clone(TransMapTrack, track_path=os.path.join(out_dir, 'transmap.bb'), - trackdb_path=os.path.join(out_dir, 'transmap.txt')) - yield self.clone(BgpTrack, track_path=os.path.join(out_dir, 'filtered_transmap.bb'), - trackdb_path=os.path.join(out_dir, 'filtered_transmap.txt'), - genepred_path=tm_args.filtered_tm_gp, label='Filtered transMap', visibility='hide', - annotation_genome=pipeline_args.ref_genome, - mode='tm') + yield self.clone( + TransMapTrack, + track_path=os.path.join(out_dir, "transmap.bb"), + trackdb_path=os.path.join(out_dir, "transmap.txt"), + ) + yield self.clone( + BgpTrack, + track_path=os.path.join(out_dir, "filtered_transmap.bb"), + trackdb_path=os.path.join(out_dir, "filtered_transmap.txt"), + genepred_path=tm_args.filtered_tm_gp, + label="Filtered transMap", + visibility="hide", + annotation_genome=pipeline_args.ref_genome, + mode="tm", + ) if pipeline_args.augustus is True and self.genome in pipeline_args.rnaseq_genomes: - yield self.clone(AugustusTrack, track_path=os.path.join(out_dir, 'augustus.bb'), - trackdb_path=os.path.join(out_dir, 'augustus.txt')) + yield self.clone( + AugustusTrack, + track_path=os.path.join(out_dir, "augustus.bb"), + trackdb_path=os.path.join(out_dir, "augustus.txt"), + ) if self.genome in pipeline_args.isoseq_genomes: isoseq_bams = [] # add a number to make names unique - for i, bam in enumerate(pipeline_args.cfg['ISO_SEQ_BAM'][self.genome]): - new_bam = os.path.join(out_dir, '{}_{}'.format(i, os.path.basename(bam))) + for i, bam in enumerate(pipeline_args.cfg["ISO_SEQ_BAM"][self.genome]): + new_bam = os.path.join(out_dir, "{}_{}".format(i, os.path.basename(bam))) isoseq_bams.append((bam, new_bam)) - yield self.clone(IsoSeqBamTrack, trackdb_path=os.path.join(out_dir, 'isoseq_bams.txt'), - isoseq_bams=tuple(isoseq_bams)) + yield self.clone( + IsoSeqBamTrack, trackdb_path=os.path.join(out_dir, "isoseq_bams.txt"), isoseq_bams=tuple(isoseq_bams) + ) if self.genome in pipeline_args.rnaseq_genomes: - yield self.clone(SpliceTrack, track_path=os.path.join(out_dir, 'splices.bb'), - trackdb_path=os.path.join(out_dir, 'splices.txt')) + yield self.clone( + SpliceTrack, + track_path=os.path.join(out_dir, "splices.bb"), + trackdb_path=os.path.join(out_dir, "splices.txt"), + ) # expression is disabled until I fix wiggletools (bamCoverage is needed) - #if self.genome not in pipeline_args.intron_only_genomes: + # if self.genome not in pipeline_args.intron_only_genomes: # yield self.clone(ExpressionTracks, max_track_path=os.path.join(out_dir, 'max_expression.bw'), # median_track_path=os.path.join(out_dir, 'median_expression.bw'), # trackdb_path=os.path.join(out_dir, 'expression.txt')) @@ -2505,6 +2752,7 @@ def requires(self): class CreateTrackDbs(RebuildableTask): """Create the final trackDb entries""" + genome = luigi.Parameter() def requires(self): @@ -2520,44 +2768,46 @@ def run(self): directory_args = CreateDirectoryStructure.get_args(pipeline_args) out_dir = os.path.join(directory_args.out_dir, self.genome) org_str = construct_org_str(directory_args.genomes) - with self.output().open('w') as outf: + with self.output().open("w") as outf: for f in os.listdir(out_dir): - if f.endswith('.txt'): - outf.write('include {}\n'.format(f)) - outf.write('\n\n') + if f.endswith(".txt"): + outf.write("include {}\n".format(f)) + outf.write("\n\n") outf.write(snake_composite.format(org_str=org_str)) for genome in directory_args.genomes: # by default, only the reference genome is visible unless we are on the reference, then all are if self.genome == pipeline_args.ref_genome: if genome == pipeline_args.ref_genome: - visibility = 'hide' + visibility = "hide" else: - visibility = 'full' + visibility = "full" else: - visibility = 'hide' if genome != pipeline_args.ref_genome else 'full' - hal_path = '../{}'.format(os.path.basename(pipeline_args.hal)) + visibility = "hide" if genome != pipeline_args.ref_genome else "full" + hal_path = "../{}".format(os.path.basename(pipeline_args.hal)) outf.write(snake_template.format(genome=genome, hal_path=hal_path, visibility=visibility)) class DenovoTrack(TrackTask): """Constructs a denovo track""" + mode = luigi.Parameter() def run(self): def find_rgb(s): if s.AssignedGeneId is None and s.AlternativeGeneIds is None: - return '175,87,207' # both null -> purple (denovo) + return "175,87,207" # both null -> purple (denovo) elif s.AssignedGeneId is None and s.AlternativeGeneIds is not None: - return '87,207,175' # no assigned -> teal (possible_paralog) - return '0' + return "87,207,175" # no assigned -> teal (possible_paralog) + return "0" def find_alternative_gene_names(s, annotation_info): if s.AlternativeGeneIds is None: - return 'N/A' - r = {tools.misc.slice_df(annotation_info, gene).iloc[0].GeneName for - gene in s.AlternativeGeneIds.split(',')} - return ','.join(r) + return "N/A" + r = { + tools.misc.slice_df(annotation_info, gene).iloc[0].GeneName for gene in s.AlternativeGeneIds.split(",") + } + return ",".join(r) pipeline_args = self.get_pipeline_args() track, trackdb = self.output() @@ -2565,12 +2815,12 @@ def find_alternative_gene_names(s, annotation_info): # load database information db_path = pipeline_args.dbs[self.genome] alt_names = load_alt_names(db_path, [self.mode]) - denovo_hgm_df = load_hgm_vectors(db_path, self.mode).drop(['GeneId', 'TranscriptId'], axis=1) - denovo_df = pd.merge(denovo_hgm_df, alt_names, on='AlignmentId').set_index('AlignmentId') + denovo_hgm_df = load_hgm_vectors(db_path, self.mode).drop(["GeneId", "TranscriptId"], axis=1) + denovo_df = pd.merge(denovo_hgm_df, alt_names, on="AlignmentId").set_index("AlignmentId") annotation_info = tools.sqlInterface.load_annotation(pipeline_args.dbs[pipeline_args.ref_genome]) - annotation_info = annotation_info.set_index('GeneId') + annotation_info = annotation_info.set_index("GeneId") - if self.mode == 'augCGP': + if self.mode == "augCGP": augustus_gp = AugustusCgp.get_args(pipeline_args).augustus_cgp_gp[self.genome] else: augustus_gp = AugustusPb.get_args(pipeline_args, self.genome).augustus_pb_gp @@ -2578,19 +2828,19 @@ def find_alternative_gene_names(s, annotation_info): tmp = luigi.LocalTarget(is_tmp=True) as_file = luigi.LocalTarget(is_tmp=True) - with as_file.open('w') as outf: + with as_file.open("w") as outf: outf.write(denovo_as) - with tmp.open('w') as outf: + with tmp.open("w") as outf: for tx in tools.transcripts.gene_pred_iterator(augustus_gp): s = denovo_df.loc[tx.name] - alternative_gene_ids = 'N/A' if s.AlternativeGeneIds is None else s.AlternativeGeneIds - intron_rna = ','.join(map(str, s.IntronRnaSupport)) - exon_rna = ','.join(map(str, s.ExonRnaSupport)) - intron_annot = ','.join(map(str, s.IntronAnnotSupport)) - exon_annot = ','.join(map(str, s.ExonAnnotSupport)) + alternative_gene_ids = "N/A" if s.AlternativeGeneIds is None else s.AlternativeGeneIds + intron_rna = ",".join(map(str, s.IntronRnaSupport)) + exon_rna = ",".join(map(str, s.ExonRnaSupport)) + intron_annot = ",".join(map(str, s.IntronAnnotSupport)) + exon_annot = ",".join(map(str, s.ExonAnnotSupport)) if s.AssignedGeneId is None: - assigned_gene_id = gene_name = gene_type = alternative_gene_names = 'N/A' + assigned_gene_id = gene_name = gene_type = alternative_gene_names = "N/A" else: a = tools.misc.slice_df(annotation_info, s.AssignedGeneId).iloc[0] gene_name = a.GeneName @@ -2598,194 +2848,301 @@ def find_alternative_gene_names(s, annotation_info): assigned_gene_id = s.AssignedGeneId alternative_gene_names = find_alternative_gene_names(s, annotation_info) block_starts, block_sizes, exon_frames = tools.transcripts.create_bed_info_gp(tx) - row = [tx.chromosome, tx.start, tx.stop, tx.name, tx.score, tx.strand, tx.thick_start, - tx.thick_stop, find_rgb(s), tx.block_count, block_sizes, block_starts, - gene_name, tx.cds_start_stat, tx.cds_end_stat, exon_frames, - gene_type, assigned_gene_id, alternative_gene_ids, alternative_gene_names, - exon_annot, exon_rna, intron_annot, intron_rna] + row = [ + tx.chromosome, + tx.start, + tx.stop, + tx.name, + tx.score, + tx.strand, + tx.thick_start, + tx.thick_stop, + find_rgb(s), + tx.block_count, + block_sizes, + block_starts, + gene_name, + tx.cds_start_stat, + tx.cds_end_stat, + exon_frames, + gene_type, + assigned_gene_id, + alternative_gene_ids, + alternative_gene_names, + exon_annot, + exon_rna, + intron_annot, + intron_rna, + ] tools.fileOps.print_row(outf, row) - tools.procOps.run_proc(['bedSort', tmp.path, tmp.path]) + tools.procOps.run_proc(["bedSort", tmp.path, tmp.path]) with tools.fileOps.TemporaryFilePath() as out_path: - cmd = ['bedToBigBed', '-extraIndex=assignedGeneId,name,name2', - '-type=bed12+8', '-tab', '-as={}'.format(as_file.path), tmp.path, chrom_sizes, out_path] - tools.procOps.run_proc(cmd, stderr='/dev/null') + cmd = [ + "bedToBigBed", + "-extraIndex=assignedGeneId,name,name2", + "-type=bed12+8", + "-tab", + "-as={}".format(as_file.path), + tmp.path, + chrom_sizes, + out_path, + ] + tools.procOps.run_proc(cmd, stderr="/dev/null") tools.fileOps.atomic_install(out_path, track.path) - label = 'AugustusCGP' if self.mode == 'augCGP' else 'AugustusPB' - description = 'Comparative Augustus' if self.mode == 'augCGP' else 'PacBio Augustus' - with trackdb.open('w') as outf: - outf.write(denovo_template.format(name='augustus_{}_{}'.format(self.mode, self.genome), - short_label=label, long_label=label, description=description, - path=os.path.basename(track.path))) + label = "AugustusCGP" if self.mode == "augCGP" else "AugustusPB" + description = "Comparative Augustus" if self.mode == "augCGP" else "PacBio Augustus" + with trackdb.open("w") as outf: + outf.write( + denovo_template.format( + name="augustus_{}_{}".format(self.mode, self.genome), + short_label=label, + long_label=label, + description=description, + path=os.path.basename(track.path), + ) + ) class BgpTrack(TrackTask): """Constructs a standard modified bigGenePred track""" + genepred_path = luigi.Parameter() label = luigi.Parameter() annotation_genome = luigi.Parameter() - visibility = luigi.Parameter(default='pack') + visibility = luigi.Parameter(default="pack") mode = luigi.Parameter() def run(self): def find_rgb(info): """blue for coding, green for non-coding""" - if info.TranscriptBiotype == 'protein_coding': - return '76,85,212' - return '85,212,76' + if info.TranscriptBiotype == "protein_coding": + return "76,85,212" + return "85,212,76" pipeline_args = self.get_pipeline_args() track, trackdb = self.output() chrom_sizes = GenomeFiles.get_args(pipeline_args, self.genome).sizes annotation_info = tools.sqlInterface.load_annotation(pipeline_args.dbs[self.annotation_genome]) - annotation_info = annotation_info.set_index('TranscriptId') + annotation_info = annotation_info.set_index("TranscriptId") tmp = luigi.LocalTarget(is_tmp=True) as_file = luigi.LocalTarget(is_tmp=True) - with as_file.open('w') as outf: + with as_file.open("w") as outf: outf.write(modified_bgp_as) - with tmp.open('w') as outf: + with tmp.open("w") as outf: for tx in tools.transcripts.gene_pred_iterator(self.genepred_path): - if self.mode == 'tm': + if self.mode == "tm": s = annotation_info.loc[tools.nameConversions.strip_alignment_numbers(tx.name)] else: s = annotation_info.loc[tx.name] block_starts, block_sizes, exon_frames = tools.transcripts.create_bed_info_gp(tx) - row = [tx.chromosome, tx.start, tx.stop, s.TranscriptName, tx.score, tx.strand, tx.thick_start, - tx.thick_stop, find_rgb(s), tx.block_count, block_sizes, block_starts, - s.GeneName, tx.cds_start_stat, tx.cds_end_stat, exon_frames, - tx.name, s.GeneId, s.TranscriptBiotype, s.GeneBiotype] + row = [ + tx.chromosome, + tx.start, + tx.stop, + s.TranscriptName, + tx.score, + tx.strand, + tx.thick_start, + tx.thick_stop, + find_rgb(s), + tx.block_count, + block_sizes, + block_starts, + s.GeneName, + tx.cds_start_stat, + tx.cds_end_stat, + exon_frames, + tx.name, + s.GeneId, + s.TranscriptBiotype, + s.GeneBiotype, + ] tools.fileOps.print_row(outf, row) - tools.procOps.run_proc(['bedSort', tmp.path, tmp.path]) + tools.procOps.run_proc(["bedSort", tmp.path, tmp.path]) with tools.fileOps.TemporaryFilePath() as out_path: - cmd = ['bedToBigBed', '-extraIndex=name,name2,geneId,transcriptId', - '-type=bed12+8', '-tab', '-as={}'.format(as_file.path), tmp.path, chrom_sizes, out_path] - tools.procOps.run_proc(cmd, stderr='/dev/null') + cmd = [ + "bedToBigBed", + "-extraIndex=name,name2,geneId,transcriptId", + "-type=bed12+8", + "-tab", + "-as={}".format(as_file.path), + tmp.path, + chrom_sizes, + out_path, + ] + tools.procOps.run_proc(cmd, stderr="/dev/null") tools.fileOps.atomic_install(out_path, track.path) - with trackdb.open('w') as outf: - sanitized_label = self.label.replace(' ', '_').replace('.', '_') - outf.write(bgp_template.format(name='{}_{}'.format(sanitized_label, self.genome), - label=self.label, visibility=self.visibility, - path=os.path.basename(track.path))) + with trackdb.open("w") as outf: + sanitized_label = self.label.replace(" ", "_").replace(".", "_") + outf.write( + bgp_template.format( + name="{}_{}".format(sanitized_label, self.genome), + label=self.label, + visibility=self.visibility, + path=os.path.basename(track.path), + ) + ) class ConsensusTrack(TrackTask): """Constructs a modified bigGenePred for consensus gene sets""" + def run(self): def find_rgb(info): """red for failed, blue for coding, green for non-coding, purple for denovo""" - if info.transcript_biotype == 'unknown_likely_coding': - return '135,76,212' - elif info.transcript_biotype == 'protein_coding': - return '76,85,212' - return '85,212,76' + if info.transcript_biotype == "unknown_likely_coding": + return "135,76,212" + elif info.transcript_biotype == "protein_coding": + return "76,85,212" + return "85,212,76" pipeline_args = self.get_pipeline_args() track, trackdb = self.output() chrom_sizes = GenomeFiles.get_args(pipeline_args, self.genome).sizes consensus_args = Consensus.get_args(pipeline_args, self.genome) - consensus_gp_info = pd.read_csv(consensus_args.consensus_gp_info, sep='\t', - header=0, na_filter=False).set_index('transcript_id') + consensus_gp_info = pd.read_csv( + consensus_args.consensus_gp_info, sep="\t", header=0, na_filter=False + ).set_index("transcript_id") has_rnaseq = len(pipeline_args.rnaseq_genomes) > 0 - has_pb = 'pacbio_isoform_supported' in consensus_gp_info.columns + has_pb = "pacbio_isoform_supported" in consensus_gp_info.columns tmp_gp = luigi.LocalTarget(is_tmp=True) as_file = luigi.LocalTarget(is_tmp=True) - with tmp_gp.open('w') as outf: + with tmp_gp.open("w") as outf: for tx in tools.transcripts.gene_pred_iterator(consensus_args.consensus_gp): info = consensus_gp_info.loc[tx.name] block_starts, block_sizes, exon_frames = tools.transcripts.create_bed_info_gp(tx) - tx_name = info.source_transcript_name if info.source_transcript_name != 'N/A' else tx.name - row = [tx.chromosome, tx.start, tx.stop, tx_name, tx.score, tx.strand, - tx.thick_start, tx.thick_stop, find_rgb(info), tx.block_count, block_sizes, block_starts, - info.source_gene_common_name, tx.cds_start_stat, tx.cds_end_stat, exon_frames, - tx.name, info.transcript_biotype, tx.name2, info.gene_biotype, info.source_gene, - info.source_transcript, info.alignment_id, info.alternative_source_transcripts, - info.paralogy, info.unfiltered_paralogy, - info.get('collapsed_gene_ids'), info.get('collapsed_gene_names'), - info.frameshift, info.exon_annotation_support, - info.intron_annotation_support, info.transcript_class, info.transcript_modes, - info.valid_start, info.valid_stop, info.proper_orf] + tx_name = info.source_transcript_name if info.source_transcript_name != "N/A" else tx.name + row = [ + tx.chromosome, + tx.start, + tx.stop, + tx_name, + tx.score, + tx.strand, + tx.thick_start, + tx.thick_stop, + find_rgb(info), + tx.block_count, + block_sizes, + block_starts, + info.source_gene_common_name, + tx.cds_start_stat, + tx.cds_end_stat, + exon_frames, + tx.name, + info.transcript_biotype, + tx.name2, + info.gene_biotype, + info.source_gene, + info.source_transcript, + info.alignment_id, + info.alternative_source_transcripts, + info.paralogy, + info.unfiltered_paralogy, + info.get("collapsed_gene_ids"), + info.get("collapsed_gene_names"), + info.frameshift, + info.exon_annotation_support, + info.intron_annotation_support, + info.transcript_class, + info.transcript_modes, + info.valid_start, + info.valid_stop, + info.proper_orf, + ] if has_rnaseq: row.extend([info.intron_rna_support, info.exon_rna_support]) if has_pb: row.append(info.pacbio_isoform_supported) tools.fileOps.print_row(outf, row) - with as_file.open('w') as outf: + with as_file.open("w") as outf: as_str = construct_consensus_gp_as(has_rnaseq, has_pb) outf.write(as_str) - tools.procOps.run_proc(['bedSort', tmp_gp.path, tmp_gp.path]) + tools.procOps.run_proc(["bedSort", tmp_gp.path, tmp_gp.path]) with tools.fileOps.TemporaryFilePath() as out_path: - cmd = ['bedToBigBed', '-extraIndex=name,name2,txId,geneName,sourceGene,sourceTranscript,alignmentId', - '-type=bed12+23', '-tab', '-as={}'.format(as_file.path), tmp_gp.path, chrom_sizes, out_path] - tools.procOps.run_proc(cmd, stderr='/dev/null') + cmd = [ + "bedToBigBed", + "-extraIndex=name,name2,txId,geneName,sourceGene,sourceTranscript,alignmentId", + "-type=bed12+23", + "-tab", + "-as={}".format(as_file.path), + tmp_gp.path, + chrom_sizes, + out_path, + ] + tools.procOps.run_proc(cmd, stderr="/dev/null") tools.fileOps.atomic_install(out_path, track.path) - with trackdb.open('w') as outf: + with trackdb.open("w") as outf: outf.write(consensus_template.format(genome=self.genome, path=os.path.basename(track.path))) class EvaluationTrack(TrackTask): """Constructs the consensus evaluation track""" + tx_modes = luigi.TupleParameter() def run(self): def load_evals(tx_mode): """Loads the error tracks from the database""" - cds_table = tools.sqlInterface.tables['CDS'][tx_mode]['evaluation'] - mrna_table = tools.sqlInterface.tables['mRNA'][tx_mode]['evaluation'] - cds_df = pd.read_sql_table(cds_table.__tablename__, engine).set_index('AlignmentId') - mrna_df = pd.read_sql_table(mrna_table.__tablename__, engine).set_index('AlignmentId') - return {'CDS': cds_df, 'mRNA': mrna_df} + cds_table = tools.sqlInterface.tables["CDS"][tx_mode]["evaluation"] + mrna_table = tools.sqlInterface.tables["mRNA"][tx_mode]["evaluation"] + cds_df = pd.read_sql_table(cds_table.__tablename__, engine).set_index("AlignmentId") + mrna_df = pd.read_sql_table(mrna_table.__tablename__, engine).set_index("AlignmentId") + return {"CDS": cds_df, "mRNA": mrna_df} pipeline_args = self.get_pipeline_args() track, trackdb = self.output() chrom_sizes = GenomeFiles.get_args(pipeline_args, self.genome).sizes - engine = tools.sqlInterface.create_engine('sqlite:///' + pipeline_args.dbs[self.genome]) + engine = tools.sqlInterface.create_engine("sqlite:///" + pipeline_args.dbs[self.genome]) evals = {tx_mode: load_evals(tx_mode) for tx_mode in self.tx_modes} consensus_args = Consensus.get_args(pipeline_args, self.genome) - consensus_gp_info = pd.read_csv(consensus_args.consensus_gp_info, sep='\t', - header=0, na_filter=False).set_index('transcript_id') + consensus_gp_info = pd.read_csv( + consensus_args.consensus_gp_info, sep="\t", header=0, na_filter=False + ).set_index("transcript_id") aln_ids = set(consensus_gp_info.alignment_id) rows = [] for aln_id in aln_ids: tx_mode = tools.nameConversions.alignment_type(aln_id) - if tx_mode not in ['transMap', 'augTM', 'augTMR']: + if tx_mode not in ["transMap", "augTM", "augTMR"]: continue - mode = 'CDS' + mode = "CDS" df = tools.misc.slice_df(evals[tx_mode][mode], aln_id) if len(df) == 0: - mode = 'mRNA' + mode = "mRNA" df = tools.misc.slice_df(evals[tx_mode][mode], aln_id) for tx_id, s in df.iterrows(): bed = s.tolist() - bed[3] = '/'.join([tx_id, bed[3], mode]) + bed[3] = "/".join([tx_id, bed[3], mode]) rows.append(bed) tmp = luigi.LocalTarget(is_tmp=True) - with tmp.open('w') as tmp_handle: + with tmp.open("w") as tmp_handle: tools.fileOps.print_rows(tmp_handle, rows) - tools.procOps.run_proc(['bedSort', tmp.path, tmp.path]) + tools.procOps.run_proc(["bedSort", tmp.path, tmp.path]) with tools.fileOps.TemporaryFilePath() as out_path: - cmd = ['bedToBigBed', '-type=bed12', '-tab', tmp.path, chrom_sizes, out_path] - tools.procOps.run_proc(cmd, stderr='/dev/null') + cmd = ["bedToBigBed", "-type=bed12", "-tab", tmp.path, chrom_sizes, out_path] + tools.procOps.run_proc(cmd, stderr="/dev/null") tools.fileOps.atomic_install(out_path, track.path) - - with trackdb.open('w') as outf: + with trackdb.open("w") as outf: outf.write(error_template.format(genome=self.genome, path=os.path.basename(track.path))) class TransMapTrack(TrackTask): """Constructs the transMap bigPsl""" + def run(self): pipeline_args = self.get_pipeline_args() track, trackdb = self.output() @@ -2800,7 +3157,7 @@ def run(self): as_file = luigi.LocalTarget(is_tmp=True) seq_dict = tools.bio.get_sequence_dict(fasta) ref_tx_dict = tools.transcripts.get_gene_pred_dict(ReferenceFiles.get_args(pipeline_args).annotation_gp) - with cds.open('w') as cds_handle, mrna.open('w') as mrna_handle: + with cds.open("w") as cds_handle, mrna.open("w") as mrna_handle: for tx in tools.transcripts.gene_pred_iterator(tm_args.tm_gp): ref_tx = ref_tx_dict[tools.nameConversions.strip_alignment_numbers(tx.name)] tools.bio.write_fasta(mrna_handle, tx.name, str(seq_dict[ref_tx.name])) @@ -2809,36 +3166,53 @@ def run(self): else: start = ref_tx.cds_coordinate_to_mrna(0) + ref_tx.offset stop = start + ref_tx.cds_size - ((ref_tx.cds_size - ref_tx.offset) % 3) - cds_handle.write('{}\t{}..{}\n'.format(tx.name, start + 1, stop)) + cds_handle.write("{}\t{}..{}\n".format(tx.name, start + 1, stop)) - with tmp.open('w') as outf: - cmd = [['pslToBigPsl', '-cds={}'.format(cds.path), '-fa={}'.format(mrna.path), tm_args.tm_psl, 'stdout'], - ['bedSort', '/dev/stdin', '/dev/stdout']] - tools.procOps.run_proc(cmd, stdout=outf, stderr='/dev/null') + with tmp.open("w") as outf: + cmd = [ + ["pslToBigPsl", "-cds={}".format(cds.path), "-fa={}".format(mrna.path), tm_args.tm_psl, "stdout"], + ["bedSort", "/dev/stdin", "/dev/stdout"], + ] + tools.procOps.run_proc(cmd, stdout=outf, stderr="/dev/null") - with as_file.open('w') as outf: + with as_file.open("w") as outf: outf.write(bigpsl) with tools.fileOps.TemporaryFilePath() as out_path: - cmd = ['bedToBigBed', '-type=bed12+13', '-tab', '-extraIndex=name', - '-as={}'.format(as_file.path), tmp.path, chrom_sizes, out_path] - tools.procOps.run_proc(cmd, stderr='/dev/null') + cmd = [ + "bedToBigBed", + "-type=bed12+13", + "-tab", + "-extraIndex=name", + "-as={}".format(as_file.path), + tmp.path, + chrom_sizes, + out_path, + ] + tools.procOps.run_proc(cmd, stderr="/dev/null") tools.fileOps.atomic_install(out_path, track.path) - with trackdb.open('w') as outf: - outf.write(bigpsl_template.format(name='transmap_{}'.format(self.genome), short_label='transMap', - long_label='transMap', path=os.path.basename(track.path), - visibility='pack')) + with trackdb.open("w") as outf: + outf.write( + bigpsl_template.format( + name="transmap_{}".format(self.genome), + short_label="transMap", + long_label="transMap", + path=os.path.basename(track.path), + visibility="pack", + ) + ) class AugustusTrack(TrackTask): """Constructs a combined TM(R) track""" + def run(self): pipeline_args = self.get_pipeline_args() track, trackdb = self.output() chrom_sizes = GenomeFiles.get_args(pipeline_args, self.genome).sizes annotation_info = tools.sqlInterface.load_annotation(pipeline_args.dbs[pipeline_args.ref_genome]) - annotation_info = annotation_info.set_index('TranscriptId') + annotation_info = annotation_info.set_index("TranscriptId") aug_args = Augustus.get_args(pipeline_args, self.genome) tm_gp = aug_args.augustus_tm_gp if self.genome in pipeline_args.rnaseq_genomes: @@ -2847,43 +3221,76 @@ def run(self): tmr_gp = None with tools.fileOps.TemporaryFilePath() as tmp, tools.fileOps.TemporaryFilePath() as as_file: - with open(as_file, 'w') as outf: + with open(as_file, "w") as outf: outf.write(modified_bgp_as) - with open(tmp, 'w') as outf: - for gp, color in zip(*[[tm_gp, tmr_gp], ['38,112,75', '112,38,75']]): + with open(tmp, "w") as outf: + for gp, color in zip(*[[tm_gp, tmr_gp], ["38,112,75", "112,38,75"]]): if gp is None: continue gp = tools.transcripts.gene_pred_iterator(gp) for tx in gp: s = annotation_info.loc[tools.nameConversions.strip_alignment_numbers(tx.name)] block_starts, block_sizes, exon_frames = tools.transcripts.create_bed_info_gp(tx) - row = [tx.chromosome, tx.start, tx.stop, s.TranscriptName, tx.score, tx.strand, tx.thick_start, - tx.thick_stop, color, tx.block_count, block_sizes, block_starts, - s.GeneName, tx.cds_start_stat, tx.cds_end_stat, exon_frames, - tx.name, s.GeneId, s.TranscriptBiotype, s.GeneBiotype] + row = [ + tx.chromosome, + tx.start, + tx.stop, + s.TranscriptName, + tx.score, + tx.strand, + tx.thick_start, + tx.thick_stop, + color, + tx.block_count, + block_sizes, + block_starts, + s.GeneName, + tx.cds_start_stat, + tx.cds_end_stat, + exon_frames, + tx.name, + s.GeneId, + s.TranscriptBiotype, + s.GeneBiotype, + ] tools.fileOps.print_row(outf, row) - tools.procOps.run_proc(['bedSort', tmp, tmp]) + tools.procOps.run_proc(["bedSort", tmp, tmp]) with tools.fileOps.TemporaryFilePath() as out_path: - cmd = ['bedToBigBed', '-extraIndex=name,name2,geneId,transcriptId', - '-type=bed12+8', '-tab', '-as={}'.format(as_file), tmp, chrom_sizes, out_path] - tools.procOps.run_proc(cmd, stderr='/dev/null') + cmd = [ + "bedToBigBed", + "-extraIndex=name,name2,geneId,transcriptId", + "-type=bed12+8", + "-tab", + "-as={}".format(as_file), + tmp, + chrom_sizes, + out_path, + ] + tools.procOps.run_proc(cmd, stderr="/dev/null") tools.fileOps.atomic_install(out_path, track.path) - with trackdb.open('w') as outf: - outf.write(bgp_template.format(name='augustus_{}'.format(self.genome), label='AugustusTM(R)', - path=os.path.basename(track.path), visibility='hide')) + with trackdb.open("w") as outf: + outf.write( + bgp_template.format( + name="augustus_{}".format(self.genome), + label="AugustusTM(R)", + path=os.path.basename(track.path), + visibility="hide", + ) + ) class IsoSeqBamTrack(RebuildableTask): """Symlinks over IsoSeq bams""" + genome = luigi.Parameter() trackdb_path = luigi.Parameter() isoseq_bams = luigi.TupleParameter() def output(self): r = [luigi.LocalTarget(new_bam) for bam, new_bam in self.isoseq_bams] - r.extend([luigi.LocalTarget(x.path + '.bai') for x in r]) + r.extend([luigi.LocalTarget(x.path + ".bai") for x in r]) return r, luigi.LocalTarget(self.trackdb_path) def requires(self): @@ -2891,26 +3298,39 @@ def requires(self): def run(self): bams, trackdb = self.output() - with trackdb.open('w') as outf: + with trackdb.open("w") as outf: outf.write(bam_composite_template.format(genome=self.genome)) for bam, new_bam in self.isoseq_bams: shutil.copy(bam, new_bam) - shutil.copy(bam + '.bai', new_bam + '.bai') - name = os.path.splitext(os.path.basename(bam))[0].split('_', 1)[0] + shutil.copy(bam + ".bai", new_bam + ".bai") + name = os.path.splitext(os.path.basename(bam))[0].split("_", 1)[0] outf.write(bam_template.format(bam=os.path.basename(new_bam), name=name, genome=self.genome)) class SpliceTrack(TrackTask): """Constructs the splice junction track""" + def run(self): def parse_entry(entry): """Converts a GFF entry to BED12""" start = int(entry[3]) - 1 stop = int(entry[4]) - block_starts = '0,{}'.format(stop - start - 2) - mult = int(tools.misc.parse_gff_attr_line(entry[-1])['mult']) - return [entry[0], start, stop, 'SpliceJunction', mult, '.', start, stop, '204,124,45', - '2', '2,2', block_starts] + block_starts = "0,{}".format(stop - start - 2) + mult = int(tools.misc.parse_gff_attr_line(entry[-1])["mult"]) + return [ + entry[0], + start, + stop, + "SpliceJunction", + mult, + ".", + start, + stop, + "204,124,45", + "2", + "2,2", + block_starts, + ] pipeline_args = self.get_pipeline_args() track, trackdb = self.output() @@ -2919,8 +3339,8 @@ def parse_entry(entry): entries = [] for line in open(hints_gff): - if '\tintron\t' in line and 'src=E' in line and 'mult' in line: - parsed = parse_entry(line.split('\t')) + if "\tintron\t" in line and "src=E" in line and "mult" in line: + parsed = parse_entry(line.split("\t")) if parsed[4] > 2: entries.append(parsed) @@ -2933,55 +3353,62 @@ def parse_entry(entry): # load to file tmp = luigi.LocalTarget(is_tmp=True) - with tmp.open('w') as tmp_handle: + with tmp.open("w") as tmp_handle: tools.fileOps.print_rows(tmp_handle, entries) - tools.procOps.run_proc(['bedSort', tmp.path, tmp.path]) + tools.procOps.run_proc(["bedSort", tmp.path, tmp.path]) with tools.fileOps.TemporaryFilePath() as out_path: - cmd = ['bedToBigBed', '-tab', tmp.path, chrom_sizes, out_path] - tools.procOps.run_proc(cmd, stderr='/dev/null') + cmd = ["bedToBigBed", "-tab", tmp.path, chrom_sizes, out_path] + tools.procOps.run_proc(cmd, stderr="/dev/null") tools.fileOps.atomic_install(out_path, track.path) - with trackdb.open('w') as outf: + with trackdb.open("w") as outf: outf.write(splice_template.format(genome=self.genome, path=os.path.basename(track.path))) class ExpressionTracks(RebuildableTask): """Constructs the maximum and median expression tracks""" + genome = luigi.Parameter() trackdb_path = luigi.Parameter() max_track_path = luigi.Parameter() median_track_path = luigi.Parameter() def output(self): - return [luigi.LocalTarget(self.max_track_path), luigi.LocalTarget(self.median_track_path)], \ - luigi.LocalTarget(self.trackdb_path) + return ( + [luigi.LocalTarget(self.max_track_path), luigi.LocalTarget(self.median_track_path)], + luigi.LocalTarget(self.trackdb_path), + ) def requires(self): return self.clone(CreateDirectoryStructure), self.clone(Consensus) def run(self): pipeline_args = self.get_pipeline_args() - bams = list(pipeline_args.cfg['BAM'][self.genome]) + bams = list(pipeline_args.cfg["BAM"][self.genome]) (max_track, median_track), trackdb = self.output() chrom_sizes = GenomeFiles.get_args(pipeline_args, self.genome).sizes - with median_track.open('w') as outf: - cmd = [['wiggletools', 'median'] + bams, - ['wigToBigWig', '-clip', '/dev/stdin', chrom_sizes, '/dev/stdout']] - tools.procOps.run_proc(cmd, stdout=outf, stderr='/dev/null') + with median_track.open("w") as outf: + cmd = [["wiggletools", "median"] + bams, ["wigToBigWig", "-clip", "/dev/stdin", chrom_sizes, "/dev/stdout"]] + tools.procOps.run_proc(cmd, stdout=outf, stderr="/dev/null") - with max_track.open('w') as outf: - cmd = [['wiggletools', 'max'] + bams, - ['wigToBigWig', '-clip', '/dev/stdin', chrom_sizes, '/dev/stdout']] - tools.procOps.run_proc(cmd, stdout=outf, stderr='/dev/null') + with max_track.open("w") as outf: + cmd = [["wiggletools", "max"] + bams, ["wigToBigWig", "-clip", "/dev/stdin", chrom_sizes, "/dev/stdout"]] + tools.procOps.run_proc(cmd, stdout=outf, stderr="/dev/null") - with trackdb.open('w') as outf: - outf.write(wiggle_template.format(genome=self.genome, mode='Median', - path=os.path.basename(median_track.path), color='151,189,68')) - outf.write(wiggle_template.format(genome=self.genome, mode='Maximum', - path=os.path.basename(max_track.path), color='106,68,189')) + with trackdb.open("w") as outf: + outf.write( + wiggle_template.format( + genome=self.genome, mode="Median", path=os.path.basename(median_track.path), color="151,189,68" + ) + ) + outf.write( + wiggle_template.format( + genome=self.genome, mode="Maximum", path=os.path.basename(max_track.path), color="106,68,189" + ) + ) ### @@ -2998,17 +3425,17 @@ def find_default_pos(chrom_sizes, window_size=200000): """ sizes = [x.split() for x in open(chrom_sizes)] sorted_sizes = sorted(sizes, key=lambda chrom_size: -int(chrom_size[1])) - return '{}:{}-{}'.format(sorted_sizes[0][0], 1, window_size) + return "{}:{}-{}".format(sorted_sizes[0][0], 1, window_size) def construct_org_str(genomes): """Constructs the organism string for the hal snakes. format is genome=genome space separated""" - return ' '.join(['{0}={0}'.format(genome) for genome in genomes]) + return " ".join(["{0}={0}".format(genome) for genome in genomes]) def construct_consensus_gp_as(has_rna, has_pb): """Dynamically generate an autosql file for consensus""" - consensus_gp_as = '''table bigCat + consensus_gp_as = """table bigCat "bigCat gene models" ( string chrom; "Reference sequence chromosome or scaffold" @@ -3047,17 +3474,17 @@ def construct_consensus_gp_as(has_rna, has_pb): string validStart; "Valid start codon" string validStop; "Valid stop codon" string properOrf; "Proper multiple of 3 ORF" -''' +""" if has_rna: consensus_gp_as += ' lstring intronRnaSupport; "RNA intron support"\n' consensus_gp_as += ' lstring exonRnaSupport; "RNA exon support"\n' if has_pb: consensus_gp_as += ' string pbIsoformSupported; "Is this transcript supported by IsoSeq?"' - consensus_gp_as += '\n)\n' + consensus_gp_as += "\n)\n" return consensus_gp_as -modified_bgp_as = '''table bigGenePred +modified_bgp_as = """table bigGenePred "bigGenePred gene models" ( string chrom; "Reference sequence chromosome or scaffold" @@ -3082,10 +3509,10 @@ def construct_consensus_gp_as(has_rna, has_pb): string geneType; "Gene type" ) -''' +""" -denovo_as = '''table denovo +denovo_as = """table denovo "denovo gene models" ( string chrom; "Reference sequence chromosome or scaffold" @@ -3113,10 +3540,10 @@ def construct_consensus_gp_as(has_rna, has_pb): lstring intronAnnotationSupport; "Intron support in reference annotation" lstring intronRnaSupport; "RNA intron support" ) -''' +""" -bigpsl = '''table bigPsl +bigpsl = """table bigPsl "bigPsl pairwise alignment" ( string chrom; "Reference sequence chromosome or scaffold" @@ -3146,7 +3573,7 @@ def construct_consensus_gp_as(has_rna, has_pb): uint seqType; "0=empty, 1=nucleotide, 2=amino_acid" ) -''' +""" ### @@ -3154,15 +3581,15 @@ def construct_consensus_gp_as(has_rna, has_pb): ### -hub_str = '''hub {hal} +hub_str = """hub {hal} shortLabel {hal} longLabel {hal} genomesFile genomes.txt email {email} -''' +""" -genome_str = '''genome {genome} +genome_str = """genome {genome} twoBitPath {genome}/{genome}.2bit trackDb {genome}/trackDb.txt organism {genome} @@ -3171,10 +3598,10 @@ def construct_consensus_gp_as(has_rna, has_pb): defaultPos {default_pos} groups groups.txt -''' +""" -groups_str = '''name cat_tracks +groups_str = """name cat_tracks label Comparative Annotation Toolkit priority 1 defaultIsClosed 0 @@ -3189,10 +3616,10 @@ def construct_consensus_gp_as(has_rna, has_pb): priority 3 defaultIsClosed 0 -''' +""" -snake_composite = '''track hubCentral +snake_composite = """track hubCentral compositeTrack on shortLabel Cactus longLabel Cactus Alignment Tracks @@ -3213,9 +3640,9 @@ def construct_consensus_gp_as(has_rna, has_pb): visibility full subTrack hubCentral -''' +""" -snake_template = ''' track snake{genome} +snake_template = """ track snake{genome} longLabel {genome} shortLabel {genome} otherSpecies {genome} @@ -3227,10 +3654,10 @@ def construct_consensus_gp_as(has_rna, has_pb): group snake subGroups view=Snake orgs={genome} -''' +""" -consensus_template = '''track consensus_{genome} +consensus_template = """track consensus_{genome} shortLabel CAT Annotation longLabel CAT Annotation description CAT Annotation @@ -3245,10 +3672,10 @@ def construct_consensus_gp_as(has_rna, has_pb): defaultLabelFields name labelSeperator " " -''' +""" -bgp_template = '''track {name} +bgp_template = """track {name} shortLabel {label} longLabel {label} description {label} @@ -3263,10 +3690,10 @@ def construct_consensus_gp_as(has_rna, has_pb): defaultLabelFields name labelSeperator " " -''' +""" -bigpsl_template = '''track {name} +bigpsl_template = """track {name} shortLabel {short_label} longLabel {long_label} bigDataUrl {path} @@ -3286,10 +3713,10 @@ def construct_consensus_gp_as(has_rna, has_pb): #showCdsMaxZoom 10000.0 searchIndex name -''' +""" -denovo_template = '''track {name} +denovo_template = """track {name} shortLabel {short_label} longLabel {long_label} description {description} @@ -3304,10 +3731,10 @@ def construct_consensus_gp_as(has_rna, has_pb): labelSeperator " " -''' +""" -bam_composite_template = '''track bams_{genome} +bam_composite_template = """track bams_{genome} group expression compositeTrack on shortLabel IsoSeq BAMs @@ -3322,9 +3749,9 @@ def construct_consensus_gp_as(has_rna, has_pb): bamGrayMode aliQual pairEndsByName on -''' +""" -bam_template = ''' track {bam}_{genome} +bam_template = """ track {bam}_{genome} parent bams_{genome} bigDataUrl {bam} shortLabel {name} @@ -3332,10 +3759,10 @@ def construct_consensus_gp_as(has_rna, has_pb): type bam priority 10 -''' +""" -wiggle_template = '''track {mode}_{genome} +wiggle_template = """track {mode}_{genome} shortLabel {mode} expression longLabel {mode} expression type bigWig @@ -3346,10 +3773,10 @@ def construct_consensus_gp_as(has_rna, has_pb): priority 11 spectrum on -''' +""" -splice_template = '''track splices_{genome} +splice_template = """track splices_{genome} type bigBed 12 group expression shortLabel RNA-seq splices @@ -3361,10 +3788,10 @@ def construct_consensus_gp_as(has_rna, has_pb): spectrum on minGrayLevel 4 -''' +""" -error_template = '''track error_{genome} +error_template = """track error_{genome} type bigBed 6 group cat_tracks shortLabel Consensus indels @@ -3373,4 +3800,4 @@ def construct_consensus_gp_as(has_rna, has_pb): visibility hide priority 5 -''' +""" diff --git a/cat/align_transcripts.py b/cat/align_transcripts.py index 58a6226d..e40102ad 100644 --- a/cat/align_transcripts.py +++ b/cat/align_transcripts.py @@ -40,23 +40,28 @@ def align_transcripts(args, toil_options): input_file_ids = argparse.Namespace() input_file_ids.ref_genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.ref_genome_fasta) input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.genome_fasta) - input_file_ids.annotation_gp = FileID.forPath(t.importFile('file://' + args.annotation_gp), - args.annotation_gp) - input_file_ids.ref_db = FileID.forPath(t.importFile('file://' + args.ref_db_path), args.ref_db_path) + input_file_ids.annotation_gp = FileID.forPath( + t.importFile("file://" + args.annotation_gp), args.annotation_gp + ) + input_file_ids.ref_db = FileID.forPath(t.importFile("file://" + args.ref_db_path), args.ref_db_path) input_file_ids.modes = {} - file_ids = [input_file_ids.ref_genome_fasta, input_file_ids.genome_fasta, input_file_ids.annotation_gp, - input_file_ids.ref_db] + file_ids = [ + input_file_ids.ref_genome_fasta, + input_file_ids.genome_fasta, + input_file_ids.annotation_gp, + input_file_ids.ref_db, + ] for mode in args.transcript_modes: - input_file_ids.modes[mode] = t.importFile('file://' + args.transcript_modes[mode]['gp']) + input_file_ids.modes[mode] = t.importFile("file://" + args.transcript_modes[mode]["gp"]) file_ids.append(input_file_ids.modes[mode]) disk_usage = tools.toilInterface.find_total_disk_usage(file_ids) - job = Job.wrapJobFn(setup, args, input_file_ids, memory='16G', disk=disk_usage) + job = Job.wrapJobFn(setup, args, input_file_ids, memory="16G", disk=disk_usage) results_file_ids = t.start(job) else: results_file_ids = t.restart() for file_path, file_id in results_file_ids.items(): tools.fileOps.ensure_file_dir(file_path) - t.exportFile(file_id, 'file://' + file_path) + t.exportFile(file_id, "file://" + file_path) def setup(job, args, input_file_ids): @@ -66,52 +71,58 @@ def setup(job, args, input_file_ids): :param args: dictionary of arguments from CAT :param input_file_ids: dictionary of fileStore file IDs for the inputs to this pipeline """ - job.fileStore.logToMaster('Beginning Align Transcripts run on {}'.format(args.genome), level=logging.INFO) + job.fileStore.logToMaster("Beginning Align Transcripts run on {}".format(args.genome), level=logging.INFO) # load all fileStore files necessary annotation_gp = job.fileStore.readGlobalFile(input_file_ids.annotation_gp) ref_genome_db = job.fileStore.readGlobalFile(input_file_ids.ref_db) - genome_fasta = tools.toilInterface.load_fasta_from_filestore(job, input_file_ids.genome_fasta, - prefix='genome', upper=False) - ref_genome_fasta = tools.toilInterface.load_fasta_from_filestore(job, input_file_ids.ref_genome_fasta, - prefix='ref_genome', upper=False) + genome_fasta = tools.toilInterface.load_fasta_from_filestore( + job, input_file_ids.genome_fasta, prefix="genome", upper=False + ) + ref_genome_fasta = tools.toilInterface.load_fasta_from_filestore( + job, input_file_ids.ref_genome_fasta, prefix="ref_genome", upper=False + ) # load required reference data into memory tx_biotype_map = tools.sqlInterface.get_transcript_biotype_map(ref_genome_db) ref_transcript_dict = tools.transcripts.get_gene_pred_dict(annotation_gp) # will hold a mapping of output file paths to lists of Promise objects containing output results = collections.defaultdict(list) - for tx_mode in ['transMap', 'augTM', 'augTMR']: + for tx_mode in ["transMap", "augTM", "augTMR"]: if tx_mode not in args.transcript_modes: continue # output file paths - mrna_path = args.transcript_modes[tx_mode]['mRNA'] - cds_path = args.transcript_modes[tx_mode]['CDS'] + mrna_path = args.transcript_modes[tx_mode]["mRNA"] + cds_path = args.transcript_modes[tx_mode]["CDS"] # begin loading transcripts and sequences gp_path = job.fileStore.readGlobalFile(input_file_ids.modes[tx_mode]) transcript_dict = tools.transcripts.get_gene_pred_dict(gp_path) - transcript_dict = {aln_id: tx for aln_id, tx in transcript_dict.items() if - tx_biotype_map[tools.nameConversions.strip_alignment_numbers(aln_id)] == 'protein_coding'} - for aln_mode, out_path in zip(*[['mRNA', 'CDS'], [mrna_path, cds_path]]): - seq_iter = get_alignment_sequences(transcript_dict, ref_transcript_dict, genome_fasta, - ref_genome_fasta, aln_mode) + transcript_dict = { + aln_id: tx + for aln_id, tx in transcript_dict.items() + if tx_biotype_map[tools.nameConversions.strip_alignment_numbers(aln_id)] == "protein_coding" + } + for aln_mode, out_path in zip(*[["mRNA", "CDS"], [mrna_path, cds_path]]): + seq_iter = get_alignment_sequences( + transcript_dict, ref_transcript_dict, genome_fasta, ref_genome_fasta, aln_mode + ) for chunk in group_transcripts(seq_iter): - j = job.addChildJobFn(run_aln_chunk, chunk, memory='8G', disk='2G') + j = job.addChildJobFn(run_aln_chunk, chunk, memory="8G", disk="2G") results[out_path].append(j.rv()) if len(results) == 0: - err_msg = 'Align Transcripts pipeline did not detect any input genePreds for {}'.format(args.genome) + err_msg = "Align Transcripts pipeline did not detect any input genePreds for {}".format(args.genome) raise RuntimeError(err_msg) # convert the results Promises into resolved values - return job.addFollowOnJobFn(merge, results, args, memory='8G', disk='4G').rv() + return job.addFollowOnJobFn(merge, results, args, memory="8G", disk="4G").rv() def get_alignment_sequences(transcript_dict, ref_transcript_dict, genome_fasta, ref_genome_fasta, mode): """Generator that yields a tuple of (tx_id, tx_seq, ref_tx_id, ref_tx_seq)""" - assert mode in ['mRNA', 'CDS'] + assert mode in ["mRNA", "CDS"] for tx_id, tx in transcript_dict.items(): ref_tx_id = tools.nameConversions.strip_alignment_numbers(tx_id) ref_tx = ref_transcript_dict[ref_tx_id] - tx_seq = tx.get_mrna(genome_fasta) if mode == 'mRNA' else tx.get_cds(genome_fasta) - ref_tx_seq = ref_tx.get_mrna(ref_genome_fasta) if mode == 'mRNA' else ref_tx.get_cds(ref_genome_fasta) + tx_seq = tx.get_mrna(genome_fasta) if mode == "mRNA" else tx.get_cds(genome_fasta) + ref_tx_seq = ref_tx.get_mrna(ref_genome_fasta) if mode == "mRNA" else ref_tx.get_cds(ref_genome_fasta) if len(ref_tx_seq) > 50 and len(tx_seq) > 50: yield tx_id, tx_seq, ref_tx_id, ref_tx_seq @@ -127,7 +138,7 @@ def run_aln_chunk(job, chunk): results = [] for tx_id, tx_seq, ref_tx_id, ref_tx_seq in chunk: p = tools.parasail_wrapper.aln_nucleotides(tx_seq, tx_id, ref_tx_seq, ref_tx_id) - psl_str = '\t'.join(p.psl_string()) + psl_str = "\t".join(p.psl_string()) results.append(psl_str) return results @@ -139,14 +150,14 @@ def merge(job, results, args): :param args: arguments to the pipeline :return: """ - job.fileStore.logToMaster('Merging Alignment output for {}'.format(args.genome), level=logging.INFO) + job.fileStore.logToMaster("Merging Alignment output for {}".format(args.genome), level=logging.INFO) results_file_ids = {} for gp_category, result_list in results.items(): tmp_results_file = tools.fileOps.get_tmp_toil_file() - with open(tmp_results_file, 'w') as outf: + with open(tmp_results_file, "w") as outf: for line in itertools.chain.from_iterable(result_list): # results is list of lists if line is not None: - outf.write(line + '\n') + outf.write(line + "\n") results_file_ids[gp_category] = job.fileStore.writeGlobalFile(tmp_results_file) return results_file_ids diff --git a/cat/augustus.py b/cat/augustus.py index 3f3464b2..b4bf7f2e 100644 --- a/cat/augustus.py +++ b/cat/augustus.py @@ -42,18 +42,25 @@ def augustus(args, coding_gp, toil_options): if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.genome_fasta) - input_file_ids.tm_cfg = FileID.forPath(t.importFile('file://' + args.tm_cfg), args.tm_cfg) - input_file_ids.coding_gp = FileID.forPath(t.importFile('file://' + coding_gp), coding_gp) - input_file_ids.ref_psl = FileID.forPath(t.importFile('file://' + args.ref_psl), args.ref_psl) - input_file_ids.tm_psl = FileID.forPath(t.importFile('file://' + args.filtered_tm_psl), args.filtered_tm_psl) - input_file_ids.annotation_gp = FileID.forPath(t.importFile('file://' + args.annotation_gp), - args.annotation_gp) - file_ids = [input_file_ids.genome_fasta, input_file_ids.coding_gp, input_file_ids.ref_psl, - input_file_ids.tm_psl, input_file_ids.annotation_gp] + input_file_ids.tm_cfg = FileID.forPath(t.importFile("file://" + args.tm_cfg), args.tm_cfg) + input_file_ids.coding_gp = FileID.forPath(t.importFile("file://" + coding_gp), coding_gp) + input_file_ids.ref_psl = FileID.forPath(t.importFile("file://" + args.ref_psl), args.ref_psl) + input_file_ids.tm_psl = FileID.forPath(t.importFile("file://" + args.filtered_tm_psl), args.filtered_tm_psl) + input_file_ids.annotation_gp = FileID.forPath( + t.importFile("file://" + args.annotation_gp), args.annotation_gp + ) + file_ids = [ + input_file_ids.genome_fasta, + input_file_ids.coding_gp, + input_file_ids.ref_psl, + input_file_ids.tm_psl, + input_file_ids.annotation_gp, + ] if args.augustus_tmr: - input_file_ids.augustus_hints_db = FileID.forPath(t.importFile('file://' + args.augustus_hints_db), - args.augustus_hints_db) - input_file_ids.tmr_cfg = FileID.forPath(t.importFile('file://' + args.tmr_cfg), args.tmr_cfg) + input_file_ids.augustus_hints_db = FileID.forPath( + t.importFile("file://" + args.augustus_hints_db), args.augustus_hints_db + ) + input_file_ids.tmr_cfg = FileID.forPath(t.importFile("file://" + args.tmr_cfg), args.tmr_cfg) file_ids.append(args.augustus_hints_db) disk_usage = tools.toilInterface.find_total_disk_usage(file_ids) job = Job.wrapJobFn(setup, args, input_file_ids, disk_usage, disk=disk_usage) @@ -61,10 +68,10 @@ def augustus(args, coding_gp, toil_options): else: tm_file_id, tmr_file_id = t.restart() tools.fileOps.ensure_file_dir(args.augustus_tm_gtf) - t.exportFile(tm_file_id, 'file://' + args.augustus_tm_gtf) + t.exportFile(tm_file_id, "file://" + args.augustus_tm_gtf) if tmr_file_id is not None: tools.fileOps.ensure_file_dir(args.augustus_tmr_gtf) - t.exportFile(tmr_file_id, 'file://' + args.augustus_tmr_gtf) + t.exportFile(tmr_file_id, "file://" + args.augustus_tmr_gtf) def setup(job, args, input_file_ids, disk_usage): @@ -76,20 +83,25 @@ def setup(job, args, input_file_ids, disk_usage): :param disk_usage: Disk Usage to pass along to AUGUSTUS jobs. Important when a hints DB is involved. :return: completed GTF format results for all jobs """ + def start_jobs(mode, chunk_size, cfg_file_id): """loop wrapper that starts jobs for both TM and TMR modes""" results = [] for chunk in tools.dataOps.grouper(iter(tx_dict.items()), chunk_size): grouped_recs = {} for tx_id, tx in chunk: - grouped_recs[tx_id] = [tx, - ref_tx_dict[tools.nameConversions.remove_alignment_number(tx_id)], - tm_psl_dict[tx_id], - ref_psl_dict[tools.nameConversions.remove_alignment_number(tx_id)]] - j = job.addChildJobFn(run_augustus_chunk, args, grouped_recs, input_file_ids, mode, cfg_file_id, - disk=disk_usage) + grouped_recs[tx_id] = [ + tx, + ref_tx_dict[tools.nameConversions.remove_alignment_number(tx_id)], + tm_psl_dict[tx_id], + ref_psl_dict[tools.nameConversions.remove_alignment_number(tx_id)], + ] + j = job.addChildJobFn( + run_augustus_chunk, args, grouped_recs, input_file_ids, mode, cfg_file_id, disk=disk_usage + ) results.append(j.rv()) return results + # load all fileStore files necessary ref_psl = job.fileStore.readGlobalFile(input_file_ids.ref_psl) tm_psl = job.fileStore.readGlobalFile(input_file_ids.tm_psl) @@ -100,9 +112,9 @@ def start_jobs(mode, chunk_size, cfg_file_id): tm_psl_dict = tools.psl.get_alignment_dict(tm_psl) ref_tx_dict = tools.transcripts.get_gene_pred_dict(annotation_gp) tx_dict = tools.transcripts.get_gene_pred_dict(coding_gp) - tm_results = start_jobs('TM', 25, input_file_ids.tm_cfg) + tm_results = start_jobs("TM", 25, input_file_ids.tm_cfg) if args.augustus_tmr: - tmr_results = start_jobs('TMR', 15, input_file_ids.tmr_cfg) + tmr_results = start_jobs("TMR", 15, input_file_ids.tmr_cfg) else: tmr_results = None return job.addFollowOnJobFn(merge, tm_results, tmr_results).rv() @@ -119,8 +131,9 @@ def run_augustus_chunk(job, args, grouped_recs, input_file_ids, mode, cfg_file_i :param cfg_file_id: File ID for the Augustus cfg file based on if we are in TM or TMR mode :return: Augustus output for this chunk """ - genome_fasta = tools.toilInterface.load_fasta_from_filestore(job, input_file_ids.genome_fasta, - prefix='genome', upper=False) + genome_fasta = tools.toilInterface.load_fasta_from_filestore( + job, input_file_ids.genome_fasta, prefix="genome", upper=False + ) cfg_file = job.fileStore.readGlobalFile(cfg_file_id) if args.augustus_tmr: hints_db_file = job.fileStore.readGlobalFile(input_file_ids.augustus_hints_db) @@ -135,13 +148,15 @@ def run_augustus_chunk(job, args, grouped_recs, input_file_ids, mode, cfg_file_i stop = min(tm_tx.stop + padding, len(genome_fasta[chromosome])) tm_hints = tools.tm2hints.tm_to_hints(tm_tx, tm_psl, ref_psl) if args.augustus_tmr: - rnaseq_hints = get_rnaseq_hints(args.genome, chromosome, start, stop, speciesnames, seqnames, hints, - featuretypes, session) - hint = ''.join([tm_hints, rnaseq_hints]) + rnaseq_hints = get_rnaseq_hints( + args.genome, chromosome, start, stop, speciesnames, seqnames, hints, featuretypes, session + ) + hint = "".join([tm_hints, rnaseq_hints]) else: hint = tm_hints - transcript = run_augustus(hint, genome_fasta, tm_tx, cfg_file, start, stop, args.augustus_species, mode, - args.utr) + transcript = run_augustus( + hint, genome_fasta, tm_tx, cfg_file, start, stop, args.augustus_species, mode, args.utr + ) if transcript is not None: results.extend(transcript) if args.augustus_tmr: @@ -164,12 +179,23 @@ def run_augustus(hint, fasta, tm_tx, cfg_file, start, stop, species, mode, utr): tmp_fasta = tools.fileOps.get_tmp_toil_file() tools.bio.write_fasta(tmp_fasta, tm_tx.chromosome, fasta[tm_tx.chromosome][start:stop]) hints_out = tools.fileOps.get_tmp_toil_file() - with open(hints_out, 'w') as outf: + with open(hints_out, "w") as outf: outf.write(hint) - cmd = ['augustus', tmp_fasta, '--predictionStart=-{}'.format(start), '--predictionEnd=-{}'.format(start), - '--extrinsicCfgFile={}'.format(cfg_file), '--hintsfile={}'.format(hints_out), '--UTR={}'.format(int(utr)), - '--alternatives-from-evidence=0', '--species={}'.format(species), '--allow_hinted_splicesites=atac', - '--protein=0', '--softmasking=1', '--/augustus/verbosity=0'] + cmd = [ + "augustus", + tmp_fasta, + "--predictionStart=-{}".format(start), + "--predictionEnd=-{}".format(start), + "--extrinsicCfgFile={}".format(cfg_file), + "--hintsfile={}".format(hints_out), + "--UTR={}".format(int(utr)), + "--alternatives-from-evidence=0", + "--species={}".format(species), + "--allow_hinted_splicesites=atac", + "--protein=0", + "--softmasking=1", + "--/augustus/verbosity=0", + ] aug_output = tools.procOps.call_proc_lines(cmd) transcript = munge_augustus_output(aug_output, mode, tm_tx) return transcript @@ -204,13 +230,14 @@ def munge_augustus_output(aug_output, mode, tm_tx): # extract the transcript lines tx_entries = [x.split() for x in aug_output if "\ttranscript\t" in x] # filter out transcripts that do not overlap the alignment range - valid_txs = [x[-1] for x in tx_entries if tm_tx.interval.overlap(tools.intervals.ChromosomeInterval(x[0], x[3], - x[4], x[6]))] + valid_txs = [ + x[-1] for x in tx_entries if tm_tx.interval.overlap(tools.intervals.ChromosomeInterval(x[0], x[3], x[4], x[6])) + ] if len(valid_txs) != 1: return None valid_tx = valid_txs[0] - tx_id = 'aug{}-{}'.format(mode, tm_tx.name) - tx_lines = [x.split('\t') for x in aug_output if valid_tx in x and not x.startswith('#')] + tx_id = "aug{}-{}".format(mode, tm_tx.name) + tx_lines = [x.split("\t") for x in aug_output if valid_tx in x and not x.startswith("#")] features = {"exon", "CDS", "start_codon", "stop_codon", "tts", "tss"} gtf = [] for chrom, source, feature, start, stop, score, strand, frame, attributes in tx_lines: diff --git a/cat/augustus_cgp.py b/cat/augustus_cgp.py index 1849384e..3eb05d98 100644 --- a/cat/augustus_cgp.py +++ b/cat/augustus_cgp.py @@ -44,36 +44,38 @@ def augustus_cgp(args, toil_options): with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() - input_file_ids.hal = FileID.forPath(t.importFile('file://' + args.hal), args.hal) - input_file_ids.chrom_sizes = FileID.forPath(t.importFile('file://' + args.query_sizes), args.query_sizes) - input_file_ids.hints_db = FileID.forPath(t.importFile('file://' + args.hints_db), args.hints_db) + input_file_ids.hal = FileID.forPath(t.importFile("file://" + args.hal), args.hal) + input_file_ids.chrom_sizes = FileID.forPath(t.importFile("file://" + args.query_sizes), args.query_sizes) + input_file_ids.hints_db = FileID.forPath(t.importFile("file://" + args.hints_db), args.hints_db) if args.cgp_param is not None: - input_file_ids.cgp_param = FileID.forPath(t.importFile('file://' + args.cgp_param), args.cgp_param) + input_file_ids.cgp_param = FileID.forPath(t.importFile("file://" + args.cgp_param), args.cgp_param) else: input_file_ids.cgp_param = None - input_file_ids.gtf = FileID.forPath(t.importFile('file://' + args.gtf), args.gtf) - input_file_ids.cgp_cfg = FileID.forPath(t.importFile('file://' + args.cgp_cfg), args.cgp_cfg) - input_file_ids.fasta = {genome: FileID.forPath(t.importFile('file://' + fasta), fasta) - for genome, fasta in args.fasta_files.items()} - du = tools.toilInterface.find_total_disk_usage([input_file_ids.hints_db], buffer='4G') - job = Job.wrapJobFn(setup, args, input_file_ids, memory='8G', disk=du) + input_file_ids.gtf = FileID.forPath(t.importFile("file://" + args.gtf), args.gtf) + input_file_ids.cgp_cfg = FileID.forPath(t.importFile("file://" + args.cgp_cfg), args.cgp_cfg) + input_file_ids.fasta = { + genome: FileID.forPath(t.importFile("file://" + fasta), fasta) + for genome, fasta in args.fasta_files.items() + } + du = tools.toilInterface.find_total_disk_usage([input_file_ids.hints_db], buffer="4G") + job = Job.wrapJobFn(setup, args, input_file_ids, memory="8G", disk=du) results, stdout_file_ids, param_file_id = t.start(job) else: results, stdout_file_ids, param_file_id = t.restart() tools.fileOps.ensure_file_dir(args.stdout_file) - with open(args.stdout_file, 'w') as outf, tools.fileOps.TemporaryFilePath() as tmp: + with open(args.stdout_file, "w") as outf, tools.fileOps.TemporaryFilePath() as tmp: for (chrom, start, chunksize), stdout_file in stdout_file_ids.items(): - outf.write('## BEGIN CHUNK chrom: {} start: {} chunksize: {}\n'.format(chrom, start, chunksize)) - t.exportFile(stdout_file, 'file://' + tmp) + outf.write("## BEGIN CHUNK chrom: {} start: {} chunksize: {}\n".format(chrom, start, chunksize)) + t.exportFile(stdout_file, "file://" + tmp) for l in open(tmp): outf.write(l) for genome, (raw_gtf_file_id, joined_gtf_file_id, joined_gp_file_id) in results.items(): tools.fileOps.ensure_file_dir(args.augustus_cgp_raw_gtf[genome]) - t.exportFile(raw_gtf_file_id, 'file://' + args.augustus_cgp_raw_gtf[genome]) - t.exportFile(joined_gtf_file_id, 'file://' + args.augustus_cgp_gtf[genome]) - t.exportFile(joined_gp_file_id, 'file://' + args.augustus_cgp_gp[genome]) + t.exportFile(raw_gtf_file_id, "file://" + args.augustus_cgp_raw_gtf[genome]) + t.exportFile(joined_gtf_file_id, "file://" + args.augustus_cgp_gtf[genome]) + t.exportFile(joined_gp_file_id, "file://" + args.augustus_cgp_gp[genome]) if args.cgp_param is None: - t.exportFile(param_file_id, 'file://' + args.param_out_path) + t.exportFile(param_file_id, "file://" + args.param_out_path) def setup(job, args, input_file_ids): @@ -98,17 +100,28 @@ def setup(job, args, input_file_ids): chrom_size = int(chrom_size) for start in range(0, chrom_size, args.chunksize - args.overlap): chunksize = args.chunksize if start + args.chunksize <= chrom_size else chrom_size - start - j = job.addChildJobFn(hal2maf, input_file_ids, args.genomes, args.ref_genome, args.annotate_ancestors, - chrom, start, chunksize, memory='8G', disk=hal2maf_usage) + j = job.addChildJobFn( + hal2maf, + input_file_ids, + args.genomes, + args.ref_genome, + args.annotate_ancestors, + chrom, + start, + chunksize, + memory="8G", + disk=hal2maf_usage, + ) maf_chunks.append([chrom, start, chunksize, j.rv()]) # if we have no params, time to train if input_file_ids.cgp_param is None: - du = tools.toilInterface.find_total_disk_usage([input_file_ids.hints_db], buffer='40G') - results = job.addFollowOnJobFn(cgp_training_wrapper, maf_chunks, tree, args, input_file_ids, memory='8G', - disk=du).rv() + du = tools.toilInterface.find_total_disk_usage([input_file_ids.hints_db], buffer="40G") + results = job.addFollowOnJobFn( + cgp_training_wrapper, maf_chunks, tree, args, input_file_ids, memory="8G", disk=du + ).rv() else: - results = job.addFollowOnJobFn(cgp_wrapper, maf_chunks, tree, args, input_file_ids, disk='4G').rv() + results = job.addFollowOnJobFn(cgp_wrapper, maf_chunks, tree, args, input_file_ids, disk="4G").rv() return results @@ -134,8 +147,14 @@ def cgp_training_wrapper(job, maf_chunks, tree, args, input_file_ids): for chrom, start, chunksize, maf_chunk in random.sample(maf_chunks, len(maf_chunks)): seqnr = seqs[chrom] query = session.query(hints).filter( - sqlalchemy.and_(hints.speciesid.in_(speciesid), hints.source == 'a2h', hints.seqnr == seqnr, - hints.start >= start, hints.end <= start + chunksize)) + sqlalchemy.and_( + hints.speciesid.in_(speciesid), + hints.source == "a2h", + hints.seqnr == seqnr, + hints.start >= start, + hints.end <= start + chunksize, + ) + ) exon_count = query.count() selected_intervals.append(maf_chunk) seen_exons += exon_count @@ -143,10 +162,10 @@ def cgp_training_wrapper(job, maf_chunks, tree, args, input_file_ids): break # run each chunk through augustus in training mode - cgp_usage = tools.toilInterface.find_total_disk_usage([input_file_ids.fasta, input_file_ids.hints_db], buffer='4G') + cgp_usage = tools.toilInterface.find_total_disk_usage([input_file_ids.fasta, input_file_ids.hints_db], buffer="4G") training_gffs = [] for maf_chunk in selected_intervals: - j = job.addChildJobFn(cgp, tree, maf_chunk, args, input_file_ids, training=True, memory='8G', disk=cgp_usage) + j = job.addChildJobFn(cgp, tree, maf_chunk, args, input_file_ids, training=True, memory="8G", disk=cgp_usage) training_gffs.append(j.rv()) return job.addFollowOnJobFn(train_cgp, maf_chunks, tree, args, input_file_ids, training_gffs).rv() @@ -162,19 +181,21 @@ def train_cgp(job, maf_chunks, tree, args, input_file_ids, training_gffs): :return: output of merge_results() """ training_gff_files = [job.fileStore.readGlobalFile(x) for x in training_gffs] - cmd = ['cat'] + training_gff_files + cmd = ["cat"] + training_gff_files combined = tools.fileOps.get_tmp_toil_file() tools.procOps.run_proc(cmd, stdout=combined) # run the training process params = tools.fileOps.get_tmp_toil_file() - cmd = ['augustus', - '--species={}'.format(args.species), - '--treefile={}'.format(job.fileStore.readGlobalFile(tree)), - '--refSpecies={}'.format(args.ref_genome), - '--referenceFile={}'.format(job.fileStore.readGlobalFile(input_file_ids.gtf)), - '--trainFeatureFile={}'.format(combined), - '--param_outfile={}'.format(params)] + cmd = [ + "augustus", + "--species={}".format(args.species), + "--treefile={}".format(job.fileStore.readGlobalFile(tree)), + "--refSpecies={}".format(args.ref_genome), + "--referenceFile={}".format(job.fileStore.readGlobalFile(input_file_ids.gtf)), + "--trainFeatureFile={}".format(combined), + "--param_outfile={}".format(params), + ] tools.procOps.run_proc(cmd) input_file_ids.cgp_param = job.fileStore.writeGlobalFile(params) return job.addFollowOnJobFn(cgp_wrapper, maf_chunks, tree, args, input_file_ids).rv() @@ -193,11 +214,11 @@ def cgp_wrapper(job, maf_chunks, tree, args, input_file_ids): # each Promise object will resolve to a tuple of gff_chunk_dict, stdout_file_id # cgp_job.rv(): key: genome, value: file handle to gff results = [] - cgp_usage = tools.toilInterface.find_total_disk_usage([input_file_ids.fasta, input_file_ids.hints_db], buffer='4G') + cgp_usage = tools.toilInterface.find_total_disk_usage([input_file_ids.fasta, input_file_ids.hints_db], buffer="4G") for chrom, start, chunksize, maf_chunk in maf_chunks: # run AugustusCGP on alignment chunk - cgp_job = job.addChildJobFn(cgp, tree, maf_chunk, args, input_file_ids, memory='8G', disk=cgp_usage) + cgp_job = job.addChildJobFn(cgp, tree, maf_chunk, args, input_file_ids, memory="8G", disk=cgp_usage) results.append([chrom, start, chunksize, cgp_job.rv()]) # merge all gff files for alignment chunks to one gff for each species @@ -205,7 +226,7 @@ def cgp_wrapper(job, maf_chunks, tree, args, input_file_ids): # stdout_file_id dict is keyed by (chromosome, start, chunksize) tuples # for the joined genes dict its a dict keyed by genome and values are a 3 member tuple of: # [raw_gtf_file_id, joined_gtf_file_id, joined_gp_file_id] - results = job.addFollowOnJobFn(merge_results, results, input_file_ids, memory='8G', disk='8G').rv() + results = job.addFollowOnJobFn(merge_results, results, input_file_ids, memory="8G", disk="8G").rv() return results @@ -215,11 +236,25 @@ def hal2maf(job, input_file_ids, genomes, ref_genome, annotate_ancestors, chrom, """ hal = job.fileStore.readGlobalFile(input_file_ids.hal) maf_chunk = tools.fileOps.get_tmp_toil_file() - genomes = ','.join(genomes) - cmd = ['hal2maf', '--onlyOrthologs', '--refGenome', ref_genome, '--targetGenomes', genomes, - '--refSequence', chrom, '--start', str(start), '--length', str(chunk_size), hal, maf_chunk] + genomes = ",".join(genomes) + cmd = [ + "hal2maf", + "--onlyOrthologs", + "--refGenome", + ref_genome, + "--targetGenomes", + genomes, + "--refSequence", + chrom, + "--start", + str(start), + "--length", + str(chunk_size), + hal, + maf_chunk, + ] if not annotate_ancestors: - cmd.append('--noAncestors') + cmd.append("--noAncestors") tools.procOps.run_proc(cmd) return job.fileStore.writeGlobalFile(maf_chunk) @@ -232,34 +267,41 @@ def cgp(job, tree, maf_chunk, args, input_file_ids, training=False): cgp_cfg = job.fileStore.readGlobalFile(input_file_ids.cgp_cfg) stdout = tools.fileOps.get_tmp_toil_file() - cmd = ['augustus', '--dbhints=1', '--allow_hinted_splicesites=atac', - '--extrinsicCfgFile={}'.format(cgp_cfg), - '--species={}'.format(args.species), - '--treefile={}'.format(job.fileStore.readGlobalFile(tree)), - '--alnfile={}'.format(job.fileStore.readGlobalFile(maf_chunk)), - '--dbaccess={}'.format(job.fileStore.readGlobalFile(input_file_ids.hints_db)), - '--speciesfilenames={}'.format(genome_fofn), - '--softmasking=1', - '--exoncands={}'.format(1 if training else 0), - '--alternatives-from-evidence=0', - '--/CompPred/logreg=on', - '--printOEs={}'.format(1 if training else 0), - '--/CompPred/outdir={}'.format(os.getcwd())] + cmd = [ + "augustus", + "--dbhints=1", + "--allow_hinted_splicesites=atac", + "--extrinsicCfgFile={}".format(cgp_cfg), + "--species={}".format(args.species), + "--treefile={}".format(job.fileStore.readGlobalFile(tree)), + "--alnfile={}".format(job.fileStore.readGlobalFile(maf_chunk)), + "--dbaccess={}".format(job.fileStore.readGlobalFile(input_file_ids.hints_db)), + "--speciesfilenames={}".format(genome_fofn), + "--softmasking=1", + "--exoncands={}".format(1 if training else 0), + "--alternatives-from-evidence=0", + "--/CompPred/logreg=on", + "--printOEs={}".format(1 if training else 0), + "--/CompPred/outdir={}".format(os.getcwd()), + ] if training is False: - cmd.append('--optCfgFile={}'.format(job.fileStore.readGlobalFile(input_file_ids.cgp_param))) + cmd.append("--optCfgFile={}".format(job.fileStore.readGlobalFile(input_file_ids.cgp_param))) else: - cmd.append('--printSampled=true') + cmd.append("--printSampled=true") tools.procOps.run_proc(cmd, stdout=stdout) if training is True: - cmd = ['cat', os.path.abspath('{}.sampled_GFs.gff'.format(args.ref_genome)), - os.path.abspath('exonCands.{}.gff3'.format(args.ref_genome)), - os.path.abspath('orthoExons.{}.gff3'.format(args.ref_genome))] + cmd = [ + "cat", + os.path.abspath("{}.sampled_GFs.gff".format(args.ref_genome)), + os.path.abspath("exonCands.{}.gff3".format(args.ref_genome)), + os.path.abspath("orthoExons.{}.gff3".format(args.ref_genome)), + ] combined_file = tools.fileOps.get_tmp_toil_file() tools.procOps.run_proc(cmd, stdout=combined_file) return job.fileStore.writeGlobalFile(combined_file) else: stdout_file_id = job.fileStore.writeGlobalFile(stdout) - return {genome: job.fileStore.writeGlobalFile(genome + '.cgp.gff') for genome in args.genomes}, stdout_file_id + return {genome: job.fileStore.writeGlobalFile(genome + ".cgp.gff") for genome in args.genomes}, stdout_file_id def merge_results(job, results, input_file_ids): @@ -278,7 +320,7 @@ def merge_results(job, results, input_file_ids): gff_chunks_by_genome[genome][(chrom, start, chunksize)] = gff_file_id results = {} for genome in gff_chunks_by_genome: - j = job.addChildJobFn(join_genes, gff_chunks_by_genome[genome], memory='8G', disk='8G') + j = job.addChildJobFn(join_genes, gff_chunks_by_genome[genome], memory="8G", disk="8G") results[genome] = j.rv() return results, stdout_file_ids, input_file_ids.cgp_param @@ -295,45 +337,51 @@ def join_genes(job, gff_chunks): raw_gtf_fofn = tools.fileOps.get_tmp_toil_file() useful_lines = 0 files = [] - with open(raw_gtf_file, 'w') as raw_handle, open(raw_gtf_fofn, 'w') as fofn_handle: + with open(raw_gtf_file, "w") as raw_handle, open(raw_gtf_fofn, "w") as fofn_handle: for (chrom, start, chunksize), chunk in gff_chunks.items(): local_path = job.fileStore.readGlobalFile(chunk) - raw_handle.write('## BEGIN CHUNK chrom: {} start: {} chunksize: {}\n'.format(chrom, start, chunksize)) + raw_handle.write("## BEGIN CHUNK chrom: {} start: {} chunksize: {}\n".format(chrom, start, chunksize)) for line in open(local_path): - if not line.startswith('#'): + if not line.startswith("#"): useful_lines += 1 raw_handle.write(line) - if os.environ.get('CAT_BINARY_MODE') == 'singularity': + if os.environ.get("CAT_BINARY_MODE") == "singularity": local_path = tools.procOps.singularify_arg(local_path) files.append(local_path) else: files.append(os.path.basename(local_path)) - fofn_handle.write(local_path + '\n') + fofn_handle.write(local_path + "\n") # make sure CGP didn't fail entirely if useful_lines == 0: - raise Exception('After running AugustusCGP, no gene predictions were made. Did you set `--augustus-species` ' - 'to a species with a trained model similar to your reference species? Please consult the ' - 'AUGUSTUS manual for more about the species flag.') + raise Exception( + "After running AugustusCGP, no gene predictions were made. Did you set `--augustus-species` " + "to a species with a trained model similar to your reference species? Please consult the " + "AUGUSTUS manual for more about the species flag." + ) join_genes_file = tools.fileOps.get_tmp_toil_file() join_genes_gp = tools.fileOps.get_tmp_toil_file() # TODO: figure out why this fails on certain filesystems try: - cmd = [['joingenes', '-f', raw_gtf_fofn, '-o', '/dev/stdout'], - ['grep', '-P', '\tAUGUSTUS\t(exon|CDS|start_codon|stop_codon|tts|tss)\t'], - ['sed', ' s/jg/augCGP-/g']] + cmd = [ + ["joingenes", "-f", raw_gtf_fofn, "-o", "/dev/stdout"], + ["grep", "-P", "\tAUGUSTUS\t(exon|CDS|start_codon|stop_codon|tts|tss)\t"], + ["sed", " s/jg/augCGP-/g"], + ] tools.procOps.run_proc(cmd, stdout=join_genes_file) except: - cmd = [['joingenes', '-g', ','.join(files), '-o', '/dev/stdout'], - ['grep', '-P', '\tAUGUSTUS\t(exon|CDS|start_codon|stop_codon|tts|tss)\t'], - ['sed', ' s/jg/augCGP-/g']] + cmd = [ + ["joingenes", "-g", ",".join(files), "-o", "/dev/stdout"], + ["grep", "-P", "\tAUGUSTUS\t(exon|CDS|start_codon|stop_codon|tts|tss)\t"], + ["sed", " s/jg/augCGP-/g"], + ] tools.procOps.run_proc(cmd, stdout=join_genes_file) # passing the joingenes output through gtfToGenePred then genePredToGtf fixes the sort order for homGeneMapping - cmd = ['gtfToGenePred', '-genePredExt', join_genes_file, join_genes_gp] + cmd = ["gtfToGenePred", "-genePredExt", join_genes_file, join_genes_gp] tools.procOps.run_proc(cmd) - cmd = ['genePredToGtf', 'file', join_genes_gp, '-utr', '-honorCdsStat', '-source=augustusCGP', join_genes_file] + cmd = ["genePredToGtf", "file", join_genes_gp, "-utr", "-honorCdsStat", "-source=augustusCGP", join_genes_file] tools.procOps.run_proc(cmd) joined_gtf_file_id = job.fileStore.writeGlobalFile(join_genes_file) @@ -347,12 +395,12 @@ def join_genes(job, gff_chunks): ### -def write_tree(job,input_file_ids): +def write_tree(job, input_file_ids): """ writes a file with the phylogenetic tree in NEWICK format """ hal = job.fileStore.readGlobalFile(input_file_ids.hal) - cmd = ['halStats', '--tree', hal] + cmd = ["halStats", "--tree", hal] tree = tools.fileOps.get_tmp_toil_file() tools.procOps.run_proc(cmd, stdout=tree) return job.fileStore.writeGlobalFile(tree) @@ -371,12 +419,10 @@ def write_genome_fofn(job, fasta_file_ids): These files are loaded from the fileStore """ genome_fofn = tools.fileOps.get_tmp_toil_file() - with open(genome_fofn, 'w') as outf: + with open(genome_fofn, "w") as outf: for genome, file_id in fasta_file_ids.items(): local_path = job.fileStore.readGlobalFile(file_id) - if os.environ.get('CAT_BINARY_MODE') == 'singularity': + if os.environ.get("CAT_BINARY_MODE") == "singularity": local_path = tools.procOps.singularify_arg(local_path) tools.fileOps.print_row(outf, [genome, local_path]) return genome_fofn - - diff --git a/cat/augustus_pb.py b/cat/augustus_pb.py index 00ca690f..71cb7bc4 100644 --- a/cat/augustus_pb.py +++ b/cat/augustus_pb.py @@ -39,17 +39,17 @@ def augustus_pb(args, toil_options): if not t.options.restart: input_file_ids = argparse.Namespace() input_file_ids.genome_fasta = tools.toilInterface.write_fasta_to_filestore(t, args.genome_fasta) - input_file_ids.chrom_sizes = FileID.forPath(t.importFile('file://' + args.chrom_sizes), args.chrom_sizes) - input_file_ids.pb_cfg = FileID.forPath(t.importFile('file://' + args.pb_cfg), args.pb_cfg) - input_file_ids.hints_gff = FileID.forPath(t.importFile('file://' + args.hints_gff), args.hints_gff) - job = Job.wrapJobFn(setup, args, input_file_ids, memory='16G', disk='32G') + input_file_ids.chrom_sizes = FileID.forPath(t.importFile("file://" + args.chrom_sizes), args.chrom_sizes) + input_file_ids.pb_cfg = FileID.forPath(t.importFile("file://" + args.pb_cfg), args.pb_cfg) + input_file_ids.hints_gff = FileID.forPath(t.importFile("file://" + args.hints_gff), args.hints_gff) + job = Job.wrapJobFn(setup, args, input_file_ids, memory="16G", disk="32G") raw_gtf_file_id, gtf_file_id, joined_gp_file_id = t.start(job) else: raw_gtf_file_id, gtf_file_id, joined_gp_file_id = t.restart() tools.fileOps.ensure_file_dir(args.augustus_pb_raw_gtf) - t.exportFile(raw_gtf_file_id, 'file://' + args.augustus_pb_raw_gtf) - t.exportFile(gtf_file_id, 'file://' + args.augustus_pb_gtf) - t.exportFile(joined_gp_file_id, 'file://' + args.augustus_pb_gp) + t.exportFile(raw_gtf_file_id, "file://" + args.augustus_pb_raw_gtf) + t.exportFile(gtf_file_id, "file://" + args.augustus_pb_gtf) + t.exportFile(joined_gp_file_id, "file://" + args.augustus_pb_gp) def setup(job, args, input_file_ids): @@ -57,15 +57,16 @@ def setup(job, args, input_file_ids): Entry function for running AugustusPB. The genome is chunked up and the resulting gene sets merged using joingenes. """ - genome_fasta = tools.toilInterface.load_fasta_from_filestore(job, input_file_ids.genome_fasta, - prefix='genome', upper=False) + genome_fasta = tools.toilInterface.load_fasta_from_filestore( + job, input_file_ids.genome_fasta, prefix="genome", upper=False + ) # load only PB hints hints_file = job.fileStore.readGlobalFile(input_file_ids.hints_gff) - hints = [x.split('\t') for x in open(hints_file) if 'src=PB' in x] + hints = [x.split("\t") for x in open(hints_file) if "src=PB" in x] if len(hints) == 0: - raise RuntimeError('No PB hints found.') + raise RuntimeError("No PB hints found.") # convert the start/stops to ints # break up by chromosome @@ -77,7 +78,7 @@ def setup(job, args, input_file_ids): # calculate overlapping intervals. If the final interval is small (<= 50% of total interval size), merge it intervals = collections.defaultdict(list) - for chrom in genome_fasta: + for chrom in genome_fasta.keys(): chrom_size = len(genome_fasta[chrom]) for start in range(0, chrom_size, args.chunksize - args.overlap): stop = min(start + args.chunksize, chrom_size) @@ -98,16 +99,17 @@ def setup(job, args, input_file_ids): if len(hints) == 0: continue # no reason to compute an empty chunk tmp_hints = tools.fileOps.get_tmp_toil_file() - with open(tmp_hints, 'w') as outf: + with open(tmp_hints, "w") as outf: for h in hints: tools.fileOps.print_row(outf, h) hints_file_id = job.fileStore.writeGlobalFile(tmp_hints) - j = job.addChildJobFn(augustus_pb_chunk, args, input_file_ids, hints_file_id, chrom, start, stop, - memory='8G', disk='8G') + j = job.addChildJobFn( + augustus_pb_chunk, args, input_file_ids, hints_file_id, chrom, start, stop, memory="8G", disk="8G" + ) predictions.append(j.rv()) # results contains a 3 member tuple of [raw_gtf_file_id, gtf_file_id, joined_gp_file_id] - results = job.addFollowOnJobFn(join_genes, predictions, memory='8G', disk='8G').rv() + results = job.addFollowOnJobFn(join_genes, predictions, memory="8G", disk="8G").rv() return results @@ -115,22 +117,29 @@ def augustus_pb_chunk(job, args, input_file_ids, hints_file_id, chrom, start, st """ core function that runs AugustusPB on one genome chunk """ - genome_fasta = tools.toilInterface.load_fasta_from_filestore(job, input_file_ids.genome_fasta, - prefix='genome', upper=False) + genome_fasta = tools.toilInterface.load_fasta_from_filestore( + job, input_file_ids.genome_fasta, prefix="genome", upper=False + ) hints = job.fileStore.readGlobalFile(hints_file_id) pb_cfg = job.fileStore.readGlobalFile(input_file_ids.pb_cfg) tmp_fasta = tools.fileOps.get_tmp_toil_file() tools.bio.write_fasta(tmp_fasta, chrom, genome_fasta[chrom][start:stop]) results = tools.fileOps.get_tmp_toil_file() - cmd = ['augustus', '--softmasking=1', '--allow_hinted_splicesites=atac', - '--alternatives-from-evidence=1', '--UTR={}'.format(int(args.utr)), - '--hintsfile={}'.format(hints), - '--extrinsicCfgFile={}'.format(pb_cfg), - '--species={}'.format(args.species), - '--/augustus/verbosity=0', - '--predictionStart=-{}'.format(start), '--predictionEnd=-{}'.format(start), - tmp_fasta] + cmd = [ + "augustus", + "--softmasking=1", + "--allow_hinted_splicesites=atac", + "--alternatives-from-evidence=1", + "--UTR={}".format(int(args.utr)), + "--hintsfile={}".format(hints), + "--extrinsicCfgFile={}".format(pb_cfg), + "--species={}".format(args.species), + "--/augustus/verbosity=0", + "--predictionStart=-{}".format(start), + "--predictionEnd=-{}".format(start), + tmp_fasta, + ] tools.procOps.run_proc(cmd, stdout=results) return job.fileStore.writeGlobalFile(results) @@ -146,36 +155,40 @@ def join_genes(job, gff_chunks): raw_gtf_file = tools.fileOps.get_tmp_toil_file() raw_gtf_fofn = tools.fileOps.get_tmp_toil_file() files = [] - with open(raw_gtf_file, 'w') as raw_handle, open(raw_gtf_fofn, 'w') as fofn_handle: + with open(raw_gtf_file, "w") as raw_handle, open(raw_gtf_fofn, "w") as fofn_handle: for chunk in gff_chunks: local_path = job.fileStore.readGlobalFile(chunk) for line in open(local_path): raw_handle.write(line) - if os.environ.get('CAT_BINARY_MODE') == 'singularity': + if os.environ.get("CAT_BINARY_MODE") == "singularity": local_path = tools.procOps.singularify_arg(local_path) files.append(local_path) else: files.append(os.path.basename(local_path)) - fofn_handle.write(local_path + '\n') + fofn_handle.write(local_path + "\n") join_genes_file = tools.fileOps.get_tmp_toil_file() join_genes_gp = tools.fileOps.get_tmp_toil_file() # TODO: figure out why this fails on certain filesystems try: - cmd = [['joingenes', '-f', raw_gtf_fofn, '-o', '/dev/stdout'], - ['grep', '-P', '\tAUGUSTUS\t(exon|CDS|start_codon|stop_codon|tts|tss)\t'], - ['sed', ' s/jg/augPB-/g']] + cmd = [ + ["joingenes", "-f", raw_gtf_fofn, "-o", "/dev/stdout"], + ["grep", "-P", "\tAUGUSTUS\t(exon|CDS|start_codon|stop_codon|tts|tss)\t"], + ["sed", " s/jg/augPB-/g"], + ] tools.procOps.run_proc(cmd, stdout=join_genes_file) except: - cmd = [['joingenes', '-g', ','.join(files), '-o', '/dev/stdout'], - ['grep', '-P', '\tAUGUSTUS\t(exon|CDS|start_codon|stop_codon|tts|tss)\t'], - ['sed', ' s/jg/augPB-/g']] + cmd = [ + ["joingenes", "-g", ",".join(files), "-o", "/dev/stdout"], + ["grep", "-P", "\tAUGUSTUS\t(exon|CDS|start_codon|stop_codon|tts|tss)\t"], + ["sed", " s/jg/augPB-/g"], + ] tools.procOps.run_proc(cmd, stdout=join_genes_file) # passing the joingenes output through gtfToGenePred then genePredToGtf fixes the sort order for homGeneMapping - cmd = ['gtfToGenePred', '-genePredExt', join_genes_file, join_genes_gp] + cmd = ["gtfToGenePred", "-genePredExt", join_genes_file, join_genes_gp] tools.procOps.run_proc(cmd) - cmd = ['genePredToGtf', 'file', join_genes_gp, '-utr', '-honorCdsStat', '-source=augustusPB', join_genes_file] + cmd = ["genePredToGtf", "file", join_genes_gp, "-utr", "-honorCdsStat", "-source=augustusPB", join_genes_file] tools.procOps.run_proc(cmd) joined_gtf_file_id = job.fileStore.writeGlobalFile(join_genes_file) diff --git a/cat/chaining.py b/cat/chaining.py index e9a00ff3..0404a29d 100644 --- a/cat/chaining.py +++ b/cat/chaining.py @@ -24,12 +24,14 @@ def chaining(args, toil_options): with Toil(toil_options) as t: if not t.options.restart: input_file_ids = argparse.Namespace() - input_file_ids.hal = FileID.forPath(t.importFile('file://' + args.hal), args.hal) - input_file_ids.query_sizes = FileID.forPath(t.importFile('file://' + args.query_sizes), args.query_sizes) - input_file_ids.query_two_bit = FileID.forPath(t.importFile('file://' + args.query_two_bit), - args.query_two_bit) - target_two_bit_file_ids = {genome: FileID.forPath(t.importFile('file://' + f), f) - for genome, f in args.target_two_bits.items()} + input_file_ids.hal = FileID.forPath(t.importFile("file://" + args.hal), args.hal) + input_file_ids.query_sizes = FileID.forPath(t.importFile("file://" + args.query_sizes), args.query_sizes) + input_file_ids.query_two_bit = FileID.forPath( + t.importFile("file://" + args.query_two_bit), args.query_two_bit + ) + target_two_bit_file_ids = { + genome: FileID.forPath(t.importFile("file://" + f), f) for genome, f in args.target_two_bits.items() + } input_file_ids.target_two_bits = target_two_bit_file_ids job = Job.wrapJobFn(setup, args, input_file_ids) chain_file_ids = t.start(job) @@ -37,7 +39,7 @@ def chaining(args, toil_options): chain_file_ids = t.restart() for chain_file, chain_file_id in chain_file_ids.items(): tools.fileOps.ensure_file_dir(chain_file) - t.exportFile(chain_file_id, 'file://' + chain_file) + t.exportFile(chain_file_id, "file://" + chain_file) def setup(job, args, input_file_ids): @@ -53,20 +55,30 @@ def setup(job, args, input_file_ids): chrom, size = l.split() size = int(size) for target_genome, target_two_bit_file_id in input_file_ids.target_two_bits.items(): - disk_usage = tools.toilInterface.find_total_disk_usage([input_file_ids.hal, target_two_bit_file_id, - input_file_ids.query_two_bit]) + disk_usage = tools.toilInterface.find_total_disk_usage( + [input_file_ids.hal, target_two_bit_file_id, input_file_ids.query_two_bit] + ) # silly heuristic for chaining -- if the chrom is over 10mb, use 32G, otherwise use 8G if size >= 10000000: - memory = '32G' + memory = "32G" else: - memory = '8G' - j = job.addChildJobFn(chain_by_chromosome, args, chrom, size, input_file_ids, target_genome, - target_two_bit_file_id, memory=memory, disk=disk_usage) + memory = "8G" + j = job.addChildJobFn( + chain_by_chromosome, + args, + chrom, + size, + input_file_ids, + target_genome, + target_two_bit_file_id, + memory=memory, + disk=disk_usage, + ) tmp_chain_file_ids[target_genome].append(j.rv()) return_file_ids = {} for genome, chain_file in args.chain_files.items(): chain_files = tmp_chain_file_ids[genome] - j = job.addFollowOnJobFn(merge, chain_files, genome, memory='8G', disk='8G') + j = job.addFollowOnJobFn(merge, chain_files, genome, memory="8G", disk="8G") return_file_ids[chain_file] = j.rv() return return_file_ids @@ -82,10 +94,9 @@ def chain_by_chromosome(job, args, chrom, size, input_file_ids, target_genome, t :param target_two_bit_file_id: the file ID for the twobit file for target_genome :return: chain file for this chromosome """ - job.fileStore.logToMaster('Beginning to chain chromosome {}-{}'.format(target_genome, chrom), - level=logging.INFO) + job.fileStore.logToMaster("Beginning to chain chromosome {}-{}".format(target_genome, chrom), level=logging.INFO) bed_path = tools.fileOps.get_tmp_toil_file() - with open(bed_path, 'w') as outf: + with open(bed_path, "w") as outf: tools.fileOps.print_row(outf, [chrom, 0, size]) chain = tools.fileOps.get_tmp_toil_file() # load files from jobStore @@ -93,9 +104,11 @@ def chain_by_chromosome(job, args, chrom, size, input_file_ids, target_genome, t target_two_bit = job.fileStore.readGlobalFile(target_two_bit_file_id) query_two_bit = job.fileStore.readGlobalFile(input_file_ids.query_two_bit) # execute liftover - cmd = [['halLiftover', '--outPSL', hal, args.ref_genome, bed_path, target_genome, '/dev/stdout'], - ['pslPosTarget', '/dev/stdin', '/dev/stdout'], - ['axtChain', '-psl', '-verbose=0', '-linearGap=medium', '/dev/stdin', target_two_bit, query_two_bit, chain]] + cmd = [ + ["halLiftover", "--outPSL", hal, args.ref_genome, bed_path, target_genome, "/dev/stdout"], + ["pslPosTarget", "/dev/stdin", "/dev/stdout"], + ["axtChain", "-psl", "-verbose=0", "-linearGap=medium", "/dev/stdin", target_two_bit, query_two_bit, chain], + ] tools.procOps.run_proc(cmd) return job.fileStore.writeGlobalFile(chain) @@ -107,15 +120,15 @@ def merge(job, chain_files, genome): :param genome: genome being combined :return: """ - job.fileStore.logToMaster('Merging chains for {}'.format(genome), level=logging.INFO) + job.fileStore.logToMaster("Merging chains for {}".format(genome), level=logging.INFO) fofn = tools.fileOps.get_tmp_toil_file() - with open(fofn, 'w') as outf: + with open(fofn, "w") as outf: for i, file_id in enumerate(chain_files): - local_path = job.fileStore.readGlobalFile(file_id, userPath='{}.chain'.format(i)) - if os.environ.get('CAT_BINARY_MODE') == 'singularity': + local_path = job.fileStore.readGlobalFile(file_id, userPath="{}.chain".format(i)) + if os.environ.get("CAT_BINARY_MODE") == "singularity": local_path = tools.procOps.singularify_arg(local_path) - outf.write(local_path + '\n') - cmd = ['chainMergeSort', '-inputList={}'.format(fofn), '-tempDir={}/'.format(job.fileStore.getLocalTempDir())] + outf.write(local_path + "\n") + cmd = ["chainMergeSort", "-inputList={}".format(fofn), "-tempDir={}/".format(job.fileStore.getLocalTempDir())] tmp_chain_file = tools.fileOps.get_tmp_toil_file() tools.procOps.run_proc(cmd, stdout=tmp_chain_file) tmp_chain_file_id = job.fileStore.writeGlobalFile(tmp_chain_file) diff --git a/cat/classify.py b/cat/classify.py index 3bc003c2..102c46d5 100644 --- a/cat/classify.py +++ b/cat/classify.py @@ -75,14 +75,14 @@ def classify(eval_args): # results stores the final dataframes results = {} for tx_mode, path_dict in eval_args.transcript_modes.items(): - tx_dict = tools.transcripts.get_gene_pred_dict(path_dict['gp']) - aln_modes = ['CDS', 'mRNA'] if tx_mode != 'augCGP' else ['CDS'] + tx_dict = tools.transcripts.get_gene_pred_dict(path_dict["gp"]) + aln_modes = ["CDS", "mRNA"] if tx_mode != "augCGP" else ["CDS"] for aln_mode in aln_modes: psl_iter = list(tools.psl.psl_iterator(path_dict[aln_mode])) mc_df = metrics_classify(aln_mode, ref_tx_dict, tx_dict, tx_biotype_map, psl_iter, seq_dict) ec_df = evaluation_classify(aln_mode, ref_tx_dict, tx_dict, tx_biotype_map, psl_iter, seq_dict) - results[tools.sqlInterface.tables[aln_mode][tx_mode]['metrics'].__tablename__] = mc_df - results[tools.sqlInterface.tables[aln_mode][tx_mode]['evaluation'].__tablename__] = ec_df + results[tools.sqlInterface.tables[aln_mode][tx_mode]["metrics"].__tablename__] = mc_df + results[tools.sqlInterface.tables[aln_mode][tx_mode]["evaluation"].__tablename__] = ec_df return results @@ -95,20 +95,20 @@ def metrics_classify(aln_mode, ref_tx_dict, tx_dict, tx_biotype_map, psl_iter, s for ref_tx, tx, psl, biotype in tx_iter(psl_iter, ref_tx_dict, tx_dict, tx_biotype_map): original_intron_vector = calculate_original_intron_vector(ref_tx, tx, psl, aln_mode) adj_start, adj_stop = find_adj_start_stop(tx, seq_dict) - r.append([ref_tx.name2, ref_tx.name, tx.name, 'AlnCoverage', 100 * psl.target_coverage]) - r.append([ref_tx.name2, ref_tx.name, tx.name, 'AlnIdentity', 100 * psl.identity]) - r.append([ref_tx.name2, ref_tx.name, tx.name, 'AlnGoodness', 100 * (1 - psl.badness)]) - r.append([ref_tx.name2, ref_tx.name, tx.name, 'PercentUnknownBases', psl.percent_n]) - r.append([ref_tx.name2, ref_tx.name, tx.name, 'OriginalIntrons', original_intron_vector]) - r.append([ref_tx.name2, ref_tx.name, tx.name, 'ValidStart', tools.transcripts.has_start_codon(seq_dict, tx)]) - r.append([ref_tx.name2, ref_tx.name, tx.name, 'ValidStop', tools.transcripts.has_stop_codon(seq_dict, tx)]) - r.append([ref_tx.name2, ref_tx.name, tx.name, 'ProperOrf', tx.cds_size % 3 == 0]) - r.append([ref_tx.name2, ref_tx.name, tx.name, 'AdjStart', adj_start]) - r.append([ref_tx.name2, ref_tx.name, tx.name, 'AdjStop', adj_stop]) - columns = ['GeneId', 'TranscriptId', 'AlignmentId', 'classifier', 'value'] + r.append([ref_tx.name2, ref_tx.name, tx.name, "AlnCoverage", 100 * psl.target_coverage]) + r.append([ref_tx.name2, ref_tx.name, tx.name, "AlnIdentity", 100 * psl.identity]) + r.append([ref_tx.name2, ref_tx.name, tx.name, "AlnGoodness", 100 * (1 - psl.badness)]) + r.append([ref_tx.name2, ref_tx.name, tx.name, "PercentUnknownBases", psl.percent_n]) + r.append([ref_tx.name2, ref_tx.name, tx.name, "OriginalIntrons", original_intron_vector]) + r.append([ref_tx.name2, ref_tx.name, tx.name, "ValidStart", tools.transcripts.has_start_codon(seq_dict, tx)]) + r.append([ref_tx.name2, ref_tx.name, tx.name, "ValidStop", tools.transcripts.has_stop_codon(seq_dict, tx)]) + r.append([ref_tx.name2, ref_tx.name, tx.name, "ProperOrf", tx.cds_size % 3 == 0]) + r.append([ref_tx.name2, ref_tx.name, tx.name, "AdjStart", adj_start]) + r.append([ref_tx.name2, ref_tx.name, tx.name, "AdjStop", adj_stop]) + columns = ["GeneId", "TranscriptId", "AlignmentId", "classifier", "value"] df = pd.DataFrame(r, columns=columns) df = df.sort_values(columns) - df = df.set_index('AlignmentId') + df = df.set_index("AlignmentId") assert len(r) == len(df) return df @@ -121,15 +121,28 @@ def evaluation_classify(aln_mode, ref_tx_dict, tx_dict, tx_biotype_map, psl_iter r = [] for ref_tx, tx, psl, biotype in tx_iter(psl_iter, ref_tx_dict, tx_dict, tx_biotype_map): r.extend(find_indels(tx, psl, aln_mode)) - if biotype == 'protein_coding': + if biotype == "protein_coding": line = in_frame_stop(tx, seq_dict) if line is not None: r.append(line) - columns = ['AlignmentId', 'chromosome', 'start', 'stop', 'name', 'score', 'strand', 'thickStart', - 'thickStop', 'rgb', 'blockCount', 'blockSizes', 'blockStarts'] + columns = [ + "AlignmentId", + "chromosome", + "start", + "stop", + "name", + "score", + "strand", + "thickStart", + "thickStop", + "rgb", + "blockCount", + "blockSizes", + "blockStarts", + ] df = pd.DataFrame(r, columns=columns) df = df.sort_values(columns) - df = df.set_index('AlignmentId') + df = df.set_index("AlignmentId") assert len(r) == len(df) return df @@ -171,7 +184,7 @@ def calculate_original_intron_vector(ref_tx, tx, psl, aln_mode): # if we lost all introns due to CDS filtering, return a vector of all 0s if len(tgt_introns) == 0: - return ','.join(['0'] * len(ref_tx.intron_intervals)) + return ",".join(["0"] * len(ref_tx.intron_intervals)) # count the number of introns within wiggle distance of each other intron_vector = [] @@ -181,7 +194,7 @@ def calculate_original_intron_vector(ref_tx, tx, psl, aln_mode): intron_vector.append(1) else: intron_vector.append(0) - return ','.join(map(str, intron_vector)) + return ",".join(map(str, intron_vector)) def in_frame_stop(tx, fasta): @@ -189,12 +202,12 @@ def in_frame_stop(tx, fasta): Finds the first in frame stop of this transcript, if there are any :param tx: Target GenePredTranscript object - :param fasta: pyfasta Fasta object mapping the genome fasta for this analysis + :param fasta: pyfaidx Fasta object mapping the genome fasta for this analysis :return: A BED string if an in frame stop was found otherwise None """ for start_pos, stop_pos, codon in tx.codon_iterator(fasta): - if tools.bio.translate_sequence(codon) == '*': - bed = tx.get_bed(new_start=start_pos, new_stop=stop_pos, rgb='135,78,191', name='InFrameStop') + if tools.bio.translate_sequence(codon) == "*": + bed = tx.get_bed(new_start=start_pos, new_stop=stop_pos, rgb="135,78,191", name="InFrameStop") return [tx.name] + bed @@ -207,12 +220,12 @@ def find_adj_start_stop(tx, fasta): """ Finds the adjusted start/stop positions that define the ORF of this transcript, dealing with in-frame stops :param tx: Target GenePredTranscript object - :param fasta: pyfasta Fasta object mapping the genome fasta for this analysis + :param fasta: pyfaidx Fasta object mapping the genome fasta for this analysis :return: two integers for start/stop in genomic coordinates """ for start_pos, stop_pos, codon in tx.codon_iterator(fasta): - if tools.bio.translate_sequence(codon) == '*': - if tx.strand == '-': + if tools.bio.translate_sequence(codon) == "*": + if tx.strand == "-": start = start_pos stop = tx.thick_stop else: @@ -242,6 +255,7 @@ def find_indels(tx, psl, aln_mode): :param aln_mode: One of ('CDS', 'mRNA'). Determines if we aligned CDS or mRNA. :return: list of bed12-format lists """ + def convert_coordinates_to_chromosome(left_pos, right_pos, coordinate_fn, strand): """convert alignment coordinates to target chromosome coordinates, inverting if negative strand""" left_chrom_pos = coordinate_fn(left_pos) @@ -249,36 +263,38 @@ def convert_coordinates_to_chromosome(left_pos, right_pos, coordinate_fn, strand right_chrom_pos = coordinate_fn(right_pos) if right_chrom_pos is None: right_chrom_pos = coordinate_fn(right_pos - 1) - if strand == '-': + if strand == "-": left_chrom_pos += 1 else: left_chrom_pos -= 1 assert right_chrom_pos is not None - if strand == '-': + if strand == "-": left_chrom_pos, right_chrom_pos = right_chrom_pos, left_chrom_pos assert right_chrom_pos >= left_chrom_pos return left_chrom_pos, right_chrom_pos def parse_indel(left_pos, right_pos, coordinate_fn, tx, offset, gap_type): """Converts either an insertion or a deletion into a output transcript""" - left_chrom_pos, right_chrom_pos = convert_coordinates_to_chromosome(left_pos, right_pos, coordinate_fn, - tx.strand) + left_chrom_pos, right_chrom_pos = convert_coordinates_to_chromosome( + left_pos, right_pos, coordinate_fn, tx.strand + ) if left_chrom_pos is None or right_chrom_pos is None: - assert aln_mode == 'CDS' + assert aln_mode == "CDS" return None if left_chrom_pos > tx.thick_start and right_chrom_pos < tx.thick_stop: - indel_type = 'CodingMult3' if offset % 3 == 0 else 'Coding' + indel_type = "CodingMult3" if offset % 3 == 0 else "Coding" else: - indel_type = 'NonCoding' + indel_type = "NonCoding" - new_bed = tx.get_bed(new_start=left_chrom_pos, new_stop=right_chrom_pos, rgb=offset, - name=''.join([indel_type, gap_type])) + new_bed = tx.get_bed( + new_start=left_chrom_pos, new_stop=right_chrom_pos, rgb=offset, name="".join([indel_type, gap_type]) + ) return [tx.name] + new_bed # depending on mode, we convert the coordinates from either CDS or mRNA # we also have a different position cutoff to make sure we are not evaluating terminal gaps - if aln_mode == 'CDS': + if aln_mode == "CDS": coordinate_fn = tx.cds_coordinate_to_chromosome else: coordinate_fn = tx.mrna_coordinate_to_chromosome @@ -293,16 +309,16 @@ def parse_indel(left_pos, right_pos, coordinate_fn, tx, offset, gap_type): for block_size, q_start, t_start in zip(*[psl.block_sizes, psl.q_starts[1:], psl.t_starts[1:]]): q_offset = q_start - block_size - q_pos t_offset = t_start - block_size - t_pos - assert (q_offset >= 0 and t_offset >= 0) + assert q_offset >= 0 and t_offset >= 0 if q_offset != 0: # query insertion -> insertion in target sequence left_pos = q_start - q_offset right_pos = q_start - row = parse_indel(left_pos, right_pos, coordinate_fn, tx, q_offset, 'Insertion') + row = parse_indel(left_pos, right_pos, coordinate_fn, tx, q_offset, "Insertion") if row is not None: r.append(row) if t_offset != 0: # target insertion -> insertion in reference sequence left_pos = right_pos = q_start - row = parse_indel(left_pos, right_pos, coordinate_fn, tx, t_offset, 'Deletion') + row = parse_indel(left_pos, right_pos, coordinate_fn, tx, t_offset, "Deletion") if row is not None: r.append(row) q_pos = q_start @@ -337,7 +353,7 @@ def convert_cds_frames(ref_tx, tx, aln_mode): :param aln_mode: If we are in CDS mode, we need to convert the transcripts to a CDS-framed object. :return: tuple of GenePredTranscript objects (ref_tx, tx) """ - if aln_mode == 'CDS': + if aln_mode == "CDS": if ref_tx.offset != 0: ref_tx = convert_cds_frame(ref_tx) if tx.offset != 0: @@ -355,7 +371,7 @@ def convert_cds_frame(tx): """ offset = tx.offset mod3 = (tx.cds_size - offset) % 3 - if tx.strand == '+': + if tx.strand == "+": b = tx.get_bed(new_start=tx.thick_start + offset, new_stop=tx.thick_stop - mod3) else: b = tx.get_bed(new_start=tx.thick_start + mod3, new_stop=tx.thick_stop - offset) @@ -370,7 +386,7 @@ def get_intron_coordinates(tx, aln_mode): :param aln_mode: One of ('CDS', 'mRNA'). Used to determine if we aligned in CDS space or mRNA space :return: list of integers """ - if aln_mode == 'CDS': + if aln_mode == "CDS": tx = convert_cds_frame(tx) introns = [tx.chromosome_coordinate_to_cds(tx.start + x) for x in tx.block_starts[1:]] else: @@ -390,14 +406,14 @@ def get_exon_intervals(tx, aln_mode): :param aln_mode: One of ('CDS', 'mRNA'). Used to determine if we aligned in CDS space or mRNA space :return: dict of ChromosomeInterval objects {reference:converted} """ - if aln_mode == 'CDS': + if aln_mode == "CDS": tx = convert_cds_frame(tx) exons = {} for exon in tx.exon_intervals: start = tx.chromosome_coordinate_to_mrna(exon.start) stop = tx.chromosome_coordinate_to_mrna(exon.stop - 1) # zero based, half open - if tx.strand == '-': + if tx.strand == "-": start, stop = stop, start - i = tools.intervals.ChromosomeInterval(None, start, stop + 1, '.') + i = tools.intervals.ChromosomeInterval(None, start, stop + 1, ".") exons[exon] = i return exons diff --git a/cat/consensus.py b/cat/consensus.py index 48dd7beb..39b885db 100644 --- a/cat/consensus.py +++ b/cat/consensus.py @@ -37,9 +37,9 @@ import tools.procOps from tools.defaultOrderedDict import DefaultOrderedDict -logger = logging.getLogger('cat') +logger = logging.getLogger("cat") -id_template = '{genome:.10}_{tag_type}{unique_id:07d}' +id_template = "{genome:.10}_{tag_type}{unique_id:07d}" def generate_consensus(args): @@ -53,112 +53,154 @@ def generate_consensus(args): # load reference annotation information ref_df = tools.sqlInterface.load_annotation(args.ref_db_path) ref_biotype_counts = collections.Counter(ref_df.TranscriptBiotype) - coding_count = ref_biotype_counts['protein_coding'] - non_coding_count = sum(y for x, y in ref_biotype_counts.items() if x != 'protein_coding') + coding_count = ref_biotype_counts["protein_coding"] + non_coding_count = sum(y for x, y in ref_biotype_counts.items() if x != "protein_coding") # gene transcript map to iterate over so that we capture missing gene information gene_biotype_map = tools.sqlInterface.get_gene_biotype_map(args.ref_db_path) transcript_biotype_map = tools.sqlInterface.get_transcript_biotype_map(args.ref_db_path) # load transMap evaluation data tm_eval_df = load_transmap_evals(args.db_path) # load the homGeneMapping data for transMap/augTM/augTMR - tx_modes = [x for x in args.tx_modes if x in ['transMap', 'augTM', 'augTMR']] + tx_modes = [x for x in args.tx_modes if x in ["transMap", "augTM", "augTMR"]] hgm_df = pd.concat([load_hgm_vectors(args.db_path, tx_mode) for tx_mode in tx_modes]) # load the alignment metrics data - mrna_metrics_df = pd.concat([load_metrics_from_db(args.db_path, tx_mode, 'mRNA') for tx_mode in tx_modes]) - cds_metrics_df = pd.concat([load_metrics_from_db(args.db_path, tx_mode, 'CDS') for tx_mode in tx_modes]) + mrna_metrics_df = pd.concat([load_metrics_from_db(args.db_path, tx_mode, "mRNA") for tx_mode in tx_modes]) + cds_metrics_df = pd.concat([load_metrics_from_db(args.db_path, tx_mode, "CDS") for tx_mode in tx_modes]) eval_df = pd.concat([load_evaluations_from_db(args.db_path, tx_mode) for tx_mode in tx_modes]).reset_index() - coding_df, non_coding_df = combine_and_filter_dfs(tx_dict, hgm_df, mrna_metrics_df, cds_metrics_df, tm_eval_df, - ref_df, eval_df, args.intron_rnaseq_support, - args.exon_rnaseq_support, args.intron_annot_support, - args.exon_annot_support, args.original_intron_support, - args.in_species_rna_support_only) + coding_df, non_coding_df = combine_and_filter_dfs( + tx_dict, + hgm_df, + mrna_metrics_df, + cds_metrics_df, + tm_eval_df, + ref_df, + eval_df, + args.intron_rnaseq_support, + args.exon_rnaseq_support, + args.intron_annot_support, + args.exon_annot_support, + args.original_intron_support, + args.in_species_rna_support_only, + ) if len(coding_df) + len(non_coding_df) == 0: - raise RuntimeError('No transcripts pass filtering for species {}. ' - 'Consider lowering requirements. Please see the manual.'.format(args.genome)) + raise RuntimeError( + "No transcripts pass filtering for species {}. " + "Consider lowering requirements. Please see the manual.".format(args.genome) + ) elif len(coding_df) == 0 and coding_count > 0: - logger.warning('No protein coding transcripts pass filtering for species {}. ' - 'Consider lowering requirements. Please see the manual.'.format(args.genome)) + logger.warning( + "No protein coding transcripts pass filtering for species {}. " + "Consider lowering requirements. Please see the manual.".format(args.genome) + ) elif len(non_coding_df) == 0 and non_coding_count > 0: - logger.warning('No non-coding transcripts pass filtering for species {}. ' - 'Consider lowering requirements. Please see the manual.'.format(args.genome)) - scored_coding_df, scored_non_coding_df = score_filtered_dfs(coding_df, non_coding_df, - args.in_species_rna_support_only) + logger.warning( + "No non-coding transcripts pass filtering for species {}. " + "Consider lowering requirements. Please see the manual.".format(args.genome) + ) + scored_coding_df, scored_non_coding_df = score_filtered_dfs( + coding_df, non_coding_df, args.in_species_rna_support_only + ) scored_df = merge_scored_dfs(scored_coding_df, scored_non_coding_df) - best_alignments = scored_df.groupby('TranscriptId')['TranscriptScore'].transform(max) == scored_df['TranscriptScore'] + best_alignments = ( + scored_df.groupby("TranscriptId")["TranscriptScore"].transform(max) == scored_df["TranscriptScore"] + ) best_df = scored_df[best_alignments].reset_index() # store some metrics for plotting - metrics = {'Transcript Missing': collections.Counter(), - 'Gene Missing': collections.Counter(), - 'Transcript Modes': collections.Counter(), # coding only - 'Duplicate transcripts': collections.Counter(), - 'Discarded by strand resolution': 0, - 'Coverage': collections.defaultdict(list), - 'Identity': collections.defaultdict(list), - 'Splice Support': collections.defaultdict(list), - 'Exon Support': collections.defaultdict(list), - 'Original Introns': collections.defaultdict(list), - 'Splice Annotation Support': collections.defaultdict(list), - 'Exon Annotation Support': collections.defaultdict(list), - 'IsoSeq Transcript Validation': collections.Counter()} + metrics = { + "Transcript Missing": collections.Counter(), + "Gene Missing": collections.Counter(), + "Transcript Modes": collections.Counter(), # coding only + "Duplicate transcripts": collections.Counter(), + "Discarded by strand resolution": 0, + "Coverage": collections.defaultdict(list), + "Identity": collections.defaultdict(list), + "Splice Support": collections.defaultdict(list), + "Exon Support": collections.defaultdict(list), + "Original Introns": collections.defaultdict(list), + "Splice Annotation Support": collections.defaultdict(list), + "Exon Annotation Support": collections.defaultdict(list), + "IsoSeq Transcript Validation": collections.Counter(), + } # we can keep track of missing stuff now - for gene_biotype, tx_df in best_df.groupby('GeneBiotype'): + for gene_biotype, tx_df in best_df.groupby("GeneBiotype"): biotype_genes = {gene_id for gene_id, b in gene_biotype_map.items() if b == gene_biotype} - metrics['Gene Missing'][gene_biotype] = len(biotype_genes) - len(set(tx_df.GeneId)) - for tx_biotype, tx_df in best_df.groupby('TranscriptBiotype'): + metrics["Gene Missing"][gene_biotype] = len(biotype_genes) - len(set(tx_df.GeneId)) + for tx_biotype, tx_df in best_df.groupby("TranscriptBiotype"): biotype_txs = {gene_id for gene_id, b in transcript_biotype_map.items() if b == tx_biotype} - metrics['Transcript Missing'][tx_biotype] = len(biotype_txs) - len(set(tx_df.TranscriptId)) + metrics["Transcript Missing"][tx_biotype] = len(biotype_txs) - len(set(tx_df.TranscriptId)) # main consensus finding -- using incorporate_tx to transform best scoring transcripts # stores a mapping of alignment IDs to tags for the final consensus set consensus_dict = {} - for (gene_id, tx_id), s in best_df.groupby(['GeneId', 'TranscriptId']): + for (gene_id, tx_id), s in best_df.groupby(["GeneId", "TranscriptId"]): aln_id, m = incorporate_tx(s, gene_id, metrics, args.hints_db_has_rnaseq) consensus_dict[aln_id] = m # if we ran in either denovo mode, load those data and detect novel genes if len(args.denovo_tx_modes) > 0: - metrics['denovo'] = {} + metrics["denovo"] = {} for tx_mode in args.denovo_tx_modes: - metrics['denovo'][tx_mode] = {'Possible paralog': 0, 'Poor alignment': 0, 'Putative novel': 0, - 'Possible fusion': 0, 'Putative novel isoform': 0} - denovo_dict = find_novel(args.db_path, tx_dict, consensus_dict, ref_df, metrics, gene_biotype_map, - args.denovo_num_introns, args.in_species_rna_support_only, - args.denovo_tx_modes, args.denovo_splice_support, args.denovo_exon_support, - args.denovo_ignore_novel_genes, args.denovo_novel_end_distance, - args.denovo_allow_unsupported, args.denovo_allow_bad_annot_or_tm, - args.denovo_only_novel_genes, args.denovo_allow_novel_ends) + metrics["denovo"][tx_mode] = { + "Possible paralog": 0, + "Poor alignment": 0, + "Putative novel": 0, + "Possible fusion": 0, + "Putative novel isoform": 0, + } + denovo_dict = find_novel( + args.db_path, + tx_dict, + consensus_dict, + ref_df, + metrics, + gene_biotype_map, + args.denovo_num_introns, + args.in_species_rna_support_only, + args.denovo_tx_modes, + args.denovo_splice_support, + args.denovo_exon_support, + args.denovo_ignore_novel_genes, + args.denovo_novel_end_distance, + args.denovo_allow_unsupported, + args.denovo_allow_bad_annot_or_tm, + args.denovo_only_novel_genes, + args.denovo_allow_novel_ends, + ) consensus_dict.update(denovo_dict) # perform final filtering steps deduplicated_consensus = deduplicate_consensus(consensus_dict, tx_dict, metrics) deduplicated_strand_resolved_consensus = resolve_opposite_strand(deduplicated_consensus, tx_dict, metrics) - if 'augPB' in args.denovo_tx_modes: - deduplicated_strand_resolved_consensus = validate_pacbio_splices(deduplicated_strand_resolved_consensus, - args.db_path, tx_dict, metrics, - args.require_pacbio_support) + if "augPB" in args.denovo_tx_modes: + deduplicated_strand_resolved_consensus = validate_pacbio_splices( + deduplicated_strand_resolved_consensus, args.db_path, tx_dict, metrics, args.require_pacbio_support + ) if args.filter_overlapping_genes is True: - gene_resolved_consensus = resolve_overlapping_cds_intervals(args.overlapping_gene_distance, - deduplicated_strand_resolved_consensus, tx_dict) + gene_resolved_consensus = resolve_overlapping_cds_intervals( + args.overlapping_gene_distance, deduplicated_strand_resolved_consensus, tx_dict + ) else: gene_resolved_consensus = deduplicated_strand_resolved_consensus # sort by genomic interval for prettily increasing numbers - final_consensus = sorted(gene_resolved_consensus, - key=lambda tx_attrs: (tx_dict[tx_attrs[0]].chromosome, tx_dict[tx_attrs[0]].start)) + final_consensus = sorted( + gene_resolved_consensus, key=lambda tx_attrs: (tx_dict[tx_attrs[0]].chromosome, tx_dict[tx_attrs[0]].start) + ) # calculate final gene set completeness calculate_completeness(final_consensus, metrics) # add some interesting metrics on how much using Augustus modes improved our results - if 'augTM' in tx_modes or 'augTMR' in tx_modes: + if "augTM" in tx_modes or "augTMR" in tx_modes: calculate_improvement_metrics(final_consensus, scored_df, tm_eval_df, hgm_df, metrics) calculate_indel_metrics(final_consensus, eval_df, metrics) # write out results. consensus tx dict has the unique names - consensus_gene_dict = write_consensus_gps(args.consensus_gp, args.consensus_gp_info, - final_consensus, tx_dict, args.genome) + consensus_gene_dict = write_consensus_gps( + args.consensus_gp, args.consensus_gp_info, final_consensus, tx_dict, args.genome + ) write_consensus_gff3(consensus_gene_dict, args.consensus_gff3) write_consensus_fastas(consensus_gene_dict, args.consensus_fasta, args.consensus_protein_fasta, args.fasta) @@ -177,14 +219,15 @@ def load_transmap_evals(db_path): # combine transMap evaluation and transMap filtering into one table # the transMap filtering columns are used for tags in the output - tm_eval_df = pd.merge(tm_eval, tm_filter_eval, on=['TranscriptId', 'AlignmentId']) - return tm_eval_df.drop('AlignmentId', axis=1) + tm_eval_df = pd.merge(tm_eval, tm_filter_eval, on=["TranscriptId", "AlignmentId"]) + return tm_eval_df.drop("AlignmentId", axis=1) def calculate_vector_support(s, resolve_nan=None, num_digits=4): """For vectors parsed by parse_text_vector(), convert to a percentage between 0 and 100""" - return 100 * tools.mathOps.format_ratio(len([x for x in s if x > 0]), len(s), resolve_nan=resolve_nan, - num_digits=num_digits) + return 100 * tools.mathOps.format_ratio( + len([x for x in s if x > 0]), len(s), resolve_nan=resolve_nan, num_digits=num_digits + ) def load_hgm_vectors(db_path, tx_mode): @@ -194,16 +237,22 @@ def load_hgm_vectors(db_path, tx_mode): exon score because CGP has a coding-only model. """ session = tools.sqlInterface.start_session(db_path) - intron_table = tools.sqlInterface.tables['hgm'][tx_mode] + intron_table = tools.sqlInterface.tables["hgm"][tx_mode] hgm_df = tools.sqlInterface.load_intron_vector(intron_table, session) # start calculating support levels for consensus finding - cols = ['IntronAnnotSupport', 'ExonAnnotSupport', 'CdsAnnotSupport', - 'ExonRnaSupport', 'IntronRnaSupport', - 'AllSpeciesExonRnaSupport', 'AllSpeciesIntronRnaSupport'] + cols = [ + "IntronAnnotSupport", + "ExonAnnotSupport", + "CdsAnnotSupport", + "ExonRnaSupport", + "IntronRnaSupport", + "AllSpeciesExonRnaSupport", + "AllSpeciesIntronRnaSupport", + ] for col in cols: - hgm_df[col] = [list(map(int, x)) if len(x[0]) > 0 else [] for x in hgm_df[col].str.split(',').tolist()] - hgm_df[col + 'Percent'] = hgm_df[col].apply(calculate_vector_support, resolve_nan=1) + hgm_df[col] = [list(map(int, x)) if len(x[0]) > 0 else [] for x in hgm_df[col].str.split(",").tolist()] + hgm_df[col + "Percent"] = hgm_df[col].apply(calculate_vector_support, resolve_nan=1) return hgm_df @@ -212,18 +261,19 @@ def load_metrics_from_db(db_path, tx_mode, aln_mode): Loads the alignment metrics for the mRNA/CDS alignments of transMap/AugustusTM/TMR """ session = tools.sqlInterface.start_session(db_path) - metrics_table = tools.sqlInterface.tables[aln_mode][tx_mode]['metrics'] + metrics_table = tools.sqlInterface.tables[aln_mode][tx_mode]["metrics"] metrics_df = tools.sqlInterface.load_metrics(metrics_table, session) # unstack flattens the long-form data structure - metrics_df = metrics_df.set_index(['AlignmentId', 'classifier']).unstack('classifier') + metrics_df = metrics_df.set_index(["AlignmentId", "classifier"]).unstack("classifier") metrics_df.columns = [col[1] for col in metrics_df.columns] metrics_df = metrics_df.reset_index() - cols = ['AlnCoverage', 'AlnGoodness', 'AlnIdentity', 'PercentUnknownBases'] + cols = ["AlnCoverage", "AlnGoodness", "AlnIdentity", "PercentUnknownBases"] metrics_df[cols] = metrics_df[cols].apply(pd.to_numeric) - metrics_df['OriginalIntrons'] = metrics_df['OriginalIntrons'].fillna('') - metrics_df['OriginalIntrons'] = [list(map(int, x)) if len(x[0]) > 0 else [] for x in - metrics_df['OriginalIntrons'].str.split(',').tolist()] - metrics_df['OriginalIntronsPercent'] = metrics_df['OriginalIntrons'].apply(calculate_vector_support, resolve_nan=1) + metrics_df["OriginalIntrons"] = metrics_df["OriginalIntrons"].fillna("") + metrics_df["OriginalIntrons"] = [ + list(map(int, x)) if len(x[0]) > 0 else [] for x in metrics_df["OriginalIntrons"].str.split(",").tolist() + ] + metrics_df["OriginalIntronsPercent"] = metrics_df["OriginalIntrons"].apply(calculate_vector_support, resolve_nan=1) session.close() return metrics_df @@ -233,6 +283,7 @@ def load_evaluations_from_db(db_path, tx_mode): Loads the indel information from the evaluation database. We give preference to CDS alignments, but fall back to mRNA alignments. """ + def aggfunc(s): """ Preferentially pick CDS stats over mRNA stats, if they exist @@ -242,21 +293,28 @@ def aggfunc(s): c = set(s[s.value_CDS > 0].name) else: c = set(s[s.value_mRNA > 0].name) - cols = ['Frameshift', 'CodingInsertion', 'CodingDeletion', 'CodingMult3Indel'] - return pd.Series(('CodingDeletion' in c or 'CodingInsertion' in c, - 'CodingInsertion' in c, 'CodingDeletion' in c, - 'CodingMult3Deletion' in c or 'CodingMult3Insertion' in c), index=cols) + cols = ["Frameshift", "CodingInsertion", "CodingDeletion", "CodingMult3Indel"] + return pd.Series( + ( + "CodingDeletion" in c or "CodingInsertion" in c, + "CodingInsertion" in c, + "CodingDeletion" in c, + "CodingMult3Deletion" in c or "CodingMult3Insertion" in c, + ), + index=cols, + ) session = tools.sqlInterface.start_session(db_path) - cds_table = tools.sqlInterface.tables['CDS'][tx_mode]['evaluation'] - mrna_table = tools.sqlInterface.tables['mRNA'][tx_mode]['evaluation'] + cds_table = tools.sqlInterface.tables["CDS"][tx_mode]["evaluation"] + mrna_table = tools.sqlInterface.tables["mRNA"][tx_mode]["evaluation"] cds_df = tools.sqlInterface.load_evaluation(cds_table, session) mrna_df = tools.sqlInterface.load_evaluation(mrna_table, session) - cds_df = cds_df.set_index('AlignmentId') - mrna_df = mrna_df.set_index('AlignmentId') - merged = mrna_df.reset_index().merge(cds_df.reset_index(), how='outer', on=['AlignmentId', 'name'], - suffixes=['_mRNA', '_CDS']) - eval_df = merged.groupby('AlignmentId').apply(aggfunc) + cds_df = cds_df.set_index("AlignmentId") + mrna_df = mrna_df.set_index("AlignmentId") + merged = mrna_df.reset_index().merge( + cds_df.reset_index(), how="outer", on=["AlignmentId", "name"], suffixes=["_mRNA", "_CDS"] + ) + eval_df = merged.groupby("AlignmentId").apply(aggfunc) return eval_df @@ -265,18 +323,30 @@ def load_alt_names(db_path, denovo_tx_modes): session = tools.sqlInterface.start_session(db_path) r = [] for tx_mode in denovo_tx_modes: - table = tools.sqlInterface.tables['alt_names'][tx_mode] + table = tools.sqlInterface.tables["alt_names"][tx_mode] r.append(tools.sqlInterface.load_alternatives(table, session)) df = pd.concat(r) # rename TranscriptId to AlignmentId. This is all super confusing and silly # the reason is that homGeneMapping has the gene -> tx -> aln ID hierarchy we inherit to simplify things - df.columns = [x if x != 'TranscriptId' else 'AlignmentId' for x in df.columns] + df.columns = [x if x != "TranscriptId" else "AlignmentId" for x in df.columns] return df -def combine_and_filter_dfs(tx_dict, hgm_df, mrna_metrics_df, cds_metrics_df, tm_eval_df, ref_df, eval_df, - intron_rnaseq_support, exon_rnaseq_support, intron_annot_support, exon_annot_support, - original_intron_support, in_species_rna_support_only): +def combine_and_filter_dfs( + tx_dict, + hgm_df, + mrna_metrics_df, + cds_metrics_df, + tm_eval_df, + ref_df, + eval_df, + intron_rnaseq_support, + exon_rnaseq_support, + intron_annot_support, + exon_annot_support, + original_intron_support, + in_species_rna_support_only, +): """ Updates the DataFrame based on support levels. Filters based on user-tunable flags for support levels. :param tx_dict: dictionary of genePredTranscript objects. Used to remove things filtered out by transMap @@ -295,52 +365,60 @@ def combine_and_filter_dfs(tx_dict, hgm_df, mrna_metrics_df, cds_metrics_df, tm_ :return: filtered and merged dataframe """ # add the reference information to gain biotype information - hgm_ref_df = pd.merge(hgm_df, ref_df, on=['GeneId', 'TranscriptId']) + hgm_ref_df = pd.merge(hgm_df, ref_df, on=["GeneId", "TranscriptId"]) # combine in homGeneMapping results - hgm_ref_tm_df = pd.merge(hgm_ref_df, tm_eval_df, on=['GeneId', 'TranscriptId']) + hgm_ref_tm_df = pd.merge(hgm_ref_df, tm_eval_df, on=["GeneId", "TranscriptId"]) # remove filtered transMap hgm_ref_tm_df = hgm_ref_tm_df[hgm_ref_tm_df.AlignmentId.isin(tx_dict.keys())] # split merged_df into coding and noncoding - coding_df = hgm_ref_tm_df[hgm_ref_tm_df.TranscriptBiotype == 'protein_coding'] - non_coding_df = hgm_ref_tm_df[hgm_ref_tm_df.TranscriptBiotype != 'protein_coding'] + coding_df = hgm_ref_tm_df[hgm_ref_tm_df.TranscriptBiotype == "protein_coding"] + non_coding_df = hgm_ref_tm_df[hgm_ref_tm_df.TranscriptBiotype != "protein_coding"] # add metrics information to coding df - metrics_df = pd.merge(mrna_metrics_df, cds_metrics_df, on='AlignmentId', suffixes=['_mRNA', '_CDS']) - coding_df = pd.merge(coding_df, metrics_df, on='AlignmentId') + metrics_df = pd.merge(mrna_metrics_df, cds_metrics_df, on="AlignmentId", suffixes=["_mRNA", "_CDS"]) + coding_df = pd.merge(coding_df, metrics_df, on="AlignmentId") # add evaluation information to coding df, where possible. This adds information on frame shifts. - coding_df = pd.merge(coding_df, eval_df, on='AlignmentId', how='left') + coding_df = pd.merge(coding_df, eval_df, on="AlignmentId", how="left") # fill the original intron values to 100 so we don't filter them out -- means a no-intron gene - coding_df['OriginalIntronsPercent_mRNA'] = coding_df.OriginalIntronsPercent_mRNA.fillna(100) - coding_df['OriginalIntronsPercent_CDS'] = coding_df.OriginalIntronsPercent_CDS.fillna(100) - non_coding_df['TransMapOriginalIntronsPercent'] = non_coding_df.TransMapOriginalIntronsPercent.fillna(100) + coding_df["OriginalIntronsPercent_mRNA"] = coding_df.OriginalIntronsPercent_mRNA.fillna(100) + coding_df["OriginalIntronsPercent_CDS"] = coding_df.OriginalIntronsPercent_CDS.fillna(100) + non_coding_df["TransMapOriginalIntronsPercent"] = non_coding_df.TransMapOriginalIntronsPercent.fillna(100) # huge ugly filtering expression for coding transcripts if in_species_rna_support_only is True: - filt = ((coding_df.OriginalIntronsPercent_mRNA >= original_intron_support) & - (coding_df.IntronAnnotSupportPercent >= intron_annot_support) & - (coding_df.IntronRnaSupportPercent >= intron_rnaseq_support) & - (coding_df.ExonAnnotSupportPercent >= exon_annot_support) & - (coding_df.ExonRnaSupportPercent >= exon_rnaseq_support)) + filt = ( + (coding_df.OriginalIntronsPercent_mRNA >= original_intron_support) + & (coding_df.IntronAnnotSupportPercent >= intron_annot_support) + & (coding_df.IntronRnaSupportPercent >= intron_rnaseq_support) + & (coding_df.ExonAnnotSupportPercent >= exon_annot_support) + & (coding_df.ExonRnaSupportPercent >= exon_rnaseq_support) + ) else: - filt = ((coding_df.OriginalIntronsPercent_mRNA >= original_intron_support) & - (coding_df.IntronAnnotSupportPercent >= intron_annot_support) & - (coding_df.AllSpeciesIntronRnaSupportPercent >= intron_rnaseq_support) & - (coding_df.ExonAnnotSupportPercent >= exon_annot_support) & - (coding_df.AllSpeciesExonRnaSupportPercent >= exon_rnaseq_support)) + filt = ( + (coding_df.OriginalIntronsPercent_mRNA >= original_intron_support) + & (coding_df.IntronAnnotSupportPercent >= intron_annot_support) + & (coding_df.AllSpeciesIntronRnaSupportPercent >= intron_rnaseq_support) + & (coding_df.ExonAnnotSupportPercent >= exon_annot_support) + & (coding_df.AllSpeciesExonRnaSupportPercent >= exon_rnaseq_support) + ) coding_df = coding_df[filt] # huge ugly filtering expression for non coding transcripts if in_species_rna_support_only is True: - filt = ((non_coding_df.TransMapOriginalIntronsPercent >= original_intron_support) & - (non_coding_df.IntronAnnotSupportPercent >= intron_annot_support) & - (non_coding_df.IntronRnaSupportPercent >= intron_rnaseq_support) & - (non_coding_df.ExonAnnotSupportPercent >= exon_annot_support) & - (non_coding_df.ExonRnaSupportPercent >= exon_rnaseq_support)) + filt = ( + (non_coding_df.TransMapOriginalIntronsPercent >= original_intron_support) + & (non_coding_df.IntronAnnotSupportPercent >= intron_annot_support) + & (non_coding_df.IntronRnaSupportPercent >= intron_rnaseq_support) + & (non_coding_df.ExonAnnotSupportPercent >= exon_annot_support) + & (non_coding_df.ExonRnaSupportPercent >= exon_rnaseq_support) + ) else: - filt = ((non_coding_df.TransMapOriginalIntronsPercent >= original_intron_support) & - (non_coding_df.IntronAnnotSupportPercent >= intron_annot_support) & - (non_coding_df.AllSpeciesIntronRnaSupportPercent >= intron_rnaseq_support) & - (non_coding_df.ExonAnnotSupportPercent >= exon_annot_support) & - (non_coding_df.AllSpeciesExonRnaSupportPercent >= exon_rnaseq_support)) + filt = ( + (non_coding_df.TransMapOriginalIntronsPercent >= original_intron_support) + & (non_coding_df.IntronAnnotSupportPercent >= intron_annot_support) + & (non_coding_df.AllSpeciesIntronRnaSupportPercent >= intron_rnaseq_support) + & (non_coding_df.ExonAnnotSupportPercent >= exon_annot_support) + & (non_coding_df.AllSpeciesExonRnaSupportPercent >= exon_rnaseq_support) + ) non_coding_df = non_coding_df[filt] return coding_df, non_coding_df @@ -362,10 +440,15 @@ def score_filtered_dfs(coding_df, non_coding_df, in_species_rna_support_only): Returns the dataframe sorted by scores after indexing. """ + def score(s): - aln_id = s.AlnIdentity_CDS if s.TranscriptBiotype == 'protein_coding' else s.TransMapIdentity - aln_cov = s.AlnCoverage_CDS if s.TranscriptBiotype == 'protein_coding' else s.TransMapCoverage - orig_intron = s.OriginalIntronsPercent_mRNA if s.TranscriptBiotype == 'protein_coding' else s.TransMapOriginalIntronsPercent + aln_id = s.AlnIdentity_CDS if s.TranscriptBiotype == "protein_coding" else s.TransMapIdentity + aln_cov = s.AlnCoverage_CDS if s.TranscriptBiotype == "protein_coding" else s.TransMapCoverage + orig_intron = ( + s.OriginalIntronsPercent_mRNA + if s.TranscriptBiotype == "protein_coding" + else s.TransMapOriginalIntronsPercent + ) if in_species_rna_support_only: rna_support = s.ExonRnaSupportPercent + s.IntronRnaSupportPercent else: @@ -374,15 +457,15 @@ def score(s): for df in [coding_df, non_coding_df]: if len(df) > 0: - df['TranscriptScore'] = df.apply(score, axis=1) + df["TranscriptScore"] = df.apply(score, axis=1) return coding_df, non_coding_df def merge_scored_dfs(scored_coding_df, scored_non_coding_df): """Merges the scored dataframes by changing some names around""" # for every non-coding TransMap metric, copy it to the other name - for m in ['Coverage', 'Identity', 'Goodness']: - scored_non_coding_df['Aln' + m + '_mRNA'] = scored_non_coding_df['TransMap' + m] + for m in ["Coverage", "Identity", "Goodness"]: + scored_non_coding_df["Aln" + m + "_mRNA"] = scored_non_coding_df["TransMap" + m] merged_df = pd.concat([scored_non_coding_df, scored_coding_df]) return merged_df @@ -407,12 +490,12 @@ def validate_pacbio_splices(deduplicated_strand_resolved_consensus, db_path, tx_ pb_resolved_consensus = [] for tx_id, d in deduplicated_strand_resolved_consensus: if tx_id in validated_ids: - d['pacbio_isoform_supported'] = True - metrics['IsoSeq Transcript Validation'][True] += 1 + d["pacbio_isoform_supported"] = True + metrics["IsoSeq Transcript Validation"][True] += 1 pb_resolved_consensus.append([tx_id, d]) elif require_pacbio_support is False: - d['pacbio_isoform_supported'] = False - metrics['IsoSeq Transcript Validation'][False] += 1 + d["pacbio_isoform_supported"] = False + metrics["IsoSeq Transcript Validation"][False] += 1 pb_resolved_consensus.append([tx_id, d]) # if require_pacbio_support is True, then we don't save this transcript return pb_resolved_consensus @@ -423,65 +506,82 @@ def incorporate_tx(best_rows, gene_id, metrics, hints_db_has_rnaseq): best_series = best_rows.iloc[0] transcript_modes = evaluate_ties(best_rows) # construct the tags for this transcript - d = {'source_transcript': best_series.TranscriptId, - 'source_transcript_name': best_series.TranscriptName, - 'source_gene': gene_id, - 'score': int(10 * round(best_series.AlnGoodness_mRNA, 3)), - 'transcript_modes': transcript_modes, - 'gene_biotype': best_series.GeneBiotype, - 'transcript_biotype': best_series.TranscriptBiotype, - 'alignment_id': str(best_series.AlignmentId), - 'frameshift': str(best_series.get('Frameshift', None)), - 'exon_annotation_support': ','.join(map(str, best_series.ExonAnnotSupport)), - 'intron_annotation_support': ','.join(map(str, best_series.IntronAnnotSupport)), - 'transcript_class': 'ortholog', - 'valid_start': bool(best_series.ValidStart), - 'valid_stop': bool(best_series.ValidStop), - 'adj_start': best_series.AdjStart_mRNA, - 'adj_stop': best_series.AdjStop_mRNA, - 'proper_orf': bool(best_series.ProperOrf)} + d = { + "source_transcript": best_series.TranscriptId, + "source_transcript_name": best_series.TranscriptName, + "source_gene": gene_id, + "score": int(10 * round(best_series.AlnGoodness_mRNA, 3)), + "transcript_modes": transcript_modes, + "gene_biotype": best_series.GeneBiotype, + "transcript_biotype": best_series.TranscriptBiotype, + "alignment_id": str(best_series.AlignmentId), + "frameshift": str(best_series.get("Frameshift", None)), + "exon_annotation_support": ",".join(map(str, best_series.ExonAnnotSupport)), + "intron_annotation_support": ",".join(map(str, best_series.IntronAnnotSupport)), + "transcript_class": "ortholog", + "valid_start": bool(best_series.ValidStart), + "valid_stop": bool(best_series.ValidStop), + "adj_start": best_series.AdjStart_mRNA, + "adj_stop": best_series.AdjStop_mRNA, + "proper_orf": bool(best_series.ProperOrf), + } # incorporate any extra tags for key, val in tools.misc.parse_gff_attr_line(best_series.ExtraTags).items(): d[key] = val if hints_db_has_rnaseq is True: - d['exon_rna_support'] = ','.join(map(str, best_series.ExonRnaSupport)) - d['intron_rna_support'] = ','.join(map(str, best_series.IntronRnaSupport)) + d["exon_rna_support"] = ",".join(map(str, best_series.ExonRnaSupport)) + d["intron_rna_support"] = ",".join(map(str, best_series.IntronRnaSupport)) if best_series.Paralogy is not None: - d['paralogy'] = best_series.Paralogy + d["paralogy"] = best_series.Paralogy if best_series.UnfilteredParalogy is not None: - d['unfiltered_paralogy'] = best_series.UnfilteredParalogy + d["unfiltered_paralogy"] = best_series.UnfilteredParalogy if best_series.GeneAlternateLoci is not None: - d['gene_alternate_contigs'] = best_series.GeneAlternateLoci + d["gene_alternate_contigs"] = best_series.GeneAlternateLoci if best_series.CollapsedGeneIds is not None: - d['collapsed_gene_ids'] = best_series.CollapsedGeneIds + d["collapsed_gene_ids"] = best_series.CollapsedGeneIds if best_series.CollapsedGeneNames is not None: - d['collapsed_gene_names'] = best_series.CollapsedGeneNames + d["collapsed_gene_names"] = best_series.CollapsedGeneNames if best_series.PossibleSplitGeneLocations is not None: - d['possible_split_gene_locations'] = best_series.PossibleSplitGeneLocations + d["possible_split_gene_locations"] = best_series.PossibleSplitGeneLocations if best_series.GeneName is not None: - d['source_gene_common_name'] = best_series.GeneName + d["source_gene_common_name"] = best_series.GeneName # add information to the overall metrics - if best_series.TranscriptBiotype == 'protein_coding': - metrics['Transcript Modes'][transcript_modes] += 1 - metrics['Coverage'][best_series.TranscriptBiotype].append(best_series.AlnCoverage_mRNA) - metrics['Identity'][best_series.TranscriptBiotype].append(best_series.AlnIdentity_mRNA) - metrics['Splice Support'][best_series.TranscriptBiotype].append(best_series.IntronRnaSupportPercent) - metrics['Exon Support'][best_series.TranscriptBiotype].append(best_series.ExonRnaSupportPercent) - metrics['Splice Annotation Support'][best_series.TranscriptBiotype].append(best_series.IntronAnnotSupportPercent) - metrics['Exon Annotation Support'][best_series.TranscriptBiotype].append(best_series.ExonAnnotSupportPercent) - metrics['Original Introns'][best_series.TranscriptBiotype].append(best_series.OriginalIntronsPercent_mRNA) + if best_series.TranscriptBiotype == "protein_coding": + metrics["Transcript Modes"][transcript_modes] += 1 + metrics["Coverage"][best_series.TranscriptBiotype].append(best_series.AlnCoverage_mRNA) + metrics["Identity"][best_series.TranscriptBiotype].append(best_series.AlnIdentity_mRNA) + metrics["Splice Support"][best_series.TranscriptBiotype].append(best_series.IntronRnaSupportPercent) + metrics["Exon Support"][best_series.TranscriptBiotype].append(best_series.ExonRnaSupportPercent) + metrics["Splice Annotation Support"][best_series.TranscriptBiotype].append(best_series.IntronAnnotSupportPercent) + metrics["Exon Annotation Support"][best_series.TranscriptBiotype].append(best_series.ExonAnnotSupportPercent) + metrics["Original Introns"][best_series.TranscriptBiotype].append(best_series.OriginalIntronsPercent_mRNA) return best_series.AlignmentId, d def evaluate_ties(best_rows): """Find out how many transcript modes agreed on this""" - return ','.join(sorted(set([tools.nameConversions.alignment_type(x) for x in best_rows.AlignmentId]))) - - -def find_novel(db_path, tx_dict, consensus_dict, ref_df, metrics, gene_biotype_map, denovo_num_introns, - in_species_rna_support_only, denovo_tx_modes, denovo_splice_support, denovo_exon_support, - denovo_ignore_novel_genes, denovo_novel_end_distance, denovo_allow_unsupported, - denovo_allow_bad_annot_or_tm, denovo_only_novel_genes, denovo_allow_novel_ends): + return ",".join(sorted(set([tools.nameConversions.alignment_type(x) for x in best_rows.AlignmentId]))) + + +def find_novel( + db_path, + tx_dict, + consensus_dict, + ref_df, + metrics, + gene_biotype_map, + denovo_num_introns, + in_species_rna_support_only, + denovo_tx_modes, + denovo_splice_support, + denovo_exon_support, + denovo_ignore_novel_genes, + denovo_novel_end_distance, + denovo_allow_unsupported, + denovo_allow_bad_annot_or_tm, + denovo_only_novel_genes, + denovo_allow_novel_ends, +): """ Finds novel loci, builds their attributes. Only calls novel loci if they have sufficient intron and splice support as defined by the user. @@ -497,6 +597,7 @@ def find_novel(db_path, tx_dict, consensus_dict, ref_df, metrics, gene_biotype_m Also finds novel splice junctions in CGP/PB transcripts. A novel splice junction is defined as a splice which homGeneMapping did not map over and which is supported by RNA-seq. """ + def is_novel(s): """ Determine if this transcript is possibly novel. If it is assigned a gene ID, pass this off to @@ -504,9 +605,9 @@ def is_novel(s): """ if s.AssignedGeneId is not None: return is_novel_supported(s) - if denovo_allow_bad_annot_or_tm is False and s.ResolutionMethod == 'badAnnotOrTm': + if denovo_allow_bad_annot_or_tm is False and s.ResolutionMethod == "badAnnotOrTm": return None - elif s.ResolutionMethod == 'ambiguousOrFusion' and s.IntronRnaSupportPercent != 100: + elif s.ResolutionMethod == "ambiguousOrFusion" and s.IntronRnaSupportPercent != 100: return None # validate the support level intron = s.IntronRnaSupportPercent if in_species_rna_support_only else s.AllSpeciesIntronRnaSupportPercent @@ -514,30 +615,36 @@ def is_novel(s): if intron < denovo_splice_support or exon < denovo_exon_support: return None # if we previously flagged this as ambiguousOrFusion, propagate this tag - if s.ResolutionMethod == 'ambiguousOrFusion': - return 'possible_fusion' - elif s.ResolutionMethod == 'badAnnotOrTm': - return 'bad_annot_or_tm' + if s.ResolutionMethod == "ambiguousOrFusion": + return "possible_fusion" + elif s.ResolutionMethod == "badAnnotOrTm": + return "bad_annot_or_tm" # if we have alternatives, this is not novel but could be a gene family expansion elif s.AlternativeGeneIds is not None: - return 'possible_paralog' + return "possible_paralog" # this may be a poor mapping elif bool(s.ExonAnnotSupportPercent > 0 or s.CdsAnnotSupportPercent > 0 or s.IntronAnnotSupportPercent > 0): - return 'poor_alignment' + return "poor_alignment" # this is looking pretty novel, could still be a mapping problem in a complex region though else: - return 'putative_novel' + return "putative_novel" def is_novel_supported(s): """Is this CGP/PB transcript with an assigned gene ID supported and have a novel splice?""" denovo_tx_obj = tx_dict[s.AlignmentId] if len(denovo_tx_obj.intron_intervals) < denovo_num_introns: return None - elif in_species_rna_support_only and s.ExonRnaSupportPercent <= denovo_exon_support or \ - s.IntronRnaSupportPercent <= denovo_splice_support: + elif ( + in_species_rna_support_only + and s.ExonRnaSupportPercent <= denovo_exon_support + or s.IntronRnaSupportPercent <= denovo_splice_support + ): return None - elif in_species_rna_support_only is False and s.AllSpeciesExonRnaSupportPercent <= denovo_exon_support or \ - s.AllSpeciesIntronRnaSupportPercent <= denovo_splice_support: + elif ( + in_species_rna_support_only is False + and s.AllSpeciesExonRnaSupportPercent <= denovo_exon_support + or s.AllSpeciesIntronRnaSupportPercent <= denovo_splice_support + ): return None # look for splices that are not supported by the reference annotation # these splices may or may not be supported by RNA-seq based on the denovo_allow_unsupported flag @@ -550,13 +657,15 @@ def is_novel_supported(s): if len(new_supported_splices) == 0: return None # if any splices are both not supported by annotation and supported by RNA, call this as novel - if any(annot == 0 and i in new_supported_splices for i, annot in zip(*[denovo_tx_obj.intron_intervals, - s.IntronAnnotSupport])): - metrics['Transcript Modes'][tools.nameConversions.alignment_type(s.AlignmentId)] += 1 - tx_class = 'putative_novel_isoform' + if any( + annot == 0 and i in new_supported_splices + for i, annot in zip(*[denovo_tx_obj.intron_intervals, s.IntronAnnotSupport]) + ): + metrics["Transcript Modes"][tools.nameConversions.alignment_type(s.AlignmentId)] += 1 + tx_class = "putative_novel_isoform" # if any splices are new, and supported by RNA-seq call this poor alignment else: - tx_class = 'poor_alignment' + tx_class = "poor_alignment" return tx_class def has_novel_ends(s): @@ -566,29 +675,36 @@ def has_novel_ends(s): denovo_tx_obj = tx_dict[s.AlignmentId] five_p = denovo_tx_obj.get_5p_interval() three_p = denovo_tx_obj.get_3p_interval() - five_p_matches = tools.intervals.interval_not_within_wiggle_room_intervals(existing_5p[denovo_tx_obj.chromosome], - five_p, denovo_novel_end_distance) - three_p_matches = tools.intervals.interval_not_within_wiggle_room_intervals(existing_5p[denovo_tx_obj.chromosome], - three_p, denovo_novel_end_distance) + five_p_matches = tools.intervals.interval_not_within_wiggle_room_intervals( + existing_5p[denovo_tx_obj.chromosome], five_p, denovo_novel_end_distance + ) + three_p_matches = tools.intervals.interval_not_within_wiggle_room_intervals( + existing_5p[denovo_tx_obj.chromosome], three_p, denovo_novel_end_distance + ) if denovo_allow_novel_ends is False: tx_class = s.TranscriptClass else: - tx_class = 'putative_novel_isoform' if s.TranscriptClass is None and (five_p_matches or three_p_matches) else s.TranscriptClass + tx_class = ( + "putative_novel_isoform" + if s.TranscriptClass is None and (five_p_matches or three_p_matches) + else s.TranscriptClass + ) return pd.Series([five_p_matches, three_p_matches, tx_class]) denovo_hgm_df = pd.concat([load_hgm_vectors(db_path, tx_mode) for tx_mode in denovo_tx_modes]) # remove the TranscriptId and GeneId columns so they can be populated by others - denovo_hgm_df = denovo_hgm_df.drop(['GeneId', 'TranscriptId'], axis=1) + denovo_hgm_df = denovo_hgm_df.drop(["GeneId", "TranscriptId"], axis=1) # load the alignment metrics data denovo_alt_names = load_alt_names(db_path, denovo_tx_modes) - denovo_df = pd.merge(denovo_hgm_df, denovo_alt_names, on='AlignmentId') + denovo_df = pd.merge(denovo_hgm_df, denovo_alt_names, on="AlignmentId") common_name_map = dict(list(zip(ref_df.GeneId, ref_df.GeneName))) - denovo_df['CommonName'] = [common_name_map.get(x, None) for x in denovo_df.AssignedGeneId] - denovo_df['GeneBiotype'] = [gene_biotype_map.get(x, None) for x in denovo_df.AssignedGeneId] + denovo_df["CommonName"] = [common_name_map.get(x, None) for x in denovo_df.AssignedGeneId] + denovo_df["GeneBiotype"] = [gene_biotype_map.get(x, None) for x in denovo_df.AssignedGeneId] # if we have an external reference, try to incorporate those names as well - if 'exRef' in denovo_tx_modes: + if "exRef" in denovo_tx_modes: + def add_exref_ids(s): if s.AlignmentId in exref_common_name_map: # if we have an assigned gene ID, defer the gene biotype to that but retain transcript biotype @@ -598,11 +714,12 @@ def add_exref_ids(s): return pd.Series([exref_common_name_map[s.AlignmentId], s.GeneBiotype]) # pass along the original data return pd.Series([s.CommonName, s.GeneBiotype]) + exref_annot = tools.sqlInterface.load_annotation(db_path) exref_common_name_map = dict(list(zip(exref_annot.TranscriptId, exref_annot.GeneName))) exref_gene_biotype_map = dict(list(zip(exref_annot.TranscriptId, exref_annot.GeneBiotype))) - denovo_df[['CommonName', 'GeneBiotype']] = denovo_df.apply(add_exref_ids, axis=1) - exref_annot = exref_annot.set_index('TranscriptId') + denovo_df[["CommonName", "GeneBiotype"]] = denovo_df.apply(add_exref_ids, axis=1) + exref_annot = exref_annot.set_index("TranscriptId") # extract all splices, 5' and 3' ends we have already seen existing_splices = set() @@ -615,41 +732,49 @@ def add_exref_ids(s): existing_3p[tx_obj.chromosome].add(tx_obj.get_3p_interval()) # apply the novel finding functions - denovo_df['TranscriptClass'] = denovo_df.apply(is_novel, axis=1) - denovo_df[['Novel5pCap', 'NovelPolyA', 'TranscriptClass']] = denovo_df.apply(has_novel_ends, axis=1) + denovo_df["TranscriptClass"] = denovo_df.apply(is_novel, axis=1) + denovo_df[["Novel5pCap", "NovelPolyA", "TranscriptClass"]] = denovo_df.apply(has_novel_ends, axis=1) # types of transcripts for later - denovo_df['TranscriptMode'] = [tools.nameConversions.alignment_type(aln_id) for aln_id in denovo_df.AlignmentId] + denovo_df["TranscriptMode"] = [tools.nameConversions.alignment_type(aln_id) for aln_id in denovo_df.AlignmentId] # filter out non-novel as well as fusions filtered_denovo_df = denovo_df[(~denovo_df.TranscriptClass.isnull())] - filtered_denovo_df = filtered_denovo_df[filtered_denovo_df.TranscriptClass != 'possible_fusion'] + filtered_denovo_df = filtered_denovo_df[filtered_denovo_df.TranscriptClass != "possible_fusion"] # fill in missing fields for novel loci - filtered_denovo_df['GeneBiotype'] = filtered_denovo_df['GeneBiotype'].fillna('unknown_likely_coding') + filtered_denovo_df["GeneBiotype"] = filtered_denovo_df["GeneBiotype"].fillna("unknown_likely_coding") # filter out novel if requested by user if denovo_ignore_novel_genes is True: - filtered_denovo_df = filtered_denovo_df[(filtered_denovo_df.TranscriptClass == 'possible_paralog') | - (filtered_denovo_df.TranscriptClass == 'putative_novel_isoform')] + filtered_denovo_df = filtered_denovo_df[ + (filtered_denovo_df.TranscriptClass == "possible_paralog") + | (filtered_denovo_df.TranscriptClass == "putative_novel_isoform") + ] elif denovo_only_novel_genes is True: - filtered_denovo_df = filtered_denovo_df[~((filtered_denovo_df.TranscriptClass == 'possible_paralog') | - (filtered_denovo_df.TranscriptClass == 'putative_novel_isoform'))] + filtered_denovo_df = filtered_denovo_df[ + ~( + (filtered_denovo_df.TranscriptClass == "possible_paralog") + | (filtered_denovo_df.TranscriptClass == "putative_novel_isoform") + ) + ] # construct aln_id -> features map to return denovo_tx_dict = {} for _, s in filtered_denovo_df.iterrows(): aln_id = s.AlignmentId tx_mode = s.TranscriptMode - denovo_tx_dict[aln_id] = {'source_gene': s.AssignedGeneId, - 'transcript_class': s.TranscriptClass, - 'novel_5p_cap': s.Novel5pCap, - 'novel_poly_a': s.NovelPolyA, - 'transcript_biotype': s.GeneBiotype, - 'gene_biotype': s.GeneBiotype, - 'intron_rna_support': ','.join(map(str, s.IntronRnaSupport)), - 'exon_rna_support': ','.join(map(str, s.ExonRnaSupport)), - 'transcript_modes': tx_mode, - 'exon_annotation_support': ','.join(map(str, s.ExonAnnotSupport)), - 'intron_annotation_support': ','.join(map(str, s.IntronAnnotSupport)), - 'alignment_id': aln_id, - 'source_gene_common_name': s.CommonName} + denovo_tx_dict[aln_id] = { + "source_gene": s.AssignedGeneId, + "transcript_class": s.TranscriptClass, + "novel_5p_cap": s.Novel5pCap, + "novel_poly_a": s.NovelPolyA, + "transcript_biotype": s.GeneBiotype, + "gene_biotype": s.GeneBiotype, + "intron_rna_support": ",".join(map(str, s.IntronRnaSupport)), + "exon_rna_support": ",".join(map(str, s.ExonRnaSupport)), + "transcript_modes": tx_mode, + "exon_annotation_support": ",".join(map(str, s.ExonAnnotSupport)), + "intron_annotation_support": ",".join(map(str, s.IntronAnnotSupport)), + "alignment_id": aln_id, + "source_gene_common_name": s.CommonName, + } # bring in extra tags for exRef if tools.nameConversions.aln_id_is_exref(aln_id): @@ -659,15 +784,16 @@ def add_exref_ids(s): denovo_tx_dict[aln_id][key] = val # record some metrics - metrics['denovo'][tx_mode][s.TranscriptClass.replace('_', ' ').capitalize()] += 1 - metrics['Transcript Modes'][tx_mode] += 1 - metrics['Splice Support']['unknown_likely_coding'].append(s.IntronRnaSupportPercent) - metrics['Exon Support']['unknown_likely_coding'].append(s.ExonRnaSupportPercent) + metrics["denovo"][tx_mode][s.TranscriptClass.replace("_", " ").capitalize()] += 1 + metrics["Transcript Modes"][tx_mode] += 1 + metrics["Splice Support"]["unknown_likely_coding"].append(s.IntronRnaSupportPercent) + metrics["Exon Support"]["unknown_likely_coding"].append(s.ExonRnaSupportPercent) # record how many of each type we threw out - for tx_mode, df in denovo_df.groupby('TranscriptMode'): - metrics['denovo'][tx_mode]['Discarded'] = len(df[(~df.TranscriptClass.isnull()) | (~df.Novel5pCap.isnull()) - | (~df.NovelPolyA.isnull())]) + for tx_mode, df in denovo_df.groupby("TranscriptMode"): + metrics["denovo"][tx_mode]["Discarded"] = len( + df[(~df.TranscriptClass.isnull()) | (~df.Novel5pCap.isnull()) | (~df.NovelPolyA.isnull())] + ) return denovo_tx_dict @@ -677,21 +803,25 @@ def deduplicate_consensus(consensus_dict, tx_dict, metrics): that are actually identical. Remove these, picking the best based on their score, favoring the transcript whose biotype matches the parent. """ + def resolve_duplicate(tx_list, consensus_dict): - biotype_txs = [tx for tx in tx_list if - consensus_dict[tx].get('gene_biotype', None) == consensus_dict[tx].get('transcript_biotype', - None)] + biotype_txs = [ + tx + for tx in tx_list + if consensus_dict[tx].get("gene_biotype", None) == consensus_dict[tx].get("transcript_biotype", None) + ] if len(biotype_txs) > 0: tx_list = biotype_txs - sorted_scores = sorted([[tx, consensus_dict[tx].get('score', 0)] for tx in tx_list], - key=lambda tx_s1: -tx_s1[1]) + sorted_scores = sorted( + [[tx, consensus_dict[tx].get("score", 0)] for tx in tx_list], key=lambda tx_s1: -tx_s1[1] + ) return sorted_scores[0][0] def add_duplicate_field(best_tx, tx_list, consensus_dict, deduplicated_consensus): deduplicated_consensus[best_tx] = consensus_dict[best_tx] tx_list = [tools.nameConversions.strip_alignment_numbers(aln_id) for aln_id in tx_list] best_tx_base = tools.nameConversions.strip_alignment_numbers(best_tx) - deduplicated_consensus[best_tx]['alternative_source_transcripts'] = ','.join(set(tx_list) - {best_tx_base}) + deduplicated_consensus[best_tx]["alternative_source_transcripts"] = ",".join(set(tx_list) - {best_tx_base}) # build a dictionary mapping duplicates making use of hashing intervals duplicates = collections.defaultdict(list) @@ -703,7 +833,7 @@ def add_duplicate_field(best_tx, tx_list, consensus_dict, deduplicated_consensus deduplicated_consensus = {} for tx_list in duplicates.values(): if len(tx_list) > 1: - metrics['Duplicate transcripts'][len(tx_list)] += 1 + metrics["Duplicate transcripts"][len(tx_list)] += 1 best_tx = resolve_duplicate(tx_list, consensus_dict) add_duplicate_field(best_tx, tx_list, consensus_dict, deduplicated_consensus) else: @@ -723,7 +853,7 @@ def resolve_opposite_strand(deduplicated_consensus, tx_dict, metrics): for tx_id, attrs in deduplicated_consensus.items(): tx_obj = tx_dict[tx_id] # don't try to resolve novel genes - source_gene = attrs['source_gene'] + source_gene = attrs["source_gene"] if source_gene is not None: gene_dict[source_gene].append([tx_obj, attrs]) else: @@ -734,13 +864,13 @@ def resolve_opposite_strand(deduplicated_consensus, tx_dict, metrics): if len(set(tx_obj.strand for tx_obj in tx_objs)) > 1: strand_scores = collections.Counter() for tx_obj, attrs in gene_dict[gene]: - strand_scores[tx_obj.strand] += attrs.get('score', 0) + strand_scores[tx_obj.strand] += attrs.get("score", 0) best_strand = sorted(strand_scores.items())[1][0] for tx_obj, attrs in gene_dict[gene]: if tx_obj.strand == best_strand: deduplicated_strand_resolved_consensus.append([tx_obj.name, attrs]) else: - metrics['Discarded by strand resolution'] += 1 + metrics["Discarded by strand resolution"] += 1 else: deduplicated_strand_resolved_consensus.extend([[tx_obj.name, attrs] for tx_obj, attrs in gene_dict[gene]]) return deduplicated_strand_resolved_consensus @@ -758,31 +888,37 @@ def resolve_overlapping_cds_intervals(overlapping_gene_distance, deduplicated_st # first, write genePred attr_df = [] with tools.fileOps.TemporaryFilePath() as tmp_gp, tools.fileOps.TemporaryFilePath() as tmp_clustered: - with open(tmp_gp, 'w') as outf: + with open(tmp_gp, "w") as outf: for tx_id, attrs in deduplicated_strand_resolved_consensus: tx_obj = tx_dict[tx_id] tools.fileOps.print_row(outf, tx_obj.get_gene_pred()) - attr_df.append([tx_id, attrs['transcript_class'], attrs['gene_biotype'], - attrs.get('source_gene', tx_obj.name2), attrs.get('score', None)]) + attr_df.append( + [ + tx_id, + attrs["transcript_class"], + attrs["gene_biotype"], + attrs.get("source_gene", tx_obj.name2), + attrs.get("score", None), + ] + ) # cluster - cmd = ['clusterGenes', '-cds', f'-minOverlappingBases={overlapping_gene_distance}', - tmp_clustered, 'no', tmp_gp] + cmd = ["clusterGenes", "-cds", f"-minOverlappingBases={overlapping_gene_distance}", tmp_clustered, "no", tmp_gp] tools.procOps.run_proc(cmd) - cluster_df = pd.read_csv(tmp_clustered, sep='\t') - attr_df = pd.DataFrame(attr_df, columns=['transcript_id', 'transcript_class', 'gene_biotype', 'gene_id', 'score']) - m = attr_df.merge(cluster_df, left_on='transcript_id', right_on='gene') # gene is transcript ID + cluster_df = pd.read_csv(tmp_clustered, sep="\t") + attr_df = pd.DataFrame(attr_df, columns=["transcript_id", "transcript_class", "gene_biotype", "gene_id", "score"]) + m = attr_df.merge(cluster_df, left_on="transcript_id", right_on="gene") # gene is transcript ID to_remove = set() # list of transcript IDs to remove - for cluster_id, group in m.groupby('#cluster'): - if len(set(group['gene_id'])) > 1: - if 'unknown_likely_coding' in set(group['gene_biotype']): # pick longest ORF - orfs = {tx_id: tx_dict[tx_id].cds_size for tx_id in group['transcript_id']} + for cluster_id, group in m.groupby("#cluster"): + if len(set(group["gene_id"])) > 1: + if "unknown_likely_coding" in set(group["gene_biotype"]): # pick longest ORF + orfs = {tx_id: tx_dict[tx_id].cds_size for tx_id in group["transcript_id"]} best_tx = sorted(iter(orfs.items()), key=lambda x: x[1])[-1][0] tx_df = group[group.transcript_id == best_tx].iloc[0] best_gene = tx_df.gene_id else: # pick highest average score - avg_scores = group[['gene_id', 'score']].groupby('gene_id', as_index=False).mean() - best_gene = avg_scores.sort_values('score', ascending=False).iloc[0]['gene_id'] + avg_scores = group[["gene_id", "score"]].groupby("gene_id", as_index=False).mean() + best_gene = avg_scores.sort_values("score", ascending=False).iloc[0]["gene_id"] to_remove.update(set(group[group.gene_id != best_gene].transcript_id)) return [[tx_id, attrs] for tx_id, attrs in deduplicated_strand_resolved_consensus if tx_id not in to_remove] @@ -801,55 +937,59 @@ def calculate_completeness(final_consensus, metrics): # don't count novel transcripts towards completeness if tools.nameConversions.aln_id_is_cgp(aln_id) or tools.nameConversions.aln_id_is_pb(aln_id): continue - genes[c['gene_biotype']].add(c['source_gene']) - txs[c['transcript_biotype']] += 1 + genes[c["gene_biotype"]].add(c["source_gene"]) + txs[c["transcript_biotype"]] += 1 genes = {biotype: len(gene_list) for biotype, gene_list in genes.items()} - metrics['Completeness'] = {'Gene': genes, 'Transcript': txs} + metrics["Completeness"] = {"Gene": genes, "Transcript": txs} def calculate_improvement_metrics(final_consensus, scored_df, tm_eval_df, hgm_df, metrics): """For coding transcripts, how much did we improve the metrics?""" - tm_df = tm_eval_df.reset_index()[['TransMapOriginalIntronsPercent', 'TranscriptId']] - hgm_df_subset = hgm_df[hgm_df['AlignmentId'].apply(tools.nameConversions.aln_id_is_transmap)] - hgm_df_subset = hgm_df_subset[['TranscriptId', 'IntronAnnotSupportPercent', 'IntronRnaSupportPercent']] - tm_df = pd.merge(tm_df, hgm_df_subset, on='TranscriptId') - df = pd.merge(tm_df, scored_df.reset_index(), on='TranscriptId', suffixes=['TransMap', '']) - df = df.drop_duplicates(subset='AlignmentId') # why do I need to do this? - df = df.set_index('AlignmentId') - metrics['Evaluation Improvement'] = {'changes': [], 'unchanged': 0} + tm_df = tm_eval_df.reset_index()[["TransMapOriginalIntronsPercent", "TranscriptId"]] + hgm_df_subset = hgm_df[hgm_df["AlignmentId"].apply(tools.nameConversions.aln_id_is_transmap)] + hgm_df_subset = hgm_df_subset[["TranscriptId", "IntronAnnotSupportPercent", "IntronRnaSupportPercent"]] + tm_df = pd.merge(tm_df, hgm_df_subset, on="TranscriptId") + df = pd.merge(tm_df, scored_df.reset_index(), on="TranscriptId", suffixes=["TransMap", ""]) + df = df.drop_duplicates(subset="AlignmentId") # why do I need to do this? + df = df.set_index("AlignmentId") + metrics["Evaluation Improvement"] = {"changes": [], "unchanged": 0} for aln_id, c in final_consensus: - if c['transcript_biotype'] != 'protein_coding': + if c["transcript_biotype"] != "protein_coding": continue - elif 'exRef' in c['transcript_modes'] or 'augPB' in c['transcript_modes'] or 'augCGP' in c['transcript_modes']: + elif "exRef" in c["transcript_modes"] or "augPB" in c["transcript_modes"] or "augCGP" in c["transcript_modes"]: continue - elif 'transMap' in c['transcript_modes']: - metrics['Evaluation Improvement']['unchanged'] += 1 + elif "transMap" in c["transcript_modes"]: + metrics["Evaluation Improvement"]["unchanged"] += 1 continue tx_s = df.loc[aln_id] - metrics['Evaluation Improvement']['changes'].append([tx_s.TransMapOriginalIntronsPercent, - tx_s.IntronAnnotSupportPercentTransMap, - tx_s.IntronRnaSupportPercentTransMap, - tx_s.OriginalIntronsPercent_mRNA, - tx_s.IntronAnnotSupportPercent, - tx_s.IntronRnaSupportPercent, - tx_s.TransMapGoodness, - tx_s.AlnGoodness_mRNA]) + metrics["Evaluation Improvement"]["changes"].append( + [ + tx_s.TransMapOriginalIntronsPercent, + tx_s.IntronAnnotSupportPercentTransMap, + tx_s.IntronRnaSupportPercentTransMap, + tx_s.OriginalIntronsPercent_mRNA, + tx_s.IntronAnnotSupportPercent, + tx_s.IntronRnaSupportPercent, + tx_s.TransMapGoodness, + tx_s.AlnGoodness_mRNA, + ] + ) def calculate_indel_metrics(final_consensus, eval_df, metrics): """How many transcripts in the final consensus have indels? How many did we have in transMap?""" if len(eval_df) == 0: # edge case where no transcripts hit indel filters - metrics['transMap Indels'] = {} - metrics ['Consensus Indels'] = {} + metrics["transMap Indels"] = {} + metrics["Consensus Indels"] = {} return - eval_df_transmap = eval_df[eval_df['AlignmentId'].apply(tools.nameConversions.aln_id_is_transmap)] - tm_vals = eval_df_transmap.set_index('AlignmentId').sum(axis=0) + eval_df_transmap = eval_df[eval_df["AlignmentId"].apply(tools.nameConversions.aln_id_is_transmap)] + tm_vals = eval_df_transmap.set_index("AlignmentId").sum(axis=0) tm_vals = 100.0 * tm_vals / len(set(eval_df_transmap.index)) - metrics['transMap Indels'] = tm_vals.to_dict() + metrics["transMap Indels"] = tm_vals.to_dict() consensus_ids = set(list(zip(*final_consensus))[0]) - consensus_vals = eval_df[eval_df['AlignmentId'].isin(consensus_ids)].set_index('AlignmentId').sum(axis=0) + consensus_vals = eval_df[eval_df["AlignmentId"].isin(consensus_ids)].set_index("AlignmentId").sum(axis=0) consensus_vals = 100.0 * consensus_vals / len(final_consensus) - metrics['Consensus Indels'] = consensus_vals.to_dict() + metrics["Consensus Indels"] = consensus_vals.to_dict() ### @@ -867,32 +1007,32 @@ def write_consensus_gps(consensus_gp, consensus_gp_info, final_consensus, tx_dic consensus_gene_dict = DefaultOrderedDict(lambda: DefaultOrderedDict(list)) # used to make gff3 next gp_infos = [] consensus_gp_target = luigi.LocalTarget(consensus_gp) - with consensus_gp_target.open('w') as out_gp: + with consensus_gp_target.open("w") as out_gp: for tx_count, (tx, attrs) in enumerate(final_consensus, 1): attrs = attrs.copy() tx_obj = tx_dict[tx] - name = id_template.format(genome=genome, tag_type='T', unique_id=tx_count) - score = int(round(attrs.get('score', 0))) - source_gene = attrs['source_gene'] + name = id_template.format(genome=genome, tag_type="T", unique_id=tx_count) + score = int(round(attrs.get("score", 0))) + source_gene = attrs["source_gene"] if source_gene is None: source_gene = tx_obj.name2 if source_gene not in genes_seen[tx_obj.chromosome]: gene_count += 1 genes_seen[tx_obj.chromosome][source_gene] = gene_count gene_id = genes_seen[tx_obj.chromosome][source_gene] - name2 = id_template.format(genome=genome, tag_type='G', unique_id=gene_id) - out_gp.write('\t'.join(tx_obj.get_gene_pred(name=name, name2=name2, score=score)) + '\n') - attrs['transcript_id'] = name - attrs['gene_id'] = name2 + name2 = id_template.format(genome=genome, tag_type="G", unique_id=gene_id) + out_gp.write("\t".join(tx_obj.get_gene_pred(name=name, name2=name2, score=score)) + "\n") + attrs["transcript_id"] = name + attrs["gene_id"] = name2 gp_infos.append(attrs) consensus_gene_dict[tx_obj.chromosome][name2].append([tx_obj, attrs]) gp_info_df = pd.DataFrame(gp_infos) - gp_info_df = gp_info_df.set_index(['gene_id', 'transcript_id']) + gp_info_df = gp_info_df.set_index(["gene_id", "transcript_id"]) # its possible alternative_source_transcripts did not end up in the final result, so add it - if 'alternative_source_transcripts' not in gp_info_df.columns: - gp_info_df['alternative_source_transcripts'] = ['N/A'] * len(gp_info_df) - with luigi.LocalTarget(consensus_gp_info).open('w') as outf: - gp_info_df.to_csv(outf, sep='\t', na_rep='N/A') + if "alternative_source_transcripts" not in gp_info_df.columns: + gp_info_df["alternative_source_transcripts"] = ["N/A"] * len(gp_info_df) + with luigi.LocalTarget(consensus_gp_info).open("w") as outf: + gp_info_df.to_csv(outf, sep="\t", na_rep="N/A") return consensus_gene_dict @@ -900,44 +1040,46 @@ def write_consensus_gff3(consensus_gene_dict, consensus_gff3): """ Write the consensus set in gff3 format """ + def convert_attrs(attrs, id_field): """converts the attrs dict to a attributes field. assigns name to the gene common name for display""" - attrs['ID'] = id_field - if 'score' in attrs: - score = 10 * attrs['score'] - del attrs['score'] + attrs["ID"] = id_field + if "score" in attrs: + score = 10 * attrs["score"] + del attrs["score"] else: - score = '.' - if 'source_gene_common_name' in attrs and isinstance(attrs['source_gene_common_name'], str): - attrs['Name'] = attrs['source_gene_common_name'] + score = "." + if "source_gene_common_name" in attrs and isinstance(attrs["source_gene_common_name"], str): + attrs["Name"] = attrs["source_gene_common_name"] else: - attrs['Name'] = attrs['gene_id'] + attrs["Name"] = attrs["gene_id"] # convert empty strings into nan attrs_str = [] for key, val in attrs.items(): val = str(val) if len(val) == 0: - val = 'nan' - val = str(val).replace('=', '%3D').replace(';', '%3B') - key = key.replace('=', '%3D').replace(';', '%3B') + val = "nan" + val = str(val).replace("=", "%3D").replace(";", "%3B") + key = key.replace("=", "%3D").replace(";", "%3B") attrs_str.append(f"{key}={val}") return score, ";".join(attrs_str) def find_feature_support(attrs, feature, i): """Extracts the boolean value from the comma delimited string""" try: - vals = list(map(bool, attrs[feature].split(','))) + vals = list(map(bool, attrs[feature].split(","))) except KeyError: - return 'N/A' + return "N/A" return vals[i] def generate_gene_record(chrom, tx_objs, gene_id, attrs_list): """calculates the gene interval for this list of tx""" + def find_all_tx_modes(attrs_list): tx_modes = set() for attrs in attrs_list: - tx_modes.update(attrs['transcript_modes'].split(',')) - return ','.join(tx_modes) + tx_modes.update(attrs["transcript_modes"].split(",")) + return ",".join(tx_modes) intervals = set() for tx in tx_objs: @@ -946,24 +1088,30 @@ def find_all_tx_modes(attrs_list): strand = tx_objs[0].strand # subset the attrs to gene fields attrs = attrs_list[0] - useful_keys = ['source_gene_common_name', 'source_gene', 'gene_biotype', - 'alternative_source_transcripts', 'gene_alternate_contigs', - 'gene_name', 'gene_id'] + useful_keys = [ + "source_gene_common_name", + "source_gene", + "gene_biotype", + "alternative_source_transcripts", + "gene_alternate_contigs", + "gene_name", + "gene_id", + ] attrs = {key: attrs[key] for key in useful_keys if key in attrs} - attrs['transcript_modes'] = find_all_tx_modes(attrs_list) + attrs["transcript_modes"] = find_all_tx_modes(attrs_list) score, attrs_field = convert_attrs(attrs, gene_id) - return [chrom, 'CAT', 'gene', intervals[0].start + 1, intervals[-1].stop, score, strand, '.', attrs_field] + return [chrom, "CAT", "gene", intervals[0].start + 1, intervals[-1].stop, score, strand, ".", attrs_field] def generate_transcript_record(chrom, tx_obj, attrs): """generates transcript records, calls generate_exon_records to generate those too""" - tx_id = attrs['transcript_id'] - attrs['Parent'] = attrs['gene_id'] + tx_id = attrs["transcript_id"] + attrs["Parent"] = attrs["gene_id"] score, attrs_field = convert_attrs(attrs, tx_id) - yield [chrom, 'CAT', 'transcript', tx_obj.start + 1, tx_obj.stop, score, tx_obj.strand, '.', attrs_field] + yield [chrom, "CAT", "transcript", tx_obj.start + 1, tx_obj.stop, score, tx_obj.strand, ".", attrs_field] # hack to remove the frameshift field from lower objects # TODO: record the actual exon with the frameshift. - if 'frameshift' in attrs: - del attrs['frameshift'] + if "frameshift" in attrs: + del attrs["frameshift"] for line in generate_intron_exon_records(chrom, tx_obj, tx_id, attrs): yield line if tx_obj.cds_size > 3: @@ -972,47 +1120,74 @@ def generate_transcript_record(chrom, tx_obj, attrs): def generate_intron_exon_records(chrom, tx_obj, tx_id, attrs): """generates intron and exon records""" - attrs['Parent'] = tx_id + attrs["Parent"] = tx_id # exon records cds_i = 0 # keep track of position of CDS in case of entirely non-coding exons for i, (exon, exon_frame) in enumerate(zip(*[tx_obj.exon_intervals, tx_obj.exon_frames])): - attrs['rna_support'] = find_feature_support(attrs, 'exon_rna_support', i) - attrs['reference_support'] = find_feature_support(attrs, 'exon_annotation_support', i) - score, attrs_field = convert_attrs(attrs, 'exon:{}:{}'.format(tx_id, i)) - yield [chrom, 'CAT', 'exon', exon.start + 1, exon.stop, score, exon.strand, '.', attrs_field] + attrs["rna_support"] = find_feature_support(attrs, "exon_rna_support", i) + attrs["reference_support"] = find_feature_support(attrs, "exon_annotation_support", i) + score, attrs_field = convert_attrs(attrs, "exon:{}:{}".format(tx_id, i)) + yield [chrom, "CAT", "exon", exon.start + 1, exon.stop, score, exon.strand, ".", attrs_field] cds_interval = exon.intersection(tx_obj.coding_interval) if cds_interval is not None: - score, attrs_field = convert_attrs(attrs, 'CDS:{}:{}'.format(tx_id, cds_i)) + score, attrs_field = convert_attrs(attrs, "CDS:{}:{}".format(tx_id, cds_i)) cds_i += 1 - yield [chrom, 'CAT', 'CDS', cds_interval.start + 1, cds_interval.stop, score, exon.strand, - tools.transcripts.convert_frame(exon_frame), attrs_field] + yield [ + chrom, + "CAT", + "CDS", + cds_interval.start + 1, + cds_interval.stop, + score, + exon.strand, + tools.transcripts.convert_frame(exon_frame), + attrs_field, + ] # intron records for i, intron in enumerate(tx_obj.intron_intervals): if len(intron) == 0: continue - attrs['rna_support'] = find_feature_support(attrs, 'intron_rna_support', i) - attrs['reference_support'] = find_feature_support(attrs, 'intron_annotation_support', i) - score, attrs_field = convert_attrs(attrs, 'intron:{}:{}'.format(tx_id, i)) - yield [chrom, 'CAT', 'intron', intron.start + 1, intron.stop, score, intron.strand, '.', attrs_field] + attrs["rna_support"] = find_feature_support(attrs, "intron_rna_support", i) + attrs["reference_support"] = find_feature_support(attrs, "intron_annotation_support", i) + score, attrs_field = convert_attrs(attrs, "intron:{}:{}".format(tx_id, i)) + yield [chrom, "CAT", "intron", intron.start + 1, intron.stop, score, intron.strand, ".", attrs_field] def generate_start_stop_codon_records(chrom, tx_obj, tx_id, attrs): """generate start/stop codon GFF3 records, handling frame appropriately""" - if attrs.get('valid_start') is True: - score, attrs_field = convert_attrs(attrs, 'start_codon:{}'.format(tx_id)) + if attrs.get("valid_start") is True: + score, attrs_field = convert_attrs(attrs, "start_codon:{}".format(tx_id)) for interval in tx_obj.get_start_intervals(): - yield [chrom, 'CAT', 'start_codon', interval.start + 1, interval.stop, score, tx_obj.strand, - interval.data, attrs_field] - if attrs.get('valid_stop') is True: - score, attrs_field = convert_attrs(attrs, 'stop_codon:{}'.format(tx_id)) + yield [ + chrom, + "CAT", + "start_codon", + interval.start + 1, + interval.stop, + score, + tx_obj.strand, + interval.data, + attrs_field, + ] + if attrs.get("valid_stop") is True: + score, attrs_field = convert_attrs(attrs, "stop_codon:{}".format(tx_id)) for interval in tx_obj.get_stop_intervals(): - yield [chrom, 'CAT', 'stop_codon', interval.start + 1, interval.stop, score, tx_obj.strand, - interval.data, attrs_field] + yield [ + chrom, + "CAT", + "stop_codon", + interval.start + 1, + interval.stop, + score, + tx_obj.strand, + interval.data, + attrs_field, + ] # main gff3 writing logic consensus_gff3 = luigi.LocalTarget(consensus_gff3) - with consensus_gff3.open('w') as out_gff3: - out_gff3.write('##gff-version 3\n') + with consensus_gff3.open("w") as out_gff3: + out_gff3.write("##gff-version 3\n") for chrom in sorted(consensus_gene_dict): for gene_id, tx_list in consensus_gene_dict[chrom].items(): tx_objs, attrs_list = list(zip(*tx_list)) @@ -1028,10 +1203,10 @@ def write_consensus_fastas(consensus_gene_dict, consensus_fasta, consensus_prote seq_dict = tools.bio.get_sequence_dict(fasta) consensus_fasta = luigi.LocalTarget(consensus_fasta) consensus_protein_fasta = luigi.LocalTarget(consensus_protein_fasta) - with consensus_fasta.open('w') as cfa, consensus_protein_fasta.open('w') as cpfa: + with consensus_fasta.open("w") as cfa, consensus_protein_fasta.open("w") as cpfa: for chrom in sorted(consensus_gene_dict): for gene_id, tx_list in consensus_gene_dict[chrom].items(): for tx_obj, attrs in tx_list: - tools.bio.write_fasta(cfa, attrs['transcript_id'], tx_obj.get_mrna(seq_dict)) + tools.bio.write_fasta(cfa, attrs["transcript_id"], tx_obj.get_mrna(seq_dict)) if tx_obj.cds_size > 0: - tools.bio.write_fasta(cpfa, attrs['transcript_id'], tx_obj.get_protein_sequence(seq_dict)) + tools.bio.write_fasta(cpfa, attrs["transcript_id"], tx_obj.get_protein_sequence(seq_dict)) diff --git a/cat/exceptions.py b/cat/exceptions.py index 44ff3d82..51ff60f8 100644 --- a/cat/exceptions.py +++ b/cat/exceptions.py @@ -1,23 +1,26 @@ class UserException(Exception): """generic exception to use when a user makes a mistake""" + pass class ToolMissingException(UserException): """exception to use when a tool is missing, usually checked in a task validate() method""" + pass class InputMissingException(UserException): """exception to use when input data are missing""" + pass class InvalidInputException(UserException): """exception to use when something about the input is invalid""" + pass class MissingFileException(UserException): """exception to use when a input file is missing""" - diff --git a/cat/filter_transmap.py b/cat/filter_transmap.py index 2cac571f..5ac696ce 100644 --- a/cat/filter_transmap.py +++ b/cat/filter_transmap.py @@ -39,8 +39,17 @@ pd.options.mode.chained_assignment = None -def filter_transmap(tm_psl, ref_psl, tm_gp, db_path, psl_tgt, global_near_best, filter_overlapping_genes, - overlapping_gene_distance, json_tgt): +def filter_transmap( + tm_psl, + ref_psl, + tm_gp, + db_path, + psl_tgt, + global_near_best, + filter_overlapping_genes, + overlapping_gene_distance, + json_tgt, +): """ Entry point for transMap filtering. :param tm_psl: input PSL @@ -62,7 +71,7 @@ def filter_transmap(tm_psl, ref_psl, tm_gp, db_path, psl_tgt, global_near_best, # pre-filter out suspiciously large spans size_filtered, num_too_long = ref_span(unfiltered, ref_psl_dict) tmp_size_filtered = tools.fileOps.get_tmp_file() - with open(tmp_size_filtered, 'w') as outf: + with open(tmp_size_filtered, "w") as outf: for aln in size_filtered.values(): tools.fileOps.print_row(outf, aln.psl_string()) @@ -81,9 +90,19 @@ def filter_transmap(tm_psl, ref_psl, tm_gp, db_path, psl_tgt, global_near_best, def hash_aln(aln, aln_id): """Hacky way to hash an alignment""" m = hashlib.sha256() - for l in [aln.t_name, aln.t_start, aln.t_end, aln.matches, aln.mismatches, aln.block_count, - tuple(aln.t_starts), tuple(aln.q_starts), tuple(aln.block_sizes), aln_id]: - m.update(str(l).encode('utf-8')) + for l in [ + aln.t_name, + aln.t_start, + aln.t_end, + aln.matches, + aln.mismatches, + aln.block_count, + tuple(aln.t_starts), + tuple(aln.q_starts), + tuple(aln.block_sizes), + aln_id, + ]: + m.update(str(l).encode("utf-8")) return m.hexdigest() unfiltered_hash_table = {} @@ -93,14 +112,20 @@ def hash_aln(aln, aln_id): assert len(unfiltered_hash_table) == len(size_filtered) with tools.fileOps.TemporaryFilePath() as local_tmp, tools.fileOps.TemporaryFilePath() as strip_tmp: - with open(strip_tmp, 'w') as outf: + with open(strip_tmp, "w") as outf: for rec in size_filtered.values(): rec = deepcopy(rec) rec.q_name = tools.nameConversions.strip_alignment_numbers(rec.q_name) tools.fileOps.print_row(outf, rec.psl_string()) - cmd = ['pslCDnaFilter', '-globalNearBest={}'.format(global_near_best), - '-minCover=0.1', '-verbose=0', - '-minSpan=0.2', strip_tmp, '/dev/stdout'] + cmd = [ + "pslCDnaFilter", + "-globalNearBest={}".format(global_near_best), + "-minCover=0.1", + "-verbose=0", + "-minSpan=0.2", + strip_tmp, + "/dev/stdout", + ] tools.procOps.run_proc(cmd, stdout=local_tmp) filtered_alns = list(tools.psl.psl_iterator(local_tmp)) @@ -112,23 +137,25 @@ def hash_aln(aln, aln_id): # report counts by biotype grouped = tools.psl.group_alignments_by_qname(global_best) unfiltered_grouped = tools.psl.group_alignments_by_qname(iter(unfiltered.values())) - metrics = {'Paralogy': collections.defaultdict(lambda: collections.Counter()), - 'UnfilteredParalogy': collections.defaultdict(lambda: collections.Counter())} + metrics = { + "Paralogy": collections.defaultdict(lambda: collections.Counter()), + "UnfilteredParalogy": collections.defaultdict(lambda: collections.Counter()), + } paralogy_df = [] for tx_id, alns in grouped.items(): biotype = transcript_biotype_map[tx_id] - putative_paralogs = ','.join(sorted([x.q_name for x in alns if x.q_name not in global_best_ids])) - all_alns = ','.join(sorted([x.q_name for x in unfiltered_grouped[tx_id] if x.q_name not in global_best_ids])) + putative_paralogs = ",".join(sorted([x.q_name for x in alns if x.q_name not in global_best_ids])) + all_alns = ",".join(sorted([x.q_name for x in unfiltered_grouped[tx_id] if x.q_name not in global_best_ids])) paralogy_df.append([tx_id, putative_paralogs, all_alns]) - metrics['Paralogy'][biotype][len(alns)] += 1 - metrics['UnfilteredParalogy'][biotype][len(unfiltered_grouped[tx_id])] += 1 + metrics["Paralogy"][biotype][len(alns)] += 1 + metrics["UnfilteredParalogy"][biotype][len(unfiltered_grouped[tx_id])] += 1 - paralogy_df = pd.DataFrame(paralogy_df, columns=['TranscriptId', 'Paralogy', 'UnfilteredParalogy']) + paralogy_df = pd.DataFrame(paralogy_df, columns=["TranscriptId", "Paralogy", "UnfilteredParalogy"]) # run pslCDnaFilter again, with no options, to get scores with tools.fileOps.TemporaryFilePath() as tmp_verbose: - cmd = ['pslCDnaFilter', '-verbose=5', tmp_size_filtered, '/dev/stdout'] - tools.procOps.run_proc(cmd, stderr=tmp_verbose, stdout='/dev/null') + cmd = ["pslCDnaFilter", "-verbose=5", tmp_size_filtered, "/dev/stdout"] + tools.procOps.run_proc(cmd, stderr=tmp_verbose, stdout="/dev/null") scores = parse_verbose(tmp_verbose) # now coding and non-coding genes are split up. Coding genes are any genes who have a transcript with an ORF @@ -137,33 +164,54 @@ def hash_aln(aln, aln_id): # identify all genes that are non-coding. Non-coding is defined as genes who have no ORFs global_best_by_gene = tools.transcripts.group_transcripts_by_name2(global_best_txs) - coding_genes = {gene_id for gene_id, tx_list in global_best_by_gene.items() - if any(x.cds_size > 0 for x in tx_list)} + coding_genes = {gene_id for gene_id, tx_list in global_best_by_gene.items() if any(x.cds_size > 0 for x in tx_list)} - with tools.fileOps.TemporaryFilePath() as coding_tmp, tools.fileOps.TemporaryFilePath() as noncoding_tmp, \ - tools.fileOps.TemporaryFilePath() as coding_clusters, tools.fileOps.TemporaryFilePath() as noncoding_clusters: - with open(coding_clusters, 'w') as out_coding, open(noncoding_clusters, 'w') as out_noncoding: + with tools.fileOps.TemporaryFilePath() as coding_tmp, tools.fileOps.TemporaryFilePath() as noncoding_tmp, tools.fileOps.TemporaryFilePath() as coding_clusters, tools.fileOps.TemporaryFilePath() as noncoding_clusters: + with open(coding_clusters, "w") as out_coding, open(noncoding_clusters, "w") as out_noncoding: for tx in global_best_txs: if tx.name2 in coding_genes: tools.fileOps.print_row(out_coding, tx.get_gene_pred()) else: tools.fileOps.print_row(out_noncoding, tx.get_gene_pred()) - cmd = ['clusterGenes', '-cds', f'-minOverlappingBases={overlapping_gene_distance}', - coding_tmp, 'no', coding_clusters] + cmd = [ + "clusterGenes", + "-cds", + f"-minOverlappingBases={overlapping_gene_distance}", + coding_tmp, + "no", + coding_clusters, + ] tools.procOps.run_proc(cmd) - cmd = ['clusterGenes', f'-minOverlappingBases={overlapping_gene_distance}', - noncoding_tmp, 'no', noncoding_clusters] + cmd = [ + "clusterGenes", + f"-minOverlappingBases={overlapping_gene_distance}", + noncoding_tmp, + "no", + noncoding_clusters, + ] tools.procOps.run_proc(cmd) - coding_clustered = pd.read_csv(coding_tmp, sep='\t') - noncoding_clustered = pd.read_csv(noncoding_tmp, sep='\t') - - metrics['Gene Family Collapse'] = collections.defaultdict(lambda: collections.Counter()) - coding_merged_df, coding_collapse_filtered = filter_clusters(coding_clustered, transcript_gene_map, - gene_name_map, scores, metrics, gene_biotype_map, - filter_overlapping_genes) - noncoding_merged_df, noncoding_collapse_filtered = filter_clusters(noncoding_clustered, transcript_gene_map, - gene_name_map, scores, metrics, gene_biotype_map, - filter_overlapping_genes) + coding_clustered = pd.read_csv(coding_tmp, sep="\t") + noncoding_clustered = pd.read_csv(noncoding_tmp, sep="\t") + + metrics["Gene Family Collapse"] = collections.defaultdict(lambda: collections.Counter()) + coding_merged_df, coding_collapse_filtered = filter_clusters( + coding_clustered, + transcript_gene_map, + gene_name_map, + scores, + metrics, + gene_biotype_map, + filter_overlapping_genes, + ) + noncoding_merged_df, noncoding_collapse_filtered = filter_clusters( + noncoding_clustered, + transcript_gene_map, + gene_name_map, + scores, + metrics, + gene_biotype_map, + filter_overlapping_genes, + ) merged_collapse_filtered = pd.concat([coding_collapse_filtered, noncoding_collapse_filtered]) merged_df = pd.concat([coding_merged_df, noncoding_merged_df]) @@ -181,8 +229,8 @@ def hash_aln(aln, aln_id): rescued_txs = [] # for each gene ID that survived filtering, find their interval - for gene_id, group in merged_collapse_filtered.groupby('gene_id'): - assert len(set(group['#cluster'])) == 1 + for gene_id, group in merged_collapse_filtered.groupby("gene_id"): + assert len(set(group["#cluster"])) == 1 tx_intervals = [] for _, s in group.iterrows(): tx_intervals.append(tools.intervals.ChromosomeInterval(s.chrom, s.txStart, s.txEnd, s.strand)) @@ -196,38 +244,41 @@ def hash_aln(aln, aln_id): # the final step is filtering for duplicates. Duplicates here means that we have multiple transMap # mapping to the same locus. Pick the highest scores combined_txs = rescued_txs + list(merged_collapse_filtered.gene) - combined_tx_df = pd.DataFrame(combined_txs, columns=['AlignmentId']) - combined_tx_df['score'] = [scores[x] for x in combined_tx_df.AlignmentId] - combined_tx_df['TranscriptId'] = [tools.nameConversions.strip_alignment_numbers(x) for x in combined_tx_df.AlignmentId] - combined_tx_df['GeneId'] = [transcript_gene_map[x] for x in combined_tx_df.TranscriptId] - combined_tx_df = combined_tx_df.sort_values('score') - combined_tx_df = combined_tx_df.groupby('TranscriptId', as_index=False).first() + combined_tx_df = pd.DataFrame(combined_txs, columns=["AlignmentId"]) + combined_tx_df["score"] = [scores[x] for x in combined_tx_df.AlignmentId] + combined_tx_df["TranscriptId"] = [ + tools.nameConversions.strip_alignment_numbers(x) for x in combined_tx_df.AlignmentId + ] + combined_tx_df["GeneId"] = [transcript_gene_map[x] for x in combined_tx_df.TranscriptId] + combined_tx_df = combined_tx_df.sort_values("score") + combined_tx_df = combined_tx_df.groupby("TranscriptId", as_index=False).first() # construct the output DataFrame - resolved_df = combined_tx_df.merge(merged_df, on='GeneId', how='left') - resolved_df = resolved_df.drop('score', axis=1) + resolved_df = combined_tx_df.merge(merged_df, on="GeneId", how="left") + resolved_df = resolved_df.drop("score", axis=1) # write the paralog resolved PSL - with psl_tgt.open('w') as outf: + with psl_tgt.open("w") as outf: for aln_id in resolved_df.AlignmentId: aln = unfiltered[aln_id] tools.fileOps.print_row(outf, aln.psl_string()) # resolve split genes using the scores and the best IDs - resolved_df, split_gene_metrics = resolve_split_genes(tmp_size_filtered, transcript_gene_map, - resolved_df, unfiltered_tx_dict) + resolved_df, split_gene_metrics = resolve_split_genes( + tmp_size_filtered, transcript_gene_map, resolved_df, unfiltered_tx_dict + ) # add in paralogy calls from before - resolved_df = resolved_df.merge(paralogy_df, on='TranscriptId') - metrics['Split Genes'] = split_gene_metrics + resolved_df = resolved_df.merge(paralogy_df, on="TranscriptId") + metrics["Split Genes"] = split_gene_metrics os.remove(tmp_size_filtered) # write the JSON tools.fileOps.ensure_file_dir(json_tgt.path) - with json_tgt.open('w') as outf: + with json_tgt.open("w") as outf: json.dump(metrics, outf) - return resolved_df.set_index(['GeneId', 'TranscriptId']) + return resolved_df.set_index(["GeneId", "TranscriptId"]) def ref_span(aln_dict, ref_aln_dict, max_span=5): @@ -255,23 +306,23 @@ def ref_span(aln_dict, ref_aln_dict, max_span=5): def parse_stats(stats): """Parse the stats output, provide summary statistics to log""" - stats = pd.read_csv(stats, sep='\t', names=['mode', 'seqs', 'alns'], index_col=0) + stats = pd.read_csv(stats, sep="\t", names=["mode", "seqs", "alns"], index_col=0) # munge the stats and report them - stats.index = [x.replace(' ', '') for x in stats.index] + stats.index = [x.replace(" ", "") for x in stats.index] stats = stats.T stats_dict = {} - if 'dropminCover:' in stats: - stats_dict['Coverage Filter'] = int(stats['dropminCover:'].alns) + if "dropminCover:" in stats: + stats_dict["Coverage Filter"] = int(stats["dropminCover:"].alns) else: - stats_dict['Coverage Filter'] = 0 - if 'dropminSpan:' in stats: - stats_dict['Min Span Distance'] = int(stats['dropminSpan:'].alns) + stats_dict["Coverage Filter"] = 0 + if "dropminSpan:" in stats: + stats_dict["Min Span Distance"] = int(stats["dropminSpan:"].alns) else: - stats_dict['Min Span Distance'] = 0 - if 'dropglobalBest:' in stats: - stats_dict['Paralog Filter'] = int(stats['dropglobalBest:'].alns) + stats_dict["Min Span Distance"] = 0 + if "dropglobalBest:" in stats: + stats_dict["Paralog Filter"] = int(stats["dropglobalBest:"].alns) else: - stats_dict['Paralog Filter'] = 0 + stats_dict["Paralog Filter"] = 0 return stats_dict @@ -279,11 +330,11 @@ def parse_verbose(verbose): """Parse the verbose output to retain score information for resolution""" scores = {} for l in open(verbose): - if l.startswith('align'): + if l.startswith("align"): l = l.split() - aln_id = l[-3].rsplit(':', 1)[0].split(']')[1] + aln_id = l[-3].rsplit(":", 1)[0].split("]")[1] score = l[5] - score = float(score.split('=')[1]) + score = float(score.split("=")[1]) scores[aln_id] = score return scores @@ -293,8 +344,8 @@ def find_best_group(group, key): Resolve a cluster by finding the highest average score. Key determines if we are currently resolving cluster or gene_id """ - avg_scores = group[[key, 'scores']].groupby(key, as_index=False).mean() - return avg_scores.sort_values('scores', ascending=False).iloc[0][key] + avg_scores = group[[key, "scores"]].groupby(key, as_index=False).mean() + return avg_scores.sort_values("scores", ascending=False).iloc[0][key] def construct_alt_loci(group, best_cluster): @@ -302,55 +353,58 @@ def construct_alt_loci(group, best_cluster): For paralogous genes, find the locations of alt loci """ intervals = collections.defaultdict(list) - for cluster_id, x in group.set_index('#cluster').iterrows(): + for cluster_id, x in group.set_index("#cluster").iterrows(): if cluster_id != best_cluster: - intervals[x.chrom].append(tools.intervals.ChromosomeInterval(x.chrom, x.txStart, x.txEnd, '.')) + intervals[x.chrom].append(tools.intervals.ChromosomeInterval(x.chrom, x.txStart, x.txEnd, ".")) merged_intervals = [] for chrom, i in intervals.items(): merged_intervals.extend(tools.intervals.gap_merge_intervals(i, 1000)) - return ','.join('{}:{}-{}'.format(x.chromosome, x.start, x.stop) for x in merged_intervals) + return ",".join("{}:{}-{}".format(x.chromosome, x.start, x.stop) for x in merged_intervals) -def filter_clusters(clustered, transcript_gene_map, gene_name_map, scores, metrics, gene_biotype_map, - filter_overlapping_genes): +def filter_clusters( + clustered, transcript_gene_map, gene_name_map, scores, metrics, gene_biotype_map, filter_overlapping_genes +): """ Wrapper for taking the output of clusterGenes and filtering it """ # add gene IDs and scores. clustered.gene is actually AlignmentId fields - clustered['gene_id'] = [transcript_gene_map[tools.nameConversions.strip_alignment_numbers(x)] for x in clustered.gene] - clustered['scores'] = [scores[x] for x in clustered.gene] + clustered["gene_id"] = [ + transcript_gene_map[tools.nameConversions.strip_alignment_numbers(x)] for x in clustered.gene + ] + clustered["scores"] = [scores[x] for x in clustered.gene] to_remove = set() # set of cluster IDs to remove alt_loci = [] # will become a DataFrame of alt loci to populate that field # any gene IDs with multiple clusters need to be resolved to resolve paralogies - for gene_id, group in clustered.groupby('gene_id'): - if len(set(group['#cluster'])) > 1: + for gene_id, group in clustered.groupby("gene_id"): + if len(set(group["#cluster"])) > 1: # pick the highest average scoring cluster - best_cluster = find_best_group(group, '#cluster') + best_cluster = find_best_group(group, "#cluster") best_cluster = int(best_cluster) alt_loci.append([gene_id, construct_alt_loci(group, best_cluster)]) - to_remove.update(set(group['#cluster']) - {best_cluster}) - paralog_filtered = clustered[~clustered['#cluster'].isin(to_remove)] - paralog_df = pd.DataFrame(alt_loci, columns=['GeneId', 'GeneAlternateLoci']) + to_remove.update(set(group["#cluster"]) - {best_cluster}) + paralog_filtered = clustered[~clustered["#cluster"].isin(to_remove)] + paralog_df = pd.DataFrame(alt_loci, columns=["GeneId", "GeneAlternateLoci"]) # group by cluster ID to identify gene family collapse genes_to_remove = set() # set of gene IDs to collapse collapsed_genes = [] # will become a DataFrame of collapsed genes - for cluster_id, group in paralog_filtered.groupby('#cluster'): - if len(set(group['gene_id'])) > 1: - best_gene = find_best_group(group, 'gene_id') + for cluster_id, group in paralog_filtered.groupby("#cluster"): + if len(set(group["gene_id"])) > 1: + best_gene = find_best_group(group, "gene_id") collapsed_gene_ids = set(group.gene_id) - {best_gene} gene_biotype = gene_biotype_map[best_gene] - metrics['Gene Family Collapse'][gene_biotype][len(collapsed_gene_ids)] += 1 + metrics["Gene Family Collapse"][gene_biotype][len(collapsed_gene_ids)] += 1 collapsed_gene_names = {gene_name_map[x] for x in collapsed_gene_ids} genes_to_remove.update(collapsed_gene_ids) - collapsed_genes.append([best_gene, ','.join(collapsed_gene_ids), ','.join(collapsed_gene_names)]) + collapsed_genes.append([best_gene, ",".join(collapsed_gene_ids), ",".join(collapsed_gene_names)]) if filter_overlapping_genes is True: - collapse_filtered = paralog_filtered[~paralog_filtered['gene_id'].isin(genes_to_remove)] + collapse_filtered = paralog_filtered[~paralog_filtered["gene_id"].isin(genes_to_remove)] else: collapse_filtered = paralog_filtered - collapsed_df = pd.DataFrame(collapsed_genes, columns=['GeneId', 'CollapsedGeneIds', 'CollapsedGeneNames']) - merged_df = collapsed_df.merge(paralog_df, how='outer', on='GeneId') + collapsed_df = pd.DataFrame(collapsed_genes, columns=["GeneId", "CollapsedGeneIds", "CollapsedGeneNames"]) + merged_df = collapsed_df.merge(paralog_df, how="outer", on="GeneId") return merged_df, collapse_filtered @@ -360,8 +414,9 @@ def find_split_genes(gene_id, g, resolved_interval, split_gene_data): """ intervals = collections.defaultdict(list) for aln in g: - ref_i = tools.intervals.ChromosomeInterval(tools.nameConversions.strip_alignment_numbers(aln.q_name), - aln.q_start, aln.q_end, '.') + ref_i = tools.intervals.ChromosomeInterval( + tools.nameConversions.strip_alignment_numbers(aln.q_name), aln.q_start, aln.q_end, "." + ) tgt_i = tools.intervals.ChromosomeInterval(aln.t_name, aln.t_start, aln.t_end, aln.strand) intervals[ref_i].append(tgt_i) merged_intervals = tools.intervals.union_of_intervals(list(intervals.keys())) @@ -377,12 +432,12 @@ def find_split_genes(gene_id, g, resolved_interval, split_gene_data): r.extend(tools.intervals.gap_merge_intervals(interval_list, 0)) # write metrics if len(alt_intervals) == 1 and list(alt_intervals.keys())[0] == resolved_interval.chromosome: - split_gene_data['intra'].add(gene_id) + split_gene_data["intra"].add(gene_id) else: - split_gene_data['contig'].add(gene_id) + split_gene_data["contig"].add(gene_id) if len(r) == 0: return None - return ','.join(['{}:{}-{}'.format(i.chromosome, i.start, i.stop) for i in r]) + return ",".join(["{}:{}-{}".format(i.chromosome, i.start, i.stop) for i in r]) else: return None @@ -392,13 +447,19 @@ def resolve_split_genes(tmp_size_filtered, transcript_gene_map, resolved_df, unf Use localNearBest algorithm to determine split genes and populate that field """ with tools.fileOps.TemporaryFilePath() as local_tmp, tools.fileOps.TemporaryFilePath() as stripped_tmp: - with open(stripped_tmp, 'w') as outf: + with open(stripped_tmp, "w") as outf: for rec in tools.psl.psl_iterator(tmp_size_filtered): rec.q_name = tools.nameConversions.strip_alignment_numbers(rec.q_name) tools.fileOps.print_row(outf, rec.psl_string()) - cmd = ['pslCDnaFilter', '-localNearBest=0.05', - '-minCover=0.1', '-verbose=0', - '-minSpan=0.2', stripped_tmp, '/dev/stdout'] + cmd = [ + "pslCDnaFilter", + "-localNearBest=0.05", + "-minCover=0.1", + "-verbose=0", + "-minSpan=0.2", + stripped_tmp, + "/dev/stdout", + ] tools.procOps.run_proc(cmd, stdout=local_tmp) filtered_alns = list(tools.psl.psl_iterator(local_tmp)) @@ -408,21 +469,24 @@ def resolve_split_genes(tmp_size_filtered, transcript_gene_map, resolved_df, unf grouped = tools.psl.group_alignments_by_qname(filtered_alns, strip=False) # construct the transcript interval for resolved transcripts - tx_intervals = {tx_id: unfiltered_tx_dict[aln_id].interval for - tx_id, aln_id in zip(resolved_df.TranscriptId, resolved_df.AlignmentId)} + tx_intervals = { + tx_id: unfiltered_tx_dict[aln_id].interval + for tx_id, aln_id in zip(resolved_df.TranscriptId, resolved_df.AlignmentId) + } split_r = [] # keep track of transcripts which have to be resolved and if they are on the same contig or different contigs - split_gene_data = {'contig': set(), 'intra': set()} + split_gene_data = {"contig": set(), "intra": set()} for tx_id, g in grouped.items(): gene_id = transcript_gene_map[tx_id] split_r.append([tx_id, find_split_genes(gene_id, g, tx_intervals[tx_id], split_gene_data)]) - split_df = pd.DataFrame(split_r, columns=['TranscriptId', 'PossibleSplitGeneLocations']) - merged = split_df.merge(resolved_df, on='TranscriptId') + split_df = pd.DataFrame(split_r, columns=["TranscriptId", "PossibleSplitGeneLocations"]) + merged = split_df.merge(resolved_df, on="TranscriptId") # calculate the number of genes for metrics - split_gene_metrics = {'Number of contig split genes': len(split_gene_data['contig']), - 'Number of intra-contig split genes': len(split_gene_data['intra'])} + split_gene_metrics = { + "Number of contig split genes": len(split_gene_data["contig"]), + "Number of intra-contig split genes": len(split_gene_data["intra"]), + } return merged, split_gene_metrics - diff --git a/cat/hgm.py b/cat/hgm.py index da0fecdc..22137d8a 100644 --- a/cat/hgm.py +++ b/cat/hgm.py @@ -47,38 +47,46 @@ def hgm(args): supplementary_gffs = [] with tools.fileOps.TemporaryFilePath() as gtf_fofn, tools.fileOps.TemporaryDirectoryPath() as temp_dir: - with open(gtf_fofn, 'w') as outf: + with open(gtf_fofn, "w") as outf: for genome, gtf in args.in_gtf.items(): if genome != args.ref_genome: supplementary_gff = create_supplementary_gff(args.hints_db, gtf, genome) else: supplementary_gff = create_supplementary_gff(args.hints_db, gtf, genome, args.annotation_gp) - if os.environ.get('CAT_BINARY_MODE') == 'singularity': - tools.fileOps.print_row(outf, [genome] + list(map(tools.procOps.singularify_arg, [gtf, supplementary_gff]))) + if os.environ.get("CAT_BINARY_MODE") == "singularity": + tools.fileOps.print_row( + outf, [genome] + list(map(tools.procOps.singularify_arg, [gtf, supplementary_gff])) + ) else: tools.fileOps.print_row(outf, [genome, gtf, supplementary_gff]) supplementary_gffs.append(supplementary_gff) if args.ref_genome not in args.in_gtf: # we are not running CGP, and so have no GTF for the reference dummy_gtf = tools.fileOps.get_tmp_file() tools.fileOps.touch(dummy_gtf) - supplementary_gff = create_supplementary_gff(args.hints_db, args.annotation_gtf, args.ref_genome, - args.annotation_gp) - if os.environ.get('CAT_BINARY_MODE') == 'singularity': - tools.fileOps.print_row(outf, [args.ref_genome] + list(map(tools.procOps.singularify_arg, [dummy_gtf, supplementary_gff]))) + supplementary_gff = create_supplementary_gff( + args.hints_db, args.annotation_gtf, args.ref_genome, args.annotation_gp + ) + if os.environ.get("CAT_BINARY_MODE") == "singularity": + tools.fileOps.print_row( + outf, + [args.ref_genome] + list(map(tools.procOps.singularify_arg, [dummy_gtf, supplementary_gff])), + ) else: tools.fileOps.print_row(outf, [args.ref_genome, dummy_gtf, supplementary_gff]) supplementary_gffs.append(supplementary_gff) else: dummy_gtf = None - cmd = ['homGeneMapping', - '--halfile={}'.format(args.hal), - '--dbaccess={}'.format(args.hints_db), - '--gtfs={}'.format(gtf_fofn), - '--outdir={}'.format(args.gtf_out_dir), - '--tmpdir={}'.format(temp_dir), - '--cpu={}'.format(args.hgm_cpu)] - tools.procOps.run_proc(cmd, stdout='/dev/null') + cmd = [ + "homGeneMapping", + "--halfile={}".format(args.hal), + "--dbaccess={}".format(args.hints_db), + "--gtfs={}".format(gtf_fofn), + "--outdir={}".format(args.gtf_out_dir), + "--tmpdir={}".format(temp_dir), + "--cpu={}".format(args.hgm_cpu), + ] + tools.procOps.run_proc(cmd, stdout="/dev/null") # cleanup for gff in supplementary_gffs: @@ -102,15 +110,17 @@ def create_supplementary_gff(hints_db, in_gtf, genome, annotation_gp=None): if annotation_gp is not None: hints.extend(extract_exons_non_coding_introns(annotation_gp)) tmp_path = tools.fileOps.get_tmp_file() - with open(tmp_path, 'w') as outf: + with open(tmp_path, "w") as outf: tools.fileOps.print_rows(outf, hints) # sort and merge hints on the same intervals - cmd = [['sort', '-n', '-k4,4', tmp_path], - ['sort', '-s', '-n', '-k5,5'], - ['sort', '-s', '-k3,3'], - ['sort', '-s', '-k1,1'], - ['join_mult_hints.pl']] - supplementary_gff_path = tools.fileOps.get_tmp_file(suffix='gff') + cmd = [ + ["sort", "-n", "-k4,4", tmp_path], + ["sort", "-s", "-n", "-k5,5"], + ["sort", "-s", "-k3,3"], + ["sort", "-s", "-k1,1"], + ["join_mult_hints.pl"], + ] + supplementary_gff_path = tools.fileOps.get_tmp_file(suffix="gff") tools.procOps.run_proc(cmd, stdout=supplementary_gff_path) os.remove(tmp_path) return supplementary_gff_path @@ -128,10 +138,10 @@ def extract_exons_non_coding_introns(annotation_gp): for tx in tools.transcripts.gene_pred_iterator(annotation_gp): for intron in tx.intron_intervals: if not intron.subset(tx.coding_interval): - r = [tx.chromosome, 'tmp', 'intron', intron.start + 1, intron.stop, '.', tx.strand, '.', 'source=N'] + r = [tx.chromosome, "tmp", "intron", intron.start + 1, intron.stop, ".", tx.strand, ".", "source=N"] hints.append(r) for exon in tx.exon_intervals: - r = [tx.chromosome, 'tmp', 'exon', exon.start + 1, exon.stop, '.', tx.strand, '.', 'source=M'] + r = [tx.chromosome, "tmp", "exon", exon.start + 1, exon.stop, ".", tx.strand, ".", "source=M"] hints.append(r) return hints @@ -156,33 +166,37 @@ def extract_exon_hints(hints_db, in_gtf, genome): # Either way, we don't want to attempt to extract exon hints from genomes without exon (wiggle) hints anyways if tools.hintsDatabaseInterface.genome_has_no_wiggle_hints(hints_db, genome): return [] - with open(hints_file, 'w') as outf_h: + with open(hints_file, "w") as outf_h: wiggle_iter = tools.hintsDatabaseInterface.get_wiggle_hints(genome, speciesnames, seqnames, hints, session) for seqname, start, end, score in wiggle_iter: - outf_h.write('\t'.join(map(str, [seqname, start, end, score])) + '\n') + outf_h.write("\t".join(map(str, [seqname, start, end, score])) + "\n") # merge exonpart hints, averaging the coverage merged_hints_file = tools.fileOps.get_tmp_file() - cmd = ['bedtools', 'merge', '-i', hints_file, '-c', '4', '-o', 'mean'] - tools.procOps.run_proc(cmd, stdout=merged_hints_file, stderr='/dev/null') + cmd = ["bedtools", "merge", "-i", hints_file, "-c", "4", "-o", "mean"] + tools.procOps.run_proc(cmd, stdout=merged_hints_file, stderr="/dev/null") # overlap the merged exons with the given GTF, producing a final set. tmp_bed = tools.fileOps.get_tmp_file() - cmd = [['grep', '-P', '(\texon\t|\tCDS\t)', in_gtf], # exons or CDS only - ['cut', '-d', '\t', '-f', '1,4,5']] # slice into BED-like format with GTF intervals + cmd = [ + ["grep", "-P", "(\texon\t|\tCDS\t)", in_gtf], # exons or CDS only + ["cut", "-d", "\t", "-f", "1,4,5"], + ] # slice into BED-like format with GTF intervals tools.procOps.run_proc(cmd, stdout=tmp_bed) # sort the BED - tools.procOps.run_proc(['bedSort', tmp_bed, tmp_bed]) + tools.procOps.run_proc(["bedSort", tmp_bed, tmp_bed]) # intersect with hints and retain scores - cmd = [['bedtools', 'intersect', '-a', tmp_bed, '-b', merged_hints_file, '-f', '0.8', '-wa', '-wb'], - # bedtools reports both entire A and entire B if at least 80% of A overlaps a B - ['cut', '-d', '\t', '-f', '1,2,3,7']] # retain the A positions with the B score + cmd = [ + ["bedtools", "intersect", "-a", tmp_bed, "-b", merged_hints_file, "-f", "0.8", "-wa", "-wb"], + # bedtools reports both entire A and entire B if at least 80% of A overlaps a B + ["cut", "-d", "\t", "-f", "1,2,3,7"], + ] # retain the A positions with the B score # these BED-like records are actually GFF intervals with 1-based starts and closed intervals bed_plus_1 = tools.procOps.call_proc_lines(cmd) hints = [] for line in bed_plus_1: chrom, start, end, score = line.split() - tags = 'pri=3;source=E;mult={}'.format(int(round(float(score)))) - hints.append([chrom, 'tmp', 'exon', start, end, '.', '.', '.', tags]) + tags = "pri=3;source=E;mult={}".format(int(round(float(score)))) + hints.append([chrom, "tmp", "exon", start, end, ".", ".", ".", tags]) os.remove(hints_file) os.remove(merged_hints_file) return hints @@ -198,31 +212,42 @@ def parse_hgm_gtf(hgm_out, genome): We calculate this as both the in-species and all-species vectors. """ + def calculate_annot_support(intron_info, cds_info, exon_info): - intron_annot = ','.join(map(str, [x.count('M') + x.count('N') for x in intron_info])) - cds_annot = ','.join(map(str, [x.count('M') for x in cds_info])) - exon_annot = ','.join(map(str, [x.count('M') for x in exon_info])) - assert len(intron_annot) + 2 == len(exon_annot) or len(intron_annot) == 0 and len(exon_annot) == 1, \ - (len(intron_annot), len(exon_annot), aln_id) + intron_annot = ",".join(map(str, [x.count("M") + x.count("N") for x in intron_info])) + cds_annot = ",".join(map(str, [x.count("M") for x in cds_info])) + exon_annot = ",".join(map(str, [x.count("M") for x in exon_info])) + assert len(intron_annot) + 2 == len(exon_annot) or len(intron_annot) == 0 and len(exon_annot) == 1, ( + len(intron_annot), + len(exon_annot), + aln_id, + ) return [intron_annot, cds_annot, exon_annot] def calculate_all_species(intron_info, exon_info): - intron_rna = ','.join(map(str, [x.count('E') + x.count('PB') for x in intron_info])) - exon_rna = ','.join(map(str, [x.count('E') + x.count('PB') for x in exon_info])) + intron_rna = ",".join(map(str, [x.count("E") + x.count("PB") for x in intron_info])) + exon_rna = ",".join(map(str, [x.count("E") + x.count("PB") for x in exon_info])) return [intron_rna, exon_rna] def calculate_in_species(intron_info, exon_info, species_id): def parse_entry(entry, species_id): - recs = entry.split(',') + recs = entry.split(",") for x in recs: if x.startswith(species_id): - return x[len(species_id):] - return '' + return x[len(species_id) :] + return "" - intron_rna = ','.join(map(str, [parse_entry(x, species_id).count('E') + - parse_entry(x, species_id).count('PB') for x in intron_info])) - exon_rna = ','.join(map(str, [parse_entry(x, species_id).count('E') + - parse_entry(x, species_id).count('PB') for x in exon_info])) + intron_rna = ",".join( + map( + str, + [parse_entry(x, species_id).count("E") + parse_entry(x, species_id).count("PB") for x in intron_info], + ) + ) + exon_rna = ",".join( + map( + str, [parse_entry(x, species_id).count("E") + parse_entry(x, species_id).count("PB") for x in exon_info] + ) + ) return [intron_rna, exon_rna] intron_lines = [] @@ -235,31 +260,31 @@ def parse_entry(entry, species_id): if line in seen_lines: continue seen_lines.add(line) - if line.startswith('#') and line != '###\n': + if line.startswith("#") and line != "###\n": _, species_id, species = line.split() species_map[species] = species_id - if '\tintron\t' in line: - intron_lines.append(line.rstrip().split('\t')[-1]) - elif '\tCDS\t' in line: - cds_lines.append(line.rstrip().split('\t')[-1]) - elif '\texon\t' in line: - exon_lines.append(line.rstrip().split('\t')[-1]) + if "\tintron\t" in line: + intron_lines.append(line.rstrip().split("\t")[-1]) + elif "\tCDS\t" in line: + cds_lines.append(line.rstrip().split("\t")[-1]) + elif "\texon\t" in line: + exon_lines.append(line.rstrip().split("\t")[-1]) species_id = species_map[genome] # make use of the sorted nature of the input GTFs to create a ordered vector d = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(list))) - for mode, group in zip(*[['intron', 'cds', 'exon'], [intron_lines, cds_lines, exon_lines]]): + for mode, group in zip(*[["intron", "cds", "exon"], [intron_lines, cds_lines, exon_lines]]): for attr_line in group: attributes = tools.misc.parse_gtf_attr_line(attr_line) - d[attributes['gene_id']][attributes['transcript_id']][mode].append(attributes['hgm_info']) + d[attributes["gene_id"]][attributes["transcript_id"]][mode].append(attributes["hgm_info"]) # convert to dataframe, switching the list to a comma separated string dd = [] for gene_id in d: for aln_id in d[gene_id]: - intron_info = d[gene_id][aln_id]['intron'] - cds_info = d[gene_id][aln_id]['cds'] - exon_info = d[gene_id][aln_id]['exon'] + intron_info = d[gene_id][aln_id]["intron"] + cds_info = d[gene_id][aln_id]["cds"] + exon_info = d[gene_id][aln_id]["exon"] if tools.nameConversions.aln_id_is_denovo(aln_id): tx_id = aln_id else: @@ -270,9 +295,17 @@ def parse_entry(entry, species_id): dd.append([gene_id, tx_id, aln_id] + all_species_vectors + in_species_vectors + annot_support_vectors) df = pd.DataFrame(dd) - df.columns = ['GeneId', 'TranscriptId', 'AlignmentId', - 'AllSpeciesIntronRnaSupport', 'AllSpeciesExonRnaSupport', - 'IntronRnaSupport', 'ExonRnaSupport', - 'IntronAnnotSupport', 'CdsAnnotSupport', 'ExonAnnotSupport'] - df = df.set_index(['GeneId', 'TranscriptId', 'AlignmentId']) + df.columns = [ + "GeneId", + "TranscriptId", + "AlignmentId", + "AllSpeciesIntronRnaSupport", + "AllSpeciesExonRnaSupport", + "IntronRnaSupport", + "ExonRnaSupport", + "IntronAnnotSupport", + "CdsAnnotSupport", + "ExonAnnotSupport", + ] + df = df.set_index(["GeneId", "TranscriptId", "AlignmentId"]) return df diff --git a/cat/hints_db.py b/cat/hints_db.py index 3c049c94..ffd3ef1b 100644 --- a/cat/hints_db.py +++ b/cat/hints_db.py @@ -6,7 +6,7 @@ import os import logging -import pyfasta +import pyfaidx import pysam try: @@ -26,67 +26,74 @@ import tools.bio from .exceptions import UserException -logger = logging.getLogger('cat') +logger = logging.getLogger("cat") def hints_db(hints_args, toil_options): """ Entry point for hints database Toil pipeline. """ + def validate_import_bam(t, bam_path, fasta_sequences, genome): validate_bam_fasta_pairs(bam_path, fasta_sequences, genome) - return [FileID.forPath(t.importFile('file://' + bam_path), bam_path), - FileID.forPath(t.importFile('file://' + bam_path + '.bai'), bam_path + '.bai')] + return [ + FileID.forPath(t.importFile("file://" + bam_path), bam_path), + FileID.forPath(t.importFile("file://" + bam_path + ".bai"), bam_path + ".bai"), + ] - fasta = pyfasta.Fasta(hints_args.fasta) + fasta = pyfaidx.Fasta(hints_args.fasta) fasta_sequences = {(x.split()[0], len(fasta[x])) for x in fasta.keys()} with Toil(toil_options) as t: if not t.options.restart: # load the RNA-seq data, if we have any - bam_file_ids = {'BAM': {}, 'INTRONBAM': {}} - for dtype in ['BAM', 'INTRONBAM']: + bam_file_ids = {"BAM": {}, "INTRONBAM": {}} + for dtype in ["BAM", "INTRONBAM"]: if hints_args.genome not in hints_args.cfg[dtype]: continue for bam_path in hints_args.cfg[dtype][hints_args.genome]: - bam_file_ids[dtype][os.path.basename(bam_path)] = validate_import_bam(t, bam_path, - fasta_sequences, - hints_args.genome) + bam_file_ids[dtype][os.path.basename(bam_path)] = validate_import_bam( + t, bam_path, fasta_sequences, hints_args.genome + ) # load the IsoSeq data, if we have any iso_seq_file_ids = [] - if hints_args.genome in hints_args.cfg['ISO_SEQ_BAM']: - for bam_path in hints_args.cfg['ISO_SEQ_BAM'][hints_args.genome]: + if hints_args.genome in hints_args.cfg["ISO_SEQ_BAM"]: + for bam_path in hints_args.cfg["ISO_SEQ_BAM"][hints_args.genome]: validate_bam_fasta_pairs(bam_path, fasta_sequences, hints_args.genome) iso_seq_file_ids.append(validate_import_bam(t, bam_path, fasta_sequences, hints_args.genome)) if hints_args.annotation_gp is None: annotation_file_id = None else: - annotation_file_id = FileID.forPath(t.importFile('file://' + hints_args.annotation_gp), - hints_args.annotation_gp) + annotation_file_id = FileID.forPath( + t.importFile("file://" + hints_args.annotation_gp), hints_args.annotation_gp + ) if hints_args.protein_fasta is None: protein_fasta_file_id = genome_fasta_file_id = None else: - protein_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.protein_fasta), - hints_args.protein_fasta) - genome_fasta_file_id = FileID.forPath(t.importFile('file://' + hints_args.fasta), hints_args.fasta) - - input_file_ids = {'bams': bam_file_ids, - 'iso_seq_bams': iso_seq_file_ids, - 'annotation': annotation_file_id, - 'protein_fasta': protein_fasta_file_id, - 'genome_fasta': genome_fasta_file_id} - if len(input_file_ids['bams']) + len(input_file_ids['iso_seq_bams']) > 0: - logger.info('All BAMs validated for {}. Beginning Toil hints pipeline'.format(hints_args.genome)) + protein_fasta_file_id = FileID.forPath( + t.importFile("file://" + hints_args.protein_fasta), hints_args.protein_fasta + ) + genome_fasta_file_id = FileID.forPath(t.importFile("file://" + hints_args.fasta), hints_args.fasta) + + input_file_ids = { + "bams": bam_file_ids, + "iso_seq_bams": iso_seq_file_ids, + "annotation": annotation_file_id, + "protein_fasta": protein_fasta_file_id, + "genome_fasta": genome_fasta_file_id, + } + if len(input_file_ids["bams"]) + len(input_file_ids["iso_seq_bams"]) > 0: + logger.info("All BAMs validated for {}. Beginning Toil hints pipeline".format(hints_args.genome)) disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids) job = Job.wrapJobFn(setup_hints, input_file_ids, disk=disk_usage) combined_hints = t.start(job) else: - logger.info('Restarting Toil hints pipeline for {}.'.format(hints_args.genome)) + logger.info("Restarting Toil hints pipeline for {}.".format(hints_args.genome)) combined_hints = t.restart() tools.fileOps.ensure_file_dir(hints_args.hints_path) - t.exportFile(combined_hints, 'file://' + hints_args.hints_path) + t.exportFile(combined_hints, "file://" + hints_args.hints_path) def setup_hints(job, input_file_ids): @@ -94,8 +101,8 @@ def setup_hints(job, input_file_ids): Generates hints for a given genome with a list of BAMs. Will add annotation if it exists. """ # RNA-seq hints - filtered_bam_file_ids = {'BAM': collections.defaultdict(list), 'INTRONBAM': collections.defaultdict(list)} - for dtype, bam_dict in input_file_ids['bams'].items(): + filtered_bam_file_ids = {"BAM": collections.defaultdict(list), "INTRONBAM": collections.defaultdict(list)} + for dtype, bam_dict in input_file_ids["bams"].items(): if len(bam_dict) == 0: continue # Since BAMs are valid, we can assume that they all share the same header @@ -108,13 +115,21 @@ def setup_hints(job, input_file_ids): grouped_references = [tuple(x) for x in group_references(sam_handle)] for original_path, (bam_file_id, bai_file_id) in bam_dict.items(): for reference_subset in grouped_references: - j = job.addChildJobFn(namesort_bam, bam_file_id, bai_file_id, reference_subset, disk_usage, - disk=disk_usage, cores=4, memory='16G') + j = job.addChildJobFn( + namesort_bam, + bam_file_id, + bai_file_id, + reference_subset, + disk_usage, + disk=disk_usage, + cores=4, + memory="16G", + ) filtered_bam_file_ids[dtype][reference_subset].append(j.rv()) # IsoSeq hints iso_seq_hints_file_ids = [] - iso_seq_file_ids = input_file_ids['iso_seq_bams'] + iso_seq_file_ids = input_file_ids["iso_seq_bams"] if len(iso_seq_file_ids) > 0: for bam_file_id, bai_file_id in iso_seq_file_ids: disk_usage = tools.toilInterface.find_total_disk_usage([bam_file_id, bai_file_id]) @@ -122,23 +137,25 @@ def setup_hints(job, input_file_ids): iso_seq_hints_file_ids.append(j.rv()) # protein hints - if input_file_ids['protein_fasta'] is not None: - disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids['protein_fasta']) - j = job.addChildJobFn(generate_protein_hints, input_file_ids['protein_fasta'], input_file_ids['genome_fasta'], - disk=disk_usage) + if input_file_ids["protein_fasta"] is not None: + disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids["protein_fasta"]) + j = job.addChildJobFn( + generate_protein_hints, input_file_ids["protein_fasta"], input_file_ids["genome_fasta"], disk=disk_usage + ) protein_hints_file_id = j.rv() else: protein_hints_file_id = None # annotation hints - if input_file_ids['annotation'] is not None: - disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids['annotation']) - j = job.addChildJobFn(generate_annotation_hints, input_file_ids['annotation'], disk=disk_usage) + if input_file_ids["annotation"] is not None: + disk_usage = tools.toilInterface.find_total_disk_usage(input_file_ids["annotation"]) + j = job.addChildJobFn(generate_annotation_hints, input_file_ids["annotation"], disk=disk_usage) annotation_hints_file_id = j.rv() else: annotation_hints_file_id = None - return job.addFollowOnJobFn(merge_bams, filtered_bam_file_ids, annotation_hints_file_id, - iso_seq_hints_file_ids, protein_hints_file_id).rv() + return job.addFollowOnJobFn( + merge_bams, filtered_bam_file_ids, annotation_hints_file_id, iso_seq_hints_file_ids, protein_hints_file_id + ).rv() def namesort_bam(job, bam_file_id, bai_file_id, reference_subset, disk_usage, num_reads=50 ** 6): @@ -146,10 +163,11 @@ def namesort_bam(job, bam_file_id, bai_file_id, reference_subset, disk_usage, nu Slices out the reference subset from a BAM, name sorts that subset, then chunks the resulting reads up for processing by filterBam. """ + def write_bam(r, ns_handle): """Write to the path, returns file ID""" outf = tools.fileOps.get_tmp_toil_file() - outf_h = pysam.Samfile(outf, 'wb', template=ns_handle) + outf_h = pysam.Samfile(outf, "wb", template=ns_handle) for rec in r: outf_h.write(rec) outf_h.close() @@ -157,11 +175,24 @@ def write_bam(r, ns_handle): bam_path = job.fileStore.readGlobalFile(bam_file_id) is_paired = bam_is_paired(bam_path) - job.fileStore.readGlobalFile(bai_file_id, bam_path + '.bai') - name_sorted = tools.fileOps.get_tmp_toil_file(suffix='name_sorted.bam') - cmd = [['samtools', 'view', '-b', bam_path] + list(reference_subset), - ['sambamba', 'sort', '--tmpdir={}'.format(job.fileStore.getLocalTempDir()), - '-t', '4', '-m', '15G', '-o', '/dev/stdout', '-n', '/dev/stdin']] + job.fileStore.readGlobalFile(bai_file_id, bam_path + ".bai") + name_sorted = tools.fileOps.get_tmp_toil_file(suffix="name_sorted.bam") + cmd = [ + ["samtools", "view", "-b", bam_path] + list(reference_subset), + [ + "sambamba", + "sort", + "--tmpdir={}".format(job.fileStore.getLocalTempDir()), + "-t", + "4", + "-m", + "15G", + "-o", + "/dev/stdout", + "-n", + "/dev/stdin", + ], + ] tools.procOps.run_proc(cmd, stdout=name_sorted) ns_handle = pysam.Samfile(name_sorted) # this group may come up empty -- check to see if we have at least one mapped read @@ -177,15 +208,15 @@ def write_bam(r, ns_handle): r.extend(list(reads)) if len(r) >= num_reads: file_id = write_bam(r, ns_handle) - j = job.addChildJobFn(filter_bam, file_id, is_paired, disk='4G', memory='2G') + j = job.addChildJobFn(filter_bam, file_id, is_paired, disk="4G", memory="2G") filtered_file_ids.append(j.rv()) r = [] # do the last bin, if its non-empty if len(r) > 0: file_id = write_bam(r, ns_handle) - j = job.addChildJobFn(filter_bam, file_id, is_paired, disk='4G', memory='2G') + j = job.addChildJobFn(filter_bam, file_id, is_paired, disk="4G", memory="2G") filtered_file_ids.append(j.rv()) - return job.addFollowOnJobFn(merge_filtered_bams, filtered_file_ids, disk=disk_usage, memory='16G').rv() + return job.addFollowOnJobFn(merge_filtered_bams, filtered_file_ids, disk=disk_usage, memory="16G").rv() def filter_bam(job, file_id, is_paired): @@ -195,16 +226,16 @@ def filter_bam(job, file_id, is_paired): bam_path = job.fileStore.readGlobalFile(file_id) assert os.path.getsize(bam_path) > 0 tmp_filtered = tools.fileOps.get_tmp_toil_file() - filter_cmd = ['filterBam', '--uniq', '--in', bam_path, '--out', tmp_filtered] + filter_cmd = ["filterBam", "--uniq", "--in", bam_path, "--out", tmp_filtered] if is_paired is True: - filter_cmd.extend(['--paired', '--pairwiseAlignments']) + filter_cmd.extend(["--paired", "--pairwiseAlignments"]) tools.procOps.run_proc(filter_cmd) if os.path.getsize(tmp_filtered) == 0: - raise RuntimeError('After filtering one BAM subset became empty. This could be bad.') + raise RuntimeError("After filtering one BAM subset became empty. This could be bad.") out_filter = tools.fileOps.get_tmp_toil_file() - sort_cmd = ['sambamba', 'sort', tmp_filtered, '-o', out_filter, '-t', '1'] + sort_cmd = ["sambamba", "sort", tmp_filtered, "-o", out_filter, "-t", "1"] tools.procOps.run_proc(sort_cmd) return job.fileStore.writeGlobalFile(out_filter) @@ -215,34 +246,35 @@ def merge_filtered_bams(job, filtered_file_ids): """ local_paths = [job.fileStore.readGlobalFile(x) for x in filtered_file_ids] fofn = tools.fileOps.get_tmp_toil_file() - with open(fofn, 'w') as outf: + with open(fofn, "w") as outf: for l in local_paths: - if os.environ.get('CAT_BINARY_MODE') == 'singularity': + if os.environ.get("CAT_BINARY_MODE") == "singularity": l = tools.procOps.singularify_arg(l) - outf.write(l + '\n') + outf.write(l + "\n") out_bam = tools.fileOps.get_tmp_toil_file() - cmd = ['samtools', 'merge', '-b', fofn, out_bam] + cmd = ["samtools", "merge", "-b", fofn, out_bam] tools.procOps.run_proc(cmd) return job.fileStore.writeGlobalFile(out_bam) -def merge_bams(job, filtered_bam_file_ids, annotation_hints_file_id, iso_seq_hints_file_ids, - protein_hints_file_id): +def merge_bams(job, filtered_bam_file_ids, annotation_hints_file_id, iso_seq_hints_file_ids, protein_hints_file_id): """ Takes a dictionary mapping reference chunks to filtered BAMs. For each reference chunk, these BAMs will be first concatenated then sorted, then passed off to hint building. Passes through the annotation/protein hints file IDs for inclusion. """ - merged_bam_file_ids = {'BAM': {}, 'INTRONBAM': {}} + merged_bam_file_ids = {"BAM": {}, "INTRONBAM": {}} for dtype in filtered_bam_file_ids: for ref_group, file_ids in filtered_bam_file_ids[dtype].items(): file_ids = [x for x in file_ids if x is not None] # some groups will end up empty if len(file_ids) > 0: disk_usage = tools.toilInterface.find_total_disk_usage(file_ids) - merged_bam_file_ids[dtype][ref_group] = job.addChildJobFn(cat_sort_bams, file_ids, disk=disk_usage, - memory='16G', cores=4).rv() - return job.addFollowOnJobFn(build_hints, merged_bam_file_ids, annotation_hints_file_id, iso_seq_hints_file_ids, - protein_hints_file_id).rv() + merged_bam_file_ids[dtype][ref_group] = job.addChildJobFn( + cat_sort_bams, file_ids, disk=disk_usage, memory="16G", cores=4 + ).rv() + return job.addFollowOnJobFn( + build_hints, merged_bam_file_ids, annotation_hints_file_id, iso_seq_hints_file_ids, protein_hints_file_id + ).rv() def cat_sort_bams(job, bam_file_ids): @@ -257,7 +289,7 @@ def cat_sort_bams(job, bam_file_ids): sam_iter = tools.dataOps.grouper(bamfiles, 4095) # do the first one - cmd = ['samtools', 'cat', '-o', catfile] + cmd = ["samtools", "cat", "-o", catfile] cmd.extend(next(sam_iter)) tools.procOps.run_proc(cmd) @@ -265,13 +297,13 @@ def cat_sort_bams(job, bam_file_ids): for more in sam_iter: old_catfile = catfile catfile = tools.fileOps.get_tmp_toil_file() - cmd = ['samtools', 'cat', '-o', catfile, old_catfile] + cmd = ["samtools", "cat", "-o", catfile, old_catfile] cmd.extend(more) tools.procOps.run_proc(cmd) # combine and merge merged = tools.fileOps.get_tmp_toil_file() - cmd = ['sambamba', 'sort', catfile, '-o', merged, '-t', '4', '-m', '15G'] + cmd = ["sambamba", "sort", catfile, "-o", merged, "-t", "4", "-m", "15G"] tools.procOps.run_proc(cmd) return job.fileStore.writeGlobalFile(merged) @@ -282,16 +314,18 @@ def generate_protein_hints(job, protein_fasta_file_id, genome_fasta_file_id): """ disk_usage = tools.toilInterface.find_total_disk_usage(genome_fasta_file_id) protein_fasta = job.fileStore.readGlobalFile(protein_fasta_file_id) - cmd = ['pyfasta', 'flatten', protein_fasta] + cmd = ["samtools", "faidx", protein_fasta] tools.procOps.run_proc(cmd) protein_handle = tools.bio.get_sequence_dict(protein_fasta) # group up proteins for sub-jobs results = [] - for chunk in tools.dataOps.grouper(protein_handle.items(), 100): - j = job.addChildJobFn(run_protein_aln, chunk, genome_fasta_file_id, disk=disk_usage, memory='8G') + for chunk in tools.dataOps.grouper(protein_handle.items(), 500): + # make pickleable + chunk = [[name, str(seq)] for name, seq in chunk] + j = job.addChildJobFn(run_protein_aln, chunk, genome_fasta_file_id, disk=disk_usage, memory="8G") results.append(j.rv()) # return merged results - return job.addFollowOnJobFn(convert_protein_aln_results_to_hints, results, memory='8G').rv() + return job.addFollowOnJobFn(convert_protein_aln_results_to_hints, results, memory="8G").rv() def run_protein_aln(job, protein_subset, genome_fasta_file_id): @@ -301,13 +335,24 @@ def run_protein_aln(job, protein_subset, genome_fasta_file_id): genome_fasta = job.fileStore.readGlobalFile(genome_fasta_file_id) # write proteins to fasta protein_fasta = tools.fileOps.get_tmp_toil_file() - with open(protein_fasta, 'w') as outf: + with open(protein_fasta, "w") as outf: for name, seq in protein_subset: tools.bio.write_fasta(outf, name, str(seq)) # perform alignment tmp_exonerate = tools.fileOps.get_tmp_toil_file() - cmd = ['exonerate', '--model', 'protein2genome', '--showvulgar', 'no', '--showalignment', 'no', - '--showquerygff', 'yes', protein_fasta, genome_fasta] + cmd = [ + "exonerate", + "--model", + "protein2genome", + "--showvulgar", + "no", + "--showalignment", + "no", + "--showquerygff", + "yes", + protein_fasta, + genome_fasta, + ] tools.procOps.run_proc(cmd, stdout=tmp_exonerate) return job.fileStore.writeGlobalFile(tmp_exonerate) @@ -317,7 +362,7 @@ def convert_protein_aln_results_to_hints(job, results): Concatenates exonerate protein2genome, converts to hints """ merged_exonerate = tools.fileOps.get_tmp_toil_file() - with open(merged_exonerate, 'w') as outf: + with open(merged_exonerate, "w") as outf: for r in results: f = job.fileStore.readGlobalFile(r) outf.write(open(f).read()) @@ -325,7 +370,7 @@ def convert_protein_aln_results_to_hints(job, results): tmp_sorted = tools.fileOps.get_tmp_toil_file() tools.misc.sort_gff(merged_exonerate, tmp_sorted) out_hints = tools.fileOps.get_tmp_toil_file() - cmd = ['exonerate2hints.pl', '--in={}'.format(tmp_sorted), '--CDSpart_cutoff=5', '--out={}'.format(out_hints)] + cmd = ["exonerate2hints.pl", "--in={}".format(tmp_sorted), "--CDSpart_cutoff=5", "--out={}".format(out_hints)] tools.procOps.run_proc(cmd) return job.fileStore.writeGlobalFile(out_hints) @@ -339,22 +384,34 @@ def build_hints(job, merged_bam_file_ids, annotation_hints_file_id, iso_seq_hint for dtype in merged_bam_file_ids: for ref_group, file_ids in merged_bam_file_ids[dtype].items(): intron_hints_file_ids.append(job.addChildJobFn(build_intron_hints, file_ids).rv()) - if dtype == 'BAM': + if dtype == "BAM": exon_hints_file_ids.append(job.addChildJobFn(build_exon_hints, file_ids).rv()) - disk_usage = tools.toilInterface.find_total_disk_usage(itertools.chain.from_iterable([intron_hints_file_ids, - exon_hints_file_ids, - iso_seq_hints_file_ids, - [annotation_hints_file_id, - protein_hints_file_id]])) - return job.addFollowOnJobFn(cat_hints, intron_hints_file_ids, exon_hints_file_ids, annotation_hints_file_id, - iso_seq_hints_file_ids, protein_hints_file_id, disk=disk_usage).rv() + disk_usage = tools.toilInterface.find_total_disk_usage( + itertools.chain.from_iterable( + [ + intron_hints_file_ids, + exon_hints_file_ids, + iso_seq_hints_file_ids, + [annotation_hints_file_id, protein_hints_file_id], + ] + ) + ) + return job.addFollowOnJobFn( + cat_hints, + intron_hints_file_ids, + exon_hints_file_ids, + annotation_hints_file_id, + iso_seq_hints_file_ids, + protein_hints_file_id, + disk=disk_usage, + ).rv() def build_intron_hints(job, merged_bam_file_id): """Builds intronhints from a BAM. Returns a fileID to the hints.""" bam_file = job.fileStore.readGlobalFile(merged_bam_file_id) intron_gff_path = tools.fileOps.get_tmp_toil_file() - cmd = ['bam2hints', '--intronsonly', '--in', bam_file, '--out', intron_gff_path] + cmd = ["bam2hints", "--intronsonly", "--in", bam_file, "--out", intron_gff_path] tools.procOps.run_proc(cmd) return job.fileStore.writeGlobalFile(intron_gff_path) @@ -362,9 +419,23 @@ def build_intron_hints(job, merged_bam_file_id): def build_exon_hints(job, merged_bam_file_id): """Builds exonhints from a BAM Returns a fileID to the hints.""" bam_file = job.fileStore.readGlobalFile(merged_bam_file_id) - cmd = [['bam2wig', bam_file], - ['wig2hints.pl', '--width=10', '--margin=10', '--minthresh=2', '--minscore=4', '--prune=0.1', '--src=W', - '--type=ep', '--UCSC=/dev/null', '--radius=4.5', '--pri=4', '--strand=.']] + cmd = [ + ["bam2wig", bam_file], + [ + "wig2hints.pl", + "--width=10", + "--margin=10", + "--minthresh=2", + "--minscore=4", + "--prune=0.1", + "--src=W", + "--type=ep", + "--UCSC=/dev/null", + "--radius=4.5", + "--pri=4", + "--strand=.", + ], + ] exon_gff_path = tools.fileOps.get_tmp_toil_file() tools.procOps.run_proc(cmd, stdout=exon_gff_path) return job.fileStore.writeGlobalFile(exon_gff_path) @@ -378,15 +449,23 @@ def generate_iso_seq_hints(job, bam_file_id, bai_file_id): Adapted from http://bioinf.uni-greifswald.de/bioinf/wiki/pmwiki.php?n=Augustus.PacBioGMAP """ bam_path = job.fileStore.readGlobalFile(bam_file_id) - job.fileStore.readGlobalFile(bai_file_id, bam_path + '.bai') + job.fileStore.readGlobalFile(bai_file_id, bam_path + ".bai") pacbio_gff_path = tools.fileOps.get_tmp_toil_file() - cmd = [['samtools', 'view', '-b', '-F', '4', bam_path], # unmapped reads causes bamToPsl to crash - ['bamToPsl', '-nohead', '/dev/stdin', '/dev/stdout'], - ['sort', '-n', '-k', '16,16'], - ['sort', '-s', '-k', '14,14'], - ['perl', '-ne', '@f=split; print if ($f[0]>=100)'], - ['blat2hints.pl', '--source=PB', '--nomult', '--ep_cutoff=20', '--in=/dev/stdin', - '--out={}'.format(pacbio_gff_path)]] + cmd = [ + ["samtools", "view", "-b", "-F", "4", bam_path], # unmapped reads causes bamToPsl to crash + ["bamToPsl", "-nohead", "/dev/stdin", "/dev/stdout"], + ["sort", "-n", "-k", "16,16"], + ["sort", "-s", "-k", "14,14"], + ["perl", "-ne", "@f=split; print if ($f[0]>=100)"], + [ + "blat2hints.pl", + "--source=PB", + "--nomult", + "--ep_cutoff=20", + "--in=/dev/stdin", + "--out={}".format(pacbio_gff_path), + ], + ] tools.procOps.run_proc(cmd) return job.fileStore.writeGlobalFile(pacbio_gff_path) @@ -406,23 +485,47 @@ def generate_annotation_hints(job, annotation_hints_file_id): # rather than try to re-do the arithmetic, we will use the get_bed() function to convert this transcript cds_tx = tools.transcripts.Transcript(tx.get_bed(new_start=tx.thick_start, new_stop=tx.thick_stop)) for intron in cds_tx.intron_intervals: - r = [intron.chromosome, 'a2h', 'intron', intron.start + 1, intron.stop, 0, intron.strand, '.', - 'grp={};src=M;pri=2'.format(tx_id)] + r = [ + intron.chromosome, + "a2h", + "intron", + intron.start + 1, + intron.stop, + 0, + intron.strand, + ".", + "grp={};src=M;pri=2".format(tx_id), + ] hints.append(r) for exon in cds_tx.exon_intervals: - r = [exon.chromosome, 'a2h', 'CDS', exon.start + 1, exon.stop, 0, exon.strand, '.', - 'grp={};src=M;pri=2'.format(tx_id)] + r = [ + exon.chromosome, + "a2h", + "CDS", + exon.start + 1, + exon.stop, + 0, + exon.strand, + ".", + "grp={};src=M;pri=2".format(tx_id), + ] hints.append(r) annotation_hints_gff = tools.fileOps.get_tmp_toil_file() tools.fileOps.print_rows(annotation_hints_gff, hints) return job.fileStore.writeGlobalFile(annotation_hints_gff) -def cat_hints(job, intron_hints_file_ids, exon_hints_file_ids, annotation_hints_file_id, iso_seq_hints_file_ids, - protein_hints_file_id): +def cat_hints( + job, + intron_hints_file_ids, + exon_hints_file_ids, + annotation_hints_file_id, + iso_seq_hints_file_ids, + protein_hints_file_id, +): """Returns file ID to combined, sorted hints""" cat_hints = tools.fileOps.get_tmp_toil_file() - with open(cat_hints, 'w') as outf: + with open(cat_hints, "w") as outf: for file_id in itertools.chain(intron_hints_file_ids, exon_hints_file_ids): f = job.fileStore.readGlobalFile(file_id) for line in open(f): @@ -433,15 +536,17 @@ def cat_hints(job, intron_hints_file_ids, exon_hints_file_ids, annotation_hints_ for line in open(f): outf.write(line) # sorted so that hints that should be summarized are below each other - cmd = [['sort', '-n', '-k4,4', cat_hints], - ['sort', '-s', '-n', '-k5,5'], - ['sort', '-s', '-k3,3'], - ['sort', '-s', '-k1,1'], - ['join_mult_hints.pl']] + cmd = [ + ["sort", "-n", "-k4,4", cat_hints], + ["sort", "-s", "-n", "-k5,5"], + ["sort", "-s", "-k3,3"], + ["sort", "-s", "-k1,1"], + ["join_mult_hints.pl"], + ] combined_hints = tools.fileOps.get_tmp_toil_file() tools.procOps.run_proc(cmd, stdout=combined_hints) # don't add the IsoSeq until after join_mult_hints because we don't want them to be joined - with open(combined_hints, 'a') as outf: + with open(combined_hints, "a") as outf: for file_id in iso_seq_hints_file_ids: f = job.fileStore.readGlobalFile(file_id) for line in open(f): @@ -462,17 +567,17 @@ def validate_bam_fasta_pairs(bam_path, fasta_sequences, genome): Make sure that this BAM is actually aligned to this fasta. Every sequence should be the same length. Sequences can exist in the reference that do not exist in the BAM, but not the other way around. """ - handle = pysam.Samfile(bam_path, 'rb') + handle = pysam.Samfile(bam_path, "rb") bam_sequences = {(n, s) for n, s in zip(*[handle.references, handle.lengths])} difference = bam_sequences - fasta_sequences if len(difference) > 0: - base_err = 'Error: BAM {} has the following sequence/length pairs not found in the {} fasta: {}.' - err = base_err.format(bam_path, genome, ','.join(['-'.join(map(str, x)) for x in difference])) + base_err = "Error: BAM {} has the following sequence/length pairs not found in the {} fasta: {}." + err = base_err.format(bam_path, genome, ",".join(["-".join(map(str, x)) for x in difference])) raise UserException(err) missing_seqs = fasta_sequences - bam_sequences if len(missing_seqs) > 0: - base_msg = 'BAM {} does not have the following sequence/length pairs in its header: {}.' - msg = base_msg.format(bam_path, ','.join(['-'.join(map(str, x)) for x in missing_seqs])) + base_msg = "BAM {} does not have the following sequence/length pairs in its header: {}." + msg = base_msg.format(bam_path, ",".join(["-".join(map(str, x)) for x in missing_seqs])) logger.warning(msg) diff --git a/cat/parent_gene_assignment.py b/cat/parent_gene_assignment.py index e5ec1ec0..9467e7f7 100644 --- a/cat/parent_gene_assignment.py +++ b/cat/parent_gene_assignment.py @@ -43,8 +43,9 @@ def assign_parents(filtered_tm_gp, unfiltered_tm_gp, chrom_sizes, denovo_gp, min # extract only gene names for the filtered set filtered_gene_ids = {tx.name2 for tx in filtered_overlapping_tm_txs} if len(filtered_gene_ids) > 1: # we have more than one match, so resolve it - resolved_name, resolution_method = resolve_multiple_genes(denovo_tx, filtered_overlapping_tm_txs, - min_distance) + resolved_name, resolution_method = resolve_multiple_genes( + denovo_tx, filtered_overlapping_tm_txs, min_distance + ) elif len(filtered_gene_ids) == 1: # yay, we have exactly one match resolved_name = list(filtered_gene_ids)[0] resolution_method = None @@ -52,12 +53,13 @@ def assign_parents(filtered_tm_gp, unfiltered_tm_gp, chrom_sizes, denovo_gp, min resolved_name = resolution_method = None # we have no matches, which means putative novel # find only genes for the unfiltered set that are not present in the filtered set alternative_gene_ids = {tx.name2 for tx in unfiltered_overlapping_tm_txs} - {resolved_name} - alternative_gene_ids = ','.join(alternative_gene_ids) if len(alternative_gene_ids) > 0 else None + alternative_gene_ids = ",".join(alternative_gene_ids) if len(alternative_gene_ids) > 0 else None r.append([denovo_tx.name, resolved_name, alternative_gene_ids, resolution_method]) - combined_alternatives = pd.DataFrame(r, columns=['TranscriptId', 'AssignedGeneId', 'AlternativeGeneIds', - 'ResolutionMethod']) - combined_alternatives = combined_alternatives.set_index('TranscriptId') + combined_alternatives = pd.DataFrame( + r, columns=["TranscriptId", "AssignedGeneId", "AlternativeGeneIds", "ResolutionMethod"] + ) + combined_alternatives = combined_alternatives.set_index("TranscriptId") return combined_alternatives @@ -95,7 +97,7 @@ def resolve_multiple_genes(denovo_tx, overlapping_tm_txs, min_distance): tm_txs_by_gene = tools.transcripts.group_transcripts_by_name2(overlapping_tm_txs) tm_jaccards = [find_highest_gene_jaccard(x, y) for x, y in itertools.combinations(list(tm_txs_by_gene.values()), 2)] if any(x > 0.001 for x in tm_jaccards): - return None, 'badAnnotOrTm' + return None, "badAnnotOrTm" # calculate asymmetric difference for this prediction scores = collections.defaultdict(list) for tx in overlapping_tm_txs: @@ -104,9 +106,9 @@ def resolve_multiple_genes(denovo_tx, overlapping_tm_txs, min_distance): high_score = max(best_scores.values()) if all(high_score - x >= min_distance for x in best_scores.values() if x != high_score): best = sorted(iter(best_scores.items()), key=lambda gene_id_score: gene_id_score[1])[-1][0] - return best, 'rescued' + return best, "rescued" else: - return None, 'ambiguousOrFusion' + return None, "ambiguousOrFusion" def find_highest_gene_jaccard(gene_list_a, gene_list_b): @@ -114,6 +116,7 @@ def find_highest_gene_jaccard(gene_list_a, gene_list_b): Calculates the overall distance between two sets of transcripts by finding their distinct exonic intervals and then measuring the Jaccard distance. """ + def find_interval(gene_list): gene_intervals = set() for tx in gene_list: diff --git a/cat/plots.py b/cat/plots.py index 8fd065cb..2aaa282b 100644 --- a/cat/plots.py +++ b/cat/plots.py @@ -6,26 +6,28 @@ import luigi import matplotlib import logging -matplotlib.rcParams['pdf.fonttype'] = 42 -matplotlib.use('Agg') + +matplotlib.rcParams["pdf.fonttype"] = 42 +matplotlib.use("Agg") import itertools import warnings from collections import OrderedDict from matplotlib.backends.backend_pdf import PdfPages import matplotlib.pyplot as plt import seaborn as sns -sns.set_style('ticks') + +sns.set_style("ticks") import numpy as np import pandas as pd import tools.psl import tools.sqlInterface import tools.nameConversions -logger = logging.getLogger('cat') +logger = logging.getLogger("cat") # suppress all warnings to make logging cleaner. The only warnings should be the chained assignment warning from pandas # as well as the bottom == top when plots have no data. -warnings.filterwarnings('ignore') +warnings.filterwarnings("ignore") bar_width = 0.45 boxplot_saturation = 0.7 @@ -46,35 +48,45 @@ def generate_plots(args): # hack to bring coding to the top try: - biotypes.insert(0, biotypes.pop(biotypes.index('protein_coding'))) + biotypes.insert(0, biotypes.pop(biotypes.index("protein_coding"))) except ValueError: pass tx_modes_plot(consensus_data, args.ordered_genomes, args.tx_modes) - tm_metrics_plot(tm_metrics, args.ordered_genomes, biotypes, transcript_biotype_map, args.tm_coverage, - args.tm_identity) + tm_metrics_plot( + tm_metrics, args.ordered_genomes, biotypes, transcript_biotype_map, args.tm_coverage, args.tm_identity + ) tm_para_plot(tm_data, args.ordered_genomes, biotypes, args.paralogy, args.unfiltered_paralogy) tm_gene_family_plot(tm_data, args.ordered_genomes, biotypes, args.gene_collapse) consensus_metrics_plot(consensus_data, args.ordered_genomes, biotypes, args.coverage, args.identity) missing_rate_plot(consensus_data, args.ordered_genomes, biotypes, args.missing) - consensus_support_plot(consensus_data, args.ordered_genomes, biotypes, - modes=['Splice Annotation Support', 'Exon Annotation Support', 'Original Introns'], - title='Reference annotation support', - tgt=args.consensus_annot_support) - consensus_support_plot(consensus_data, args.ordered_genomes, biotypes, - modes=['Splice Support', 'Exon Support'], - title='Extrinsic support', - tgt=args.consensus_extrinsic_support) - completeness_plot(consensus_data, args.ordered_genomes, biotypes, args.completeness, gene_biotype_map, - transcript_biotype_map) + consensus_support_plot( + consensus_data, + args.ordered_genomes, + biotypes, + modes=["Splice Annotation Support", "Exon Annotation Support", "Original Introns"], + title="Reference annotation support", + tgt=args.consensus_annot_support, + ) + consensus_support_plot( + consensus_data, + args.ordered_genomes, + biotypes, + modes=["Splice Support", "Exon Support"], + title="Extrinsic support", + tgt=args.consensus_extrinsic_support, + ) + completeness_plot( + consensus_data, args.ordered_genomes, biotypes, args.completeness, gene_biotype_map, transcript_biotype_map + ) indel_plot(consensus_data, args.ordered_genomes, args.indel) - if 'denovo' in args: + if "denovo" in args: denovo_plot(consensus_data, args.ordered_genomes, args.denovo) - if 'split_genes' in args: + if "split_genes" in args: split_genes_plot(tm_data, args.ordered_genomes, args.split_genes) - if 'pb_support' in args: + if "pb_support" in args: pb_support_plot(consensus_data, args.ordered_genomes, args.pb_genomes, args.pb_support) - if 'improvement' in args: + if "improvement" in args: improvement_plot(consensus_data, args.ordered_genomes, args.improvement) @@ -85,12 +97,12 @@ def generate_plots(args): def load_tm_metrics(dbs): """Loads transMap data from PSLs""" - tm_metrics = {'transMap Coverage': OrderedDict(), 'transMap Identity': OrderedDict()} - tm_name_map = {'TransMapCoverage': 'transMap Coverage', 'TransMapIdentity': 'transMap Identity'} + tm_metrics = {"transMap Coverage": OrderedDict(), "transMap Identity": OrderedDict()} + tm_name_map = {"TransMapCoverage": "transMap Coverage", "TransMapIdentity": "transMap Identity"} for genome, db_path in dbs.items(): session = tools.sqlInterface.start_session(db_path) table = tools.sqlInterface.TmEval - for classifier in ['TransMapCoverage', 'TransMapIdentity']: + for classifier in ["TransMapCoverage", "TransMapIdentity"]: query = session.query(table.AlignmentId, table.value).filter(table.classifier == classifier) tm_metrics[tm_name_map[classifier]][genome] = dict(query.all()) return tm_metrics @@ -103,32 +115,31 @@ def load_tm_metrics(dbs): def tm_metrics_plot(tm_metrics, ordered_genomes, biotypes, transcript_biotype_map, tm_coverage_tgt, tm_identity_tgt): """plots for transMap coverage, identity""" - tm_iter = list(zip(*[['transMap Coverage', 'transMap Identity'], - [tm_coverage_tgt, tm_identity_tgt]])) + tm_iter = list(zip(*[["transMap Coverage", "transMap Identity"], [tm_coverage_tgt, tm_identity_tgt]])) for mode, tgt in tm_iter: df = dict_to_df_with_biotype(tm_metrics[mode], transcript_biotype_map) - df = pd.melt(df, id_vars='biotype', value_vars=ordered_genomes).dropna() - df.columns = ['biotype', 'genome', mode] - cov_ident_plot(biotypes, ordered_genomes, mode, tgt, df, x=mode, y='genome') + df = pd.melt(df, id_vars="biotype", value_vars=ordered_genomes).dropna() + df.columns = ["biotype", "genome", mode] + cov_ident_plot(biotypes, ordered_genomes, mode, tgt, df, x=mode, y="genome") def consensus_metrics_plot(consensus_data, ordered_genomes, biotypes, coverage_tgt, identity_tgt): """plots for consensus coverage, identity, score""" - cons_iter = list(zip(*[['Coverage', 'Identity'], - [coverage_tgt, identity_tgt]])) + cons_iter = list(zip(*[["Coverage", "Identity"], [coverage_tgt, identity_tgt]])) for mode, tgt in cons_iter: df = json_to_df_with_biotype(consensus_data, mode) - cov_ident_plot(biotypes, ordered_genomes, mode, tgt, df, x=mode, y='genome') + cov_ident_plot(biotypes, ordered_genomes, mode, tgt, df, x=mode, y="genome") def consensus_support_plot(consensus_data, ordered_genomes, biotypes, modes, title, tgt): """grouped violin plots of original intron / intron annotation / exon annotation support""" + def adjust_plot(g, this_title): g.set_xticklabels(rotation=90) g.fig.suptitle(this_title) g.fig.subplots_adjust(top=0.9) for ax in g.axes.flat: - ax.set_ylabel('Percent supported') + ax.set_ylabel("Percent supported") ax.set_ylim(-1, 101) dfs = [] @@ -138,28 +149,64 @@ def adjust_plot(g, this_title): df = df[mode] dfs.append(df) df = pd.concat(dfs, axis=1) - df = pd.melt(df, value_vars=modes, id_vars=['genome', 'biotype']) + df = pd.melt(df, value_vars=modes, id_vars=["genome", "biotype"]) af = luigi.local_target.atomic_file(tgt.path) with PdfPages(af.tmp_path) as pdf: if len(ordered_genomes) > 1: - g = sns.factorplot(data=df, y='value', x='genome', col='variable', col_wrap=2, kind='violin', sharex=True, - sharey=True, row_order=ordered_genomes, cut=0) + g = sns.factorplot( + data=df, + y="value", + x="genome", + col="variable", + col_wrap=2, + kind="violin", + sharex=True, + sharey=True, + row_order=ordered_genomes, + cut=0, + ) else: - g = sns.factorplot(data=df, y='value', x='variable', kind='violin', sharex=True, - sharey=True, row_order=ordered_genomes, cut=0) + g = sns.factorplot( + data=df, + y="value", + x="variable", + kind="violin", + sharex=True, + sharey=True, + row_order=ordered_genomes, + cut=0, + ) adjust_plot(g, title) multipage_close(pdf, tight_layout=False) - title += ' for {}' + title += " for {}" for biotype in biotypes: this_title = title.format(biotype) biotype_df = biotype_filter(df, biotype) if biotype_df is not None: if len(ordered_genomes) > 1: - g = sns.factorplot(data=biotype_df, y='value', x='genome', col='variable', col_wrap=2, - kind='violin', sharex=True, sharey=True, row_order=ordered_genomes, cut=0) + g = sns.factorplot( + data=biotype_df, + y="value", + x="genome", + col="variable", + col_wrap=2, + kind="violin", + sharex=True, + sharey=True, + row_order=ordered_genomes, + cut=0, + ) else: - g = sns.factorplot(data=df, y='value', x='variable', kind='violin', sharex=True, - sharey=True, row_order=ordered_genomes, cut=0) + g = sns.factorplot( + data=df, + y="value", + x="variable", + kind="violin", + sharex=True, + sharey=True, + row_order=ordered_genomes, + cut=0, + ) adjust_plot(g, this_title) multipage_close(pdf, tight_layout=False) af.move_to_final_destination() @@ -167,72 +214,97 @@ def adjust_plot(g, this_title): def tm_para_plot(tm_data, ordered_genomes, biotypes, para_tgt, unfiltered_para_tgt): """transMap paralogy plots""" - for key, tgt in [['Paralogy', para_tgt], ['UnfilteredParalogy', unfiltered_para_tgt]]: - legend_labels = ['= 1', '= 2', '= 3', '\u2265 4'] - title_string = 'Proportion of transcripts that have multiple alignments' - biotype_title_string = 'Proportion of {} transcripts that have multiple alignments' + for key, tgt in [["Paralogy", para_tgt], ["UnfilteredParalogy", unfiltered_para_tgt]]: + legend_labels = ["= 1", "= 2", "= 3", "\u2265 4"] + title_string = "Proportion of transcripts that have multiple alignments" + biotype_title_string = "Proportion of {} transcripts that have multiple alignments" df = json_biotype_nested_counter_to_df(tm_data, key) # we want a dataframe where each row is the counts, in genome order # we construct the transpose first r = [] df[key] = pd.to_numeric(df[key]) # make sure genomes are in order - df['genome'] = pd.Categorical(df['genome'], ordered_genomes, ordered=True) - df = df.sort_values('genome') - for biotype, biotype_df in df.groupby('biotype'): - for genome, genome_df in biotype_df.groupby('genome'): - high_para = genome_df[genome_df[key] >= 4]['count'].sum() - counts = dict(list(zip(genome_df[key], genome_df['count']))) + df["genome"] = pd.Categorical(df["genome"], ordered_genomes, ordered=True) + df = df.sort_values("genome") + for biotype, biotype_df in df.groupby("biotype"): + for genome, genome_df in biotype_df.groupby("genome"): + high_para = genome_df[genome_df[key] >= 4]["count"].sum() + counts = dict(list(zip(genome_df[key], genome_df["count"]))) r.append([biotype, genome, counts.get(1, 0), counts.get(2, 0), counts.get(3, 0), high_para]) - df = pd.DataFrame(r, columns=['biotype', 'genome', '1', '2', '3', '\u2265 4']) - sum_df = df.groupby('genome', sort=False).aggregate(sum).T + df = pd.DataFrame(r, columns=["biotype", "genome", "1", "2", "3", "\u2265 4"]) + sum_df = df.groupby("genome", sort=False).aggregate(sum).T plot_fn = generic_unstacked_barplot if len(df.columns) <= 5 else generic_stacked_barplot - box_label = 'Number of\nalignments' + box_label = "Number of\nalignments" af = luigi.local_target.atomic_file(tgt.path) with PdfPages(af.tmp_path) as pdf: - plot_fn(sum_df, pdf, title_string, legend_labels, 'Number of transcripts', ordered_genomes, box_label) + plot_fn(sum_df, pdf, title_string, legend_labels, "Number of transcripts", ordered_genomes, box_label) for biotype in biotypes: biotype_df = biotype_filter(df, biotype) if biotype_df is not None: - biotype_df = biotype_df.drop(['genome', 'biotype'], axis=1).T + biotype_df = biotype_df.drop(["genome", "biotype"], axis=1).T title_string = biotype_title_string.format(biotype) - plot_fn(biotype_df, pdf, title_string, legend_labels, 'Number of transcripts', ordered_genomes, - box_label) + plot_fn( + biotype_df, + pdf, + title_string, + legend_labels, + "Number of transcripts", + ordered_genomes, + box_label, + ) af.move_to_final_destination() def tm_gene_family_plot(tm_data, ordered_genomes, biotypes, gene_family_tgt): """transMap gene family collapse plots.""" try: - df = json_biotype_nested_counter_to_df(tm_data, 'Gene Family Collapse') + df = json_biotype_nested_counter_to_df(tm_data, "Gene Family Collapse") except ValueError: # no gene family collapse. probably the test set. - with gene_family_tgt.open('wb') as outf: + with gene_family_tgt.open("wb") as outf: pass return - df['Gene Family Collapse'] = pd.to_numeric(df['Gene Family Collapse']) - tot_df = df[['Gene Family Collapse', 'genome', 'count']].\ - groupby(['genome', 'Gene Family Collapse']).aggregate(sum).reset_index() - tot_df = tot_df.sort_values('Gene Family Collapse') + df["Gene Family Collapse"] = pd.to_numeric(df["Gene Family Collapse"]) + tot_df = ( + df[["Gene Family Collapse", "genome", "count"]] + .groupby(["genome", "Gene Family Collapse"]) + .aggregate(sum) + .reset_index() + ) + tot_df = tot_df.sort_values("Gene Family Collapse") af = luigi.local_target.atomic_file(gene_family_tgt.path) with PdfPages(af.tmp_path) as pdf: - g = sns.factorplot(y='count', col='genome', x='Gene Family Collapse', data=tot_df, kind='bar', - col_order=ordered_genomes, col_wrap=4) - g.fig.suptitle('Number of genes collapsed during gene family collapse') - g.set_xlabels('Number of genes collapsed to one locus') - g.set_ylabels('Number of genes') + g = sns.factorplot( + y="count", + col="genome", + x="Gene Family Collapse", + data=tot_df, + kind="bar", + col_order=ordered_genomes, + col_wrap=4, + ) + g.fig.suptitle("Number of genes collapsed during gene family collapse") + g.set_xlabels("Number of genes collapsed to one locus") + g.set_ylabels("Number of genes") g.fig.subplots_adjust(top=0.9) multipage_close(pdf, tight_layout=False) for biotype in biotypes: biotype_df = biotype_filter(df, biotype) if biotype_df is None: continue - biotype_df = biotype_df.sort_values('Gene Family Collapse') - g = sns.factorplot(y='count', col='genome', x='Gene Family Collapse', data=biotype_df, kind='bar', - col_order=[x for x in ordered_genomes if x in set(biotype_df.genome)], col_wrap=4) - g.fig.suptitle('Number of genes collapsed during gene family collapse for {}'.format(biotype)) - g.set_xlabels('Number of genes collapsed to one locus') - g.set_ylabels('Number of genes') + biotype_df = biotype_df.sort_values("Gene Family Collapse") + g = sns.factorplot( + y="count", + col="genome", + x="Gene Family Collapse", + data=biotype_df, + kind="bar", + col_order=[x for x in ordered_genomes if x in set(biotype_df.genome)], + col_wrap=4, + ) + g.fig.suptitle("Number of genes collapsed during gene family collapse for {}".format(biotype)) + g.set_xlabels("Number of genes collapsed to one locus") + g.set_ylabels("Number of genes") g.fig.subplots_adjust(top=0.9) multipage_close(pdf, tight_layout=False) af.move_to_final_destination() @@ -240,58 +312,87 @@ def tm_gene_family_plot(tm_data, ordered_genomes, biotypes, gene_family_tgt): def missing_rate_plot(consensus_data, ordered_genomes, biotypes, missing_plot_tgt): """Missing genes/transcripts""" - base_title = 'Number of missing orthologs in consensus set' - gene_missing_df = json_biotype_counter_to_df(consensus_data, 'Gene Missing') - gene_missing_df.columns = ['biotype', 'Genes', 'genome'] - transcript_missing_df = json_biotype_counter_to_df(consensus_data, 'Transcript Missing') - transcript_missing_df.columns = ['biotype', 'Transcripts', 'genome'] - df = transcript_missing_df.merge(gene_missing_df, on=['genome', 'biotype']) - df = pd.melt(df, id_vars=['biotype', 'genome']) - ylabel = 'Number of genes or transcripts' + base_title = "Number of missing orthologs in consensus set" + gene_missing_df = json_biotype_counter_to_df(consensus_data, "Gene Missing") + gene_missing_df.columns = ["biotype", "Genes", "genome"] + transcript_missing_df = json_biotype_counter_to_df(consensus_data, "Transcript Missing") + transcript_missing_df.columns = ["biotype", "Transcripts", "genome"] + df = transcript_missing_df.merge(gene_missing_df, on=["genome", "biotype"]) + df = pd.melt(df, id_vars=["biotype", "genome"]) + ylabel = "Number of genes or transcripts" af = luigi.local_target.atomic_file(missing_plot_tgt.path) with PdfPages(af.tmp_path) as pdf: - tot_df = df.groupby(['genome', 'biotype', 'variable']).aggregate(sum).reset_index() - generic_barplot(tot_df, pdf, '', ylabel, base_title, x='genome', y='value', - col='variable', row_order=ordered_genomes) + tot_df = df.groupby(["genome", "biotype", "variable"]).aggregate(sum).reset_index() + generic_barplot( + tot_df, pdf, "", ylabel, base_title, x="genome", y="value", col="variable", row_order=ordered_genomes + ) for biotype in biotypes: biotype_df = biotype_filter(df, biotype) if biotype_df is None: continue - biotype_df = biotype_df.groupby(['genome', 'variable']).aggregate(sum).reset_index() - title = base_title + ' for biotype {}'.format(biotype) - generic_barplot(biotype_df, pdf, '', ylabel, title, x='genome', y='value', - col='variable', row_order=ordered_genomes) + biotype_df = biotype_df.groupby(["genome", "variable"]).aggregate(sum).reset_index() + title = base_title + " for biotype {}".format(biotype) + generic_barplot( + biotype_df, pdf, "", ylabel, title, x="genome", y="value", col="variable", row_order=ordered_genomes + ) af.move_to_final_destination() def tx_modes_plot(consensus_data, ordered_genomes, tx_mode_plot_tgt): - ordered_groups = ['transMap', 'transMap+TM', 'transMap+TMR', 'transMap+TM+TMR', 'TM', 'TMR', 'TM+TMR', 'CGP', 'PB', - 'exRef', 'other'] - ordered_groups = OrderedDict([[frozenset(x.split('+')), x] for x in ordered_groups]) + ordered_groups = [ + "transMap", + "transMap+TM", + "transMap+TMR", + "transMap+TM+TMR", + "TM", + "TMR", + "TM+TMR", + "CGP", + "PB", + "exRef", + "other", + ] + ordered_groups = OrderedDict([[frozenset(x.split("+")), x] for x in ordered_groups]) def split_fn(s): - return ordered_groups.get(frozenset(s['Transcript Modes'].replace('aug', '').split(',')), 'Other') + return ordered_groups.get(frozenset(s["Transcript Modes"].replace("aug", "").split(",")), "Other") - modes_df = json_biotype_counter_to_df(consensus_data, 'Transcript Modes') - df = modes_df.pivot(index='genome', columns='Transcript Modes').transpose().reset_index() - df['Modes'] = df.apply(split_fn, axis=1) - df = df[['Modes'] + ordered_genomes] - ordered_values = [x for x in ordered_groups.values() if x in set(df['Modes'])] + modes_df = json_biotype_counter_to_df(consensus_data, "Transcript Modes") + df = modes_df.pivot(index="genome", columns="Transcript Modes").transpose().reset_index() + df["Modes"] = df.apply(split_fn, axis=1) + df = df[["Modes"] + ordered_genomes] + ordered_values = [x for x in ordered_groups.values() if x in set(df["Modes"])] af = luigi.local_target.atomic_file(tx_mode_plot_tgt.path) with PdfPages(af.tmp_path) as pdf: - title_string = 'Transcript modes in protein coding consensus gene set' - ylabel = 'Number of transcripts' + title_string = "Transcript modes in protein coding consensus gene set" + ylabel = "Number of transcripts" if len(ordered_genomes) > 1: - df['Ordered Modes'] = pd.Categorical(df['Modes'], ordered_values, ordered=True) - df = df.sort_values('Ordered Modes') - df = df[['Ordered Modes'] + ordered_genomes].set_index('Ordered Modes') + df["Ordered Modes"] = pd.Categorical(df["Modes"], ordered_values, ordered=True) + df = df.sort_values("Ordered Modes") + df = df[["Ordered Modes"] + ordered_genomes].set_index("Ordered Modes") df = df.fillna(0) - generic_stacked_barplot(df, pdf, title_string, df.index, ylabel, ordered_genomes, 'Transcript mode(s)', - bbox_to_anchor=(1.25, 0.7)) + generic_stacked_barplot( + df, + pdf, + title_string, + df.index, + ylabel, + ordered_genomes, + "Transcript mode(s)", + bbox_to_anchor=(1.25, 0.7), + ) else: - generic_barplot(pd.melt(df, id_vars='Modes'), pdf, 'Transcript mode(s)', ylabel, title_string, x='Modes', - y='value', order=ordered_values) + generic_barplot( + pd.melt(df, id_vars="Modes"), + pdf, + "Transcript mode(s)", + ylabel, + title_string, + x="Modes", + y="value", + order=ordered_values, + ) af.move_to_final_destination() @@ -299,28 +400,46 @@ def denovo_plot(consensus_data, ordered_genomes, denovo_tgt): af = luigi.local_target.atomic_file(denovo_tgt.path) with PdfPages(af.tmp_path) as pdf: try: - df = json_biotype_nested_counter_to_df(consensus_data, 'denovo') + df = json_biotype_nested_counter_to_df(consensus_data, "denovo") except ValueError: # No de novo results. Probably the test set. return # fix column names because json_biotype_nested_counter_to_df makes assumptions - df.columns = ['Result', 'Number of transcripts', 'Augustus mode', 'genome'] - has_pb = len(set(df['Augustus mode'])) == 2 + df.columns = ["Result", "Number of transcripts", "Augustus mode", "genome"] + has_pb = len(set(df["Augustus mode"])) == 2 if len(set(df.genome)) > 1: # if we ran in PB only, we may not have multiple genomes if has_pb is True: - ax = sns.factorplot(data=df, x='genome', y='Number of transcripts', kind='bar', col='Result', - hue='Augustus mode', col_wrap=2, row_order=ordered_genomes, sharex=True, - sharey=False) + ax = sns.factorplot( + data=df, + x="genome", + y="Number of transcripts", + kind="bar", + col="Result", + hue="Augustus mode", + col_wrap=2, + row_order=ordered_genomes, + sharex=True, + sharey=False, + ) else: - ax = sns.factorplot(data=df, x='genome', y='Number of transcripts', kind='bar', col='Result', - col_wrap=2, row_order=ordered_genomes, sharex=True, sharey=False) + ax = sns.factorplot( + data=df, + x="genome", + y="Number of transcripts", + kind="bar", + col="Result", + col_wrap=2, + row_order=ordered_genomes, + sharex=True, + sharey=False, + ) else: if has_pb is True: - ax = sns.factorplot(data=df, x='Result', y='Number of transcripts', kind='bar', hue='Augustus mode') + ax = sns.factorplot(data=df, x="Result", y="Number of transcripts", kind="bar", hue="Augustus mode") else: - ax = sns.factorplot(data=df, x='Result', y='Number of transcripts', kind='bar') + ax = sns.factorplot(data=df, x="Result", y="Number of transcripts", kind="bar") ax.set_xticklabels(rotation=90) - ax.fig.suptitle('Incorporation of de-novo predictions') + ax.fig.suptitle("Incorporation of de-novo predictions") ax.fig.subplots_adjust(top=0.9) multipage_close(pdf, tight_layout=False) af.move_to_final_destination() @@ -329,16 +448,33 @@ def denovo_plot(consensus_data, ordered_genomes, denovo_tgt): def split_genes_plot(tm_data, ordered_genomes, split_plot_tgt): af = luigi.local_target.atomic_file(split_plot_tgt.path) with PdfPages(af.tmp_path) as pdf: - df = json_biotype_counter_to_df(tm_data, 'Split Genes') - df.columns = ['category', 'count', 'genome'] - title = 'Split genes' + df = json_biotype_counter_to_df(tm_data, "Split Genes") + df.columns = ["category", "count", "genome"] + title = "Split genes" if len(ordered_genomes) > 1: - g = generic_barplot(pdf=pdf, data=df, x='genome', y='count', col='category', xlabel='', col_wrap=2, - sharey=False, ylabel='Number of transcripts or genes', row_order=ordered_genomes, - title=title) + g = generic_barplot( + pdf=pdf, + data=df, + x="genome", + y="count", + col="category", + xlabel="", + col_wrap=2, + sharey=False, + ylabel="Number of transcripts or genes", + row_order=ordered_genomes, + title=title, + ) else: - g = generic_barplot(pdf=pdf, data=df, x='category', y='count', ylabel='Number of transcripts or genes', - title=title, xlabel='Category') + g = generic_barplot( + pdf=pdf, + data=df, + x="category", + y="count", + ylabel="Number of transcripts or genes", + title=title, + xlabel="Category", + ) af.move_to_final_destination() @@ -346,38 +482,56 @@ def pb_support_plot(consensus_data, ordered_genomes, pb_genomes, pb_support_tgt) af = luigi.local_target.atomic_file(pb_support_tgt.path) with PdfPages(af.tmp_path) as pdf: pb_genomes = [x for x in ordered_genomes if x in pb_genomes] # fix order - df = json_biotype_counter_to_df(consensus_data, 'IsoSeq Transcript Validation') + df = json_biotype_counter_to_df(consensus_data, "IsoSeq Transcript Validation") if len(df) == 0: # no support information return - df.columns = ['IsoSeq Transcript Validation', 'Number of transcripts', 'genome'] - ax = sns.factorplot(data=df, x='genome', y='Number of transcripts', hue='IsoSeq Transcript Validation', - kind='bar', row_order=pb_genomes) + df.columns = ["IsoSeq Transcript Validation", "Number of transcripts", "genome"] + ax = sns.factorplot( + data=df, + x="genome", + y="Number of transcripts", + hue="IsoSeq Transcript Validation", + kind="bar", + row_order=pb_genomes, + ) ax.set_xticklabels(rotation=90) - ax.fig.suptitle('Isoforms validated by at least one IsoSeq read') + ax.fig.suptitle("Isoforms validated by at least one IsoSeq read") multipage_close(pdf, tight_layout=False) af.move_to_final_destination() -def completeness_plot(consensus_data, ordered_genomes, biotypes, completeness_plot_tgt, gene_biotype_map, - transcript_biotype_map): +def completeness_plot( + consensus_data, ordered_genomes, biotypes, completeness_plot_tgt, gene_biotype_map, transcript_biotype_map +): def adjust_plot(g, gene_count, tx_count): for ax, c in zip(*[g.axes[0], [gene_count, tx_count]]): _ = ax.set_ylim(0, c) - ax.spines['top'].set_edgecolor('#e74c3c') - ax.spines['top'].set_linewidth(2) - ax.spines['top'].set_visible(True) - ax.spines['top'].set_linestyle('dashed') + ax.spines["top"].set_edgecolor("#e74c3c") + ax.spines["top"].set_linewidth(2) + ax.spines["top"].set_visible(True) + ax.spines["top"].set_linestyle("dashed") - df = json_grouped_biotype_nested_counter_to_df(consensus_data, 'Completeness') + df = json_grouped_biotype_nested_counter_to_df(consensus_data, "Completeness") af = luigi.local_target.atomic_file(completeness_plot_tgt.path) with PdfPages(af.tmp_path) as pdf: - tot_df = df.groupby(by=['genome', 'category']).aggregate(np.sum).reset_index() + tot_df = df.groupby(by=["genome", "category"]).aggregate(np.sum).reset_index() tot_df = sort_long_df(tot_df, ordered_genomes) - title = 'Number of comparative genes/transcripts present' - g = generic_barplot(pdf=pdf, data=tot_df, x='genome', y='count', col='category', xlabel='', - sharey=False, ylabel='Number of genes/transcripts', title=title, - col_order=['Gene', 'Transcript'], close=False, palette=choose_palette(ordered_genomes)) + title = "Number of comparative genes/transcripts present" + g = generic_barplot( + pdf=pdf, + data=tot_df, + x="genome", + y="count", + col="category", + xlabel="", + sharey=False, + ylabel="Number of genes/transcripts", + title=title, + col_order=["Gene", "Transcript"], + close=False, + palette=choose_palette(ordered_genomes), + ) adjust_plot(g, len(gene_biotype_map), len(transcript_biotype_map)) multipage_close(pdf, tight_layout=False) for biotype in biotypes: @@ -386,65 +540,114 @@ def adjust_plot(g, gene_count, tx_count): biotype_df = sort_long_df(biotype_df, ordered_genomes) gene_biotype_count = len({i for i, b in gene_biotype_map.items() if b == biotype}) tx_biotype_count = len({i for i, b in transcript_biotype_map.items() if b == biotype}) - title = 'Number of comparative genes/transcripts present for biotype {}'.format(biotype) - g = generic_barplot(pdf=pdf, data=biotype_df, x='genome', y='count', col='category', xlabel='', - sharey=False, ylabel='Number of genes/transcripts', - title=title, col_order=['Gene', 'Transcript'], close=False, - palette=choose_palette(ordered_genomes)) + title = "Number of comparative genes/transcripts present for biotype {}".format(biotype) + g = generic_barplot( + pdf=pdf, + data=biotype_df, + x="genome", + y="count", + col="category", + xlabel="", + sharey=False, + ylabel="Number of genes/transcripts", + title=title, + col_order=["Gene", "Transcript"], + close=False, + palette=choose_palette(ordered_genomes), + ) adjust_plot(g, gene_biotype_count, tx_biotype_count) multipage_close(pdf, tight_layout=False) af.move_to_final_destination() def improvement_plot(consensus_data, ordered_genomes, improvement_tgt): - def do_kdeplot(x, y, ax, n_levels=None, bw='scott'): + def do_kdeplot(x, y, ax, n_levels=None, bw="scott"): try: - sns.kdeplot(x, y, ax=ax, cut=0, cmap='Purples_d', shade=True, shade_lowest=False, n_levels=n_levels, bw=bw, - rasterized=True) + sns.kdeplot( + x, + y, + ax=ax, + cut=0, + cmap="Purples_d", + shade=True, + shade_lowest=False, + n_levels=n_levels, + bw=bw, + rasterized=True, + ) except: - logger.warning('Unable to do a KDE fit to AUGUSTUS improvement.') + logger.warning("Unable to do a KDE fit to AUGUSTUS improvement.") pass af = luigi.local_target.atomic_file(improvement_tgt.path) with PdfPages(af.tmp_path) as pdf, sns.axes_style("whitegrid"): for genome in ordered_genomes: - data = pd.DataFrame(consensus_data[genome]['Evaluation Improvement']['changes']) - unchanged = consensus_data[genome]['Evaluation Improvement']['unchanged'] + data = pd.DataFrame(consensus_data[genome]["Evaluation Improvement"]["changes"]) + unchanged = consensus_data[genome]["Evaluation Improvement"]["unchanged"] if len(data) == 0: continue - data.columns = ['transMap original introns', - 'transMap intron annotation support', - 'transMap intron RNA support', - 'Original introns', - 'Intron annotation support', - 'Intron RNA support', - 'transMap alignment goodness', - 'Alignment goodness'] + data.columns = [ + "transMap original introns", + "transMap intron annotation support", + "transMap intron RNA support", + "Original introns", + "Intron annotation support", + "Intron RNA support", + "transMap alignment goodness", + "Alignment goodness", + ] fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(ncols=2, nrows=2) - for ax in [ax1, ax2, ax3, ax4]: + for ax in [ax1, ax2, ax3, ax4]: ax.set_xlim(0, 100) ax.set_ylim(0, 100) - - do_kdeplot(data['transMap original introns'], data['Original introns'], ax1, n_levels=25, bw=2) - sns.regplot(x=data['transMap original introns'], y=data['Original introns'], ax=ax1, - color='#A9B36F', scatter_kws={"s": 3, 'alpha': 0.7, 'rasterized': True}, fit_reg=False) - do_kdeplot(data['transMap intron annotation support'], data['Intron annotation support'], ax2, - n_levels=25, bw=2) - sns.regplot(x=data['transMap intron annotation support'], y=data['Intron annotation support'], ax=ax2, - color='#A9B36F', scatter_kws={"s": 3, 'alpha': 0.7, 'rasterized': True}, fit_reg=False) - do_kdeplot(data['transMap intron RNA support'], data['Intron RNA support'], ax3, n_levels=25, bw=2) - sns.regplot(x=data['transMap intron RNA support'], y=data['Intron RNA support'], ax=ax3, - color='#A9B36F', scatter_kws={"s": 3, 'alpha': 0.7, 'rasterized': True}, fit_reg=False) - - do_kdeplot(data['transMap alignment goodness'], data['Alignment goodness'], ax4, n_levels=20, bw=1) - sns.regplot(x=data['transMap alignment goodness'], y=data['Alignment goodness'], ax=ax4, - color='#A9B36F', scatter_kws={"s": 3, 'alpha': 0.7, 'rasterized': True}, fit_reg=False) - - fig.suptitle('AUGUSTUS metric improvements for {:,} transcripts in {}.\n' - '{:,} transMap transcripts were chosen.'.format(len(data), genome, unchanged)) - + + do_kdeplot(data["transMap original introns"], data["Original introns"], ax1, n_levels=25, bw=2) + sns.regplot( + x=data["transMap original introns"], + y=data["Original introns"], + ax=ax1, + color="#A9B36F", + scatter_kws={"s": 3, "alpha": 0.7, "rasterized": True}, + fit_reg=False, + ) + do_kdeplot( + data["transMap intron annotation support"], data["Intron annotation support"], ax2, n_levels=25, bw=2 + ) + sns.regplot( + x=data["transMap intron annotation support"], + y=data["Intron annotation support"], + ax=ax2, + color="#A9B36F", + scatter_kws={"s": 3, "alpha": 0.7, "rasterized": True}, + fit_reg=False, + ) + do_kdeplot(data["transMap intron RNA support"], data["Intron RNA support"], ax3, n_levels=25, bw=2) + sns.regplot( + x=data["transMap intron RNA support"], + y=data["Intron RNA support"], + ax=ax3, + color="#A9B36F", + scatter_kws={"s": 3, "alpha": 0.7, "rasterized": True}, + fit_reg=False, + ) + + do_kdeplot(data["transMap alignment goodness"], data["Alignment goodness"], ax4, n_levels=20, bw=1) + sns.regplot( + x=data["transMap alignment goodness"], + y=data["Alignment goodness"], + ax=ax4, + color="#A9B36F", + scatter_kws={"s": 3, "alpha": 0.7, "rasterized": True}, + fit_reg=False, + ) + + fig.suptitle( + "AUGUSTUS metric improvements for {:,} transcripts in {}.\n" + "{:,} transMap transcripts were chosen.".format(len(data), genome, unchanged) + ) + for ax in [ax1, ax2, ax3, ax4]: - ax.set(adjustable='box', aspect='equal') + ax.set(adjustable="box", aspect="equal") fig.subplots_adjust(hspace=0.3) multipage_close(pdf, tight_layout=False) af.move_to_final_destination() @@ -453,27 +656,45 @@ def do_kdeplot(x, y, ax, n_levels=None, bw='scott'): def indel_plot(consensus_data, ordered_genomes, indel_plot_tgt): af = luigi.local_target.atomic_file(indel_plot_tgt.path) with PdfPages(af.tmp_path) as pdf: - tm_df = pd.concat([pd.DataFrame.from_dict(consensus_data[genome]['transMap Indels'], orient='index').T - for genome in ordered_genomes]) + tm_df = pd.concat( + [ + pd.DataFrame.from_dict(consensus_data[genome]["transMap Indels"], orient="index").T + for genome in ordered_genomes + ] + ) try: # this is a hack to deal with weird small input datasets - tm_df['genome'] = ordered_genomes + tm_df["genome"] = ordered_genomes except: return - tm_df['transcript set'] = ['transMap'] * len(tm_df) - consensus_df = pd.concat([pd.DataFrame.from_dict(consensus_data[genome]['Consensus Indels'], orient='index').T - for genome in ordered_genomes]) - consensus_df['genome'] = ordered_genomes - consensus_df['transcript set'] = ['Consensus'] * len(consensus_df) + tm_df["transcript set"] = ["transMap"] * len(tm_df) + consensus_df = pd.concat( + [ + pd.DataFrame.from_dict(consensus_data[genome]["Consensus Indels"], orient="index").T + for genome in ordered_genomes + ] + ) + consensus_df["genome"] = ordered_genomes + consensus_df["transcript set"] = ["Consensus"] * len(consensus_df) df = pd.concat([consensus_df, tm_df]) - df = pd.melt(df, id_vars=['genome', 'transcript set'], - value_vars=['CodingDeletion', 'CodingInsertion', 'CodingMult3Indel']) - df.columns = ['Genome', 'Transcript set', 'Type', 'Percent of transcripts'] - g = sns.factorplot(data=df, x='Genome', y='Percent of transcripts', col='Transcript set', - hue='Type', kind='bar', row_order=ordered_genomes, - col_order=['transMap', 'Consensus']) + df = pd.melt( + df, + id_vars=["genome", "transcript set"], + value_vars=["CodingDeletion", "CodingInsertion", "CodingMult3Indel"], + ) + df.columns = ["Genome", "Transcript set", "Type", "Percent of transcripts"] + g = sns.factorplot( + data=df, + x="Genome", + y="Percent of transcripts", + col="Transcript set", + hue="Type", + kind="bar", + row_order=ordered_genomes, + col_order=["transMap", "Consensus"], + ) g.set_xticklabels(rotation=90) - g.fig.subplots_adjust(top=.8) - g.fig.suptitle('Coding indels') + g.fig.subplots_adjust(top=0.8) + g.fig.suptitle("Coding indels") multipage_close(pdf, tight_layout=False) af.move_to_final_destination() @@ -486,34 +707,66 @@ def indel_plot(consensus_data, ordered_genomes, indel_plot_tgt): def cov_ident_plot(biotypes, ordered_genomes, mode, tgt, df, x=None, y=None, xlabel=None): """violin plots for coverage and identity.""" if xlabel is None: - xlabel = 'Percent {}'.format(mode) + xlabel = "Percent {}".format(mode) af = luigi.local_target.atomic_file(tgt.path) with PdfPages(af.tmp_path) as pdf: - title = 'Overall {}'.format(mode) + title = "Overall {}".format(mode) xmin = int(min(df[mode])) horizontal_violin_plot(df, ordered_genomes, title, xlabel, pdf, x=x, y=y, xlim=(xmin, 100)) for biotype in biotypes: biotype_df = biotype_filter(df, biotype) if biotype_df is not None: - title = '{} for biotype {}'.format(mode, biotype) + title = "{} for biotype {}".format(mode, biotype) xmin = int(min(df[mode])) horizontal_violin_plot(biotype_df, ordered_genomes, title, xlabel, pdf, x=x, y=y, xlim=(xmin, 100)) af.move_to_final_destination() + ### # generic plotting functions ### -def generic_barplot(data, pdf, xlabel, ylabel, title, row_order=None, x=None, y=None, hue=None, hue_order=None, - order=None, col=None, col_wrap=None, sharex=True, sharey=True, col_order=None, palette=None, - close=True): - g = sns.factorplot(data=data, x=x, y=y, hue=hue, ci=None, kind='bar', hue_order=hue_order, row_order=row_order, - col=col, col_wrap=col_wrap, sharex=sharex, sharey=sharey, col_order=col_order, palette=palette, - order=order) +def generic_barplot( + data, + pdf, + xlabel, + ylabel, + title, + row_order=None, + x=None, + y=None, + hue=None, + hue_order=None, + order=None, + col=None, + col_wrap=None, + sharex=True, + sharey=True, + col_order=None, + palette=None, + close=True, +): + g = sns.factorplot( + data=data, + x=x, + y=y, + hue=hue, + ci=None, + kind="bar", + hue_order=hue_order, + row_order=row_order, + col=col, + col_wrap=col_wrap, + sharex=sharex, + sharey=sharey, + col_order=col_order, + palette=palette, + order=order, + ) g.set_xticklabels(rotation=90) g.fig.suptitle(title) - g.fig.subplots_adjust(top=.8) + g.fig.subplots_adjust(top=0.8) g.set_axis_labels(xlabel, ylabel) try: # depending on columns, axes could be flat or not axes = list(itertools.chain.from_iterable(g.axes)) @@ -522,7 +775,7 @@ def generic_barplot(data, pdf, xlabel, ylabel, title, row_order=None, x=None, y= for ax in axes: ax.yaxis.set_major_locator(matplotlib.ticker.MaxNLocator(nbins=10, steps=[1, 2, 5, 10], integer=True)) ax.margins(y=0.15) - ax.autoscale(enable=True, axis='y', tight=False) + ax.autoscale(enable=True, axis="y", tight=False) ax.set_ylim(0, ax.get_ylim()[1]) if close is True: multipage_close(pdf, tight_layout=False) @@ -532,8 +785,19 @@ def generic_barplot(data, pdf, xlabel, ylabel, title, row_order=None, x=None, y= def horizontal_violin_plot(data, ordered_genomes, title, xlabel, pdf, hue=None, x=None, y=None, xlim=None): """not so generic function that specifically produces a paired boxplot/violinplot""" fig, ax = plt.subplots() - sns.violinplot(data=data, x=x, y=y, hue=hue, order=ordered_genomes, palette=choose_palette(ordered_genomes), - saturation=boxplot_saturation, orient='h', cut=0, scale='count', ax=ax) + sns.violinplot( + data=data, + x=x, + y=y, + hue=hue, + order=ordered_genomes, + palette=choose_palette(ordered_genomes), + saturation=boxplot_saturation, + orient="h", + cut=0, + scale="count", + ax=ax, + ) fig.suptitle(title) ax.set_xlabel(xlabel) if xlim is not None: @@ -542,8 +806,9 @@ def horizontal_violin_plot(data, ordered_genomes, title, xlabel, pdf, hue=None, def _generic_histogram(bars, legend_labels, title_string, pdf, ax, fig, ylabel, names, box_label, bbox_to_anchor): - fig.legend([x[0] for x in bars[::-1]], legend_labels[::-1], bbox_to_anchor=bbox_to_anchor, frameon=True, - title=box_label) + fig.legend( + [x[0] for x in bars[::-1]], legend_labels[::-1], bbox_to_anchor=bbox_to_anchor, frameon=True, title=box_label + ) ax.set_title(title_string) ax.set_ylabel(ylabel) set_ticks(names, ax) @@ -552,14 +817,22 @@ def _generic_histogram(bars, legend_labels, title_string, pdf, ax, fig, ylabel, multipage_close(pdf) -def generic_unstacked_barplot(df, pdf, title_string, legend_labels, ylabel, names, box_label, - bbox_to_anchor=(1.12, 0.7)): +def generic_unstacked_barplot( + df, pdf, title_string, legend_labels, ylabel, names, box_label, bbox_to_anchor=(1.12, 0.7) +): fig, ax = plt.subplots() bars = [] shorter_bar_width = bar_width / len(df) for i, (_, d) in enumerate(df.iterrows()): - bars.append(ax.bar(np.arange(len(df.columns)) + shorter_bar_width * i, d, shorter_bar_width, - color=sns.color_palette()[i], linewidth=0.0)) + bars.append( + ax.bar( + np.arange(len(df.columns)) + shorter_bar_width * i, + d, + shorter_bar_width, + color=sns.color_palette()[i], + linewidth=0.0, + ) + ) _generic_histogram(bars, legend_labels, title_string, pdf, ax, fig, ylabel, names, box_label, bbox_to_anchor) @@ -569,8 +842,9 @@ def generic_stacked_barplot(df, pdf, title_string, legend_labels, ylabel, names, cumulative = np.zeros(len(df.columns)) color_palette = choose_palette(legend_labels) for i, (_, d) in enumerate(df.iterrows()): - bars.append(ax.bar(np.arange(len(df.columns)), d, bar_width, bottom=cumulative, - color=color_palette[i], linewidth=0.0)) + bars.append( + ax.bar(np.arange(len(df.columns)), d, bar_width, bottom=cumulative, color=color_palette[i], linewidth=0.0) + ) cumulative += d _generic_histogram(bars, legend_labels, title_string, pdf, ax, fig, ylabel, names, box_label, bbox_to_anchor) @@ -609,7 +883,7 @@ def json_biotype_nested_counter_to_df(consensus_data, key): for biotype, vals in d[key].items(): df = pd.DataFrame(list(vals.items())) if len(df) > 0: - df.columns = [key, 'count'] + df.columns = [key, "count"] df = df.assign(biotype=[biotype] * len(df), genome=[genome] * len(df)) dfs.append(df) return pd.concat(dfs) @@ -622,7 +896,7 @@ def json_grouped_biotype_nested_counter_to_df(consensus_data, key): for group, vals in d[key].items(): df = pd.DataFrame(list(vals.items())) if len(df) > 0: - df.columns = ['biotype', 'count'] + df.columns = ["biotype", "count"] df = df.assign(category=[group] * len(df), genome=[genome] * len(df)) dfs.append(df) return pd.concat(dfs) @@ -635,7 +909,7 @@ def json_biotype_counter_to_df(consensus_data, key): vals = consensus_data[genome][key] df = pd.DataFrame(list(vals.items())) if len(df) > 0: - df.columns = [key, 'count'] + df.columns = [key, "count"] df = df.assign(genome=[genome] * len(df)) dfs.append(df) return pd.concat(dfs) @@ -644,10 +918,10 @@ def json_biotype_counter_to_df(consensus_data, key): def dict_to_df_with_biotype(data, transcript_biotype_map): df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data.items()])) try: - df['biotype'] = [transcript_biotype_map[tx] for tx in df.index] + df["biotype"] = [transcript_biotype_map[tx] for tx in df.index] except KeyError: # try removing names - df['biotype'] = [transcript_biotype_map[tools.nameConversions.strip_alignment_numbers(tx)] for tx in df.index] + df["biotype"] = [transcript_biotype_map[tools.nameConversions.strip_alignment_numbers(tx)] for tx in df.index] return df @@ -660,8 +934,8 @@ def multipage_close(pdf, tight_layout=True): """convenience function for closing up a pdf page""" if tight_layout: plt.tight_layout() - pdf.savefig(bbox_inches='tight') - plt.close('all') + pdf.savefig(bbox_inches="tight") + plt.close("all") def choose_palette(ordered_genomes): @@ -674,7 +948,7 @@ def choose_palette(ordered_genomes): def set_ticks(names, ax, nbins=10.0): ax.margins(y=0.15) - ax.autoscale(enable=True, axis='y', tight=False) + ax.autoscale(enable=True, axis="y", tight=False) ax.set_ylim(0, plt.ylim()[1]) ax.yaxis.set_major_locator(matplotlib.ticker.MaxNLocator(nbins=nbins, steps=[1, 2, 5, 10], integer=True)) ax.xaxis.set_major_locator(matplotlib.ticker.LinearLocator(len(names))) @@ -685,6 +959,6 @@ def set_ticks(names, ax, nbins=10.0): def sort_long_df(df, ordered_genomes): """sorts a long form dataframe by ordered genomes""" ordered_index = dict(list(zip(ordered_genomes, list(range(len(ordered_genomes)))))) - df['order'] = df['genome'].map(ordered_index) - df = df.sort_values('order') - return df.drop('order', axis=1) + df["order"] = df["genome"].map(ordered_index) + df = df.sort_values("order") + return df.drop("order", axis=1) diff --git a/cat/transmap_classify.py b/cat/transmap_classify.py index 4ca7f853..f33cf13d 100644 --- a/cat/transmap_classify.py +++ b/cat/transmap_classify.py @@ -29,6 +29,7 @@ import tools.procOps import tools.tm2hints import tools.mathOps +import tools.intervals def transmap_classify(tm_eval_args): @@ -51,21 +52,21 @@ def transmap_classify(tm_eval_args): tx_id = tools.nameConversions.strip_alignment_numbers(aln_id) ref_aln = ref_psl_dict[tx_id] gene_id = ref_gp_dict[tx_id].name2 - r.append([aln_id, tx_id, gene_id, 'AlnExtendsOffContig', aln_extends_off_contig(aln)]) - r.append([aln_id, tx_id, gene_id, 'AlnPartialMap', alignment_partial_map(aln)]) - r.append([aln_id, tx_id, gene_id, 'AlnAbutsUnknownBases', aln_abuts_unknown_bases(tx, fasta)]) - r.append([aln_id, tx_id, gene_id, 'PercentN', aln.percent_n]) - r.append([aln_id, tx_id, gene_id, 'TransMapCoverage', 100 * aln.coverage]) - r.append([aln_id, tx_id, gene_id, 'TransMapIdentity', 100 * aln.identity]) - r.append([aln_id, tx_id, gene_id, 'TransMapGoodness', 100 * (1 - aln.badness)]) - r.append([aln_id, tx_id, gene_id, 'TransMapOriginalIntronsPercent', percent_original_introns(aln, tx, ref_aln)]) - r.append([aln_id, tx_id, gene_id, 'Synteny', synteny_scores[aln_id]]) - r.append([aln_id, tx_id, gene_id, 'ValidStart', tools.transcripts.has_start_codon(fasta, tx)]) - r.append([aln_id, tx_id, gene_id, 'ValidStop', tools.transcripts.has_stop_codon(fasta, tx)]) - r.append([aln_id, tx_id, gene_id, 'ProperOrf', tx.cds_size % 3 == 0]) - df = pd.DataFrame(r, columns=['AlignmentId', 'TranscriptId', 'GeneId', 'classifier', 'value']) + r.append([aln_id, tx_id, gene_id, "AlnExtendsOffContig", aln_extends_off_contig(aln)]) + r.append([aln_id, tx_id, gene_id, "AlnPartialMap", alignment_partial_map(aln)]) + r.append([aln_id, tx_id, gene_id, "AlnAbutsUnknownBases", aln_abuts_unknown_bases(tx, fasta)]) + r.append([aln_id, tx_id, gene_id, "PercentN", aln.percent_n]) + r.append([aln_id, tx_id, gene_id, "TransMapCoverage", 100 * aln.coverage]) + r.append([aln_id, tx_id, gene_id, "TransMapIdentity", 100 * aln.identity]) + r.append([aln_id, tx_id, gene_id, "TransMapGoodness", 100 * (1 - aln.badness)]) + r.append([aln_id, tx_id, gene_id, "TransMapOriginalIntronsPercent", percent_original_introns(aln, tx, ref_aln)]) + r.append([aln_id, tx_id, gene_id, "Synteny", synteny_scores[aln_id]]) + r.append([aln_id, tx_id, gene_id, "ValidStart", tools.transcripts.has_start_codon(fasta, tx)]) + r.append([aln_id, tx_id, gene_id, "ValidStop", tools.transcripts.has_stop_codon(fasta, tx)]) + r.append([aln_id, tx_id, gene_id, "ProperOrf", tx.cds_size % 3 == 0]) + df = pd.DataFrame(r, columns=["AlignmentId", "TranscriptId", "GeneId", "classifier", "value"]) df.value = pd.to_numeric(df.value) - return df.set_index(['GeneId', 'TranscriptId', 'AlignmentId', 'classifier']) + return df.set_index(["GeneId", "TranscriptId", "AlignmentId", "classifier"]) ### @@ -110,7 +111,7 @@ def aln_abuts_unknown_bases(tx, fasta): Do any exons in this alignment immediately touch Ns? :param tx: a GenePredTranscript object - :param fasta: pyfasta Fasta object for genome + :param fasta: pyfaidx Fasta object for genome :return: boolean """ chrom = tx.chromosome @@ -123,7 +124,7 @@ def aln_abuts_unknown_bases(tx, fasta): right_base = None else: right_base = fasta[chrom][exon.stop] - if left_base == 'N' or right_base == 'N': + if left_base == "N" or right_base == "N": return True return False @@ -136,6 +137,7 @@ def synteny(ref_gp_dict, gp_dict): :param gp_dict: Dictionary of GenePredTranscript objects from the transMap output :return: """ + def create_interval_dict(tx_dict): """ Creates a dict mapping chromosome sequences to gene intervals [chrom][gene_id]: [list of tx intervals] @@ -151,7 +153,7 @@ def merge_interval_dict(interval_dict): merged_interval_dict = collections.defaultdict(dict) for chrom in interval_dict: for gene_id, gene_intervals in interval_dict[chrom].items(): - merged_intervals = tools.intervals.gap_merge_intervals(gene_intervals, float('inf')) + merged_intervals = tools.intervals.gap_merge_intervals(gene_intervals, float("inf")) assert len(merged_intervals) == 1 merged_interval = merged_intervals[0] merged_interval.data = gene_id @@ -187,12 +189,12 @@ def make_ref_interval_map(ref_intervals): # find the genes from -5 to +5 in the target genome target_intervals = tm_chrom_intervals[tx.chromosome] target_position = bisect.bisect_left(target_intervals, tx.interval) - target_genes = {x.data for x in target_intervals[target_position - 5: target_position + 5]} + target_genes = {x.data for x in target_intervals[target_position - 5 : target_position + 5]} # find the same gene list in the reference genome ref_interval = ref_interval_map[tx.name2] ref_intervals = ref_chrom_intervals[ref_interval.chromosome] ref_position = bisect.bisect_left(ref_intervals, ref_interval) - reference_genes = {x.data for x in ref_intervals[ref_position - 5: ref_position + 5]} + reference_genes = {x.data for x in ref_intervals[ref_position - 5 : ref_position + 5]} scores[tx.name] = len(reference_genes & target_genes) return scores diff --git a/programs/cat_parse_ncbi_gff3 b/programs/cat_parse_ncbi_gff3 new file mode 100755 index 00000000..6d156d55 --- /dev/null +++ b/programs/cat_parse_ncbi_gff3 @@ -0,0 +1,110 @@ +#!/usr/bin/env python +""" +Convert a eukaryotic NCBI GFF3 to be CAT compatible +""" +import gffutils +import argparse +from copy import deepcopy + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('input_gff3', help='NCBI GFF3 file') + parser.add_argument('output_gff3', help='Output GFF3') + return parser.parse_args() + + +def update_gene_attributes(gene): + """ + Updates attributes for genes. Assumes that the gene is tagged by /gene, and may or may not have a gene_id. + """ + gene.attributes["gene_name"] = gene.attributes["gene"] + del gene.attributes["gene"] + + if "gene_id" not in gene.attributes: + gene.attributes["gene_id"] = gene.attributes["gene_name"] + assert "gene_biotype" in gene.attributes + + +def update_transcript_attributes(gene, transcript, i): + """ + Updates attributes for transcripts. Uses information from the gene. + + Assumes that transcript_biotype == gene_biotype. + + Ensures that the transcript level feature has the required keys: + + 1. gene_id + 2. gene_name + 3. transcript_id + 4. transcript_name + 5. transcript_biotype + 6. gene_biotype + + Args: + gene: Current gene. + transcript: Current transcript. + i: Number of this transcript, used to create unique identifiers if necessary. + """ + if "transcript_id" not in transcript.attributes: + transcript.attributes["transcript_id"] = [f"transcript-{transcript.id.replace('id-', '')}-{i}"] + + # propagate gene information to transcript information + transcript.attributes["transcript_name"] = gene.attributes["gene_name"] + transcript.attributes["gene_id"] = gene.attributes["gene_id"] + transcript.attributes["gene_name"] = gene.attributes["gene_name"] + + transcript.attributes["gene_biotype"] = gene.attributes["gene_biotype"] + transcript.attributes["transcript_biotype"] = gene.attributes["gene_biotype"] + + +def infer_transcript(gene, i): + """ + Creates a transcript from a gene. Sets up relationships. + + Args: + gene: A feature. + i: Index of this transcript. + + Returns: + A new copy of the Feature converted to be a transcript. + """ + tx = deepcopy(gene) + tx.featuretype = "transcript" + tx.attributes["Parent"] = gene.attributes["ID"] + new_id = [f"transcript-{tx.attributes['ID'][0].replace('id-', '').replace('gene-', '')}-{i}"] + tx.attributes["ID"] = tx.attributes["Name"] = tx.attributes["transcript_id"] = tx.attributes["transcript_name"] = new_id + tx.attributes["transcript_biotype"] = tx.attributes["gene_biotype"] + + return tx + + +if __name__ == "__main__": + args = parse_args() + db = gffutils.create_db(args.input_gff3, dbfn=":memory:", merge_strategy="create_unique") + with open(args.output_gff3, "w") as fh: + print("##gff-version 3", file=fh) + + for gene in db.features_of_type(["gene", "pseudogene"]): + + update_gene_attributes(gene) + print(gene, file=fh) + + for i, transcript in enumerate(db.children(gene, level=1), 1): + + if transcript.featuretype not in ["CDS", "exon"]: + update_transcript_attributes(gene, transcript, i) + print(transcript, file=fh) + + for exon_or_cds in db.children(transcript, level=1): + print(exon_or_cds, file=fh) + + else: + # CDS/exon are a direct child of a gene; infer a transcript feature + inferred_transcript = infer_transcript(gene, i) + print(inferred_transcript, file=fh) + + # now walk ALL children of the gene, update their Parent, and print + for exon_or_cds in db.children(gene, level=1): + exon_or_cds.attributes["Parent"] = inferred_transcript.attributes["ID"] + print(exon_or_cds, file=fh) diff --git a/programs/validate_gff3 b/programs/validate_gff3 index 1d16f1bf..5cca61a3 100755 --- a/programs/validate_gff3 +++ b/programs/validate_gff3 @@ -30,8 +30,19 @@ if __name__ == '__main__': c[l[0]] += 1 duplicates = {x for x, y in c.items() if y > 1} assert len(duplicates) == 0, 'Found {} duplicate genes: {}'.format(len(duplicates), '\n'.join(duplicates)) + df = tools.gff3.parse_gff3(attrs, gp) tx_dict = tools.transcripts.get_gene_pred_dict(gp) + + grouped_genes = tools.transcripts.group_transcripts_by_name2(tx_dict.values()) + multi_chrom_genes = [] + for gene_id, txs in grouped_genes.items(): + if len({x.chromosome for x in txs}) != 1: + multi_chrom_genes.append(gene_id) + if len(multi_chrom_genes) > 0: + raise Exception('Found {} genes on multiple chromosomes. ' + 'This is not allowed. Genes: {}'.format(len(multi_chrom_genes), ",".join(multi_chrom_genes))) + assert len(tx_dict) == len(df) exons = {len(x.exon_intervals) for x in tx_dict.values()} if len(exons) == 1: diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..02bab777 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,4 @@ +[tool.black] +line-length = 120 +target-version = ['py37'] +include = '\.pyi?$' diff --git a/scripts/extract_cds_bed.py b/scripts/extract_cds_bed.py index 2b04d5e4..8bb7de27 100644 --- a/scripts/extract_cds_bed.py +++ b/scripts/extract_cds_bed.py @@ -11,20 +11,22 @@ def parse_args(): - parser = argparse.ArgumentParser(usage='Given a GFF3 file, print an uber-CDS BED file to stdout.') - parser.add_argument('gff', help='GFF file to parse') + parser = argparse.ArgumentParser(usage="Given a GFF3 file, print an uber-CDS BED file to stdout.") + parser.add_argument("gff", help="GFF file to parse") return parser.parse_args() -if __name__ == '__main__': +if __name__ == "__main__": args = parse_args() - cmd = convert_gff3_cmd('/dev/null', args.gff) + cmd = convert_gff3_cmd("/dev/null", args.gff) with TemporaryFilePath() as tmp: run_proc(cmd, stdout=tmp) tx_dict = get_gene_pred_dict(tmp) tx_dict = group_transcripts_by_name2(tx_dict.values()) for gene, txs in tx_dict.items(): - cds_txs = [Transcript(tx.get_bed(new_start=tx.thick_start, new_stop=tx.thick_stop)) for tx in txs if tx.cds_size > 0] + cds_txs = [ + Transcript(tx.get_bed(new_start=tx.thick_start, new_stop=tx.thick_stop)) for tx in txs if tx.cds_size > 0 + ] if len(cds_txs) == 0: continue intervals = flatten_list_of_lists([x.exon_intervals for x in cds_txs]) diff --git a/scripts/fantom_lv3_parser.py b/scripts/fantom_lv3_parser.py index e57a2154..0e6c8d20 100644 --- a/scripts/fantom_lv3_parser.py +++ b/scripts/fantom_lv3_parser.py @@ -2,45 +2,50 @@ This hacky script makes the LV3 FANTOM GTF work with CAT. """ + def parse_gtf_attr_line(attr_line): """parse a GTF attributes line""" - attr_line = [x.split(' ') for x in re.split('; +', attr_line.replace('"', ''))] - attr_line[-1][-1] = attr_line[-1][-1].rstrip().replace(';', '') + attr_line = [x.split(" ") for x in re.split("; +", attr_line.replace('"', ""))] + attr_line[-1][-1] = attr_line[-1][-1].rstrip().replace(";", "") return dict(attr_line) + import re -recs = [x.rstrip().split('\t') for x in open('FANTOM_CAT.lv3_robust.gtf')] + +recs = [x.rstrip().split("\t") for x in open("FANTOM_CAT.lv3_robust.gtf")] attr_map = {} new_recs = [] for rec in recs[1:]: rec = rec[:] attrs = parse_gtf_attr_line(rec[-1]) - if rec[2] == 'gene': - if 'gene_id' not in attrs: - attrs['gene_id'] = attrs['gene_name'] - attrs['gene_biotype'] = attrs['geneSuperClass'] - if attrs['gene_biotype'] == 'all_mRNA': - attrs['gene_biotype'] = 'protein_coding' - attr_map[attrs['gene_id']] = attrs - attrs['ID'] = attrs['gene_id'] - elif rec[2] == 'transcript': - gene_attrs = attr_map[attrs['gene_id']] + if rec[2] == "gene": + if "gene_id" not in attrs: + attrs["gene_id"] = attrs["gene_name"] + attrs["gene_biotype"] = attrs["geneSuperClass"] + if attrs["gene_biotype"] == "all_mRNA": + attrs["gene_biotype"] = "protein_coding" + attr_map[attrs["gene_id"]] = attrs + attrs["ID"] = attrs["gene_id"] + elif rec[2] == "transcript": + gene_attrs = attr_map[attrs["gene_id"]] attrs.update(gene_attrs) - attrs['transcript_biotype'] = 'protein_coding' if attrs['coding_status'] == 'coding' else 'non_coding' - attrs['transcript_name'] = attrs['transcript_id'] - attrs['Parent'] = attrs['gene_id'] - attrs['ID'] = attrs['transcript_id'] - elif rec[2] == 'CDS' or rec[2] == 'exon': - attrs['Parent'] = attrs['transcript_id'] - rec[-1] = ';'.join(['{}={}'.format(x[0].lower() if x[0] != 'Parent' and x[0] != 'ID' else x[0], x[1]) for x in attrs.iteritems()]) + attrs["transcript_biotype"] = "protein_coding" if attrs["coding_status"] == "coding" else "non_coding" + attrs["transcript_name"] = attrs["transcript_id"] + attrs["Parent"] = attrs["gene_id"] + attrs["ID"] = attrs["transcript_id"] + elif rec[2] == "CDS" or rec[2] == "exon": + attrs["Parent"] = attrs["transcript_id"] + rec[-1] = ";".join( + ["{}={}".format(x[0].lower() if x[0] != "Parent" and x[0] != "ID" else x[0], x[1]) for x in attrs.iteritems()] + ) new_recs.append(rec) -fh = open('FANTOM_CAT.lv3_robust.gff3', 'w') -fh.write(recs[0][0] + '\n') +fh = open("FANTOM_CAT.lv3_robust.gff3", "w") +fh.write(recs[0][0] + "\n") for rec in new_recs: - fh.write('\t'.join(rec) + '\n') + fh.write("\t".join(rec) + "\n") fh.close() diff --git a/setup.py b/setup.py index 94706655..29e68bc3 100644 --- a/setup.py +++ b/setup.py @@ -1,41 +1,48 @@ from setuptools import setup setup( - name='cat', - version='2.0', - packages=['cat', 'tools'], - python_requires='>=3.7.0', + name="cat", + version="2.0", + packages=["cat", "tools"], + python_requires=">=3.7.0", install_requires=[ - 'pyfasta>=0.5.2', - 'toil>=3.5', - 'luigi>=2.5', - 'seaborn>=0.7', - 'pandas>=1.0', - 'frozendict', - 'configobj>=5.0', - 'sqlalchemy>=1.0', - 'ete3', - 'pysam>=0.10', - 'numpy>=1.10', - 'scipy>=0.18.1', - 'bx-python>=0.7.1', - 'parasail', - 'bcbio-gff', - 'biopython' + "pyfaidx", + "toil>=3.5", + "luigi>=2.5", + "seaborn>=0.7", + "pandas>=1.0", + "frozendict", + "configobj>=5.0", + "sqlalchemy>=1.0", + "ete3", + "pysam>=0.10", + "numpy>=1.10", + "scipy>=0.18.1", + "bx-python>=0.7.1", + "parasail", + "bcbio-gff", + "biopython", + "gffutils", ], - scripts=['programs/cat_to_ncbi_submit', 'programs/translate_gene_pred', - 'programs/validate_gff3', 'programs/cat_parse_ncbi_genbank', - 'programs/cat_parse_ncbi_refseq', 'programs/cat_parse_prokka_gff3'], - author='Ian Fiddes', - description='Comparative Annotation Toolkit', - url='https://github.com/ComparativeGenomicsToolkit/Comparative-Annotation-Toolkit', - license='Apache 2.0', + scripts=[ + "programs/cat_to_ncbi_submit", + "programs/translate_gene_pred", + "programs/validate_gff3", + "programs/cat_parse_ncbi_genbank", + "programs/cat_parse_ncbi_refseq", + "programs/cat_parse_prokka_gff3", + "programs/cat_parse_ncbi_gff3", + ], + author="Ian Fiddes", + description="Comparative Annotation Toolkit", + url="https://github.com/ComparativeGenomicsToolkit/Comparative-Annotation-Toolkit", + license="Apache 2.0", classifiers=[ - 'Development Status :: 3 - Alpha', - 'Intended Audience :: Bioinformatics', - 'Topic :: Bioinformatics', - 'License :: Apache 2.0', - 'Programming Language :: Python :: 3.7' + "Development Status :: 3 - Alpha", + "Intended Audience :: Bioinformatics", + "Topic :: Bioinformatics", + "License :: Apache 2.0", + "Programming Language :: Python :: 3.7", ], - keywords='bioinformatics comparative genomics', + keywords="bioinformatics comparative genomics", ) diff --git a/test_parser/parser.py b/test_parser/parser.py index 440c12e0..f81ac7b6 100644 --- a/test_parser/parser.py +++ b/test_parser/parser.py @@ -4,17 +4,20 @@ import sys import tools.gff3 + def test_ncbi(): - test = tools.gff3.parse_gff3("test_parser/ncbiGp","test_parser/ncbiAttrs") - ncbi = pd.read_csv("test_parser/ncbiTest",sep='\t',index_col=0) - assert_frame_equal(test,ncbi,check_dtype=False) + test = tools.gff3.parse_gff3("test_parser/ncbiGp", "test_parser/ncbiAttrs") + ncbi = pd.read_csv("test_parser/ncbiTest", sep="\t", index_col=0) + assert_frame_equal(test, ncbi, check_dtype=False) + def test_gencode(): - test = tools.gff3.parse_gff3("test_parser/gencodeGp","test_parser/gencodeAttrs") - gencode = pd.read_csv("test_parser/gencodeTest",sep='\t',index_col=0) - assert_frame_equal(test,gencode,check_dtype=False) + test = tools.gff3.parse_gff3("test_parser/gencodeGp", "test_parser/gencodeAttrs") + gencode = pd.read_csv("test_parser/gencodeTest", sep="\t", index_col=0) + assert_frame_equal(test, gencode, check_dtype=False) + def test_ensembl(): - test = tools.gff3.parse_gff3("test_parser/ensemblGp","test_parser/ensemblAttrs") - ensembl = pd.read_csv("test_parser/ensemblTest",sep='\t',index_col=0) - assert_frame_equal(test,ensembl,check_dtype=False) + test = tools.gff3.parse_gff3("test_parser/ensemblGp", "test_parser/ensemblAttrs") + ensembl = pd.read_csv("test_parser/ensemblTest", sep="\t", index_col=0) + assert_frame_equal(test, ensembl, check_dtype=False) diff --git a/tests/transcript_tests.py b/tests/transcript_tests.py index bb04c310..73ad0e8e 100644 --- a/tests/transcript_tests.py +++ b/tests/transcript_tests.py @@ -14,12 +14,12 @@ class PositiveStrandTranscriptTests(unittest.TestCase): """ def setUp(self): - self.tokens = ['chr1', '2', '15', 'test', '0', '+', '4', '13', '0,128,0', '3', '4,3,3', '0,5,10'] + self.tokens = ["chr1", "2", "15", "test", "0", "+", "4", "13", "0,128,0", "3", "4,3,3", "0,5,10"] self.t = Transcript(self.tokens) - self.transcript_seq = 'ATTCTGGCTA' - self.cds_seq = 'TCTGGC' - self.amino_acid = 'SG' - self.chrom_seq = {'chr1': 'GTATTCTTGGACCTAA'} + self.transcript_seq = "ATTCTGGCTA" + self.cds_seq = "TCTGGC" + self.amino_acid = "SG" + self.chrom_seq = {"chr1": "GTATTCTTGGACCTAA"} def test_sizes(self): """ @@ -150,11 +150,11 @@ class NegativeStrandTranscriptTests(unittest.TestCase): """ def setUp(self): - self.t = Transcript(['chr1', '2', '15', 'A', '0', '-', '4', '13', '0,128,0', '3', '4,3,3', '0,5,10']) - self.transcript_seq = 'TAGCCAGAAT' - self.cds_seq = 'GCCAGA' - self.amino_acid = 'AR' - self.chrom_seq = {'chr1': 'GTATTCTTGGACCTAA'} + self.t = Transcript(["chr1", "2", "15", "A", "0", "-", "4", "13", "0,128,0", "3", "4,3,3", "0,5,10"]) + self.transcript_seq = "TAGCCAGAAT" + self.cds_seq = "GCCAGA" + self.amino_acid = "AR" + self.chrom_seq = {"chr1": "GTATTCTTGGACCTAA"} def test_chromosome_invalid_coordinates(self): """ @@ -269,12 +269,12 @@ class ComplicatedTranscript1(unittest.TestCase): """ def setUp(self): - self.tokens = ['chr1', '1', '20', 'A', '0', '+', '8', '16', '0,128,0', '4', '3,4,3,3', '0,5,12,16'] + self.tokens = ["chr1", "1", "20", "A", "0", "+", "8", "16", "0,128,0", "4", "3,4,3,3", "0,5,12,16"] self.t = Transcript(self.tokens) - self.transcript_seq = 'TATTTGGTAACCT' - self.cds_seq = 'GGTAA' - self.amino_acid = 'G' - self.chrom_seq = {'chr1':'GTATTCTTGGACCTAAGCCTG'} + self.transcript_seq = "TATTTGGTAACCT" + self.cds_seq = "GGTAA" + self.amino_acid = "G" + self.chrom_seq = {"chr1": "GTATTCTTGGACCTAAGCCTG"} def test_sizes(self): """ @@ -288,7 +288,29 @@ def test_chromosome_coordinate_translations(self): """ Check all possible chromosome translations for correct result """ - cds_result = [None, None, None, None, None, None, None, None, 0, 1, None, None, None, 2, 3, 4, None, None, None, None, None] + cds_result = [ + None, + None, + None, + None, + None, + None, + None, + None, + 0, + 1, + None, + None, + None, + 2, + 3, + 4, + None, + None, + None, + None, + None, + ] mrna_result = [None, 0, 1, 2, None, None, 3, 4, 5, 6, None, None, None, 7, 8, 9, None, 10, 11, 12, None] for i in xrange(21): self.assertEqual(self.t.chromosome_coordinate_to_cds(i), cds_result[i]) @@ -365,12 +387,18 @@ def test_sequences(self): def test_get_bed(self): self.assertEqual(self.t.get_bed(), self.tokens) - self.assertEqual(self.t.get_bed(new_start=1, new_stop=12), - ['chr1', '1', '10', 'A', '0', '+', '8', '10', '0,128,0', '2', '3,4', '0,5']) - self.assertEqual(self.t.get_bed(new_start=19, new_stop=19), - ['chr1', '19', '19', 'A', '0', '+', '0', '0', '0,128,0', '1', '0', '0']) - self.assertEqual(self.t.get_bed(new_start=1, new_stop=4), - ['chr1', '1', '4', 'A', '0', '+', '0', '0', '0,128,0', '1', '3', '0']) + self.assertEqual( + self.t.get_bed(new_start=1, new_stop=12), + ["chr1", "1", "10", "A", "0", "+", "8", "10", "0,128,0", "2", "3,4", "0,5"], + ) + self.assertEqual( + self.t.get_bed(new_start=19, new_stop=19), + ["chr1", "19", "19", "A", "0", "+", "0", "0", "0,128,0", "1", "0", "0"], + ) + self.assertEqual( + self.t.get_bed(new_start=1, new_stop=4), + ["chr1", "1", "4", "A", "0", "+", "0", "0", "0,128,0", "1", "3", "0"], + ) class ComplicatedTranscript2(unittest.TestCase): @@ -385,12 +413,12 @@ class ComplicatedTranscript2(unittest.TestCase): """ def setUp(self): - self.tokens = ['chr1', '1', '20', 'A', '0', '-', '7', '16', '0,128,0', '4', '3,4,3,3', '0,5,12,16'] + self.tokens = ["chr1", "1", "20", "A", "0", "-", "7", "16", "0,128,0", "4", "3,4,3,3", "0,5,12,16"] self.t = Transcript(self.tokens) - self.transcript_seq = 'AGGTTACCAAATA' - self.cds_seq = 'TTACCA' - self.amino_acid = 'LP' - self.chrom_seq = {'chr1': 'GTATTCTTGGACCTAAGCCTG'} + self.transcript_seq = "AGGTTACCAAATA" + self.cds_seq = "TTACCA" + self.amino_acid = "LP" + self.chrom_seq = {"chr1": "GTATTCTTGGACCTAAGCCTG"} def test_sizes(self): """ @@ -405,7 +433,29 @@ def test_chromosome_coordinate_translations(self): """ Check all possible chromosome translations for correct result """ - cds_result = [None, None, None, None, None, None, None, 5, 4, 3, None, None, None, 2, 1, 0, None, None, None, None, None] + cds_result = [ + None, + None, + None, + None, + None, + None, + None, + 5, + 4, + 3, + None, + None, + None, + 2, + 1, + 0, + None, + None, + None, + None, + None, + ] mrna_result = [None, 12, 11, 10, None, None, 9, 8, 7, 6, None, None, None, 5, 4, 3, None, 2, 1, 0, None] for i in xrange(21): self.assertEqual(self.t.chromosome_coordinate_to_cds(i), cds_result[i]) @@ -482,12 +532,17 @@ def test_sequences(self): def test_get_bed(self): self.assertEqual(self.t.get_bed(), self.tokens) - self.assertEqual(self.t.get_bed(new_start=4), - ['chr1', '6', '20', 'A', '0', '-', '7', '16', '0,128,0', '3', '4,3,3', '0,7,11']) - self.assertEqual(self.t.get_bed(new_start=17), - ['chr1', '17', '20', 'A', '0', '-', '0', '0', '0,128,0', '1', '3', '0']) - self.assertEqual(self.t.get_bed(new_start=10, new_stop=17), - ['chr1', '13', '16', 'A', '0', '-', '13', '16', '0,128,0', '1', '3', '0']) + self.assertEqual( + self.t.get_bed(new_start=4), + ["chr1", "6", "20", "A", "0", "-", "7", "16", "0,128,0", "3", "4,3,3", "0,7,11"], + ) + self.assertEqual( + self.t.get_bed(new_start=17), ["chr1", "17", "20", "A", "0", "-", "0", "0", "0,128,0", "1", "3", "0"] + ) + self.assertEqual( + self.t.get_bed(new_start=10, new_stop=17), + ["chr1", "13", "16", "A", "0", "-", "13", "16", "0,128,0", "1", "3", "0"], + ) class SingleExonTranscript1(unittest.TestCase): @@ -502,11 +557,11 @@ class SingleExonTranscript1(unittest.TestCase): """ def setUp(self): - self.t = Transcript(['chr1', '0', '6', 'A', '0', '+', '1', '4', '0,128,0', '1', '6', '0']) - self.transcript_seq = 'GTATTC' - self.cds_seq = 'TAT' - self.amino_acid = 'Y' - self.chrom_seq = {'chr1': 'GTATTCTTGGACCTAA'} + self.t = Transcript(["chr1", "0", "6", "A", "0", "+", "1", "4", "0,128,0", "1", "6", "0"]) + self.transcript_seq = "GTATTC" + self.cds_seq = "TAT" + self.amino_acid = "Y" + self.chrom_seq = {"chr1": "GTATTCTTGGACCTAA"} def test_sizes(self): """ @@ -627,11 +682,11 @@ class SingleExonTranscript2(unittest.TestCase): """ def setUp(self): - self.t = Transcript(['chr1', '0', '6', 'A', '0', '+', '0', '6', '0,128,0', '1', '6', '0']) - self.transcript_seq = 'GTATTC' + self.t = Transcript(["chr1", "0", "6", "A", "0", "+", "0", "6", "0,128,0", "1", "6", "0"]) + self.transcript_seq = "GTATTC" self.cds_seq = self.transcript_seq - self.amino_acid = 'VF' - self.chrom_seq = {'chr1': 'GTATTCTTGGACCTAA'} + self.amino_acid = "VF" + self.chrom_seq = {"chr1": "GTATTCTTGGACCTAA"} def test_sizes(self): """ @@ -729,11 +784,11 @@ class NoncodingTranscript(unittest.TestCase): """ def setUp(self): - self.t = Transcript(['chr1', '0', '11', 'A', '0', '+', '0', '0', '0,128,0', '3', '4,1,3', '0,6,8']) - self.transcript_seq = 'GTATTGGA' - self.cds_seq = '' - self.amino_acid = '' - self.chrom_seq = {'chr1': 'GTATTCTTGGACCTAA'} + self.t = Transcript(["chr1", "0", "11", "A", "0", "+", "0", "0", "0,128,0", "3", "4,1,3", "0,6,8"]) + self.transcript_seq = "GTATTGGA" + self.cds_seq = "" + self.amino_acid = "" + self.chrom_seq = {"chr1": "GTATTCTTGGACCTAA"} def test_sizes(self): """ @@ -834,13 +889,28 @@ class PositiveStrandGenePredTranscript(PositiveStrandTranscriptTests): """ def setUp(self): - self.tokens = ['A', 'chr1', '+', '2', '15', '4', '13', '3', '2,7,12', '6,10,15', '1', - 'q2', 'cmpl', 'cmpl', '2,1,1'] + self.tokens = [ + "A", + "chr1", + "+", + "2", + "15", + "4", + "13", + "3", + "2,7,12", + "6,10,15", + "1", + "q2", + "cmpl", + "cmpl", + "2,1,1", + ] self.t = GenePredTranscript(self.tokens) - self.transcript_seq = 'ATTCTGGCTA' - self.cds_seq = 'TCTGGC' - self.amino_acid = 'L' # this transcript has a offset of 2, so the first in-frame codon is TGG - self.chrom_seq = {'chr1': 'GTATTCTTGGACCTAA'} + self.transcript_seq = "ATTCTGGCTA" + self.cds_seq = "TCTGGC" + self.amino_acid = "L" # this transcript has a offset of 2, so the first in-frame codon is TGG + self.chrom_seq = {"chr1": "GTATTCTTGGACCTAA"} def test_sequences(self): """ @@ -866,13 +936,28 @@ class NegativeStrandGenePredTranscript(NegativeStrandTranscriptTests): """ def setUp(self): - self.tokens = ['A', 'chr1', '-', '2', '15', '4', '13', '3', '2,7,12', '6,10,15', '1', - 'q2', 'cmpl', 'cmpl', '2,2,1'] + self.tokens = [ + "A", + "chr1", + "-", + "2", + "15", + "4", + "13", + "3", + "2,7,12", + "6,10,15", + "1", + "q2", + "cmpl", + "cmpl", + "2,2,1", + ] self.t = GenePredTranscript(self.tokens) - self.transcript_seq = 'TAGCCAGAAT' - self.cds_seq = 'GCCAGA' - self.amino_acid = 'Q' # this transcript has a offset of 1, so the first in-frame codon is CAG - self.chrom_seq = {'chr1': 'GTATTCTTGGACCTAA'} + self.transcript_seq = "TAGCCAGAAT" + self.cds_seq = "GCCAGA" + self.amino_acid = "Q" # this transcript has a offset of 1, so the first in-frame codon is CAG + self.chrom_seq = {"chr1": "GTATTCTTGGACCTAA"} def test_sequences(self): """ @@ -886,5 +971,5 @@ def test_get_gp(self): self.assertEqual(self.t.get_gene_pred(), self.tokens) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tools/__init__.py b/tools/__init__.py index d156f36f..0340d775 100644 --- a/tools/__init__.py +++ b/tools/__init__.py @@ -12,6 +12,7 @@ class PycbioException(Exception): except Exception as ex: raise PycbioException("more stuff", ex) """ + def __init__(self, msg, cause=None): """Constructor.""" if (cause is not None) and (not isinstance(cause, PycbioException)): @@ -28,7 +29,7 @@ def __str__(self): "recursively construct message for chained exception" desc = self.msg if self.cause is not None: - desc += ",\n caused by: " + self.cause.__class__.__name__ + ": " + str(self.cause) + desc += ",\n caused by: " + self.cause.__class__.__name__ + ": " + str(self.cause) return desc def format(self): @@ -44,7 +45,7 @@ def formatExcept(ex, doneStacks=None): if isinstance(ex, PycbioException): desc += ex.msg + "\n" else: - desc += str(ex) + "\n" + desc += str(ex) + "\n" st = getattr(ex, "stackTrace", None) if st is not None: if doneStacks is None: diff --git a/tools/bio.py b/tools/bio.py index 972e5529..3f0cc36b 100644 --- a/tools/bio.py +++ b/tools/bio.py @@ -1,33 +1,22 @@ """ Basic biology related functions """ -import string import os -from pyfasta import Fasta, NpyFastaRecord +from pyfaidx import Fasta from .fileOps import opengz -class UpperNpyFastaRecord(NpyFastaRecord): - """ - Used when we want only upper case records. - If as_string is False, will no longer return a memmap object but instead a list. - """ - def __getitem__(self, islice): - d = self.getdata(islice) - return d.tostring().decode().upper() if self.as_string else list(map(string.upper, d)) - - def write_fasta(path_or_handle, name, seq, chunk_size=100, validate=None): """Writes out fasta file. if path ends in gz, will be gzipped. """ if isinstance(path_or_handle, str): - fh = opengz(path_or_handle, 'w') + fh = opengz(path_or_handle, "w") else: fh = path_or_handle - if validate is 'DNA': - valid_chars = set('ACGTUYSWKMBDHVNacgtuyswkmbdhvn.-*') - elif validate is 'protein': - valid_chars = set('ABCDEFGHIKLMPQSRTVWXYZUabcdefghiklmpqsrtvwxyzuNn.-*') + if validate is "DNA": + valid_chars = set("ACGTUYSWKMBDHVNacgtuyswkmbdhvn.-*") + elif validate is "protein": + valid_chars = set("ABCDEFGHIKLMPQSRTVWXYZUabcdefghiklmpqsrtvwxyzuNn.-*") else: valid_chars = set() try: @@ -42,7 +31,7 @@ def write_fasta(path_or_handle, name, seq, chunk_size=100, validate=None): raise RuntimeError("Invalid FASTA character(s) seen in fasta sequence: {}".format(bad_chars)) fh.write(">%s\n" % name) for i in range(0, len(seq), chunk_size): - fh.write("%s\n" % seq[i:i+chunk_size]) + fh.write("%s\n" % seq[i : i + chunk_size]) if isinstance(path_or_handle, str): fh.close() @@ -62,32 +51,95 @@ def reverse_complement(seq): _codon_table = { - 'ATG': 'M', - 'TAA': '*', 'TAG': '*', 'TGA': '*', 'TAR': '*', 'TRA': '*', - 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GCN': 'A', - 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'AGA': 'R', - 'AGG': 'R', 'CGN': 'R', 'MGR': 'R', - 'AAT': 'N', 'AAC': 'N', 'AAY': 'N', - 'GAT': 'D', 'GAC': 'D', 'GAY': 'D', - 'TGT': 'C', 'TGC': 'C', 'TGY': 'C', - 'CAA': 'Q', 'CAG': 'Q', 'CAR': 'Q', - 'GAA': 'E', 'GAG': 'E', 'GAR': 'E', - 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G', 'GGN': 'G', - 'CAT': 'H', 'CAC': 'H', 'CAY': 'H', - 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATH': 'I', - 'TTA': 'L', 'TTG': 'L', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', - 'CTG': 'L', 'YTR': 'L', 'CTN': 'L', - 'AAA': 'K', 'AAG': 'K', 'AAR': 'K', - 'TTT': 'F', 'TTC': 'F', 'TTY': 'F', - 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CCN': 'P', - 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'AGT': 'S', - 'AGC': 'S', 'TCN': 'S', 'AGY': 'S', - 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'ACN': 'T', - 'TGG': 'W', - 'TAT': 'Y', 'TAC': 'Y', 'TAY': 'Y', - 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GTN': 'V', - '': '' - } + "ATG": "M", + "TAA": "*", + "TAG": "*", + "TGA": "*", + "TAR": "*", + "TRA": "*", + "GCT": "A", + "GCC": "A", + "GCA": "A", + "GCG": "A", + "GCN": "A", + "CGT": "R", + "CGC": "R", + "CGA": "R", + "CGG": "R", + "AGA": "R", + "AGG": "R", + "CGN": "R", + "MGR": "R", + "AAT": "N", + "AAC": "N", + "AAY": "N", + "GAT": "D", + "GAC": "D", + "GAY": "D", + "TGT": "C", + "TGC": "C", + "TGY": "C", + "CAA": "Q", + "CAG": "Q", + "CAR": "Q", + "GAA": "E", + "GAG": "E", + "GAR": "E", + "GGT": "G", + "GGC": "G", + "GGA": "G", + "GGG": "G", + "GGN": "G", + "CAT": "H", + "CAC": "H", + "CAY": "H", + "ATT": "I", + "ATC": "I", + "ATA": "I", + "ATH": "I", + "TTA": "L", + "TTG": "L", + "CTT": "L", + "CTC": "L", + "CTA": "L", + "CTG": "L", + "YTR": "L", + "CTN": "L", + "AAA": "K", + "AAG": "K", + "AAR": "K", + "TTT": "F", + "TTC": "F", + "TTY": "F", + "CCT": "P", + "CCC": "P", + "CCA": "P", + "CCG": "P", + "CCN": "P", + "TCT": "S", + "TCC": "S", + "TCA": "S", + "TCG": "S", + "AGT": "S", + "AGC": "S", + "TCN": "S", + "AGY": "S", + "ACT": "T", + "ACC": "T", + "ACA": "T", + "ACG": "T", + "ACN": "T", + "TGG": "W", + "TAT": "Y", + "TAC": "Y", + "TAY": "Y", + "GTT": "V", + "GTC": "V", + "GTA": "V", + "GTG": "V", + "GTN": "V", + "": "", +} def codon_to_amino_acid(c): @@ -100,7 +152,7 @@ def codon_to_amino_acid(c): return None if c in _codon_table: return _codon_table[c] - return 'X' + return "X" def translate_sequence(sequence): @@ -112,12 +164,12 @@ def translate_sequence(sequence): sequence = sequence.upper() i = 0 for i in range(0, len(sequence) - len(sequence) % 3, 3): - result.append(codon_to_amino_acid(sequence[i: i + 3])) + result.append(codon_to_amino_acid(sequence[i : i + 3])) if len(sequence) % 3 == 2: - c = codon_to_amino_acid(sequence[i + 3:] + 'N') - if c != 'X': + c = codon_to_amino_acid(sequence[i + 3 :] + "N") + if c != "X": result.append(c) - return ''.join(result) + return "".join(result) def read_codons(seq, offset=0, skip_last=True): @@ -127,8 +179,8 @@ def read_codons(seq, offset=0, skip_last=True): l = len(seq) if skip_last: l -= 3 - for i in range(offset, l - l % 3, 3): - yield seq[i:i + 3] + for i in range(offset, l - l % 3, 3): + yield seq[i : i + 3] def read_codons_with_position(seq, offset=0, skip_last=True): @@ -140,21 +192,17 @@ def read_codons_with_position(seq, offset=0, skip_last=True): if skip_last: l -= 3 for i in range(offset, l - l % 3, 3): - yield i, seq[i:i + 3] + yield i, seq[i : i + 3] def get_sequence_dict(file_path, upper=True): """ Returns a dictionary of fasta records. If upper is true, all bases will be uppercased. """ - assert os.path.exists(file_path), ('Error: FASTA file {} does not exist'.format(file_path)) - gdx_path = file_path + ".gdx" - assert os.path.exists(gdx_path), ("Error: gdx does not exist for this fasta. We need the fasta files to be " - "flattened in place prior to running the pipeline because of concurrency issues.") - flat_path = file_path + '.flat' - assert os.path.exists(flat_path), ("Error: flat file does not exist for this fasta. We need the fasta files to be " - "flattened in place prior to running the pipeline because of concurrency issues.") + assert os.path.exists(file_path), "Error: FASTA file {} does not exist".format(file_path) + gdx_path = file_path + ".fai" + assert os.path.exists(gdx_path), "Error: FASTA index file {}.fai does not exist".format(file_path) if upper is True: - return Fasta(file_path, record_class=UpperNpyFastaRecord) + return Fasta(file_path, sequence_always_upper=True, as_raw=True) else: - return Fasta(file_path) + return Fasta(file_path, as_raw=True) diff --git a/tools/defaultOrderedDict.py b/tools/defaultOrderedDict.py index be74e121..4b9fbdfc 100644 --- a/tools/defaultOrderedDict.py +++ b/tools/defaultOrderedDict.py @@ -8,10 +8,10 @@ class DefaultOrderedDict(OrderedDict): """ Source: http://stackoverflow.com/a/6190500/562769 """ + def __init__(self, default_factory=None, *a, **kw): - if (default_factory is not None and - not isinstance(default_factory, Callable)): - raise TypeError('first argument must be callable') + if default_factory is not None and not isinstance(default_factory, Callable): + raise TypeError("first argument must be callable") OrderedDict.__init__(self, *a, **kw) self.default_factory = default_factory @@ -31,7 +31,7 @@ def __reduce__(self): if self.default_factory is None: args = tuple() else: - args = self.default_factory, + args = (self.default_factory,) return type(self), args, None, None, list(self.items()) def copy(self): @@ -42,9 +42,8 @@ def __copy__(self): def __deepcopy__(self, memo): import copy - return type(self)(self.default_factory, - copy.deepcopy(list(self.items()))) + + return type(self)(self.default_factory, copy.deepcopy(list(self.items()))) def __repr__(self): - return 'OrderedDefaultDict(%s, %s)' % (self.default_factory, - OrderedDict.__repr__(self)) + return "OrderedDefaultDict(%s, %s)" % (self.default_factory, OrderedDict.__repr__(self)) diff --git a/tools/fifo.py b/tools/fifo.py index da481a06..320790fb 100644 --- a/tools/fifo.py +++ b/tools/fifo.py @@ -1,9 +1,11 @@ # Copyright 2006-2012 Mark Diekhans import os, errno, socket, fcntl + class _Fifo(object): """Object wrapper for pipes, abstracting traditional and named pipes, and hiding OS differences""" + __slots__ = ("rfd", "wfd", "rfh", "wfh", "rpath", "wpath") def __init__(self): @@ -13,7 +15,8 @@ def __del__(self): "finalizer" try: self.close() - except: pass + except: + pass def getRfh(self): "get read file object" @@ -53,6 +56,7 @@ def close(self): if self.rfd is not None: self.rclose() + class _LinuxFifo(_Fifo): """Linus FIFO, that used /proc to get file paths""" @@ -66,12 +70,13 @@ def __init__(self): @staticmethod def __mkFdPath(fd): "get linux /proc path for an fd" - assert(fd is not None) + assert fd is not None p = "/proc/" + str(os.getpid()) + "/fd/" + str(fd) if not os.path.exists(p): raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), p) return p + class _NamedFifo(_Fifo): """FIFO, that used named pipes to get file paths""" @@ -86,13 +91,14 @@ def __fifoOpen(path, mode): "open a FIFO file descriptor without blocking during open" # FIXME: O_NONBLOCK not right for write, maybe just drop this omode = os.O_RDONLY if (mode.startswith("r")) else os.O_WRONLY - fd = os.open(path, omode|os.O_NONBLOCK) + fd = os.open(path, omode | os.O_NONBLOCK) try: - fcntl.fcntl(fd, fcntl.F_SETFL, omode) # clear O_NONBLOCK + fcntl.fcntl(fd, fcntl.F_SETFL, omode) # clear O_NONBLOCK except: try: os.close(fd) - finally: pass + finally: + pass raise return fd @@ -103,22 +109,28 @@ def __fifoMk(suffix="tmp", tmpDir=None): if tmpDir is None: tmpDir = os.getenv("TMPDIR", "/var/tmp") prefix = tmpDir + "/" + socket.gethostname() + "." + str(os.getpid()) - maxTries=1000 + maxTries = 1000 unum = 0 while unum < maxTries: - path = prefix + "." + str(unum) + "." + suffix + path = prefix + "." + str(unum) + "." + suffix if _NamedFifo.__fifoMkAtomic(path): return path unum += 1 - raise Exception("unable to create a unique FIFO name in the form \"" - + prefix + ".*." + suffix + "\" after " + str(maxTries) - + " tries") + raise Exception( + 'unable to create a unique FIFO name in the form "' + + prefix + + ".*." + + suffix + + '" after ' + + str(maxTries) + + " tries" + ) @staticmethod def __checkFifo(path): """check that fifo matches expected types and perms, catch security hold were it could be replace with another file""" - pass # FIXME implement + pass # FIXME implement @staticmethod def __fifoMkAtomic(path): @@ -142,7 +154,10 @@ def close(self): os.unlink(self.rpath) self.rpath = self.wpath = None + _fifoClass = None + + def factory(): "get a FIFO object of the correct type for this OS" global _fifoClass diff --git a/tools/fileOps.py b/tools/fileOps.py index dc5af841..18e27b88 100644 --- a/tools/fileOps.py +++ b/tools/fileOps.py @@ -21,6 +21,7 @@ class TemporaryFilePath(object): """ Generates a path pointing to a temporary file. Context manager wrapper for get_tmp_file. Deletes the file on exit. """ + def __init__(self, prefix=None, suffix="tmp", tmp_dir=None): self.path = get_tmp_file(prefix=prefix, suffix=suffix, tmp_dir=tmp_dir) @@ -39,6 +40,7 @@ class TemporaryDirectoryPath(object): Generates a path pointing to a temporary directory. Context manager wrapper for get_tmp_file, except creates a directory out of the path. Deletes the directory and all of its contents on exit. """ + def __init__(self, prefix=None, suffix="tmp", tmp_dir=None): self.path = get_tmp_file(prefix=prefix, suffix=suffix, tmp_dir=tmp_dir) ensure_dir(self.path) @@ -75,9 +77,9 @@ def ensure_dir(d): elif len(d) == 0: pass else: - raise RuntimeError('Unable to create directory {}'.format(d)) + raise RuntimeError("Unable to create directory {}".format(d)) if not dir_is_writeable(d): - raise RuntimeError('{} is not writeable.'.format(d)) + raise RuntimeError("{} is not writeable.".format(d)) def ensure_file_dir(file_path): @@ -86,7 +88,7 @@ def ensure_file_dir(file_path): :param file_path: Path of file to ensure a parent directory of. """ d = os.path.dirname(file_path) - if d != '': + if d != "": ensure_dir(d) @@ -97,15 +99,15 @@ def opengz(file, mode="r"): :param mode: Same mode options as python's default open. :return: A open file handle. """ - assert mode in ['r', 'rb', 'a', 'ab', 'w', 'wb'] - if mode == 'wb' or (mode == 'w' and file.endswith('.gz')): - return gzip.open(file, 'wb') - elif mode == 'ab' or (mode == 'a' and file.endswith('.gz')): - return gzip.open(file, 'ab') - elif mode == 'w': - return open(file, 'w') - f = open(file, 'rb') - if f.read(2) == '\x1f\x8b': + assert mode in ["r", "rb", "a", "ab", "w", "wb"] + if mode == "wb" or (mode == "w" and file.endswith(".gz")): + return gzip.open(file, "wb") + elif mode == "ab" or (mode == "a" and file.endswith(".gz")): + return gzip.open(file, "ab") + elif mode == "w": + return open(file, "w") + f = open(file, "rb") + if f.read(2) == "\x1f\x8b": f.seek(0) return gzip.GzipFile(fileobj=f, mode=mode) else: @@ -113,7 +115,7 @@ def opengz(file, mode="r"): return open(file, mode) -def iter_lines(fspec, skip_lines=0, sep='\t'): +def iter_lines(fspec, skip_lines=0, sep="\t"): """generator over lines in file, dropping newlines. If fspec is a string, open the file and close at end. Otherwise it is file-like object and will not be closed. @@ -121,7 +123,7 @@ def iter_lines(fspec, skip_lines=0, sep='\t'): :param skip_lines: A integer of the number of lines to skip from the start of the file :param sep: Character used to separate columns in the file. If set to None, will not split the line. :return: Iterator of lines""" - fh = _resolve_fspec(fspec, 'r') + fh = _resolve_fspec(fspec, "r") try: _ = [next(fh) for _ in range(skip_lines)] for line in fh: @@ -145,12 +147,12 @@ def get_tmp_file(prefix=None, suffix="tmp", tmp_dir=None): if tmp_dir is None: tmp_dir = tempfile.gettempdir() if prefix is None: - base_path = os.path.join(tmp_dir, '.'.join([socket.gethostname(), str(os.getpid())])) + base_path = os.path.join(tmp_dir, ".".join([socket.gethostname(), str(os.getpid())])) else: - base_path = os.path.join(tmp_dir, '.'.join([prefix, socket.gethostname(), str(os.getpid())])) + base_path = os.path.join(tmp_dir, ".".join([prefix, socket.gethostname(), str(os.getpid())])) while True: - rand = ''.join([random.choice(string.digits) for _ in range(10)]) - path = '.'.join([base_path, rand, suffix]) + rand = "".join([random.choice(string.digits) for _ in range(10)]) + path = ".".join([base_path, rand, suffix]) if not os.path.exists(path): return path @@ -192,29 +194,29 @@ def touch(file_path): :return: None """ ensure_file_dir(file_path) - with open(file_path, 'a'): + with open(file_path, "a"): os.utime(file_path, None) -def print_row(fspec, line, sep='\t'): +def print_row(fspec, line, sep="\t"): """ Convenience function that writes a delimited line to fspec (file handle or file) :param fspec: A open file handle or file path :param line: One or more things to write. Must be convertible to strings. :param sep: separator to use """ - fh = _resolve_fspec(fspec, 'w') - fh.write(sep.join(map(str, line)) + '\n') + fh = _resolve_fspec(fspec, "w") + fh.write(sep.join(map(str, line)) + "\n") -def print_rows(fspec, item_iter, sep='\t'): +def print_rows(fspec, item_iter, sep="\t"): """ Convenience function that writes a iterable of lines to fspec (file handle or file) :param fspec: A open file handle or file path :param item_iter: One or more things to write. Must be convertible to strings. :param sep: separator to use """ - fh = _resolve_fspec(fspec, 'w') + fh = _resolve_fspec(fspec, "w") for line in item_iter: print_row(fh, line, sep) @@ -225,12 +227,12 @@ def print_iterable(fspec, item_iter): :param fspec: A open file handle or file path :param item_iter: One or more things to write. Assumed to be fully formatted strings with newlines """ - fh = _resolve_fspec(fspec, 'w') + fh = _resolve_fspec(fspec, "w") for line in item_iter: fh.write(line) -def _resolve_fspec(fspec, mode='r'): +def _resolve_fspec(fspec, mode="r"): """ Determine if this is a file or a handle, passing a file name to opengz() :param fspec: A open file handle or file path @@ -255,6 +257,6 @@ def hashfile(fspec, hasher=hashlib.sha256, blocksize=65536, num_characters=12): buf = fh.read(blocksize) hasher = hasher() # instantiate this hashing instance while len(buf) > 0: - hasher.update(buf.encode('utf-8')) + hasher.update(buf.encode("utf-8")) buf = fh.read(blocksize) return int(hasher.hexdigest(), 16) % 10 ** num_characters diff --git a/tools/gff3.py b/tools/gff3.py index 64b0a6be..43233fd2 100644 --- a/tools/gff3.py +++ b/tools/gff3.py @@ -5,17 +5,19 @@ from . import misc -reserved_keys = ['gene_biotype', - 'transcript_biotype', - 'gene_type', - 'transcript_type', - 'gene_name', - 'gene_id', - 'transcript_id', - 'transcript_name', - 'ID', - 'Name', - 'Parent'] +reserved_keys = [ + "gene_biotype", + "transcript_biotype", + "gene_type", + "transcript_type", + "gene_name", + "gene_id", + "transcript_id", + "transcript_name", + "ID", + "Name", + "Parent", +] def parse_gff3(annotation_attrs, annotation_gp, is_external_reference=False): @@ -31,36 +33,58 @@ def parse_attrs(attrs): results = [] for tx_id, gene_id in tx_name_map.items(): d = attrs_dict[tx_id] - gene_biotype = d.get('gene_biotype', d.get('gene_type', None)) + gene_biotype = d.get("gene_biotype", d.get("gene_type", None)) if gene_biotype is None: raise Exception("Did not find a gene biotype or gene type for {} (attrs={})".format(gene_id, d)) - tx_biotype = d.get('transcript_biotype', d.get('transcript_type', None)) + tx_biotype = d.get("transcript_biotype", d.get("transcript_type", None)) if tx_biotype is None: raise Exception("Did not find a transcript biotype or type for {} (attrs={})".format(tx_id, d)) - gene_name = d['gene_name'] - gene_id = d['gene_id'] - tx_id = d['transcript_id'] - tx_name = d['transcript_name'] - extra_tags = ';'.join(['{}={}'.format(x, y.replace(';', '%3B').replace('=', '%3D')) - for x, y in d.items() if x not in reserved_keys]) + gene_name = d["gene_name"] + gene_id = d["gene_id"] + tx_id = d["transcript_id"] + tx_name = d["transcript_name"] + extra_tags = ";".join( + [ + "{}={}".format(x, y.replace(";", "%3B").replace("=", "%3D")) + for x, y in d.items() + if x not in reserved_keys + ] + ) if len(extra_tags) > 0: try: misc.parse_gff_attr_line(extra_tags) except: - raise Exception(f'Error parsing extra tags in input GFF3 {extra_tags}') + raise Exception(f"Error parsing extra tags in input GFF3 {extra_tags}") if is_external_reference is True: # hack to fix names - gene_id = f'exRef-{gene_id}' - tx_id = f'exRef-{tx_id}' + gene_id = f"exRef-{gene_id}" + tx_id = f"exRef-{tx_id}" results.append([gene_id, tx_id, tx_name, gene_name, gene_biotype, tx_biotype, extra_tags]) - df = pd.DataFrame(results, columns=['GeneId', 'TranscriptId', 'TranscriptName', 'GeneName', - 'GeneBiotype', 'TranscriptBiotype', 'ExtraTags']) - df = df.set_index('TranscriptId') + df = pd.DataFrame( + results, + columns=[ + "GeneId", + "TranscriptId", + "TranscriptName", + "GeneName", + "GeneBiotype", + "TranscriptBiotype", + "ExtraTags", + ], + ) + df = df.set_index("TranscriptId") return df def convert_gff3_cmd(annotation_attrs, annotation): - cmd = ['gff3ToGenePred', '-rnaNameAttr=transcript_id', '-geneNameAttr=gene_id', '-honorStartStopCodons', - '-refseqHacks', - '-attrsOut={}'.format(annotation_attrs), annotation, '/dev/stdout'] - return cmd \ No newline at end of file + cmd = [ + "gff3ToGenePred", + "-rnaNameAttr=transcript_id", + "-geneNameAttr=gene_id", + "-honorStartStopCodons", + "-refseqHacks", + "-attrsOut={}".format(annotation_attrs), + annotation, + "/dev/stdout", + ] + return cmd diff --git a/tools/hal.py b/tools/hal.py index d5736b2d..854ab868 100644 --- a/tools/hal.py +++ b/tools/hal.py @@ -13,7 +13,7 @@ def get_tree(hal): :param hal: HAL file. :return: Tree object """ - cmd = ['halStats', '--tree', hal] + cmd = ["halStats", "--tree", hal] newick = call_proc_lines(cmd)[0] return ete3.Tree(newick, format=1) @@ -57,7 +57,9 @@ def extract_genomes(hal, include_ancestors=False, target_genomes=None): return tuple(x.name for x in t.get_descendants()) else: anc = t.get_common_ancestor(target_genomes) - return tuple(x.name for x in anc.get_descendants() if len(x.get_descendants()) != 0 or x.name in target_genomes) + return tuple( + x.name for x in anc.get_descendants() if len(x.get_descendants()) != 0 or x.name in target_genomes + ) else: if target_genomes is None: return tuple(t.get_leaf_names()) diff --git a/tools/hintsDatabaseInterface.py b/tools/hintsDatabaseInterface.py index 7a4d3020..fc5afd93 100644 --- a/tools/hintsDatabaseInterface.py +++ b/tools/hintsDatabaseInterface.py @@ -17,7 +17,7 @@ def reflect_hints_db(db_path): :param db_path: path to hints sqlite database :return: sqlalchemy.MetaData object, sqlalchemy.orm.Session object """ - engine = sqlalchemy.create_engine('sqlite:///{}'.format(db_path), poolclass=NullPool) + engine = sqlalchemy.create_engine("sqlite:///{}".format(db_path), poolclass=NullPool) metadata = sqlalchemy.MetaData() metadata.reflect(bind=engine) Base = automap_base(metadata=metadata) @@ -47,23 +47,24 @@ def get_rnaseq_hints(genome, chromosome, start, stop, speciesnames, seqnames, hi """ speciesid = session.query(speciesnames.speciesid).filter_by(speciesname=genome) seqnr = session.query(seqnames.seqnr).filter( - sqlalchemy.and_( - seqnames.speciesid.in_(speciesid), - (seqnames.seqname == chromosome))) + sqlalchemy.and_(seqnames.speciesid.in_(speciesid), (seqnames.seqname == chromosome)) + ) query = session.query(hints, featuretypes).filter( - sqlalchemy.and_( - hints.speciesid.in_(speciesid), - hints.seqnr.in_(seqnr), - hints.start >= start, - hints.end <= stop, - featuretypes.typeid == hints.type)) + sqlalchemy.and_( + hints.speciesid.in_(speciesid), + hints.seqnr.in_(seqnr), + hints.start >= start, + hints.end <= stop, + featuretypes.typeid == hints.type, + ) + ) hints = [] for h, f in query: - tags = 'pri=3;src={};mult={}'.format(h.esource, h.mult) + tags = "pri=3;src={};mult={}".format(h.esource, h.mult) # add 1 to both start and end to shift to 1-based - l = [chromosome, h.source, f.typename, h.start + 1, h.end + 1, h.score, '.', '.', tags] - hints.append('\t'.join(map(str, l)) + '\n') - return ''.join(hints) + l = [chromosome, h.source, f.typename, h.start + 1, h.end + 1, h.score, ".", ".", tags] + hints.append("\t".join(map(str, l)) + "\n") + return "".join(hints) def get_wiggle_hints(genome, speciesnames, seqnames, hints, session): @@ -81,7 +82,8 @@ def get_wiggle_hints(genome, speciesnames, seqnames, hints, session): # chunk up the genome to reduce memory usage for seqnr, seqname in seqs.items(): query = session.query(hints.start, hints.end, hints.score).filter( - sqlalchemy.and_(hints.speciesid.in_(speciesid), hints.source == 'w2h', hints.seqnr == seqnr)) + sqlalchemy.and_(hints.speciesid.in_(speciesid), hints.source == "w2h", hints.seqnr == seqnr) + ) for start, end, score in query: # add 1 to end to convert to half-open interval yield seqname, start, end + 1, score @@ -95,7 +97,7 @@ def hints_db_has_rnaseq(db_path, genome=None): :return: boolean """ speciesnames, seqnames, hints, featuretypes, session = reflect_hints_db(db_path) - query = session.query(hints).filter(sqlalchemy.or_(hints.source == 'w2h', hints.source == 'b2h')) + query = session.query(hints).filter(sqlalchemy.or_(hints.source == "w2h", hints.source == "b2h")) if genome is not None: speciesid = session.query(speciesnames.speciesid).filter_by(speciesname=genome) query = query.filter(hints.speciesid == speciesid) @@ -112,7 +114,7 @@ def genome_has_no_wiggle_hints(db_path, genome): :return: boolean """ speciesnames, seqnames, hints, featuretypes, session = reflect_hints_db(db_path) - query = session.query(hints).filter(hints.source == 'w2h') + query = session.query(hints).filter(hints.source == "w2h") speciesid = session.query(speciesnames.speciesid).filter_by(speciesname=genome) query = query.filter(hints.speciesid == speciesid) r = query.first() is None @@ -128,7 +130,7 @@ def hints_db_has_annotation(db_path, genome=None): :return: boolean """ speciesnames, seqnames, hints, featuretypes, session = reflect_hints_db(db_path) - query = session.query(hints).filter(hints.source == 'a2h') + query = session.query(hints).filter(hints.source == "a2h") if genome is not None: speciesid = session.query(speciesnames.speciesid).filter_by(speciesname=genome) query = query.filter(hints.speciesid == speciesid) diff --git a/tools/intervals.py b/tools/intervals.py index 14e8dce6..1161517c 100644 --- a/tools/intervals.py +++ b/tools/intervals.py @@ -5,7 +5,7 @@ from . import mathOps from .bio import reverse_complement, translate_sequence -__author__ = 'Ian Fiddes' +__author__ = "Ian Fiddes" class ChromosomeInterval(object): @@ -13,14 +13,15 @@ class ChromosomeInterval(object): Represents a continuous genomic interval. interval arithmetic adapted from http://code.activestate.com/recipes/576816-interval/ """ - __slots__ = ('chromosome', 'start', 'stop', 'strand', 'data') + + __slots__ = ("chromosome", "start", "stop", "strand", "data") def __init__(self, chromosome, start, stop, strand, data=None): self.chromosome = str(chromosome) - self.start = int(start) # 0 based - self.stop = int(stop) # exclusive + self.start = int(start) # 0 based + self.stop = int(stop) # exclusive assert self.start <= self.stop, "start > stop {}:{}-{} ({})".format(chromosome, start, stop, strand) - self.strand = strand # + or - + self.strand = strand # + or - self.data = data def __len__(self): @@ -29,32 +30,47 @@ def __len__(self): def __hash__(self): m = hashlib.sha256() for key in self.__slots__: - m.update(str(self.__getattribute__(key)).encode('utf-8')) + m.update(str(self.__getattribute__(key)).encode("utf-8")) return int(m.hexdigest(), 16) % 10 ** 12 def __eq__(self, other): - return (isinstance(other, type(self)) and - (self.chromosome, self.start, self.stop, self.strand) == - (other.chromosome, other.start, other.stop, other.strand)) + return isinstance(other, type(self)) and (self.chromosome, self.start, self.stop, self.strand) == ( + other.chromosome, + other.start, + other.stop, + other.strand, + ) def __ne__(self, other): return not self == other def __gt__(self, other): - return (isinstance(other, type(self)) and self.chromosome == other.chromosome and - (self.start, self.stop) > (other.start, other.stop)) + return ( + isinstance(other, type(self)) + and self.chromosome == other.chromosome + and (self.start, self.stop) > (other.start, other.stop) + ) def __ge__(self, other): - return (isinstance(other, type(self)) and self.chromosome == other.chromosome and - (self.start, self.stop) >= (other.start, other.stop)) + return ( + isinstance(other, type(self)) + and self.chromosome == other.chromosome + and (self.start, self.stop) >= (other.start, other.stop) + ) def __lt__(self, other): - return (isinstance(other, type(self)) and self.chromosome == other.chromosome and - (self.start, self.stop) < (other.start, other.stop)) + return ( + isinstance(other, type(self)) + and self.chromosome == other.chromosome + and (self.start, self.stop) < (other.start, other.stop) + ) def __le__(self, other): - return (isinstance(other, type(self)) and self.chromosome == other.chromosome and - (self.start, self.stop) <= (other.start, other.stop)) + return ( + isinstance(other, type(self)) + and self.chromosome == other.chromosome + and (self.start, self.stop) <= (other.start, other.stop) + ) def __contains__(self, other): return self.start <= other < self.stop @@ -73,8 +89,9 @@ def __repr__(self): if self.data is None: return "ChromosomeInterval('{}', {}, {}, '{}')".format(self.chromosome, self.start, self.stop, self.strand) else: - return "ChromosomeInterval('{}', {}, {}, '{}', '{}')".format(self.chromosome, self.start, self.stop, - self.strand, self.data) + return "ChromosomeInterval('{}', {}, {}, '{}', '{}')".format( + self.chromosome, self.start, self.stop, self.strand, self.data + ) @property def is_null(self): @@ -109,8 +126,10 @@ def complement(self, size): :return: Two ChromosomeInterval objects representing the complement of this interval and the size. """ assert 0 <= len(self) < size - return [ChromosomeInterval(self.chromosome, 0, self.start, self.strand), - ChromosomeInterval(self.chromosome, self.stop, size, self.strand)] + return [ + ChromosomeInterval(self.chromosome, 0, self.start, self.strand), + ChromosomeInterval(self.chromosome, self.stop, size, self.strand), + ] def union(self, other): """ @@ -216,10 +235,10 @@ def get_sequence(self, seq_dict, stranded=True): :param stranded: Should we reverse complement negative strand sequences? :return: A sequence string. """ - if stranded is False or self.strand is '+': - return seq_dict[self.chromosome][self.start: self.stop] - elif self.strand is '-': - return reverse_complement(seq_dict[self.chromosome][self.start: self.stop]) + if stranded is False or self.strand is "+": + return seq_dict[self.chromosome][self.start : self.stop] + elif self.strand is "-": + return reverse_complement(seq_dict[self.chromosome][self.start : self.stop]) def get_protein_sequence(self, seq_dict, frame, truncate=True): """ @@ -231,9 +250,9 @@ def get_protein_sequence(self, seq_dict, frame, truncate=True): """ seq = self.get_sequence(seq_dict) if truncate: - return translate_sequence(seq[frame:len(seq) - len(seq) % 3]) + return translate_sequence(seq[frame : len(seq) - len(seq) % 3]) else: - return translate_sequence(seq[frame:len(seq)]) + return translate_sequence(seq[frame : len(seq)]) def gap_merge_intervals(intervals, gap): @@ -246,13 +265,15 @@ def gap_merge_intervals(intervals, gap): new_intervals = [] for interval in sorted(intervals): if not new_intervals: - new_intervals.append(ChromosomeInterval(interval.chromosome, interval.start, interval.stop, - interval.strand, interval.data)) + new_intervals.append( + ChromosomeInterval(interval.chromosome, interval.start, interval.stop, interval.strand, interval.data) + ) elif interval.separation(new_intervals[-1]) <= gap: new_intervals[-1] = new_intervals[-1].hull(interval) else: - new_intervals.append(ChromosomeInterval(interval.chromosome, interval.start, interval.stop, - interval.strand, interval.data)) + new_intervals.append( + ChromosomeInterval(interval.chromosome, interval.start, interval.stop, interval.strand, interval.data) + ) return new_intervals diff --git a/tools/luigiAddons.py b/tools/luigiAddons.py index b447ab2d..b734f21a 100644 --- a/tools/luigiAddons.py +++ b/tools/luigiAddons.py @@ -51,6 +51,7 @@ def clone_parent(self, **args): for additional_task in tasks_to_inherit[1:]: task = task.clone(cls=additional_task, **args) return task + return Wrapped @@ -73,6 +74,7 @@ def __call__(self, task_that_requires): class Wrapped(task_that_requires): def requires(self): return (self.clone(x) for x in tasks_to_require) + return Wrapped @@ -80,6 +82,7 @@ class IndexTarget(luigi.Target): """ luigi target that determines if the indices have been built on a hints database. """ + def __init__(self, db): self.db = db @@ -87,7 +90,7 @@ def exists(self, timeout=6000): con = sqlite3.connect(self.db, timeout=timeout) cur = con.cursor() r = [] - for idx in ['gidx', 'hidx']: + for idx in ["gidx", "hidx"]: query = 'PRAGMA index_info("{}")'.format(idx) try: v = cur.execute(query).fetchall() diff --git a/tools/mathOps.py b/tools/mathOps.py index 7cc914f6..875a3ca6 100644 --- a/tools/mathOps.py +++ b/tools/mathOps.py @@ -16,7 +16,7 @@ def format_ratio(numerator, denominator, num_digits=None, resolve_nan=None): """ if denominator == 0 or math.isnan(denominator) or math.isnan(numerator): if resolve_nan is None: - return float('nan') + return float("nan") else: return resolve_nan r = float(numerator) / float(denominator) diff --git a/tools/misc.py b/tools/misc.py index 1f3d5f73..ecf6459d 100644 --- a/tools/misc.py +++ b/tools/misc.py @@ -18,10 +18,11 @@ class HashableNamespace(argparse.Namespace): """ Adds a __hash__ function to argparse's Namespace. """ + def __hash__(self): m = hashlib.sha256() for val in self.__dict__.values(): - m.update(str(val).encode('utf-8')) + m.update(str(val).encode("utf-8")) return int(m.hexdigest(), 16) % 10 ** 12 @@ -31,6 +32,7 @@ class PipelineNamespace(object): Used to maintain information on the pipeline state but allow users to change insignificant features without forcing the pipeline to rerun expensive modules. """ + def __init__(self): self.significant = {} @@ -39,35 +41,35 @@ def set(self, name, val, significant=True): self.significant[name] = significant def __hash__(self): - vals = tuple(name for name in self.__dict__ if name != 'significant' and self.significant[name]) + vals = tuple(name for name in self.__dict__ if name != "significant" and self.significant[name]) m = hashlib.sha256() for val in vals: - m.update(str(val).encode('utf-8')) + m.update(str(val).encode("utf-8")) return int(m.hexdigest(), 16) % 10 ** 12 def convert_gtf_gp(gp_target, gtf_target): """converts a GTF to genePred""" - cmd = ['gtfToGenePred', '-genePredExt', gtf_target.path, '/dev/stdout'] - with gp_target.open('w') as outf: + cmd = ["gtfToGenePred", "-genePredExt", gtf_target.path, "/dev/stdout"] + with gp_target.open("w") as outf: procOps.run_proc(cmd, stdout=outf) -def convert_gp_gtf(gtf_target, gp_target, source='CAT'): +def convert_gp_gtf(gtf_target, gp_target, source="CAT"): """Converts a genePred to GTF""" - cmd = ['genePredToGtf', 'file', gp_target.path, '-utr', '-honorCdsStat', '-source={}'.format(source), '/dev/stdout'] - with gtf_target.open('w') as outf: + cmd = ["genePredToGtf", "file", gp_target.path, "-utr", "-honorCdsStat", "-source={}".format(source), "/dev/stdout"] + with gtf_target.open("w") as outf: procOps.run_proc(cmd, stdout=outf) def samtools_version(): """checks the version of samtools installed""" try: - r = procOps.call_proc_lines(['samtools', '--version']) - if StrictVersion(r[0].split()[1].split('-')[0]) < '1.3': - raise Exception('samtools version is not >= 1.3.0') + r = procOps.call_proc_lines(["samtools", "--version"]) + if StrictVersion(r[0].split()[1].split("-")[0]) < "1.3": + raise Exception("samtools version is not >= 1.3.0") except ProcException: - raise Exception('samtools is not installed') + raise Exception("samtools is not installed") def is_bam(path): @@ -75,7 +77,7 @@ def is_bam(path): try: pysam.Samfile(path) except IOError: - raise RuntimeError('Path {} does not exist'.format(path)) + raise RuntimeError("Path {} does not exist".format(path)) except ValueError: return False return True @@ -96,7 +98,7 @@ def pairwise_adjacent(iterable): def sort_gff(input_file, output_file): """Sorts a GFF format file by column 1 (chromosome) then column 4(start integer)""" - cmd = [['sort', '-n', '-k4,4', input_file], ['sort', '-s', '-n', '-k5,5'], ['sort', '-s', '-k1,1']] + cmd = [["sort", "-n", "-k4,4", input_file], ["sort", "-s", "-n", "-k5,5"], ["sort", "-s", "-k1,1"]] procOps.run_proc(cmd, stdout=output_file) @@ -104,8 +106,8 @@ def parse_gtf_attr_line(attr_line): """parse a GTF attributes line""" if len(attr_line) == 0: return {} - attr_line = [x.split(' ') for x in re.split('; +', attr_line.replace('"', ''))] - attr_line[-1][-1] = attr_line[-1][-1].rstrip().replace(';', '') + attr_line = [x.split(" ") for x in re.split("; +", attr_line.replace('"', ""))] + attr_line[-1][-1] = attr_line[-1][-1].rstrip().replace(";", "") return dict(attr_line) @@ -113,8 +115,8 @@ def parse_gff_attr_line(attr_line): """parse a GFF attributes line""" if len(attr_line) == 0: return {} - attr_line = [x.split('=') for x in re.split('; *', attr_line.replace('"', ''))] - attr_line[-1][-1] = attr_line[-1][-1].rstrip().replace(';', '') + attr_line = [x.split("=") for x in re.split("; *", attr_line.replace('"', ""))] + attr_line[-1][-1] = attr_line[-1][-1].rstrip().replace(";", "") return dict(attr_line) @@ -148,16 +150,20 @@ def is_exec(program): # just to run "which" can be surprisingly expensive. But we do # check for the presence of Docker or Singularity, since that should take # only a few ms. - binary_mode = os.environ.get('CAT_BINARY_MODE') - cmd = ['which', binary_mode] - pl = Procline(cmd, stdin='/dev/null', stdout='/dev/null', stderr='/dev/null') + binary_mode = os.environ.get("CAT_BINARY_MODE") + cmd = ["which", binary_mode] + pl = Procline(cmd, stdin="/dev/null", stdout="/dev/null", stderr="/dev/null") try: pl.wait() return True except ProcException: - raise Exception("{0} not found. Either install {0}, or install CAT's dependencies and use --binary-mode local.".format(binary_mode)) + raise Exception( + "{0} not found. Either install {0}, or install CAT's dependencies and use --binary-mode local.".format( + binary_mode + ) + ) else: - cmd = ['which', program] + cmd = ["which", program] try: return procOps.call_proc_lines(cmd)[0].endswith(program) except ProcException: diff --git a/tools/nameConversions.py b/tools/nameConversions.py index 05e9d147..f5905bca 100644 --- a/tools/nameConversions.py +++ b/tools/nameConversions.py @@ -51,27 +51,31 @@ def aln_id_is_transmap(aln_id): :param aln_id: name string :return: boolean """ - return True if remove_augustus_alignment_number(aln_id) == aln_id and remove_alignment_number(aln_id) != aln_id else False + return ( + True + if remove_augustus_alignment_number(aln_id) == aln_id and remove_alignment_number(aln_id) != aln_id + else False + ) def aln_id_is_augustus_tm(aln_id): - return aln_id.startswith('augTM-') + return aln_id.startswith("augTM-") def aln_id_is_augustus_tmr(aln_id): - return aln_id.startswith('augTMR-') + return aln_id.startswith("augTMR-") def aln_id_is_cgp(aln_id): - return aln_id.startswith('augCGP-') + return aln_id.startswith("augCGP-") def aln_id_is_pb(aln_id): - return aln_id.startswith('augPB-') + return aln_id.startswith("augPB-") def aln_id_is_exref(aln_id): - return aln_id.startswith('exRef-') + return aln_id.startswith("exRef-") def aln_id_is_denovo(aln_id): @@ -81,15 +85,15 @@ def aln_id_is_denovo(aln_id): def alignment_type(aln_id): """returns what type of alignment this ID is""" if aln_id_is_augustus_tmr(aln_id): - return 'augTMR' + return "augTMR" elif aln_id_is_augustus_tm(aln_id): - return 'augTM' + return "augTM" elif aln_id_is_cgp(aln_id): - return 'augCGP' + return "augCGP" elif aln_id_is_pb(aln_id): - return 'augPB' + return "augPB" elif aln_id_is_exref(aln_id): - return 'exRef' + return "exRef" elif aln_id_is_transmap(aln_id): - return 'transMap' + return "transMap" assert False diff --git a/tools/parasail_wrapper.py b/tools/parasail_wrapper.py index 82a016ba..ad0ea025 100644 --- a/tools/parasail_wrapper.py +++ b/tools/parasail_wrapper.py @@ -7,17 +7,17 @@ from .misc import pairwise from .psl import PslRow -cigar_re = re.compile('([MIDNSHPX=])') -INS = 'I' -DEL = 'D' -MATCH = '=' -MISMATCH = 'X' +cigar_re = re.compile("([MIDNSHPX=])") +INS = "I" +DEL = "D" +MATCH = "=" +MISMATCH = "X" def iter_cigar(cigar): ref_pos = cigar.beg_ref tgt_pos = cigar.beg_query - for num, op in pairwise(re.split(cigar_re, cigar.decode.decode('utf-8'))): + for num, op in pairwise(re.split(cigar_re, cigar.decode.decode("utf-8"))): num = int(num) yield ref_pos, tgt_pos, num, op if op == MATCH or op == MISMATCH: @@ -36,20 +36,20 @@ def construct_fa(name1, seq1, name2, seq2, result): aln2 = [] for ref_pos, tgt_pos, num, op in iter_cigar(result.cigar): if op == MATCH or op == MISMATCH: - aln1.append(seq1[ref_pos:ref_pos + num]) - aln2.append(seq2[tgt_pos:tgt_pos + num]) + aln1.append(seq1[ref_pos : ref_pos + num]) + aln2.append(seq2[tgt_pos : tgt_pos + num]) elif op == DEL: - aln1.append(''.join(['-'] * min(num, len(seq2) - tgt_pos))) - aln2.append(seq2[tgt_pos:tgt_pos + num]) + aln1.append("".join(["-"] * min(num, len(seq2) - tgt_pos))) + aln2.append(seq2[tgt_pos : tgt_pos + num]) elif op == INS: - aln1.append(seq1[ref_pos:ref_pos + num]) - aln2.append(''.join(['-'] * min(num, len(seq1) - ref_pos))) + aln1.append(seq1[ref_pos : ref_pos + num]) + aln2.append("".join(["-"] * min(num, len(seq1) - ref_pos))) assert len(aln1[-1]) == len(aln2[-1]) - aln1 = ''.join(aln1) - aln2 = ''.join(aln2) + aln1 = "".join(aln1) + aln2 = "".join(aln2) assert len(aln1) == len(aln2) assert max(len(seq1), len(seq2)) == len(aln1) - return f'>{name1}\n{aln1}\n>{name2}\n{aln2}' + return f">{name1}\n{aln1}\n>{name2}\n{aln2}" def construct_psl(name1, name2, result): @@ -68,7 +68,7 @@ def construct_psl(name1, name2, result): t_pos = result.cigar.beg_ref t_size = result.len_ref - parsed_cigar = list(pairwise(re.split(cigar_re, result.cigar.decode.decode('utf-8')))) + parsed_cigar = list(pairwise(re.split(cigar_re, result.cigar.decode.decode("utf-8")))) for i, (num, op) in enumerate(parsed_cigar): num = int(num) @@ -96,10 +96,31 @@ def construct_psl(name1, name2, result): else: assert False block_count = len(block_sizes) - p = PslRow((matches, mismatches, 0, 0, q_num_insert, q_base_insert, t_num_insert, t_base_insert, '+', - name1, q_size, q_starts[0], q_starts[block_count - 1] + block_sizes[block_count - 1], - name2, t_size, t_starts[0], t_starts[block_count - 1] + block_sizes[block_count - 1], - block_count, ','.join(map(str, block_sizes)), ','.join(map(str, q_starts)), ','.join(map(str, t_starts)))) + p = PslRow( + ( + matches, + mismatches, + 0, + 0, + q_num_insert, + q_base_insert, + t_num_insert, + t_base_insert, + "+", + name1, + q_size, + q_starts[0], + q_starts[block_count - 1] + block_sizes[block_count - 1], + name2, + t_size, + t_starts[0], + t_starts[block_count - 1] + block_sizes[block_count - 1], + block_count, + ",".join(map(str, block_sizes)), + ",".join(map(str, q_starts)), + ",".join(map(str, t_starts)), + ) + ) return p diff --git a/tools/pipeline.py b/tools/pipeline.py index 30b6b27f..a6b18bef 100644 --- a/tools/pipeline.py +++ b/tools/pipeline.py @@ -18,7 +18,8 @@ def _getSigName(num): for key in vars(signal): if (getattr(signal, key) == num) and key.startswith("SIG") and (key.find("_") < 0): return key - return "signal"+str(num) + return "signal" + str(num) + def _setPgid(pid, pgid): """set pgid of a process, ignored exception caused by race condition @@ -28,17 +29,18 @@ def _setPgid(pid, pgid): # or EPERM. To handle this is a straight-forward way, just check that the # change has been made. However, in some cases the change didn't take, # retrying seems to make the problem go away. - for i in range(0,5): + for i in range(0, 5): try: os.setpgid(pid, pgid) return except OSError: if os.getpgid(pid) == pgid: return - time.sleep(0.25) # sleep for retry + time.sleep(0.25) # sleep for retry # last try, let it return an error os.setpgid(pid, pgid) + # FIXME: why not use pipes.quote? def _quoteStr(a): "return string with quotes if it contains white space" @@ -47,14 +49,16 @@ def _quoteStr(a): a = '"' + a + '"' return a + class ProcException(PycbioException): "Process error exception. A None returncode indicates a exec failure." + def __init__(self, procDesc, returncode=None, stderr=None, cause=None): self.returncode = returncode self.stderr = stderr if returncode is None: msg = "exec failed" - elif (returncode < 0): + elif returncode < 0: msg = "process signaled: " + _getSigName(-returncode) else: msg = "process exited " + str(returncode) @@ -64,25 +68,30 @@ def __init__(self, procDesc, returncode=None, stderr=None, cause=None): msg += ":\n" + stderr PycbioException.__init__(self, msg, cause=cause) + class ProcDagException(PycbioException): "Exception not associate with process execution" + def __init__(self, msg, cause=None): PycbioException.__init__(self, msg, cause) + def nonBlockClear(fd): "clear the non-blocking flag on a fd" flags = fcntl.fcntl(fd, fcntl.F_GETFL) - fcntl.fcntl(fd, fcntl.F_SETFL, flags&~os.O_NONBLOCK) + fcntl.fcntl(fd, fcntl.F_SETFL, flags & ~os.O_NONBLOCK) + class _StatusPipe(object): """Used to communicate from child to parent. Child is close-on-exec, so this can be used to get the status of an exec.""" + __slots__ = ("rfd", "wfd") def __init__(self): self.rfd, self.wfd = os.pipe() flags = fcntl.fcntl(self.wfd, fcntl.F_GETFD) - fcntl.fcntl(self.wfd, fcntl.F_SETFD, flags|fcntl.FD_CLOEXEC) + fcntl.fcntl(self.wfd, fcntl.F_SETFD, flags | fcntl.FD_CLOEXEC) def postForkParent(self): "post fork handling in parent" @@ -107,15 +116,17 @@ def recvStatus(self): """read status from child, return exception if received, otherwise None or True""" # FIXME add read loop, or read through pickle?? - data = os.read(self.rfd, 1024*1024) + data = os.read(self.rfd, 1024 * 1024) os.close(self.rfd) if len(data) > 0: return pickle.loads(data) else: - return None + return None + class PInOut(object): """base class for PIn and POut""" + def __init__(self, dev, argPrefix=None): self.dev = dev self.argPrefix = argPrefix @@ -125,19 +136,19 @@ def __init__(self, dev, argPrefix=None): def __radd__(self, argPrefix): "string concatiation operator that sets the argPrefix" - assert(self.argPrefix is None) + assert self.argPrefix is None self.argPrefix = argPrefix return self def assocByPath(self, proc): "associate Dev with a Proc that will access it by path" - assert(self.proc is None) + assert self.proc is None self.proc = proc self.named = True def assocByFd(self, proc): "associate Dev with a Proc that will access it by file descriptor" - assert(self.proc is None) + assert self.proc is None self.proc = proc self.named = False @@ -154,14 +165,14 @@ def getConnectedProc(self): def getFd(self): "get file descriptor for this object" - assert(not self.named) + assert not self.named return self.dev.getFd(self) - + def getFh(self): "get file object for this object, or error if not supported by Dev" - assert(not self.named) + assert not self.named return self.dev.getFh(self) - + def getPath(self): "get path for this object" return self.dev.getPath(self) @@ -176,7 +187,7 @@ def getArg(self): def close(self): "terminate association with device" self.dev.close(self) - + def __str__(self): """return input file argument""" if not self.named: @@ -206,6 +217,7 @@ def pHasOtherProc(obj): else: return False + class PIn(PInOut): """Process input object that links Dev object as input to a process, either as stdin or as a command line argument. That is, it's output @@ -215,9 +227,11 @@ class PIn(PInOut): option-equals is prepended to the file (--in=fname). The argPrefix can be specified as an option to the constructor, or in a string concatination ("--in="+PIn(d)).""" + def __init__(self, dev, argPrefix=None): PInOut.__init__(self, dev, argPrefix) + class POut(PInOut): """Process output object that links Dev object as output from a process, either as stdout/stderr or as a command line argument. That is, it's input @@ -230,10 +244,12 @@ class POut(PInOut): If append is True, file is opened with append access, if approriate. """ + def __init__(self, dev, argPrefix=None, append=False): PInOut.__init__(self, dev, argPrefix) self.append = append # FIXME: not implemented + class Dev(object): """Base class for objects specifiying process input or output. Usually implemented as pipes or named pipes, they provide a way of hide details of @@ -241,7 +257,7 @@ class Dev(object): processes by PIn or POut objects. """ def __init__(self): - self.pin = None # dev output/process input + self.pin = None # dev output/process input self.pout = None # dev input/process output def _addPio(self, pio): @@ -259,8 +275,7 @@ def _addPio(self, pio): def needNamed(self): """does this device need a named pipe?""" - return (((self.pin is not None) and (self.pin.named)) - or ((self.pout is not None) and (self.pout.named))) + return ((self.pin is not None) and (self.pin.named)) or ((self.pout is not None) and (self.pout.named)) def preFork(self): """pre-fork setup.""" @@ -269,7 +284,7 @@ def preFork(self): def postExecParent(self): "called do any post-exec handling in the parent" pass - + def close(self, pio): """remove association of process with device; PInOut association remains for debugging purposes""" @@ -286,11 +301,12 @@ def getFd(self, pio): def getFh(self, pio): "get file object for given PInOut object, or error if not supported" raise AttributeError("getFh not supported for this Dev: " + str(self.__class__)) - + def getPath(self, pio): "get path for given PInOut object" raise AttributeError("getPath not implemented") - + + class DataReader(Dev): """Object to read data from process into memory via a pipe.""" @@ -306,11 +322,13 @@ def __del__(self): if self.thread is not None: try: self.thread.join() - except: pass + except: + pass if self.fifo is not None: try: self.fifo.close() - except: pass + except: + pass def __str__(self): return "[DataWriter]" @@ -327,7 +345,7 @@ def postExecParent(self): self.fifo.wclose() self.thread = threading.Thread(target=self.__reader) self.thread.start() - + def finish(self): "called in parent when processing is complete" if self.fifo is not None: @@ -348,14 +366,15 @@ def get(self): def getFd(self, pio): "get file descriptor for given PInOut object" - assert(pio == self.pout) + assert pio == self.pout return self.fifo.wfd - + def getPath(self, pio): "get path for given PInOut object" - assert(pio == self.pout) + assert pio == self.pout return self.fifo.wpath - + + class DataWriter(Dev): """Object to write data from memory to process via a pipe.""" @@ -370,11 +389,13 @@ def __del__(self): if self.thread is not None: try: self.thread.join() - except: pass + except: + pass if self.fifo is not None: try: self.fifo.close() - except: pass + except: + pass def __str__(self): return "[DataWriter]" @@ -391,7 +412,7 @@ def postExecParent(self): self.fifo.rclose() self.thread = threading.Thread(target=self.__writer) self.thread.start() - + def finish(self): "called in parent when processing is complete" if self.thread is not None: @@ -411,14 +432,15 @@ def __writer(self): def getFd(self, pio): "get file descriptor for given PInOut object" - assert(pio == self.pin) + assert pio == self.pin return self.fifo.rfd - + def getPath(self, pio): "get path for given PInOut object" - assert(pio == self.pin) + assert pio == self.pin return self.fifo.rpath - + + class Pipe(Dev): """Interprocess communication between two Procs, either by named or anonymous pipes. One end can also be attached to read/write @@ -433,7 +455,8 @@ def __del__(self): if self.fifo is not None: try: self.fifo.close() - except: pass + except: + pass def __str__(self): return "[Pipe]" @@ -450,7 +473,7 @@ def postExecParent(self): self.fifo.wclose() if (not self.pin.named) and (self.pin.proc is not None): self.fifo.rclose() - + def close(self, pio): """remove association of process with device; PInOut association remains for debugging purposes""" @@ -459,7 +482,7 @@ def close(self, pio): elif pio == self.pout: self.fifo.wclose() else: - assert(False) + assert False def finish(self): "called in parent when processing is complete" @@ -472,7 +495,7 @@ def getFd(self, pio): return self.fifo.rfd else: return self.fifo.wfd - + def getFh(self, pio): "get file object for given PInOut object" if pio == self.pin: @@ -486,7 +509,8 @@ def getPath(self, pio): return self.fifo.rpath else: return self.fifo.wpath - + + class File(Dev): """A file path for input or output, used for specifying stdio associated with files. Proc wraps these around string arguments automatically""" @@ -507,15 +531,16 @@ def getFd(self, pio): if isinstance(pio, PIn): self.fd = os.open(self.path, os.O_RDONLY) elif self.append: - self.fd = os.open(self.path, os.O_WRONLY|os.O_CREAT|os.O_APPEND, 0o666) + self.fd = os.open(self.path, os.O_WRONLY | os.O_CREAT | os.O_APPEND, 0o666) else: - self.fd = os.open(self.path, os.O_WRONLY|os.O_CREAT|os.O_TRUNC, 0o666) + self.fd = os.open(self.path, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o666) return self.fd - + def getPath(self, pio): "get path for given PInOut object" return self.path + class Proc(object): """A process, represented as a node in a DAG of Proc objects, connected by PInOut and Dev objects. All processes in a ProcDag are part of the same @@ -553,10 +578,10 @@ def __init__(self, dag, cmd, stdin=None, stdout=None, stderr=None): self.pid = None self.statusPipe = None self.returncode = None # exit code, or -signal - self.exceptInfo = None # (exception, value, traceback) + self.exceptInfo = None # (exception, value, traceback) self.started = False self.finished = False - self.forced = False # force termination during ProcDag cleanup + self.forced = False # force termination during ProcDag cleanup @staticmethod def __devStr(dev): @@ -580,7 +605,7 @@ def __str__(self): def getPios(self): "get set of associated PIn and POut objects" - return self.pins|self.pouts + return self.pins | self.pouts def __stdioAssoc(self, spec, mode): """check a stdio spec validity and associate if PInOut or Dev""" @@ -644,7 +669,7 @@ def __stdioSetup(self, spec, stdfd): if stdfd == 0: # stdin? fd = os.open(spec, os.O_RDONLY) else: - fd = os.open(spec, os.O_WRONLY|os.O_CREAT|os.O_TRUNC, 0o666) + fd = os.open(spec, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o666) elif isinstance(spec, int): fd = spec if (fd is not None) and (fd != stdfd): @@ -654,11 +679,12 @@ def __stdioSetup(self, spec, stdfd): def __closeFiles(self): "clone non-stdio files" keepOpen = set([self.statusPipe.wfd]) | trace.getActiveTraceFds() - for fd in range(3, MAXFD+1): + for fd in range(3, MAXFD + 1): try: if not fd in keepOpen: os.close(fd) - except: pass + except: + pass def __doChildStart(self): "guts of start child process" @@ -671,7 +697,7 @@ def __doChildStart(self): self.__closeFiles() signal.signal(signal.SIGPIPE, signal.SIG_DFL) os.execvp(cmd[0], cmd) - + def __childStart(self): "start in child process" try: @@ -681,7 +707,7 @@ def __childStart(self): if type(ex) != ProcException: ex = ProcException(str(self), cause=ex) self.statusPipe.sendExcept(ex) - + def __parentStart(self): "start in parent process" # first process is process leader. @@ -690,7 +716,7 @@ def __parentStart(self): try: _setPgid(self.pid, self.dag.pgid) except OSError: - pass # igore error if child has already come and gone + pass # igore error if child has already come and gone self.statusPipe.postForkParent() def __start(self): @@ -702,7 +728,7 @@ def __start(self): try: self.__childStart() finally: - os.abort() # should never make it here + os.abort() # should never make it here else: self.__parentStart() @@ -755,12 +781,12 @@ def __handleErrExit(self): stderr = self.stderr.dev.get() # FIXME: shouldn't save if we killed it self.exceptInfo = (ProcException(str(self), self.returncode, stderr), None, None) - + def _handleExit(self, waitStat): """Handle process exiting, saving status Call close on all PInOut objects to disassociate """ self.finished = True - assert(os.WIFEXITED(waitStat) or os.WIFSIGNALED(waitStat)) + assert os.WIFEXITED(waitStat) or os.WIFSIGNALED(waitStat) self.returncode = os.WEXITSTATUS(waitStat) if os.WIFEXITED(waitStat) else -os.WTERMSIG(waitStat) if not ((self.returncode == 0) or (self.returncode == -signal.SIGPIPE)): self.__handleErrExit() @@ -777,7 +803,7 @@ def _poll(self): w = os.waitpid(self.pid, os.WNOHANG) if w[0] != 0: self._handleExit(w[1]) - return (w[0] != 0) + return w[0] != 0 def _forceFinish(self): """Forced termination of process. The forced flag is set, as an @@ -793,14 +819,16 @@ def _forceFinish(self): def failed(self): "check if process failed, call after poll() or wait()" - return (self.exceptInfo is not None) + return self.exceptInfo is not None + class _ProcDagDesc(object): """Generate a description of a ProcDag for debugging purposes.""" + def __init__(self, dag): self.dag = dag self.procsSeen = set() - self.piosSeen = set() # avoid cycles + self.piosSeen = set() # avoid cycles @staticmethod def __isPipelinePipe(spec): @@ -814,7 +842,7 @@ def __isPipelinePipe(spec): def __findPipelineStart(self, proc): "starting at a proc, walk back stdin->stdout pipeline to process" - seen = set() # don't hang on cycles + seen = set() # don't hang on cycles while self.__isPipelinePipe(proc.stdin) and (not proc in seen): seen.add(proc) proc = proc.stdin.dev.pout.proc @@ -886,9 +914,9 @@ def __nonPipeStdioDesc(spec, stdfd, sym): else: return "" # default, so display nothing elif isinstance(spec, PInOut): - return " " + sym + str(spec.dev) + return " " + sym + str(spec.dev) else: - return " " + sym + str(spec) + return " " + sym + str(spec) def __descProc(self, proc): """describe a single process in a pipeline, recursively handling args @@ -904,7 +932,7 @@ def __descProc(self, proc): strs.append(self.__descPInOutArg(a)) else: strs.append(_quoteStr(a)) - desc = " " .join(strs) + desc = " ".join(strs) # stdin if not PInOut.pIsPipe(proc.stdin): desc += self.__nonPipeStdioDesc(proc.stdin, 0, "<") @@ -934,7 +962,7 @@ def __str__(self): """get a string more or less describing the DAG""" # find sub-pipelines not connected as args or stderr and start # formatting these - (notConn, areConn)= self.__partPipelines() + (notConn, areConn) = self.__partPipelines() notConn.sort() # consistent test results areConn.sort() descs = [] @@ -951,16 +979,18 @@ def __str__(self): descs.sort() # reproducible desc += "{CYCLE}: " + " ; ".join(descs) return desc - + + class ProcDag(object): """Process DAG. Controls creation and management of process graph.""" + def __init__(self): self.procs = set() self.devs = set() - self.pgid = None # process group leader - self.byPid = dict() # indexed by pid + self.pgid = None # process group leader + self.byPid = dict() # indexed by pid self.started = False # have procs been started - self.finished = False # have all procs finished + self.finished = False # have all procs finished def __str__(self): """get a string more or less describing the DAG""" @@ -1046,16 +1076,16 @@ def __cleanupDev(self, dev): except Exception as ex: # FIXME: make optional, or record, or something exi = sys.exc_info() - stack = "" if exi is None else "".join(traceback.format_list(traceback.extract_tb(exi[2])))+"\n" - sys.stderr.write("ProcDag dev cleanup exception: " +str(ex)+"\n"+stack) + stack = "" if exi is None else "".join(traceback.format_list(traceback.extract_tb(exi[2]))) + "\n" + sys.stderr.write("ProcDag dev cleanup exception: " + str(ex) + "\n" + stack) def __cleanupProc(self, proc): try: proc._forceFinish() except Exception as ex: # FIXME: make optional - sys.stderr.write("ProcDag proc cleanup exception: " +str(ex)+"\n") - + sys.stderr.write("ProcDag proc cleanup exception: " + str(ex) + "\n") + def __cleanup(self): """forced cleanup of child processed after failure""" self.finished = True @@ -1136,9 +1166,11 @@ def failed(self): def kill(self, sig=signal.SIGTERM): "send a signal to the process" os.kill(-self.pgid, sig) - + + class Procline(ProcDag): """Process pipeline""" + def __init__(self, cmds, stdin=None, stdout=None, stderr=None): """cmds is either a list of arguments for a single process, or a list of such lists for a pipeline. If the stdin/out/err arguments are none, @@ -1155,25 +1187,26 @@ def __init__(self, cmds, stdin=None, stdout=None, stderr=None): if isinstance(cmds[0], str): cmds = [cmds] # one-process pipeline prevPipe = None - lastCmd = cmds[len(cmds)-1] + lastCmd = cmds[len(cmds) - 1] for cmd in cmds: - prevPipe = self._createProc(cmd, prevPipe, (cmd==lastCmd), stdin, stdout, stderr) - + prevPipe = self._createProc(cmd, prevPipe, (cmd == lastCmd), stdin, stdout, stderr) + def _createProc(self, cmd, prevPipe, isLastCmd, stdinFirst, stdoutLast, stderr): """create one process""" - if (prevPipe is None): + if prevPipe is None: stdin = stdinFirst # first process in pipeline else: stdin = PIn(prevPipe) - if (isLastCmd): + if isLastCmd: outPipe = None - stdout = stdoutLast # last process in pipeline + stdout = stdoutLast # last process in pipeline else: outPipe = Pipe() stdout = POut(outPipe) self.create(cmd, stdin=stdin, stdout=stdout, stderr=stderr) return outPipe + class Pipeline(Procline): """Object to create and manage a pipeline of processes. It can either run an independent set of processes, or a file-like object that either writes @@ -1182,7 +1215,7 @@ class Pipeline(Procline): """ # FIXME: change otherEnd stdio, or stdin/stdout, match with mode - def __init__(self, cmds, mode='r', otherEnd=None): + def __init__(self, cmds, mode="r", otherEnd=None): """cmds is either a list of arguments for a single process, or a list of such lists for a pipeline. Mode is 'r' for a pipeline who's output will be read, or 'w' for a pipeline to that is to @@ -1234,7 +1267,7 @@ def _getOtherFh(self): otherFh = None closeOther = False elif isinstance(self.otherEnd, str): - if self.mode == 'r': + if self.mode == "r": otherFh = PIn(File(self.otherEnd)) else: otherFh = POut(File(self.otherEnd)) @@ -1250,7 +1283,7 @@ def __iter__(self): def __next__(self): return next(self.fh) - + def flush(self): "Flush the internal I/O buffer." self.fh.flush() @@ -1258,7 +1291,7 @@ def flush(self): def fileno(self): "get the integer OS-dependent file handle" return self.fh.fileno() - + def write(self, str): "Write string str to file." self.fh.write(str) @@ -1295,6 +1328,18 @@ def close(self): if not self.finished: self.wait() -__all__ = [ProcException.__name__, PIn.__name__, POut.__name__, Dev.__name__, - DataReader.__name__, DataWriter.__name__, Pipe.__name__, File.__name__, - Proc.__name__, ProcDag.__name__, Procline.__name__, Pipeline.__name__] + +__all__ = [ + ProcException.__name__, + PIn.__name__, + POut.__name__, + Dev.__name__, + DataReader.__name__, + DataWriter.__name__, + Pipe.__name__, + File.__name__, + Proc.__name__, + ProcDag.__name__, + Procline.__name__, + Pipeline.__name__, +] diff --git a/tools/procOps.py b/tools/procOps.py index af75be52..ec6e0800 100644 --- a/tools/procOps.py +++ b/tools/procOps.py @@ -9,7 +9,7 @@ import logging import time -logger = logging.getLogger('cat') +logger = logging.getLogger("cat") def cmdLists(cmd): @@ -17,19 +17,20 @@ def cmdLists(cmd): creates docker or singularity command(s) from either a single command or a list of commands. """ - if os.environ.get('CAT_BINARY_MODE') == 'docker': + if os.environ.get("CAT_BINARY_MODE") == "docker": + docker_image = os.getenv("DOCKER_IMAGE", "quay.io/ucsc_cgl/cat:latest") if isinstance(cmd[0], list): docList = [] for e in cmd: - docList.append(getDockerCommand('quay.io/ucsc_cgl/cat', e)) + docList.append(getDockerCommand(docker_image, e)) return docList else: - return getDockerCommand('quay.io/ucsc_cgl/cat', cmd) - elif os.environ.get('CAT_BINARY_MODE') == 'singularity': - if os.environ.get('SINGULARITY_IMAGE'): - img = os.environ['SINGULARITY_IMAGE'] + return getDockerCommand(docker_image, cmd) + elif os.environ.get("CAT_BINARY_MODE") == "singularity": + if os.environ.get("SINGULARITY_IMAGE"): + img = os.environ["SINGULARITY_IMAGE"] else: - img = os.path.join(os.environ['SINGULARITY_PULLFOLDER'], 'cat.img') + img = os.path.join(os.environ["SINGULARITY_PULLFOLDER"], "cat.img") assert os.path.exists(img) if isinstance(cmd[0], list): return list([get_singularity_command(img, c) for c in cmd]) @@ -45,11 +46,11 @@ def call_proc(cmd, keepLastNewLine=False): a list of lists of commands and arguments.""" stdout = pipeline.DataReader() cmd = cmdLists(cmd) - logger.debug('About to run command: %s' % cmd) + logger.debug("About to run command: %s" % cmd) now = time.time() pl = pipeline.Procline(cmd, stdin="/dev/null", stdout=stdout) pl.wait() - logger.debug('Command %s took %s seconds.' % (cmd, time.time() - now)) + logger.debug("Command %s took %s seconds." % (cmd, time.time() - now)) out = stdout.get() if (not keepLastNewLine) and (len(out) > 0) and (out[-1] == "\n"): out = out[0:-1] @@ -95,11 +96,12 @@ def popen_catch(command, stdin=None): """ command = cmdLists(command) if stdin is not None: - process = subprocess.Popen(command, encoding='utf-8', - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=sys.stderr, bufsize=-1) + process = subprocess.Popen( + command, encoding="utf-8", stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=sys.stderr, bufsize=-1 + ) output, nothing = process.communicate(stdin) else: - process = subprocess.Popen(command, encoding='utf-8', stdout=subprocess.PIPE, stderr=sys.stderr, bufsize=-1) + process = subprocess.Popen(command, encoding="utf-8", stdout=subprocess.PIPE, stderr=sys.stderr, bufsize=-1) output, nothing = process.communicate() sts = process.wait() if sts != 0: @@ -148,7 +150,7 @@ def add_to_work_dirs(dirname, work_dirs): # override the existing filesystem within the container.) for i, work_dir in enumerate(work_dirs): mrca = mrca_path(dirname, work_dir) - if mrca == '/': + if mrca == "/": # Avoid bind-mounting the root dir. if i == len(work_dirs) - 1: # No mergeable directories. @@ -167,27 +169,27 @@ def getDockerCommand(image, cmd): image: the Docker image to use, e.g. 'quay.io/comparative-genomics-toolkit/cactus:latest' cmd: list of arguments """ - dockerPreamble = ['docker', 'run', '-i', '--rm', '-u', "%s:%s" % (os.getuid(), os.getgid())] + dockerPreamble = ["docker", "run", "-i", "--rm", "-u", "%s:%s" % (os.getuid(), os.getgid())] if "TMPDIR" in os.environ: tmpdir = os.environ["TMPDIR"] dockerPreamble.extend(["--env", "TMPDIR={}".format(tmpdir)]) - dockerPreamble.extend(['-v', tmpdir + ':' + tmpdir]) + dockerPreamble.extend(["-v", tmpdir + ":" + tmpdir]) work_dirs = [] for i, arg in enumerate(cmd): - if arg.startswith('-') and '=' in arg: + if arg.startswith("-") and "=" in arg: # We assume this is -option=value syntax. Special-case # this to check if the value is a path. - arg = arg.split('=')[1] + arg = arg.split("=")[1] dirname = os.path.dirname(arg) if os.path.exists(dirname): # The dirname exists, so we will try to mount it. arg = os.path.abspath(arg) - if arg.startswith('/dev'): + if arg.startswith("/dev"): continue add_to_work_dirs(dirname, work_dirs) for work_dir in work_dirs: work_dir = os.path.abspath(work_dir) - dockerPreamble += ['-v', work_dir + ':' + work_dir] + dockerPreamble += ["-v", work_dir + ":" + work_dir] return dockerPreamble + [image] + cmd @@ -209,13 +211,13 @@ def get_singularity_command(image, cmd): # getDockerCommand, we mount the entire root of the # outside file system in '/mnt' of the container, and # then prepend '/mnt' to all file paths in the command. - singularity_cmd = ['singularity', 'exec', '-B', '/:/mnt', image] + singularity_cmd = ["singularity", "exec", "-B", "/:/mnt", image] for arg in cmd: - if arg.startswith('-') and len(arg.split('=')) == 2: + if arg.startswith("-") and len(arg.split("=")) == 2: # We assume this is -option=value syntax. Special-case # this to check if the value is a path. - option, value = arg.split('=') - singularified_arg = '='.join([option, singularify_arg(value)]) + option, value = arg.split("=") + singularified_arg = "=".join([option, singularify_arg(value)]) else: singularified_arg = singularify_arg(arg) @@ -224,7 +226,7 @@ def get_singularity_command(image, cmd): return singularity_cmd -def singularify_arg(arg, singularity_mount_point='/mnt'): +def singularify_arg(arg, singularity_mount_point="/mnt"): """ Check to see if 'arg' is a path; if it is, modify it to be accessible from inside the singularity container. @@ -247,4 +249,3 @@ def singularify_arg(arg, singularity_mount_point='/mnt'): arg = str(singularity_mount_point) + str(os.path.abspath(arg)) return arg - diff --git a/tools/psl.py b/tools/psl.py index fded61c0..11cc730b 100644 --- a/tools/psl.py +++ b/tools/psl.py @@ -11,19 +11,40 @@ from tools.mathOps import format_ratio from tools.nameConversions import strip_alignment_numbers -__author__ = 'Ian Fiddes' +__author__ = "Ian Fiddes" class PslRow(object): """ Represents a single row in a PSL file. http://genome.ucsc.edu/FAQ/FAQformat.html#format2 """ - __slots__ = ('matches', 'mismatches', 'repmatches', 'n_count', 'q_num_insert', 'q_base_insert', 't_num_insert', - 't_base_insert', 'strand', 'q_name', 'q_size', 'q_start', 'q_end', 't_name', 't_size', 't_start', - 't_end', 'block_count', 'block_sizes', 'q_starts', 't_starts') + + __slots__ = ( + "matches", + "mismatches", + "repmatches", + "n_count", + "q_num_insert", + "q_base_insert", + "t_num_insert", + "t_base_insert", + "strand", + "q_name", + "q_size", + "q_start", + "q_end", + "t_name", + "t_size", + "t_start", + "t_end", + "block_count", + "block_sizes", + "q_starts", + "t_starts", + ) def __init__(self, data_tokens): - assert(len(data_tokens) == 21) + assert len(data_tokens) == 21 self.matches = int(data_tokens[0]) self.mismatches = int(data_tokens[1]) self.repmatches = int(data_tokens[2]) @@ -43,16 +64,18 @@ def __init__(self, data_tokens): self.t_end = int(data_tokens[16]) self.block_count = int(data_tokens[17]) # lists of ints - self.block_sizes = [int(x) for x in data_tokens[18].split(',') if x] - self.q_starts = [int(x) for x in data_tokens[19].split(',') if x] - self.t_starts = [int(x) for x in data_tokens[20].split(',') if x] + self.block_sizes = [int(x) for x in data_tokens[18].split(",") if x] + self.q_starts = [int(x) for x in data_tokens[19].split(",") if x] + self.t_starts = [int(x) for x in data_tokens[20].split(",") if x] def target_coordinate_to_query(self, p): """ Take position P in target coordinates (positive) and convert it to query coordinates (positive). """ - if self.strand not in ['+', '-', '++']: - raise NotImplementedError('PslRow does not support coordinate conversions for strand {}'.format(self.strand)) + if self.strand not in ["+", "-", "++"]: + raise NotImplementedError( + "PslRow does not support coordinate conversions for strand {}".format(self.strand) + ) if p < self.t_start: return None if p >= self.t_end: @@ -64,7 +87,7 @@ def target_coordinate_to_query(self, p): continue # p must be in block offset = p - t - if self.strand == '+' or self.strand == '++': + if self.strand == "+" or self.strand == "++": return self.q_starts[i] + offset else: return self.q_size - (self.q_starts[i] + offset) - 1 @@ -74,13 +97,15 @@ def query_coordinate_to_target(self, p): """ Take position P in query coordinates (positive) and convert it to target coordinates (positive). """ - if self.strand not in ['+', '-', '++']: - raise NotImplementedError('PslRow does not support coordinate conversions for strand {}'.format(self.strand)) + if self.strand not in ["+", "-", "++"]: + raise NotImplementedError( + "PslRow does not support coordinate conversions for strand {}".format(self.strand) + ) if p < self.q_start: return None if p >= self.q_end: return None - if self.strand == '-': + if self.strand == "-": p = self.q_size - p - 1 for i, q in enumerate(self.q_starts): if p < q: @@ -94,25 +119,29 @@ def query_coordinate_to_target(self, p): @property def coverage(self): - return format_ratio(self.matches + self.mismatches + self.repmatches, self.q_size, - num_digits=5, resolve_nan=0) + return format_ratio(self.matches + self.mismatches + self.repmatches, self.q_size, num_digits=5, resolve_nan=0) @property def identity(self): - return format_ratio(self.matches + self.repmatches, - self.matches + self.repmatches + self.mismatches + self.q_base_insert, - num_digits=5, resolve_nan=0) + return format_ratio( + self.matches + self.repmatches, + self.matches + self.repmatches + self.mismatches + self.q_base_insert, + num_digits=5, + resolve_nan=0, + ) @property def target_identity(self): - return format_ratio(self.matches + self.repmatches, - self.matches + self.repmatches + self.mismatches + self.q_base_insert + self.t_base_insert, - num_digits=5, resolve_nan=0) + return format_ratio( + self.matches + self.repmatches, + self.matches + self.repmatches + self.mismatches + self.q_base_insert + self.t_base_insert, + num_digits=5, + resolve_nan=0, + ) @property def target_coverage(self): - return format_ratio(self.matches + self.mismatches + self.repmatches, self.t_size, - num_digits=5, resolve_nan=0) + return format_ratio(self.matches + self.mismatches + self.repmatches, self.t_size, num_digits=5, resolve_nan=0) @property def percent_n(self): @@ -129,21 +158,46 @@ def badness(self): :return: A float between 0 and 1 where 1 is very bad """ - b = format_ratio(self.mismatches + self.q_num_insert + 3 * math.log(1 + abs(self.q_size - self.t_size)), - self.matches + self.mismatches + self.repmatches, - num_digits=5, resolve_nan=1) + b = format_ratio( + self.mismatches + self.q_num_insert + 3 * math.log(1 + abs(self.q_size - self.t_size)), + self.matches + self.mismatches + self.repmatches, + num_digits=5, + resolve_nan=1, + ) return min(b, 1) def psl_string(self): """ Return a list capable of producing a new PslRow object """ - return list(map(str, [self.matches, self.mismatches, self.repmatches, self.n_count, self.q_num_insert, - self.q_base_insert, self.t_num_insert, self.t_base_insert, self.strand, self.q_name, - self.q_size, self.q_start, self.q_end, self.t_name, self.t_size, self.t_start, - self.t_end, self.block_count, ','.join([str(b) for b in self.block_sizes]), - ','.join([str(b) for b in self.q_starts]), - ','.join([str(b) for b in self.t_starts])])) + return list( + map( + str, + [ + self.matches, + self.mismatches, + self.repmatches, + self.n_count, + self.q_num_insert, + self.q_base_insert, + self.t_num_insert, + self.t_base_insert, + self.strand, + self.q_name, + self.q_size, + self.q_start, + self.q_end, + self.t_name, + self.t_size, + self.t_start, + self.t_end, + self.block_count, + ",".join([str(b) for b in self.block_sizes]), + ",".join([str(b) for b in self.q_starts]), + ",".join([str(b) for b in self.t_starts]), + ], + ) + ) def psl_iterator(psl_file, make_unique=False): @@ -155,7 +209,7 @@ def psl_iterator(psl_file, make_unique=False): for tokens in iter_lines(inf): psl = PslRow(tokens) if make_unique is True: - numbered_aln_id = '-'.join([psl.q_name, str(counts[psl.q_name])]) + numbered_aln_id = "-".join([psl.q_name, str(counts[psl.q_name])]) counts[psl.q_name] += 1 psl.q_name = numbered_aln_id yield psl diff --git a/tools/rangeFinder.py b/tools/rangeFinder.py index 8c510f1e..c0aa53eb 100644 --- a/tools/rangeFinder.py +++ b/tools/rangeFinder.py @@ -16,7 +16,6 @@ # will fit in. - class RemoveValueError(Exception): pass @@ -24,17 +23,11 @@ class RemoveValueError(Exception): class Binner(object): "functions to translate ranges to bin numbers" - binOffsetsBasic = (512 + 64 + 8 + 1, - 64 + 8 + 1, - 8 + 1, - 1, 0) - binOffsetsExtended = (4096 + 512 + 64 + 8 + 1, - 512 + 64 + 8 + 1, - 64 + 8 + 1, 8 + 1, - 1, 0) + binOffsetsBasic = (512 + 64 + 8 + 1, 64 + 8 + 1, 8 + 1, 1, 0) + binOffsetsExtended = (4096 + 512 + 64 + 8 + 1, 512 + 64 + 8 + 1, 64 + 8 + 1, 8 + 1, 1, 0) binFirstShift = 17 # How much to shift to get to finest bin. - binNextShift = 3 # How much to shift to get to next larger bin. + binNextShift = 3 # How much to shift to get to next larger bin. binBasicMaxEnd = 512 * 1024 * 1024 binOffsetToExtended = 4681 @@ -45,7 +38,7 @@ def __calcBinForOffsets(start, end, baseOffset, offsets): startBin = start >> Binner.binFirstShift endBin = (end - 1) >> Binner.binFirstShift for binOff in offsets: - if (startBin == endBin): + if startBin == endBin: return baseOffset + binOff + startBin startBin >>= Binner.binNextShift endBin >>= Binner.binNextShift @@ -80,9 +73,13 @@ def getOverlappingBins(start, end): else: if start < Binner.binBasicMaxEnd: # overlapping both basic and extended - for bins in Binner.__getOverlappingBinsForOffsets(start, Binner.binBasicMaxEnd, 0, Binner.binOffsetsBasic): + for bins in Binner.__getOverlappingBinsForOffsets( + start, Binner.binBasicMaxEnd, 0, Binner.binOffsetsBasic + ): yield bins - for bins in Binner.__getOverlappingBinsForOffsets(start, end, Binner.binOffsetToExtended, Binner.binOffsetsExtended): + for bins in Binner.__getOverlappingBinsForOffsets( + start, end, Binner.binOffsetToExtended, Binner.binOffsetsExtended + ): yield bins @staticmethod @@ -95,7 +92,9 @@ def getOverlappingSqlExpr(binCol, seqCol, startCol, endCol, seq, start, end): parts.append("({}={})".format(binCol, bins[0])) else: parts.append("({}>={} and {}<={})".format(binCol, bins[0], binCol, bins[1])) - return "(({}=\"{}\") and ({}<{}) and ({}>{}) and ({}))".format(seqCol, seq, startCol, end, endCol, start, " or ".join(parts)) + return '(({}="{}") and ({}<{}) and ({}>{}) and ({}))'.format( + seqCol, seq, startCol, end, endCol, start, " or ".join(parts) + ) class Entry(object): @@ -119,6 +118,7 @@ class RangeBins(object): """Range indexed container for a single sequence. This using a binning scheme that implements spacial indexing. Based on UCSC hg browser binRange C module. """ + __slots__ = ("seqId", "strand", "bins") def __init__(self, seqId, strand): @@ -129,17 +129,17 @@ def __init__(self, seqId, strand): def add(self, start, end, value): bin = Binner.calcBin(start, end) entries = self.bins.get(bin) - if (entries is None): + if entries is None: self.bins[bin] = entries = [] entries.append(Entry(start, end, value)) def overlapping(self, start, end): "generator over values overlapping the specified range" - if (start < end): + if start < end: for bins in Binner.getOverlappingBins(start, end): for j in range(bins[0], bins[1] + 1): bin = self.bins.get(j) - if (bin is not None): + if bin is not None: for entry in bin: if entry.overlaps(start, end): yield entry.value @@ -173,6 +173,7 @@ class RangeFinder(object): have strand. A query without strand will find all overlapping entries on either strand if strand was specified when adding entries. """ + validStrands = set((None, "+", "-")) def __init__(self): @@ -182,7 +183,7 @@ def __init__(self): def add(self, seqId, start, end, value, strand=None): "add an entry for a sequence and range, and optional strand" if self.haveStrand is None: - self.haveStrand = (strand is not None) + self.haveStrand = strand is not None elif self.haveStrand != (strand is not None): raise Exception("all RangeFinder entries must either have strand or not have strand") if strand not in self.validStrands: @@ -230,9 +231,9 @@ def __removeSpecificStrand(self, seqId, start, end, value, strand): def __removeBothStrands(self, seqId, start, end, value): "remove an entry, checking both strands" - removed = self.__removeIfExists(seqId, start, end, value, '+') + removed = self.__removeIfExists(seqId, start, end, value, "+") if not removed: - removed = self.__removeIfExists(seqId, start, end, value, '-') + removed = self.__removeIfExists(seqId, start, end, value, "-") if bins is not None: removed = bins.removeIfExists(seqId, start, end, value) if not removed: @@ -262,4 +263,4 @@ def dump(self, fh): bins.dump(fh) -__all__ = (RangeFinder.__name__,) \ No newline at end of file +__all__ = (RangeFinder.__name__,) diff --git a/tools/sqlInterface.py b/tools/sqlInterface.py index 6540e304..6692d245 100644 --- a/tools/sqlInterface.py +++ b/tools/sqlInterface.py @@ -18,7 +18,8 @@ class Annotation(Base): """Table for the annotation table. Only exists in ref_genome""" - __tablename__ = 'annotation' + + __tablename__ = "annotation" GeneId = Column(Text, primary_key=True) TranscriptId = Column(Text, primary_key=True) TranscriptName = Column(Text) @@ -30,6 +31,7 @@ class Annotation(Base): class Bed12(object): """General table description for storing BED12 features""" + chromosome = Column(Text) start = Column(Integer) stop = Column(Integer) @@ -46,41 +48,49 @@ class Bed12(object): class EvaluationColumns(Bed12): """Mixin class for all TranscriptEvaluation module tables. Represents a bed12 with a leading ID column""" + AlignmentId = Column(Text, primary_key=True) class MrnaTmEval(EvaluationColumns, Base): """Table for evaluations of mRNA alignments of transcripts derived from transMap""" - __tablename__ = 'mRNA_transMap_Evaluation' + + __tablename__ = "mRNA_transMap_Evaluation" class MrnaAugTmEval(EvaluationColumns, Base): """Table for evaluations of mRNA alignments of transcripts derived from AugustusTM""" - __tablename__ = 'mRNA_augTM_Evaluation' + + __tablename__ = "mRNA_augTM_Evaluation" class MrnaAugTmrEval(EvaluationColumns, Base): """Table for evaluations of mRNA alignments of transcripts derived from AugustusTMR""" - __tablename__ = 'mRNA_augTMR_Evaluation' + + __tablename__ = "mRNA_augTMR_Evaluation" class CdsTmEval(EvaluationColumns, Base): """Table for evaluations of CDS alignments of transcripts derived from transMap""" - __tablename__ = 'CDS_transMap_Evaluation' + + __tablename__ = "CDS_transMap_Evaluation" class CdsAugTmEval(EvaluationColumns, Base): """Table for evaluations of CDS alignments of transcripts derived from AugustusTM""" - __tablename__ = 'CDS_augTM_Evaluation' + + __tablename__ = "CDS_augTM_Evaluation" class CdsAugTmrEval(EvaluationColumns, Base): """Table for evaluations of CDS alignments of transcripts derived from AugustusTMR""" - __tablename__ = 'CDS_augTMR_Evaluation' + + __tablename__ = "CDS_augTMR_Evaluation" class MetricsColumns(object): """Mixin class for all TranscriptMetrics module tables""" + AlignmentId = Column(Text, primary_key=True) classifier = Column(Text) value = Column(Float) @@ -88,14 +98,16 @@ class MetricsColumns(object): class TmEval(MetricsColumns, Base): """Table for evaluations from TransMapEvaluation module""" - __tablename__ = 'TransMapEvaluation' + + __tablename__ = "TransMapEvaluation" TranscriptId = Column(Text, primary_key=True) GeneId = Column(Text, primary_key=True) class TmFilterEval(MetricsColumns, Base): """Table for evaluations from FilterTransMap module. This table is stored in a stacked format for simplicity.""" - __tablename__ = 'TransMapFilterEvaluation' + + __tablename__ = "TransMapFilterEvaluation" GeneId = Column(Text, primary_key=True) TranscriptId = Column(Text, primary_key=True) AlignmentId = Column(Text, primary_key=True) @@ -109,41 +121,49 @@ class TmFilterEval(MetricsColumns, Base): class TmMetrics(MetricsColumns, Base): """Table for evaluations from TransMapMetrics module""" - __tablename__ = 'TransMapMetrics' + + __tablename__ = "TransMapMetrics" class MrnaTmMetrics(MetricsColumns, Base): """Table for evaluations of mRNA alignments of transcripts derived from transMap""" - __tablename__ = 'mRNA_transMap_Metrics' + + __tablename__ = "mRNA_transMap_Metrics" class MrnaAugTmMetrics(MetricsColumns, Base): """Table for evaluations of mRNA alignments of transcripts derived from AugustusTM""" - __tablename__ = 'mRNA_augTM_Metrics' + + __tablename__ = "mRNA_augTM_Metrics" class MrnaAugTmrMetrics(MetricsColumns, Base): """Table for evaluations of mRNA alignments of transcripts derived from AugustusTMR""" - __tablename__ = 'mRNA_augTMR_Metrics' + + __tablename__ = "mRNA_augTMR_Metrics" class CdsTmMetrics(MetricsColumns, Base): """Table for evaluations of CDS alignments of transcripts derived from transMap""" - __tablename__ = 'CDS_transMap_Metrics' + + __tablename__ = "CDS_transMap_Metrics" class CdsAugTmMetrics(MetricsColumns, Base): """Table for evaluations of CDS alignments of transcripts derived from AugustusTM""" - __tablename__ = 'CDS_augTM_Metrics' + + __tablename__ = "CDS_augTM_Metrics" class CdsAugTmrMetrics(MetricsColumns, Base): """Table for evaluations of CDS alignments of transcripts derived from AugustusTMR""" - __tablename__ = 'CDS_augTMR_Metrics' + + __tablename__ = "CDS_augTMR_Metrics" class HgmColumns(object): """Mixin class for all homGeneMapping tables""" + GeneId = Column(Text, primary_key=True) TranscriptId = Column(Text, primary_key=True) AlignmentId = Column(Text, primary_key=True) @@ -158,36 +178,43 @@ class HgmColumns(object): class TmIntronSupport(HgmColumns, Base): """Table for intron support of transMap transcripts from homGeneMapping""" - __tablename__ = 'transMap_Hgm' + + __tablename__ = "transMap_Hgm" class AugTmIntronSupport(HgmColumns, Base): """Table for intron support of AugustusTM transcripts from homGeneMapping""" - __tablename__ = 'augTM_Hgm' + + __tablename__ = "augTM_Hgm" class AugTmrIntronSupport(HgmColumns, Base): """Table for intron support of AugustusTMR transcripts from homGeneMapping""" - __tablename__ = 'augTMR_Hgm' + + __tablename__ = "augTMR_Hgm" class AugCgpIntronSupport(HgmColumns, Base): """Table for intron support of AugustusCGP transcripts from homGeneMapping""" - __tablename__ = 'augCGP_Hgm' + + __tablename__ = "augCGP_Hgm" class AugPbIntronSupport(HgmColumns, Base): """Table for intron support of AugustusPB transcripts from homGeneMapping""" - __tablename__ = 'augPB_Hgm' + + __tablename__ = "augPB_Hgm" class ExRefIntronSupport(HgmColumns, Base): """Table for intron support of External reference transcripts from homGeneMapping""" - __tablename__ = 'ExRef_Hgm' + + __tablename__ = "ExRef_Hgm" class AlternativeGeneIdColumns(object): """mixin class for AlternativeGenes""" + TranscriptId = Column(Text, primary_key=True) AssignedGeneId = Column(Text) AlternativeGeneIds = Column(Text) @@ -196,22 +223,26 @@ class AlternativeGeneIdColumns(object): class AugCgpAlternativeGenes(AlternativeGeneIdColumns, Base): """Table for recording a list of alternative parental genes for CGP""" - __tablename__ = 'augCGP_AlternativeGenes' + + __tablename__ = "augCGP_AlternativeGenes" class AugPbAlternativeGenes(AlternativeGeneIdColumns, Base): """Table for recording a list of alternative parental genes for IsoSeq""" - __tablename__ = 'augPB_AlternativeGenes' + + __tablename__ = "augPB_AlternativeGenes" class ExRefAlternativeGenes(AlternativeGeneIdColumns, Base): """Table for recording a list of alternative parental genes for external references""" - __tablename__ = 'ExRef_AlternativeGenes' + + __tablename__ = "ExRef_AlternativeGenes" class IsoSeqExonStructures(Bed12, Base): """Table for recording all distinct exon structures present in a IsoSeq hints file""" - __tablename__ = 'IsoSeqExonStructures' + + __tablename__ = "IsoSeqExonStructures" index = Column(Integer, primary_key=True) @@ -222,7 +253,7 @@ class IsoSeqExonStructures(Bed12, Base): def start_session(db_path): """basic script for starting a session""" - engine = create_engine('sqlite:///' + db_path) + engine = create_engine("sqlite:///" + db_path) Session = sessionmaker(bind=engine) return Session() @@ -232,18 +263,27 @@ def start_session(db_path): ### -tables = {'hgm': {'augCGP': AugCgpIntronSupport, 'augTM': AugTmIntronSupport, - 'augTMR': AugTmrIntronSupport, 'transMap': TmIntronSupport, - 'augPB': AugPbIntronSupport, 'exRef': ExRefIntronSupport}, - 'CDS': {'augTM': {'metrics': CdsAugTmMetrics, 'evaluation': CdsAugTmEval}, - 'augTMR': {'metrics': CdsAugTmrMetrics, 'evaluation': CdsAugTmrEval}, - 'transMap': {'metrics': CdsTmMetrics, 'evaluation': CdsTmEval}}, - 'mRNA': {'augTM': {'metrics': MrnaAugTmMetrics, 'evaluation': MrnaAugTmEval}, - 'augTMR': {'metrics': MrnaAugTmrMetrics, 'evaluation': MrnaAugTmrEval}, - 'transMap': {'metrics': MrnaTmMetrics, 'evaluation': MrnaTmEval}}, - 'alt_names': {'exRef': ExRefAlternativeGenes, - 'augPB': AugPbAlternativeGenes, - 'augCGP': AugCgpAlternativeGenes}} +tables = { + "hgm": { + "augCGP": AugCgpIntronSupport, + "augTM": AugTmIntronSupport, + "augTMR": AugTmrIntronSupport, + "transMap": TmIntronSupport, + "augPB": AugPbIntronSupport, + "exRef": ExRefIntronSupport, + }, + "CDS": { + "augTM": {"metrics": CdsAugTmMetrics, "evaluation": CdsAugTmEval}, + "augTMR": {"metrics": CdsAugTmrMetrics, "evaluation": CdsAugTmrEval}, + "transMap": {"metrics": CdsTmMetrics, "evaluation": CdsTmEval}, + }, + "mRNA": { + "augTM": {"metrics": MrnaAugTmMetrics, "evaluation": MrnaAugTmEval}, + "augTMR": {"metrics": MrnaAugTmrMetrics, "evaluation": MrnaAugTmrEval}, + "transMap": {"metrics": MrnaTmMetrics, "evaluation": MrnaTmEval}, + }, + "alt_names": {"exRef": ExRefAlternativeGenes, "augPB": AugPbAlternativeGenes, "augCGP": AugCgpAlternativeGenes}, +} ### @@ -251,7 +291,7 @@ def start_session(db_path): ### -def read_attrs(db_path, table=Annotation.__tablename__, index_col='TranscriptId'): +def read_attrs(db_path, table=Annotation.__tablename__, index_col="TranscriptId"): """ Read the attributes database file into a pandas DataFrame :param db_path: path to the attributes database @@ -259,11 +299,11 @@ def read_attrs(db_path, table=Annotation.__tablename__, index_col='TranscriptId' :param index_col: column to index on. should generally be tx_id. :return: pandas DataFrame """ - engine = create_engine('sqlite:///{}'.format(db_path)) + engine = create_engine("sqlite:///{}".format(db_path)) return pd.read_sql_table(table, engine, index_col=index_col) -def get_transcript_gene_map(db_path, table=Annotation.__tablename__, index_col='TranscriptId'): +def get_transcript_gene_map(db_path, table=Annotation.__tablename__, index_col="TranscriptId"): """ Convenience wrapper for read_attrs that returns a dictionary mapping transcript IDs to gene IDs. :param db_path: path to the attributes database @@ -275,7 +315,7 @@ def get_transcript_gene_map(db_path, table=Annotation.__tablename__, index_col=' return dict(list(zip(df.index, df.GeneId))) -def get_gene_transcript_map(db_path, table=Annotation.__tablename__, index_col='TranscriptId'): +def get_gene_transcript_map(db_path, table=Annotation.__tablename__, index_col="TranscriptId"): """ Convenience wrapper for read_attrs that returns a dictionary mapping transcript IDs to gene IDs. :param db_path: path to the attributes database @@ -285,12 +325,12 @@ def get_gene_transcript_map(db_path, table=Annotation.__tablename__, index_col=' """ df = read_attrs(db_path, table, index_col).reset_index() r = {} - for gene_id, s in df.groupby('GeneId'): + for gene_id, s in df.groupby("GeneId"): r[gene_id] = s.TranscriptId.tolist() return r -def get_transcript_biotype_map(db_path, table=Annotation.__tablename__, index_col='TranscriptId'): +def get_transcript_biotype_map(db_path, table=Annotation.__tablename__, index_col="TranscriptId"): """ Convenience wrapper for read_attrs that returns a dictionary mapping transcript IDs to their biotype :param db_path: path to the attributes database @@ -302,7 +342,7 @@ def get_transcript_biotype_map(db_path, table=Annotation.__tablename__, index_co return dict(list(zip(df.index, df.TranscriptBiotype))) -def get_gene_biotype_map(db_path, table=Annotation.__tablename__, index_col='TranscriptId'): +def get_gene_biotype_map(db_path, table=Annotation.__tablename__, index_col="TranscriptId"): """ Convenience wrapper for read_attrs that returns a dictionary mapping gene IDs to their biotype :param db_path: path to the attributes database @@ -349,7 +389,7 @@ def load_annotation(ref_db_path): :param ref_db_path: path to reference genome database. Must have table Annotation.__tablename__ :return: DataFrame """ - engine = create_engine('sqlite:///' + ref_db_path) + engine = create_engine("sqlite:///" + ref_db_path) df = pd.read_sql_table(Annotation.__tablename__, engine) return df @@ -360,9 +400,9 @@ def load_alignment_evaluation(db_path): :param db_path: path to genome database :return: DataFrame """ - engine = create_engine('sqlite:///' + db_path) + engine = create_engine("sqlite:///" + db_path) df = pd.read_sql_table(TmEval.__tablename__, engine) - df = pd.pivot_table(df, index=['TranscriptId', 'AlignmentId'], columns='classifier', values='value') + df = pd.pivot_table(df, index=["TranscriptId", "AlignmentId"], columns="classifier", values="value") return df.reset_index() @@ -372,7 +412,7 @@ def load_filter_evaluation(db_path): :param db_path: path to genome database :return: DataFrame """ - engine = create_engine('sqlite:///' + db_path) + engine = create_engine("sqlite:///" + db_path) return pd.read_sql_table(TmFilterEval.__tablename__, engine) @@ -382,8 +422,8 @@ def load_isoseq_txs(db_path): :param db_path: path to genome db :return: list of Transcript objects """ - engine = create_engine('sqlite:///' + db_path) - df = pd.read_sql_table(IsoSeqExonStructures.__tablename__, engine, index_col='index') + engine = create_engine("sqlite:///" + db_path) + df = pd.read_sql_table(IsoSeqExonStructures.__tablename__, engine, index_col="index") txs = [transcripts.Transcript(list(s)) for _, s in df.iterrows()] return txs @@ -395,10 +435,12 @@ def load_evaluation(table, session): :param session: Active sqlalchemy session. :return: DataFrame """ - assert any(table == cls for cls in (MrnaAugTmrEval, MrnaAugTmEval, MrnaTmEval, - CdsAugTmrEval, CdsAugTmEval, CdsTmEval)) - query = session.query(table.AlignmentId, table.name, func.count(table.name).label('value')). \ - group_by(table.AlignmentId, table.name) + assert any( + table == cls for cls in (MrnaAugTmrEval, MrnaAugTmEval, MrnaTmEval, CdsAugTmrEval, CdsAugTmEval, CdsTmEval) + ) + query = session.query(table.AlignmentId, table.name, func.count(table.name).label("value")).group_by( + table.AlignmentId, table.name + ) return pd.read_sql(query.statement, session.bind) @@ -409,8 +451,10 @@ def load_metrics(table, session): :param session: Active sqlalchemy session. :return: DataFrame """ - assert any(table == cls for cls in (MrnaAugTmrMetrics, MrnaAugTmMetrics, MrnaTmMetrics, - CdsAugTmrMetrics, CdsAugTmMetrics, CdsTmMetrics)) + assert any( + table == cls + for cls in (MrnaAugTmrMetrics, MrnaAugTmMetrics, MrnaTmMetrics, CdsAugTmrMetrics, CdsAugTmMetrics, CdsTmMetrics) + ) query = session.query(table) return pd.read_sql(query.statement, session.bind) @@ -422,8 +466,17 @@ def load_intron_vector(table, session): :param session: Active sqlalchemy session. :return: DataFrame """ - assert any(table == cls for cls in (TmIntronSupport, AugCgpIntronSupport, AugTmIntronSupport, AugPbIntronSupport, - AugTmrIntronSupport, ExRefIntronSupport)) + assert any( + table == cls + for cls in ( + TmIntronSupport, + AugCgpIntronSupport, + AugTmIntronSupport, + AugPbIntronSupport, + AugTmrIntronSupport, + ExRefIntronSupport, + ) + ) query = session.query(table) return pd.read_sql(query.statement, session.bind) @@ -444,11 +497,12 @@ def load_alternatives(table, session): # Stats functions ### + def load_luigi_stats(db_path, table): """ Loads the luigi stats from the stats db :param db_path: path to database :return: DataFrame """ - engine = create_engine('sqlite:///' + db_path) + engine = create_engine("sqlite:///" + db_path) return pd.read_sql_table(table, engine) diff --git a/tools/sqlite.py b/tools/sqlite.py index a79f22c9..e767de58 100644 --- a/tools/sqlite.py +++ b/tools/sqlite.py @@ -8,6 +8,7 @@ class ExclusiveSqlConnection(object): """Context manager for an exclusive SQL connection""" + def __init__(self, path, timeout=6000): self.path = path self.timeout = timeout diff --git a/tools/strOps.py b/tools/strOps.py index 059c2166..d8d620e7 100644 --- a/tools/strOps.py +++ b/tools/strOps.py @@ -5,14 +5,17 @@ # matches one or more whitespaces spaceRe = re.compile("[ \t\n\v\f\r]+") + def hasSpaces(s): "test if there are any whitespace characters in a string" return spaceRe.search(s) is not None + def splitAtSpaces(s): "split a string at one or more contiguous whitespaces" return spaceRe.split(s) + def dup(n, s): "make a string with n copies of s" l = [] @@ -20,17 +23,20 @@ def dup(n, s): l.append(s) return "".join(l) + def emptyOrNone(s): "is a string empty of None" return (s is None) or (len(s) == 0) + def emptyForNone(s): "return an empty string if s is None, else s" return "" if s is None else s + def noneForEmpty(s): "return non if s is a empty string, else s" return None if s == "" else s -__all__ = (hasSpaces.__name__, splitAtSpaces.__name__, dup.__name__, emptyForNone.__name__, noneForEmpty.__name__) +__all__ = (hasSpaces.__name__, splitAtSpaces.__name__, dup.__name__, emptyForNone.__name__, noneForEmpty.__name__) diff --git a/tools/tm2hints.py b/tools/tm2hints.py index 84977f0f..c8982538 100644 --- a/tools/tm2hints.py +++ b/tools/tm2hints.py @@ -10,8 +10,17 @@ """ from . import procOps -cmd = ['transMap2hints.pl', '--ep_cutoff=0', '--ep_margin=12', '--min_intron_len=50', '--start_stop_radius=5', - '--tss_tts_radius=10', '--utrend_cutoff=10', '--in=/dev/stdin', '--out=/dev/stdout'] +cmd = [ + "transMap2hints.pl", + "--ep_cutoff=0", + "--ep_margin=12", + "--min_intron_len=50", + "--start_stop_radius=5", + "--tss_tts_radius=10", + "--utrend_cutoff=10", + "--in=/dev/stdin", + "--out=/dev/stdout", +] def tm_to_hints(tm_tx, tm_psl, ref_psl): @@ -25,9 +34,9 @@ def tm_to_hints(tm_tx, tm_psl, ref_psl): :return: GFF formatted string. """ ref_starts = fix_ref_q_starts(ref_psl) - intron_vector = ['1' if is_fuzzy_intron(i, tm_psl, ref_starts) else '0' for i in tm_tx.intron_intervals] - tm_gp = '\t'.join(tm_tx.get_gene_pred()) - tm_rec = ''.join([tm_gp, '\t', ','.join(intron_vector), '\n']) + intron_vector = ["1" if is_fuzzy_intron(i, tm_psl, ref_starts) else "0" for i in tm_tx.intron_intervals] + tm_gp = "\t".join(tm_tx.get_gene_pred()) + tm_rec = "".join([tm_gp, "\t", ",".join(intron_vector), "\n"]) return procOps.popen_catch(cmd, tm_rec) @@ -37,9 +46,10 @@ def fix_ref_q_starts(ref_psl): :param ref_psl: PslRow object generated by GenePredToFakePsl :return: list """ - if ref_psl.strand == '-': - ref_starts = [ref_psl.q_size - (ref_psl.q_starts[i] + ref_psl.block_sizes[i]) for i in - range(len(ref_psl.q_starts))] + if ref_psl.strand == "-": + ref_starts = [ + ref_psl.q_size - (ref_psl.q_starts[i] + ref_psl.block_sizes[i]) for i in range(len(ref_psl.q_starts)) + ] else: ref_starts = ref_psl.q_starts return ref_starts diff --git a/tools/toilInterface.py b/tools/toilInterface.py index fc2b07e5..889dcc7c 100644 --- a/tools/toilInterface.py +++ b/tools/toilInterface.py @@ -5,6 +5,7 @@ import math import argparse from toil.lib.humanize import human2bytes + try: from toil.fileStores import FileID except ImportError: @@ -15,39 +16,37 @@ ### -def load_fasta_from_filestore(job, fasta_file_ids, prefix='genome', upper=False): +def load_fasta_from_filestore(job, fasta_file_ids, prefix="genome", upper=False): """ Convenience function that will load a fasta from the fileStore and return the local path to it. This works with - the pyfasta module to load all of the required files. + the pyfaidx module to load all of the required files. :param job: current job. - :param fasta_file_ids: list of fileStore file ID for the fasta, gdx, and flat file. + :param fasta_file_ids: list of fileStore file ID for the fasta and fasta index files. :param prefix: local file path prefix :param upper: force all entries to upper case - :return: open pyfasta Fasta record pointing to the file. + :return: open pyfaidx Fasta record pointing to the file. """ - fasta_local_path = '{}.fasta'.format(prefix) - fasta_file_id, gdx_file_id, flat_file_id = fasta_file_ids + fasta_local_path = "{}.fasta".format(prefix) + fasta_file_id, fai_file_id = fasta_file_ids job.fileStore.readGlobalFile(fasta_file_id, fasta_local_path) - job.fileStore.readGlobalFile(gdx_file_id, '{}.fasta.gdx'.format(prefix)) - job.fileStore.readGlobalFile(flat_file_id, '{}.fasta.flat'.format(prefix)) + job.fileStore.readGlobalFile(fai_file_id, "{}.fasta.fai".format(prefix)) return bio.get_sequence_dict(fasta_local_path, upper=upper) def write_fasta_to_filestore(toil, fasta_local_path): """ Convenience function that loads a fasta and its associated gdx/flat file into the fileStore. - Assumes that the paths are consistent with the requirements (i.e. $path.gdx and $path.flat) + Assumes that the paths are consistent with the requirements (i.e. $path.fai) :param toil: Toil context manager :param fasta_local_path: Path to local fasta to load. - :return: List of fileStore IDs for fasta, fasta_gdx, fasta_flat + :return: List of fileStore IDs for fasta, fasta_fai """ - fasta_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path), fasta_local_path) - gdx_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path + '.gdx'), fasta_local_path + '.gdx') - flat_file_id = FileID.forPath(toil.importFile('file:///' + fasta_local_path + '.flat'), fasta_local_path + '.flat') - return fasta_file_id, gdx_file_id, flat_file_id + fasta_file_id = FileID.forPath(toil.importFile("file:///" + fasta_local_path), fasta_local_path) + fai_file_id = FileID.forPath(toil.importFile("file:///" + fasta_local_path + ".fai"), fasta_local_path + ".fai") + return fasta_file_id, fai_file_id -def find_total_disk_usage(input_file_ids, buffer='2G', round='2G'): +def find_total_disk_usage(input_file_ids, buffer="2G", round="2G"): """ Takes a input_file_id namespace or dict or list and finds all members that are FileID objects, and finds their sizes. @@ -57,6 +56,7 @@ def find_total_disk_usage(input_file_ids, buffer='2G', round='2G'): :param round: amount to round up. Human readable parsed by human2bytes :return: integer """ + def roundup(x, base): return int(math.ceil(x / float(base))) * base diff --git a/tools/trace.py b/tools/trace.py index 8c1c5a4a..02958868 100644 --- a/tools/trace.py +++ b/tools/trace.py @@ -8,10 +8,12 @@ # used to detect traces that are currently open to prevent closing _activeTraceFds = set() + def getActiveTraceFds(): "return snapshot of currently active traces" return frozenset(_activeTraceFds) + class Trace(object): """Trace object, associate with an open trace file. File is flushed after each write to debug blocking""" @@ -33,7 +35,7 @@ def __init__(self, traceFile, ignoreMods=None, inclThread=False, inclPid=False, def enable(self): """enable logging on all threads.""" - assert(self.fh is not None) + assert self.fh is not None sys.settrace(self.__callback) threading.settrace(self.__callback) @@ -70,27 +72,29 @@ def log(self, *args): except IOError as ex: pass - __indentStrs = {} # cache of spaces for identation, indexed by depth + __indentStrs = {} # cache of spaces for identation, indexed by depth + def __getIndent(self): "get indentation string" if not self.callIndent: return "" i = Trace.__indentStrs.get(self.depth) if i is None: - i = Trace.__indentStrs[self.depth] = "".ljust(4*self.depth) + i = Trace.__indentStrs[self.depth] = "".ljust(4 * self.depth) return i def __logLine(self, frame, event): "log a code line" lineno = frame.f_lineno fname = frame.f_globals["__file__"] - if (fname.endswith(".pyc") or fname.endswith(".pyo")): + if fname.endswith(".pyc") or fname.endswith(".pyo"): fname = fname[:-1] name = frame.f_globals["__name__"] line = linecache.getline(fname, lineno) self.log(name, ":", lineno, self.__getIndent(), line.rstrip()) __logEvents = frozenset(["call", "line"]) + def __callback(self, frame, event, arg): "trace event callback" if frame.f_globals["__name__"] not in self.ignoreMods: @@ -100,9 +104,10 @@ def __callback(self, frame, event, arg): self.depth -= 1 if self.depth < 0: self.depth = 0 - + if event in Trace.__logEvents: self.__logLine(frame, event) return self.__callback + __all__ = (getActiveTraceFds.__name__, Trace.__name__) diff --git a/tools/transcripts.py b/tools/transcripts.py index 6aaaf85b..104aa95b 100644 --- a/tools/transcripts.py +++ b/tools/transcripts.py @@ -18,9 +18,27 @@ class Transcript(object): """ Represent a transcript record from a bed file. """ - __slots__ = ('name', 'strand', 'score', 'thick_start', 'rgb', 'thick_stop', 'start', 'stop', 'intron_intervals', - 'exon_intervals', 'exons', 'block_sizes', 'block_starts', 'block_count', 'chromosome', - 'interval', 'coding_interval', 'stranded') + + __slots__ = ( + "name", + "strand", + "score", + "thick_start", + "rgb", + "thick_stop", + "start", + "stop", + "intron_intervals", + "exon_intervals", + "exons", + "block_sizes", + "block_starts", + "block_count", + "chromosome", + "interval", + "coding_interval", + "stranded", + ) def __init__(self, bed_tokens, stranded=True): self.chromosome = bed_tokens[0] @@ -28,7 +46,7 @@ def __init__(self, bed_tokens, stranded=True): self.stop = int(bed_tokens[2]) self.name = bed_tokens[3] self.score = int(bed_tokens[4]) - self.strand = bed_tokens[5] if stranded else '.' + self.strand = bed_tokens[5] if stranded else "." self.thick_start = int(bed_tokens[6]) self.thick_stop = int(bed_tokens[7]) self.rgb = bed_tokens[8] @@ -46,11 +64,11 @@ def __len__(self): def __hash__(self): m = hashlib.sha256() for key in self.__slots__: - m.update(str(self.__getattribute__(key)).encode('utf-8')) + m.update(str(self.__getattribute__(key)).encode("utf-8")) return int(m.hexdigest(), 16) % 10 ** 12 def __repr__(self): - return 'Transcript({})'.format(self.get_bed()) + return "Transcript({})".format(self.get_bed()) @property def cds_size(self): @@ -146,8 +164,25 @@ def get_bed(self, rgb=None, name=None, new_start=None, new_stop=None): else: thick_start = new_start thick_stop = new_stop - return list(map(str, [self.chromosome, new_start, new_stop, name, self.score, self.strand, thick_start, - thick_stop, rgb, 1, 0, 0])) + return list( + map( + str, + [ + self.chromosome, + new_start, + new_stop, + name, + self.score, + self.strand, + thick_start, + thick_stop, + rgb, + 1, + 0, + 0, + ], + ) + ) if self.chromosome_coordinate_to_mrna(new_start) is None: new_start = find_closest([x.start for x in self.exon_intervals], new_start) @@ -174,10 +209,27 @@ def get_bed(self, rgb=None, name=None, new_start=None, new_stop=None): thick_start = 0 thick_stop = 0 block_count = len(exon_intervals) - block_sizes = ','.join(map(str, [len(x) for x in exon_intervals])) - block_starts = ','.join(map(str, [x.start - new_start for x in exon_intervals])) - return list(map(str, [self.chromosome, new_start, new_stop, name, self.score, self.strand, thick_start, thick_stop, - rgb, block_count, block_sizes, block_starts])) + block_sizes = ",".join(map(str, [len(x) for x in exon_intervals])) + block_starts = ",".join(map(str, [x.start - new_start for x in exon_intervals])) + return list( + map( + str, + [ + self.chromosome, + new_start, + new_stop, + name, + self.score, + self.strand, + thick_start, + thick_stop, + rgb, + block_count, + block_sizes, + block_starts, + ], + ) + ) def chromosome_coordinate_to_mrna(self, coord): if not (self.start <= coord < self.stop): @@ -186,10 +238,10 @@ def chromosome_coordinate_to_mrna(self, coord): i = ChromosomeInterval(self.chromosome, coord, coord + 1, self.strand) if not any(i.overlap(x) for x in self.exon_intervals): return None - exon_intervals = self.exon_intervals if self.strand == '+' else reversed(self.exon_intervals) + exon_intervals = self.exon_intervals if self.strand == "+" else reversed(self.exon_intervals) for e in exon_intervals: if i.overlap(e): - if self.strand == '+': + if self.strand == "+": p += coord - e.start else: p += e.stop - coord - 1 @@ -209,17 +261,17 @@ def mrna_coordinate_to_chromosome(self, coord): if not (0 <= coord < len(self)): return None p = 0 - exon_intervals = self.exon_intervals if self.strand == '+' else reversed(self.exon_intervals) + exon_intervals = self.exon_intervals if self.strand == "+" else reversed(self.exon_intervals) for e in exon_intervals: if p + len(e) > coord: - if self.strand == '+': + if self.strand == "+": return e.start + (coord - p) else: return e.stop - (coord - p) - 1 p += len(e) def mrna_coordinate_to_cds(self, coord): - if self.strand == '+': + if self.strand == "+": cds_start = self.chromosome_coordinate_to_mrna(self.thick_start) else: cds_start = self.chromosome_coordinate_to_mrna(self.thick_stop - 1) @@ -231,7 +283,7 @@ def mrna_coordinate_to_cds(self, coord): def cds_coordinate_to_mrna(self, coord): if not (0 <= coord < self.cds_size): return None - if self.strand == '+': + if self.strand == "+": cds_start = self.chromosome_coordinate_to_mrna(self.thick_start) else: cds_start = self.chromosome_coordinate_to_mrna(self.thick_stop - 1) @@ -240,7 +292,7 @@ def cds_coordinate_to_mrna(self, coord): def cds_coordinate_to_chromosome(self, coord): if not (0 <= coord < self.cds_size): return None - if self.strand == '+': + if self.strand == "+": cds_start = self.chromosome_coordinate_to_mrna(self.thick_start) else: cds_start = self.chromosome_coordinate_to_mrna(self.thick_stop - 1) @@ -257,11 +309,11 @@ def get_mrna(self, seq_dict): assert self.stop <= len(sequence) + 1 s = [] for e in self.exon_intervals: - s.append(sequence[e.start:e.stop]) - if self.strand == '+': - mrna = ''.join(s) + s.append(sequence[e.start : e.stop]) + if self.strand == "+": + mrna = "".join(s) else: - mrna = reverse_complement(''.join(s)) + mrna = reverse_complement("".join(s)) return str(mrna) def get_sequence(self, seq_dict): @@ -269,7 +321,7 @@ def get_sequence(self, seq_dict): Returns the entire chromosome sequence for this transcript, (+) strand orientation. """ sequence = seq_dict[self.chromosome] - return sequence[self.start:self.stop] + return sequence[self.start : self.stop] def get_cds(self, seq_dict): """ @@ -282,25 +334,25 @@ def get_cds(self, seq_dict): assert self.stop <= len(sequence) + 1 # make sure this isn't a non-coding gene if self.thick_start == self.thick_stop == 0: - return '' + return "" s = [] for e in self.exon_intervals: if self.thick_start < e.start and e.stop < self.thick_stop: # squarely in the CDS - s.append(sequence[e.start:e.stop]) + s.append(sequence[e.start : e.stop]) elif e.start <= self.thick_start < e.stop < self.thick_stop: # thickStart marks the start of the CDS - s.append(sequence[self.thick_start:e.stop]) + s.append(sequence[self.thick_start : e.stop]) elif e.start <= self.thick_start and self.thick_stop <= e.stop: # thickStart and thickStop mark the whole CDS - s.append(sequence[self.thick_start: self.thick_stop]) + s.append(sequence[self.thick_start : self.thick_stop]) elif self.thick_start < e.start < self.thick_stop <= e.stop: # thickStop marks the end of the CDS - s.append(sequence[e.start:self.thick_stop]) - if self.strand == '-': - cds = reverse_complement(''.join(s)) + s.append(sequence[e.start : self.thick_stop]) + if self.strand == "-": + cds = reverse_complement("".join(s)) else: - cds = ''.join(s) + cds = "".join(s) return str(cds) def get_protein_sequence(self, seq_dict): @@ -310,7 +362,7 @@ def get_protein_sequence(self, seq_dict): """ cds = self.get_cds(seq_dict) if len(cds) < 3: - return '' + return "" return translate_sequence(self.get_cds(seq_dict).upper()) def get_start_intervals(self): @@ -349,7 +401,7 @@ def get_5p_interval(self): """ Returns a ChromosomeInterval representing the 5' end """ - if self.strand == '+': + if self.strand == "+": return ChromosomeInterval(self.chromosome, self.start, self.start + 1, self.strand) else: return ChromosomeInterval(self.chromosome, self.stop - 1, self.stop, self.strand) @@ -358,7 +410,7 @@ def get_3p_interval(self): """ Returns a ChromosomeInterval representing the 3' end """ - if self.strand == '-': + if self.strand == "-": return ChromosomeInterval(self.chromosome, self.start, self.start + 1, self.strand) else: return ChromosomeInterval(self.chromosome, self.stop - 1, self.stop, self.strand) @@ -369,13 +421,14 @@ class GenePredTranscript(Transcript): Subclasses Transcript to represent genePred entries. genePred entries have the same information, except that they also tell you whether the CDS is complete on both ends, and the frame information of each exon. """ + # adding slots for new fields - __slots__ = ('cds_start_stat', 'cds_end_stat', 'exon_frames', 'name2', 'score') + __slots__ = ("cds_start_stat", "cds_end_stat", "exon_frames", "name2", "score") def __init__(self, gene_pred_tokens, stranded=True): name = gene_pred_tokens[0] chrom = gene_pred_tokens[1] - strand = gene_pred_tokens[2] if stranded is True else '.' + strand = gene_pred_tokens[2] if stranded is True else "." start = gene_pred_tokens[3] stop = gene_pred_tokens[4] thick_start = gene_pred_tokens[5] @@ -387,25 +440,37 @@ def __init__(self, gene_pred_tokens, stranded=True): self.name2 = gene_pred_tokens[11] self.cds_start_stat = gene_pred_tokens[12] self.cds_end_stat = gene_pred_tokens[13] - self.exon_frames = [int(x) for x in gene_pred_tokens[14].split(',') if x != ''] + self.exon_frames = [int(x) for x in gene_pred_tokens[14].split(",") if x != ""] # convert genePred format coordinates to BED-like coordinates to make intervals - block_starts = [int(x) for x in exon_starts.split(',') if x != ''] - block_ends = [int(x) for x in exon_ends.split(',') if x != ''] + block_starts = [int(x) for x in exon_starts.split(",") if x != ""] + block_ends = [int(x) for x in exon_ends.split(",") if x != ""] block_sizes = ",".join(map(str, [e - s for e, s in zip(block_ends, block_starts)])) block_starts = ",".join(map(str, [x - int(start) for x in block_starts])) - bed_tokens = [chrom, start, stop, name, self.score, strand, thick_start, thick_stop, '0', block_count, - block_sizes, block_starts] + bed_tokens = [ + chrom, + start, + stop, + name, + self.score, + strand, + thick_start, + thick_stop, + "0", + block_count, + block_sizes, + block_starts, + ] super(GenePredTranscript, self).__init__(bed_tokens, stranded=stranded) def __repr__(self): - return 'GenePredTranscript({})'.format(self.get_gene_pred()) + return "GenePredTranscript({})".format(self.get_gene_pred()) @property def offset(self): frames = [x for x in self.exon_frames if x != -1] if len(frames) == 0: return 0 - if self.strand == '+': + if self.strand == "+": offset = 3 - frames[0] else: offset = 3 - frames[-1] @@ -422,12 +487,12 @@ def _get_exon_intervals(self): for block_size, block_start, frame in zip(*(self.block_sizes, self.block_starts, self.exon_frames)): start = self.start + block_start stop = self.start + block_start + block_size - exon_intervals.append(ChromosomeInterval(self.chromosome, start, stop, self.strand, data={'frame': frame})) + exon_intervals.append(ChromosomeInterval(self.chromosome, start, stop, self.strand, data={"frame": frame})) return exon_intervals def _make_exon_idx_iter(self): """make iterator exon indexes in order of transcriptions""" - if self.strand == '+': + if self.strand == "+": return range(0, len(self.exon_intervals)) else: return range(len(self.exon_intervals) - 1, -1, -1) @@ -462,7 +527,7 @@ def _adjust_cds_start(self, cds_interval, expected_frame, frame): frame = self._frame_incr(frame) amt += 1 # min/max here avoids going negative, making a zero-length block - if cds_interval.strand == '+': + if cds_interval.strand == "+": start = min(cds_interval.start + amt, cds_interval.stop) stop = cds_interval.stop gap_start = cds_interval.start @@ -473,7 +538,7 @@ def _adjust_cds_start(self, cds_interval, expected_frame, frame): gap_start = cds_interval.stop - amt gap_stop = cds_interval.stop cds_interval = ChromosomeInterval(cds_interval.chromosome, start, stop, cds_interval.strand) - gap_interval = ChromosomeInterval(cds_interval.chromosome, gap_start, gap_stop, cds_interval.strand, 'gap') + gap_interval = ChromosomeInterval(cds_interval.chromosome, gap_start, gap_stop, cds_interval.strand, "gap") return cds_interval, gap_interval def _get_codon_intervals(self): @@ -502,10 +567,10 @@ def get_cds(self, seq_dict, ignore_frameshift=False): else: codon_regions = sorted(codon_regions, key=lambda x: x.start) - if self.strand == '+': - cds = ''.join([str(x.get_sequence(seq_dict)) for x in codon_regions]) + if self.strand == "+": + cds = "".join([str(x.get_sequence(seq_dict)) for x in codon_regions]) else: - cds = ''.join([str(x.get_sequence(seq_dict)) for x in codon_regions[::-1]]) + cds = "".join([str(x.get_sequence(seq_dict)) for x in codon_regions[::-1]]) return cds def codon_iterator(self, seq_dict): @@ -524,12 +589,12 @@ def codon_iterator(self, seq_dict): positions.append(p) cds_pos += 1 - if self.strand == '-': + if self.strand == "-": positions = positions[::-1] for i in range(0, cds_pos - cds_pos % 3, 3): - codon = cds[i:i + 3] - if self.strand == '+': + codon = cds[i : i + 3] + if self.strand == "+": if positions[i + 2] + 1 != self.thick_stop: yield positions[i], positions[i + 2] + 1, codon else: @@ -546,7 +611,7 @@ def get_protein_sequence(self, seq_dict): try: return translate_sequence(cds.upper()) except AssertionError: - raise RuntimeError('Failed to translate transcript {} with sequence {}'.format(self.name, cds)) + raise RuntimeError("Failed to translate transcript {} with sequence {}".format(self.name, cds)) def get_gene_pred(self, name=None, new_start=None, new_stop=None, name2=None, score=None): """ @@ -564,12 +629,31 @@ def get_gene_pred(self, name=None, new_start=None, new_stop=None, name2=None, sc # if no resizing, just return what we have if new_start is None and new_stop is None: - exon_starts = ','.join(map(str, [exon.start for exon in self.exon_intervals])) - exon_ends = ','.join(map(str, [exon.stop for exon in self.exon_intervals])) - exon_frames = ','.join(map(str, self.exon_frames)) - return list(map(str, [name, self.chromosome, self.strand, self.start, self.stop, self.thick_start, - self.thick_stop, len(self.exon_intervals), exon_starts, exon_ends, score, name2, - self.cds_start_stat, self.cds_end_stat, exon_frames])) + exon_starts = ",".join(map(str, [exon.start for exon in self.exon_intervals])) + exon_ends = ",".join(map(str, [exon.stop for exon in self.exon_intervals])) + exon_frames = ",".join(map(str, self.exon_frames)) + return list( + map( + str, + [ + name, + self.chromosome, + self.strand, + self.start, + self.stop, + self.thick_start, + self.thick_stop, + len(self.exon_intervals), + exon_starts, + exon_ends, + score, + name2, + self.cds_start_stat, + self.cds_end_stat, + exon_frames, + ], + ) + ) if new_start is not None and new_stop is not None: assert new_start <= new_stop if new_start is not None: @@ -585,8 +669,8 @@ def get_gene_pred(self, name=None, new_start=None, new_stop=None, name2=None, sc new_interval = ChromosomeInterval(self.chromosome, new_start, new_stop, self.strand) exon_intervals = [] exon_frames = [] - exon_iter = self.exon_intervals if self.strand == '+' else self.exon_intervals[::-1] - frame_iter = self.exon_frames if self.strand == '+' else reversed(self.exon_frames) + exon_iter = self.exon_intervals if self.strand == "+" else self.exon_intervals[::-1] + frame_iter = self.exon_frames if self.strand == "+" else reversed(self.exon_frames) # attempt to find the first frame. If there is none, then we have a non-coding transcript and this is easy try: @@ -614,7 +698,7 @@ def get_gene_pred(self, name=None, new_start=None, new_stop=None, name2=None, sc cds_counter += len(coding_exon) # flip back around negative strand transcripts - if self.strand == '-': + if self.strand == "-": exon_intervals = exon_intervals[::-1] exon_frames = exon_frames[::-1] @@ -626,14 +710,34 @@ def get_gene_pred(self, name=None, new_start=None, new_stop=None, name2=None, sc thick_start = max(self.thick_start, new_start) thick_stop = min(self.thick_stop, new_stop) - cds_start_stat = 'unk' if thick_start != self.thick_start else self.cds_start_stat - cds_end_stat = 'unk' if thick_stop != self.thick_stop else self.cds_end_stat + cds_start_stat = "unk" if thick_start != self.thick_start else self.cds_start_stat + cds_end_stat = "unk" if thick_stop != self.thick_stop else self.cds_end_stat exon_count = len(exon_intervals) - exon_starts = ','.join(map(str, [exon.start for exon in exon_intervals])) - exon_ends = ','.join(map(str, [exon.stop for exon in exon_intervals])) - exon_frames = ','.join(map(str, exon_frames)) - return list(map(str, [name, self.chromosome, self.strand, new_start, new_stop, thick_start, thick_stop, exon_count, - exon_starts, exon_ends, score, name2, cds_start_stat, cds_end_stat, exon_frames])) + exon_starts = ",".join(map(str, [exon.start for exon in exon_intervals])) + exon_ends = ",".join(map(str, [exon.stop for exon in exon_intervals])) + exon_frames = ",".join(map(str, exon_frames)) + return list( + map( + str, + [ + name, + self.chromosome, + self.strand, + new_start, + new_stop, + thick_start, + thick_stop, + exon_count, + exon_starts, + exon_ends, + score, + name2, + cds_start_stat, + cds_end_stat, + exon_frames, + ], + ) + ) def get_gene_pred_dict(gp_file, stranded=True): @@ -654,9 +758,9 @@ def gene_pred_iterator(gp_file, stranded=True): :return: tuples of (name, GenePredTranscript) """ for i, x in enumerate(open(gp_file)): - tokens = x.rstrip().split('\t') + tokens = x.rstrip().split("\t") if len(tokens) != 15: - raise RuntimeError('GenePred line {} had {} tokens, not 15. Record: {}'.format(i + 1, len(tokens), tokens)) + raise RuntimeError("GenePred line {} had {} tokens, not 15. Record: {}".format(i + 1, len(tokens), tokens)) t = GenePredTranscript(tokens, stranded=stranded) yield t @@ -681,7 +785,7 @@ def transcript_iterator(bed_file, stranded=True): with open(bed_file) as inf: for tokens in iter_lines(inf): if len(tokens) != 12: - raise RuntimeError('BED line had {} tokens, not 12. Record: {}'.format(len(tokens), tokens)) + raise RuntimeError("BED line had {} tokens, not 12. Record: {}".format(len(tokens), tokens)) t = Transcript(tokens, stranded=stranded) yield t @@ -692,22 +796,22 @@ def load_gps(gp_list): for gp in gp_list: for t in gene_pred_iterator(gp): if t.name in r: - raise RuntimeError('Attempted to add duplicate GenePredTranscript object with name {}'.format(t.name)) + raise RuntimeError("Attempted to add duplicate GenePredTranscript object with name {}".format(t.name)) r[t.name] = t return r def convert_frame(exon_frame): """converts genePred-style exonFrame to GFF-style phase""" - mapping = {0: 0, 1: 2, 2: 1, -1: '.'} + mapping = {0: 0, 1: 2, 2: 1, -1: "."} return mapping[exon_frame] def create_bed_info_gp(gp): """Creates the block_starts, block_sizes and exon_frames fields from a GenePredTranscript object""" - block_starts = ','.join(map(str, gp.block_starts)) - block_sizes = ','.join(map(str, gp.block_sizes)) - exon_frames = ','.join(map(str, gp.exon_frames)) + block_starts = ",".join(map(str, gp.block_starts)) + block_sizes = ",".join(map(str, gp.block_sizes)) + exon_frames = ",".join(map(str, gp.exon_frames)) return block_starts, block_sizes, exon_frames @@ -726,11 +830,25 @@ def intervals_to_bed(intervals, name=None, score=0, rgb=0, thick_start=0, thick_ intervals = sorted(intervals) start = intervals[0].start stop = intervals[-1].stop - block_sizes = ','.join(map(str, [len(i) for i in intervals])) - block_starts = ','.join(map(str, [i.start - start for i in intervals])) + block_sizes = ",".join(map(str, [len(i) for i in intervals])) + block_starts = ",".join(map(str, [i.start - start for i in intervals])) i = intervals[0] - return Transcript([i.chromosome, start, stop, name, score, i.strand, thick_start, thick_stop, rgb, - len(intervals), block_sizes, block_starts]) + return Transcript( + [ + i.chromosome, + start, + stop, + name, + score, + i.strand, + thick_start, + thick_stop, + rgb, + len(intervals), + block_sizes, + block_starts, + ] + ) def cluster_txs(txs): @@ -828,7 +946,7 @@ def has_start_codon(fasta, tx): s = tx.get_protein_sequence(fasta) if len(s) == 0: return False - return s[0] == 'M' + return s[0] == "M" def has_stop_codon(fasta, tx): @@ -843,4 +961,4 @@ def has_stop_codon(fasta, tx): s = tx.get_protein_sequence(fasta) if len(s) == 0: return False - return s[-1] == '*' + return s[-1] == "*"