diff --git a/.travis.yml b/.travis.yml index 21bedea..32257a9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -53,3 +53,10 @@ jobs: luigi --module cat RunCat --hal=test_data/vertebrates.hal --target-genomes='("hg38", "galGal4")' --ref-genome=mm10 --workers=2 --config=test_data/test.config --work-dir test_install --out-dir test_install --local-scheduler --augustus --augustus-cgp --augustus-pb --assembly-hub --log-level DEBUG + - stage: test_standalone + script: + - > + docker run -v $PWD/test_data:/test_data/ -i quay.io/ucsc_cgl/cat:latest + luigi --module cat RunCat --hal=test_data/vertebrates.hal --target-genomes='("hg38", "galGal4")' --ref-genome=mm10 + --workers=2 --config=test_data/test.config --work-dir test_install --out-dir test_install --local-scheduler + --augustus --augustus-cgp --augustus-pb --assembly-hub --log-level DEBUG --binary-mode local diff --git a/Dockerfile b/Dockerfile index dbc3e9e..642eeaa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,90 +1,143 @@ -FROM ubuntu:18.04 AS builder -ARG AUGUSTUS_COMMIT=36ae43d -RUN apt-get update -RUN apt-get install -y build-essential libssl-dev libncurses5-dev libcurl4-openssl-dev liblzma-dev libbz2-dev \ -libboost-all-dev sqlite3 libsqlite3-0 libsqlite3-dev libgsl0-dev lp-solve liblpsolve55-dev libbamtools-dev wget git - -# htslib -RUN git clone --recursive git://github.com/samtools/htslib.git -RUN cd htslib && make install - -# bcftools -RUN git clone git://github.com/samtools/bcftools.git -RUN cd bcftools && make - -# samtools -RUN git clone git://github.com/samtools/samtools -RUN cd samtools && make && make install - -# MOVE Directories INTO $HOME/tool -RUN mkdir /root/tools -RUN mv samtools /root/tools -RUN mv htslib /root/tools -RUN mv bcftools /root/tools - -# Augustus -RUN git clone https://github.com/Gaius-Augustus/Augustus augustus -RUN cd augustus && git reset --hard ${AUGUSTUS_COMMIT} -RUN echo 'COMPGENEPRED = true' >> augustus/common.mk -RUN echo 'SQLITE = true' >> augustus/common.mk -RUN cd augustus/auxprogs/homGeneMapping/src && sed 's/# BOOST = true/BOOST = true/g' -i Makefile && sed 's/# SQLITE = true/SQLITE = true/g' -i Makefile -RUN cd augustus && make - -# HDF5 -RUN wget -q http://www.hdfgroup.org/ftp/HDF5/releases/hdf5-1.10/hdf5-1.10.1/src/hdf5-1.10.1.tar.gz -RUN tar xzf hdf5-1.10.1.tar.gz -RUN cd hdf5-1.10.1 && ./configure --enable-cxx --prefix=/usr -RUN cd hdf5-1.10.1 && make && make install - -# sonLib -RUN git clone git://github.com/ComparativeGenomicsToolkit/sonLib.git - -# HAL -RUN git clone git://github.com/ComparativeGenomicsToolkit/hal.git -RUN cd sonLib && make -RUN cd hal && make - -# LibBigWig -RUN git clone https://github.com/dpryan79/libBigWig.git -RUN cd libBigWig && make install - -# WiggleTools -RUN git clone https://github.com/dahlo/WiggleTools -# Their makefile now hardcodes /bin/cc as compiler :( -RUN ln -s /usr/bin/cc /bin/cc -RUN cd WiggleTools && make - -# sambamba -RUN wget -q https://github.com/biod/sambamba/releases/download/v0.6.7/sambamba_v0.6.7_linux.tar.bz2 -RUN tar xvjf sambamba_v0.6.7_linux.tar.bz2 - -# Slimmer final Docker image - -FROM ubuntu:18.04 -RUN apt-get update -RUN apt-get install -y wget bedtools bamtools samtools sqlite3 libgsl0-dev libcolamd2 software-properties-common libcurl4-openssl-dev exonerate -RUN add-apt-repository -y ppa:deadsnakes/ppa -RUN apt-get install -y python3.7 python3-pip -# Kent -RUN for i in wigToBigWig faToTwoBit gff3ToGenePred genePredToBed genePredToFakePsl bamToPsl transMapPslToGenePred \ -pslPosTarget axtChain chainMergeSort pslMap pslRecalcMatch pslMapPostChain gtfToGenePred genePredToGtf bedtools \ -pslCheck pslCDnaFilter clusterGenes pslToBigPsl bedSort bedToBigBed; do \ -wget -q http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/$i -O /bin/$i ; chmod +x /bin/$i ; done - -COPY --from=builder /hal/bin/* /bin/ -COPY --from=builder /sambamba /bin/ -COPY --from=builder /augustus/bin/* /bin/ -COPY --from=builder /augustus/scripts/* /bin/ -COPY --from=builder /WiggleTools/bin/* /bin/ - -RUN mkdir -p /augustus -COPY --from=builder /augustus/config /augustus/config - -# Python deps -RUN pip3 install bd2k-python-lib toil[all]==5.0 pyfasta numpy matplotlib pandas==1.0 - -# make Python 3 primary python -RUN rm /usr/bin/python -RUN ln -s /usr/bin/python3.7 /usr/bin/python - -ENV AUGUSTUS_CONFIG_PATH=/augustus/config/ +# install python dependencies +FROM ubuntu:20.04 AS cat-python + +RUN apt update && apt install -y --no-install-recommends \ + gcc \ + python3-dev \ + python3-pip \ + wget + +COPY ./setup.py / + +RUN wget https://ont-research.s3-eu-west-1.amazonaws.com/parasail-1.1.17-py2.py3-none-manylinux1_x86_64.whl +RUN pip3 install parasail-1.1.17-py2.py3-none-manylinux1_x86_64.whl + +RUN mkdir cat tools \ + && python3 setup.py egg_info \ + && pip3 install -r cat.egg-info/requires.txt + +COPY ./ /cat + +RUN cd /cat \ + && sed -i'' "s#'augustus_cfgs/#'/opt/augustus/config/extrinsic/#g" cat/__init__.py \ + && python3 setup.py install + +######################################## + +FROM curlimages/curl:7.70.0 AS cat-binaries + +USER root + +WORKDIR /binaries + +# Need >= v395 for clusterGenes -minOverlappingBases option +RUN curl -LO http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/{axtChain,bamToPsl,bedSort,bedToBigBed,chainMergeSort,clusterGenes,faToTwoBit,genePredToBed,genePredToFakePsl,genePredToGtf,gff3ToGenePred,gtfToGenePred,pslCDnaFilter,pslMap,pslMapPostChain,pslPosTarget,pslRecalcMatch,pslToBigPsl,transMapPslToGenePred,wigToBigWig} \ + && chmod a+x /binaries/* + +RUN set -o pipefail && curl -L https://github.com/biod/sambamba/releases/download/v0.7.1/sambamba-0.7.1-linux-static.gz \ + | gzip -d > /binaries/sambamba && chmod a+x /binaries/sambamba + +# CAT v2.1.0 needs more recent hal2fasta supporting the --onlySequenceNames option +RUN set -o pipefail && curl -L https://github.com/ComparativeGenomicsToolkit/cactus/releases/download/v1.0.0/cactus-bin-v1.0.0.tar.gz \ + | tar -C /tmp -xzf - \ + cactus-bin-v1.0.0/bin/hal2maf \ + cactus-bin-v1.0.0/bin/hal2fasta \ + cactus-bin-v1.0.0/bin/halLiftover \ + cactus-bin-v1.0.0/bin/halStats \ + && mv /tmp/cactus-bin-v1.0.0/bin/* /binaries && chmod a+x /binaries/hal* + +######################################## + +FROM ubuntu:20.04 AS cat-augustus + +# Install required packages +RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends \ + autoconf \ + build-essential \ + ca-certificates \ + curl \ + libbamtools-dev \ + libboost-iostreams-dev \ + libgsl-dev \ + libhts-dev \ + liblpsolve55-dev \ + libsqlite3-dev \ + libsuitesparse-dev \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +# 2020-07-03 snapshot + bam2wig build simplification +# https://github.com/Gaius-Augustus/Augustus/pull/153 +RUN mkdir /src && cd /src \ + && curl -L https://github.com/harvardinformatics/Augustus/archive/08b7d320cbee586ebfbee410aeae90d81ce03f1e.tar.gz \ + | tar --strip-components=1 -xzf - \ + && make INSTALLDIR=/opt/augustus MYSQL=false HTSLIBS='-lhts' \ + && make -j install \ + && mv /opt/augustus-* /opt/augustus \ + && rm -rf /src + +######################################## +# https://github.com/Ensembl/WiggleTools/blob/597d84/Dockerfile + +FROM ubuntu:20.04 AS cat-wiggletools + +RUN apt update && apt install -y --no-install-recommends \ + ca-certificates \ + libgsl-dev \ + libhts-dev \ + libbigwig-dev \ + libcurl4-openssl-dev \ + gcc \ + python \ + make + +WORKDIR /build + +# 2020-06-02 snapshot +ADD https://github.com/Ensembl/WiggleTools/archive/c1daac89e3775bc8f96376fc1ed7f7e645ce168c.tar.gz wiggletools.tar.gz + +RUN tar --strip-components=1 -xzf wiggletools.tar.gz \ + && make LIBS='-lwiggletools -lBigWig -lcurl -lhts -lgsl -lgslcblas -lz -lpthread -lm -llzma' + +######################################## + +FROM ubuntu:20.04 AS final + +RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends \ + bamtools \ + bedtools \ + exonerate \ + libbamtools2.5.1 \ + libbigwig0 \ + libboost-iostreams1.71.0 \ + libcolamd2 \ + libcurl4 \ + libgsl23 \ + libhts3 \ + libsqlite3-0 \ + libsz2 \ + libsuitesparseconfig5 \ + python3-pip \ + samtools \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=cat-python /usr/local /usr/local +COPY --from=cat-binaries /binaries /usr/local/bin +COPY --from=cat-wiggletools /build/bin/wiggletools /usr/local/bin +COPY --from=cat-augustus /opt/augustus /opt/augustus + +# (2020-06-04) augustus_cfgs/log_reg_parameters_default.cfg identical to Augustus config/cgp/log_reg_parameters_default.cfg +COPY ./augustus_cfgs/*extrinsic*.cfg /opt/augustus/config/extrinsic/ + +# luigi looks for luigi.cfg in /etc/luigi/luigi.cfg by default +COPY ./logging.cfg ./luigi.cfg /etc/luigi/ + +# but need to tell luigi to look for logging.cfg at /etc/luigi/logging.cfg +RUN sed -i'' '/logging_conf_file/s#.*#logging_conf_file=/etc/luigi/logging.cfg#' /etc/luigi/luigi.cfg + +# also modify the Toil default resource rules in order to allow this container to be used for unit tests +RUN sed -i "s/maxDisk = self.physicalDisk/pass/g" /usr/local/lib/python3.8/dist-packages/toil/batchSystems/singleMachine.py && \ + sed -i "s/maxCores = self.numCores/maxCores = 8/g" /usr/local/lib/python3.8/dist-packages/toil/batchSystems/singleMachine.py && \ + sed -i "s/maxMemory = self.physicalMemory/pass/g" /usr/local/lib/python3.8/dist-packages/toil/batchSystems/singleMachine.py + +ENV PATH=/opt/augustus/bin:/opt/augustus/scripts:${PATH} diff --git a/Dockerfile.complete b/Dockerfile.complete deleted file mode 100644 index 6c50cbb..0000000 --- a/Dockerfile.complete +++ /dev/null @@ -1,165 +0,0 @@ -# install python dependencies -FROM ubuntu:20.04 AS cat-python - -RUN apt update && apt install -y --no-install-recommends \ - gcc \ - python3-dev \ - python3-pip \ - wget - -COPY ./setup.py / - -RUN wget https://ont-research.s3-eu-west-1.amazonaws.com/parasail-1.1.17-py2.py3-none-manylinux1_x86_64.whl -RUN pip3 install parasail-1.1.17-py2.py3-none-manylinux1_x86_64.whl - -RUN mkdir cat tools \ - && python3 setup.py egg_info \ - && pip3 install -r cat.egg-info/requires.txt - -COPY ./ /cat - -RUN cd /cat \ - && sed -i'' "s#'augustus_cfgs/#'/opt/augustus/config/extrinsic/#g" cat/__init__.py \ - && python3 setup.py install - -######################################## - -FROM curlimages/curl:7.70.0 AS cat-binaries - -USER root - -WORKDIR /binaries - -# Need >= v395 for clusterGenes -minOverlappingBases option -RUN curl -LO http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/{axtChain,bamToPsl,bedSort,bedToBigBed,chainMergeSort,clusterGenes,faToTwoBit,genePredToBed,genePredToFakePsl,genePredToGtf,gff3ToGenePred,gtfToGenePred,pslCDnaFilter,pslCheck,pslMap,pslMapPostChain,pslPosTarget,pslRecalcMatch,pslToBigPsl,transMapPslToGenePred,wigToBigWig} \ - && chmod a+x /binaries/* - -RUN set -o pipefail && curl -L https://github.com/biod/sambamba/releases/download/v0.7.1/sambamba-0.7.1-linux-static.gz \ - | gzip -d > /binaries/sambamba && chmod a+x /binaries/sambamba - -# CAT v2.1.0 needs more recent hal2fasta supporting the --onlySequenceNames option -#RUN set -o pipefail && curl -L https://github.com/ComparativeGenomicsToolkit/cactus/releases/download/v1.0.0/cactus-bin-v1.0.0.tar.gz \ -# | tar -C /tmp -xzf - \ -# cactus-bin-v1.0.0/bin/hal2maf \ -# cactus-bin-v1.0.0/bin/hal2fasta \ -# cactus-bin-v1.0.0/bin/halLiftover \ -# cactus-bin-v1.0.0/bin/halStats \ -# && mv /tmp/cactus-bin-v1.0.0/bin/* /binaries && chmod a+x /binaries/hal* - -######################################## - -FROM ubuntu:20.04 AS cat-augustus - -# Install required packages -RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends \ - autoconf \ - build-essential \ - ca-certificates \ - curl \ - libbamtools-dev \ - libboost-iostreams-dev \ - libgsl-dev \ - libhts-dev \ - liblpsolve55-dev \ - libsqlite3-dev \ - libsuitesparse-dev \ - zlib1g-dev \ - && rm -rf /var/lib/apt/lists/* - -# 2020-07-03 snapshot + bam2wig build simplification -# https://github.com/Gaius-Augustus/Augustus/pull/153 -RUN mkdir /src && cd /src \ - && curl -L https://github.com/harvardinformatics/Augustus/archive/08b7d320cbee586ebfbee410aeae90d81ce03f1e.tar.gz \ - | tar --strip-components=1 -xzf - \ - && make INSTALLDIR=/opt/augustus MYSQL=false HTSLIBS='-lhts' \ - && make -j install \ - && mv /opt/augustus-* /opt/augustus \ - && rm -rf /src - -######################################## -# https://github.com/Ensembl/WiggleTools/blob/597d84/Dockerfile - -FROM ubuntu:20.04 AS cat-wiggletools - -RUN apt update && apt install -y --no-install-recommends \ - ca-certificates \ - libgsl-dev \ - libhts-dev \ - libbigwig-dev \ - libcurl4-openssl-dev \ - gcc \ - python \ - make - -WORKDIR /build - -# 2020-06-02 snapshot -ADD https://github.com/Ensembl/WiggleTools/archive/c1daac89e3775bc8f96376fc1ed7f7e645ce168c.tar.gz wiggletools.tar.gz - -RUN tar --strip-components=1 -xzf wiggletools.tar.gz \ - && make LIBS='-lwiggletools -lBigWig -lcurl -lhts -lgsl -lgslcblas -lz -lpthread -lm -llzma' - -######################################## - -FROM ubuntu:20.04 AS cat-hal - -RUN apt update && apt install -y \ - libhdf5-dev \ - g++ \ - make \ - zlib1g-dev - -WORKDIR /sonLib -# 2020-06-16 snapshot -ADD https://github.com/ComparativeGenomicsToolkit/sonLib/archive/ea0b939828ba24d998a7c1aa407ff5a016912f56.tar.gz sonLib.tar.gz -RUN tar --strip-components=1 -xzf sonLib.tar.gz -RUN make -j - -WORKDIR /hal -# 2020-07-08 snapshot -ADD https://github.com/ComparativeGenomicsToolkit/hal/archive/cb7c044731271ec41640db71f5694af53a0ead57.tar.gz hal.tar.gz -RUN tar --strip-components=1 -xzf hal.tar.gz \ - && make -j \ - && mkdir /binaries \ - && mv bin/hal2fasta bin/hal2maf bin/halStats bin/halLiftover /binaries \ - && strip /binaries/* \ - && rm -rf /hal - -######################################## - -FROM ubuntu:20.04 AS final - -RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends \ - bamtools \ - bedtools \ - exonerate \ - libbamtools2.5.1 \ - libbigwig0 \ - libboost-iostreams1.71.0 \ - libcolamd2 \ - libcurl4 \ - libgsl23 \ - libhts3 \ - libsqlite3-0 \ - libsz2 \ - libsuitesparseconfig5 \ - python3-pip \ - samtools \ - && rm -rf /var/lib/apt/lists/* - -COPY --from=cat-python /usr/local /usr/local -COPY --from=cat-binaries /binaries /usr/local/bin -COPY --from=cat-hal /binaries /usr/local/bin -COPY --from=cat-wiggletools /build/bin/wiggletools /usr/local/bin -COPY --from=cat-augustus /opt/augustus /opt/augustus - -# (2020-06-04) augustus_cfgs/log_reg_parameters_default.cfg identical to Augustus config/cgp/log_reg_parameters_default.cfg -COPY ./augustus_cfgs/*extrinsic*.cfg /opt/augustus/config/extrinsic/ - -# luigi looks for luigi.cfg in /etc/luigi/luigi.cfg by default -COPY ./logging.cfg ./luigi.cfg /etc/luigi/ - -# but need to tell luigi to look for logging.cfg at /etc/luigi/logging.cfg -RUN sed -i'' '/logging_conf_file/s#.*#logging_conf_file=/etc/luigi/logging.cfg#' /etc/luigi/luigi.cfg - -ENV PATH=/opt/augustus/bin:/opt/augustus/scripts:${PATH} diff --git a/tools/bio.py b/tools/bio.py index 972e552..6fd7c6b 100644 --- a/tools/bio.py +++ b/tools/bio.py @@ -24,9 +24,9 @@ def write_fasta(path_or_handle, name, seq, chunk_size=100, validate=None): fh = opengz(path_or_handle, 'w') else: fh = path_or_handle - if validate is 'DNA': + if validate == 'DNA': valid_chars = set('ACGTUYSWKMBDHVNacgtuyswkmbdhvn.-*') - elif validate is 'protein': + elif validate == 'protein': valid_chars = set('ABCDEFGHIKLMPQSRTVWXYZUabcdefghiklmpqsrtvwxyzuNn.-*') else: valid_chars = set() diff --git a/tools/intervals.py b/tools/intervals.py index 14e8dce..ad822e6 100644 --- a/tools/intervals.py +++ b/tools/intervals.py @@ -216,9 +216,9 @@ def get_sequence(self, seq_dict, stranded=True): :param stranded: Should we reverse complement negative strand sequences? :return: A sequence string. """ - if stranded is False or self.strand is '+': + if stranded is False or self.strand == '+': return seq_dict[self.chromosome][self.start: self.stop] - elif self.strand is '-': + elif self.strand == '-': return reverse_complement(seq_dict[self.chromosome][self.start: self.stop]) def get_protein_sequence(self, seq_dict, frame, truncate=True):