diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 26c0983..8e14a32 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -2,49 +2,89 @@ name: C/C++ CI on: push: - branches: [ master, develop, 'release/**' ] + branches: [master, develop, "release/**", "feature/**"] pull_request: - branches: [ master, develop ] + branches: [master, develop] jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: dependencies - run: | - sudo apt-get update - sudo DEBIAN_FRONTEND=noninteractive apt-get install -y yaggo gettext swig python3-dev ruby-dev libperl-dev - - name: autotools - run: autoreconf -fi - - name: configure - run: ./configure --enable-all-binding --enable-swig - - - name: make - run: make -j$(nproc) - - - name: make check - run: make -j$(nproc) check - - name: Check logs - if: failure() - uses: actions/upload-artifact@v4 - with: - name: checklog - path: "**/*tests/*.log" - - - name: make distcheck - run: make -j$(nproc) distcheck - - name: Distcheck logs - if: failure() - uses: actions/upload-artifact@v4 - with: - name: distchecklog - path: | - **/*tests/*.log - **/*tests/**/tee.* - - - name: Distribution tar ball - uses: actions/upload-artifact@v4 - with: - name: disttarball - path: "mummer-*.tar.gz" + - uses: actions/checkout@v3 + - name: dependencies + run: | + sudo apt-get update + sudo DEBIAN_FRONTEND=noninteractive apt-get install -y yaggo gettext swig python3-dev ruby-dev libperl-dev + - name: autotools + run: autoreconf -fi + - name: configure + run: ./configure --enable-all-binding --enable-swig + + - name: make + run: make -j$(nproc) + + - name: make check + run: make -j$(nproc) check + - name: Check logs + if: failure() + uses: actions/upload-artifact@v4 + with: + name: checklog + path: "**/*tests/*.log" + + - name: make distcheck + run: make -j$(nproc) distcheck + - name: Distcheck logs + if: failure() + uses: actions/upload-artifact@v4 + with: + name: distchecklog + path: | + **/*tests/*.log + **/*tests/**/tee.* + + - name: Distribution tar ball + uses: actions/upload-artifact@v4 + with: + name: disttarball + path: "mummer-*.tar.gz" + + testbsd: + runs-on: ubuntu-latest + name: Compile on FreeBSD + steps: + - uses: actions/checkout@v4 + - name: Test in FreeBSD + id: test + uses: vmactions/freebsd-vm@v1 + with: + usesh: true + prepare: | + pkg install -y gmake yaggo autoconf automake libtool bash gcc14 + run: | + autoreconf -fi && \ + ./configure MAKE=gmake CC=gcc14 CXX=g++14 LDFLAGS=-Wl,-rpath=/usr/local/lib/gcc14 && \ + gmake -j $(sysctl -n hw.ncpu) check + + testmacos: + runs-on: macos-latest + steps: + - uses: actions/checkout@v3 + - name: dependencies + run: | + brew install autoconf automake libtool md5sha1sum + gem install yaggo + - name: autotools + run: autoreconf -fi + - name: configure + run: ./configure CC=gcc-14 CXX=g++-14 + - name: make + run: make -j$(sysctl -n hw.ncpu) + - name: make check + run: make -j$(sysctl -n hw.ncpu) check + - name: Check logs + if: failure() + uses: actions/upload-artifact@v4 + with: + name: checklog + path: "**/*tests/*.log" diff --git a/ChangeLog b/ChangeLog index 68ba812..b353558 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,16 @@ -- Version history -- +4.0.0 48bit suffix array. Go past previous sequence length limitations + nucmer is all C++ (not Perl script) and multi-threaded + nucmer can save the SA index for later reuse + Added SAM output format to nucmer + Moved to autoconf compilation system + Bindings to scripting language: Python, Perl, Ruby + Improved mummerplot + Better unittesting and conformance testing + Provide a library with pkg-conig configuration + Moved to artistic license 2.0 + Moved to C++17 standard. 3.23 - Added -D and --banded option to nucmer. These options can be used to specify the largest indel that will be included in an alignment segment without breaking it in two pieces. diff --git a/INSTALL.md b/INSTALL.md index 6d348a8..897d782 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -1,48 +1,91 @@ -# MUMmer4 INSTALLATION README +# MUMmer4 Compilation README ## Dependencies -If compiling from a [release source tarball](../../releases) you need a -recent version of the GCC compiler (g++ version >= 4.7) and other -essential tools for compilation (GNU make, ar, etc. Install -`build-essentials` on Debian or Ubuntu derivatives). Additional -requirements are needed to compile the SWIG script bindings. See the -[SWIG installation guide](swig/INSTALL.md). +If compiling from a [release source tarball](../../releases) (recommended), you need a recent version of the GCC compiler (see below, only GCC is supported) and other essential tools for compilation (GNU make, ar, etc). +Additional requirements are needed to compile the SWIG script bindings. +See the [SWIG installation guide](swig/INSTALL.md). -If compiling from the github development tree, additionally you need autotools (autoconf, automake and libtools), -[yaggo](https://github.com/gmarcais/yaggo/releases). -You should compile from a [release source tarball](../../releases), unless you plan on modifying the code of MUMmer. +If compiling from the github development tree, additionally you need autotools (autoconf, automake and libtools), [yaggo](https://github.com/gmarcais/yaggo/releases). -On Ubuntu: +### On Ubuntu +From the tarball: ```Shell -sudo apt install git build-essential yaggo autoconf automake libtool gettext +sudo apt install build-essential # For the bindings to scripting, additionally install sudo apt install swig python3-dev ruby-dev libperl-dev ``` +From the git tree: +```Shell +sudo apt instaoo build-essential git yaggo autoconf automake libtool gettext +# For the bindings to scripting, additionally install +sudo apt install swig python3-dev ruby-dev libperl-dev +``` + +### On Mac OS + +MUMmer must be compiled with GCC, not Clang (and not the Apple provided `gcc` which is really `clang`). +Install with Brew: + +```Shell +brew install autoconf automake libtool md5sha1sum +gem install yaggo +``` + +### On FreeBSD + +MUMmer must be compiled with GCC, not Clang. +Install with Brew: + +```Shell +brew install autoconf automake libtool md5sha1sum bash +gem install yaggo +``` + + ## Compilation & Installation -To compile and install from a [release source tarball](../../releases): +If compiling from the release tarball (recommended), then the first command `autoreconf -fi` is not necessary. + +### On Ubuntu ```Shell +autoreconf -fi # Optional, on if compiling from git tree ./configure --prefix=/path/to/installation make +make check # Optional make install ``` -If `--prefix` is omitted, the software is installed in -`/usr/local`. One may need `sudo make install` if installing in a -system location. +If `--prefix` is omitted, the software is installed in `/usr/local`. +One may need `sudo make install` if installing in a system location. + +### On MacOS + +Compile with `gcc-14`. -To compile from the github tree, `autoreconf` must additionally be run: ```Shell -autoreconf -fi -./configure --prefix=/path/to/installation +autoreconf -fi # Optional, on if compiling from git tree +./configure --prefix=/path/to/installation CC=gcc-14 CXX=g++-14 make +make check # Optional make install ``` +### On FreeBSD + +Compile with `gcc14`. + +```Shell +autoreconf -fi # Optional, on if compiling from git tree +./configure --prefix=/path/to/installation MAKE=gmake CC=gcc14 CXX=g++14 LDFLAGS=-Wl,-rpath=/usr/local/lib/gcc14 +gmake +gmake check # Optional +gmake install +``` + ## SOFTWARE REQUIREMENTS The MUMmer4.x package requires the following to run successfully. In diff --git a/include/thread_pipe.hpp b/include/thread_pipe.hpp index 692f59a..53e7c52 100644 --- a/include/thread_pipe.hpp +++ b/include/thread_pipe.hpp @@ -48,6 +48,7 @@ class ostream_buffered : public consumer std::ostream& os_; public: ostream_buffered(std::ostream& os) : os_(os) { } + ~ostream_buffered() { close(); } bool operator()(stringstream_wrapper& e) { bool res = false; auto rdbuf = e.p_->rdbuf(); diff --git a/src/tigr/show-aligns.cc b/src/tigr/show-aligns.cc index 24cb401..51d9689 100644 --- a/src/tigr/show-aligns.cc +++ b/src/tigr/show-aligns.cc @@ -696,7 +696,8 @@ void append(ColoredBuffer& Buff1, ColoredBuffer& Buff2, std::string &Buff3, void add_prefix(ColoredBuffer& Buff, long int pos, long int seqlen, int frame) { char b[LINE_BUFFER_LEN + 1]; - sprintf(b, PREFIX_FORMAT, toFwd(pos, seqlen, frame)); + snprintf(b, sizeof(b), PREFIX_FORMAT, toFwd(pos, seqlen, frame)); + b[sizeof(b)-1] = '\0'; Buff.clear(); Buff += b; } diff --git a/tests/check_cigar.cc b/tests/check_cigar.cc index f7a96ec..a917a2d 100644 --- a/tests/check_cigar.cc +++ b/tests/check_cigar.cc @@ -6,13 +6,12 @@ typedef std::map name_seq_map; -name_seq_map read_fasta(const char* path) { +name_seq_map read_fasta(std::istream& is) { name_seq_map res; - std::ifstream is(path); int c = is.peek(); if(c != '>') - check_cigar_cmdline::error() << "Invalid fasta file '" << path << '\''; + check_cigar_cmdline::error() << "Invalid fasta file"; std::string line; while(c != EOF) { std::getline(is, line); @@ -26,6 +25,48 @@ name_seq_map read_fasta(const char* path) { return res; } +name_seq_map read_fasta(const char* path) { + std::ifstream is(path); + return read_fasta(is); +} + +name_seq_map read_fastq(std::istream& is) { + name_seq_map res; + + std::string line; + int c = is.peek(); + while(c != EOF) { + std::getline(is, line); + if(c != '@') + check_cigar_cmdline::error() << "Invalid fastq file: " << line; + std::string& seq = res[line.substr(1, line.find_first_of(" \t", 1) - 1)]; + for(c = is.peek(); c != EOF && c != '+'; c = is.peek()) { + std::getline(is, line); + seq += line; + } + if(c == '+') { + std::getline(is, line); + std::getline(is, line); + c = is.peek(); + } + } + + return res; +} + +name_seq_map read_seq(const char* path) { + std::ifstream is(path); + switch(is.peek()) { + case '>': return read_fasta(is); + case '@': return read_fastq(is); + default: + check_cigar_cmdline::error() << "Invalid sequence file '" << path << '\''; + } + // should never get there + exit(1); +} + + char comp(char c) { switch(c) { case 'a': case 'A': return 't'; @@ -97,7 +138,7 @@ long find_nm(const std::vector& fields) { int main(int argc, char *argv[]) { check_cigar_cmdline args(argc, argv); auto ref_map = read_fasta(args.ref_arg); - auto qry_map = read_fasta(args.qry_arg); + auto qry_map = read_seq(args.qry_arg); std::ifstream is(args.sam_arg); if(!is.good()) diff --git a/tests/gcc10_uniform_dist.hpp b/tests/gcc10_uniform_dist.hpp index f9e1031..59882d1 100644 --- a/tests/gcc10_uniform_dist.hpp +++ b/tests/gcc10_uniform_dist.hpp @@ -36,6 +36,10 @@ #ifndef _GCC10_UNIFORM_DIST_H #define _GCC10_UNIFORM_DIST_H +#ifndef __glibcxx_assert +#define __glibcxx_assert(x) +#endif + #include #include diff --git a/tests/genome.sh b/tests/genome.sh index d00a234..d837650 100644 --- a/tests/genome.sh +++ b/tests/genome.sh @@ -1,2 +1,2 @@ -time sed -e 's/^>\([^[:space:]]\+\).*/>\1/' $D/seed_reads_2.fa | tee genome | nucmer -G --delta /dev/stdout $D/seed_genome.fa /dev/stdin | \ +time sed -E 's/^>([^[:space:]]+).*/>\1/' "${D}/seed_reads_2.fa" | tee genome | nucmer -G --delta /dev/stdout "${D}/seed_genome.fa" /dev/stdin | \ tee genome.delta | tail -n +3 | test_md5 8328b1577d8656eaa53aa61a113d89b0 diff --git a/tests/mummer.sh b/tests/mummer.sh index ab4bf1f..76f8f9b 100644 --- a/tests/mummer.sh +++ b/tests/mummer.sh @@ -1,2 +1,2 @@ -mummer -mum $D/seed_reads_1.fa $D/seed_reads_0.fa | ufasta hsort -H | ufasta dsort | test_md5 4e8182c9f745abf59158f69a05b942f3 -mummer -maxmatch $D/seed_reads_1.fa $D/seed_reads_0.fa | ufasta hsort -H | ufasta dsort | test_md5 a459f93742d1c36819e53e7a4c128bf7 +mummer -mum "${D}/seed_reads_1.fa" "${D}/seed_reads_0.fa" | ufasta hsort -H | ufasta dsort | test_md5 4e8182c9f745abf59158f69a05b942f3 +mummer -maxmatch "${D}/seed_reads_1.fa" "${D}/seed_reads_0.fa" | ufasta hsort -H | ufasta dsort | test_md5 a459f93742d1c36819e53e7a4c128bf7 diff --git a/tests/sam.sh b/tests/sam.sh index 870cc82..7304c47 100644 --- a/tests/sam.sh +++ b/tests/sam.sh @@ -16,7 +16,7 @@ for i in 1 2; do [ "$(grep '^@PG' $f | head -c $pglen)" = "$pgline" ] # Test sequence headers - diff -q <(grep '^@SQ' $f) <(ufasta sizes -H $D/seed_reads_1.fa | sed 's/\([0-9]\+\) \([0-9]\+\)/@SQ\tSN:\1\tLN:\2/') + diff -q <(grep '^@SQ' $f) <(ufasta sizes -H $D/seed_reads_1.fa | sed -E 's/([0-9]+) ([0-9]+)/@SQ\tSN:\1\tLN:\2/') # Test that samtools can parse our output if [ -n "$SAMTOOLS" ]; then diff --git a/tests/test_md5 b/tests/test_md5 index c247441..2949798 100755 --- a/tests/test_md5 +++ b/tests/test_md5 @@ -1,4 +1,9 @@ -#! /bin/bash +#! /usr/bin/env bash -md5sum -c <(echo "$1 -") 2>&1 || \ - test $(md5) == "$1" 2>&1 +# The semantic of md5sum differs in some subtle way. Darwin doesn't support +# /dev/stdin, FreeBSD doesn't support "-". Linux supports both. + +case "$(uname)" in + (Darwin) exec md5sum -c <(echo "$1 -") 2>&1 ;; + (*) exec md5sum -c <(echo "$1 /dev/stdin") 2>&1 ;; +esac diff --git a/tests/testsh.in b/tests/testsh.in index 9f2aa20..973e7dd 100644 --- a/tests/testsh.in +++ b/tests/testsh.in @@ -1,4 +1,4 @@ -#! /bin/bash +#! /usr/bin/env bash set -e set -x @@ -18,15 +18,14 @@ SCRIPT=$1 # Path to data D=$BUILD/tests/data -PATH=$BUILD:$BUILD/tests:$SRC/tests:$PATH +PATH=${BUILD}:${BUILD}/tests:${SRC}/tests:$PATH + # Name N=$(basename $1 .sh) # Working directory WORKDIR=tests/$N mkdir -p $WORKDIR -# Read script in fd 3 so that it is availabe after changing directory -# (path to SCRIPT may not be absolute). -exec 3<$SCRIPT +SCRIPT=$(realpath $SCRIPT) cd $WORKDIR -source /dev/fd/3 +source "$SCRIPT" diff --git a/unittests/Makefile.am b/unittests/Makefile.am index afa8429..82895ad 100644 --- a/unittests/Makefile.am +++ b/unittests/Makefile.am @@ -20,12 +20,15 @@ EXTRA_DIST += $(GTEST_SRC) # Unittest programs # ##################### unittests_programs = %D%/test_all +unittests_script = %D%/unittests check_PROGRAMS += $(unittests_programs) -TESTS += $(unittests_programs) +TESTS += $(unittests_script) +EXTRA_DIST += $(unittests_script) +CLEANDIRS += %D%/tmp %C%_test_all_SOURCES = %D%/test_nucmer.cc %D%/test_cooperative_pool2.cc \ %D%/test_whole_sequence_parser.cc %D%/test_sparse_sa.cc %D%/test_qsort.cc \ - %D%/test_multi_thread_skip_list_set.cc + %D%/test_multi_thread_skip_list_set.cc %D%/test_thread_pipe.cc %C%_test_all_LDADD = $(LDADD) %D%/libgtest_main.la %C%_test_all_CXXFLAGS = $(AM_CXXFLAGS) -I$(srcdir)/unittests noinst_HEADERS += %D%/misc.hpp diff --git a/unittests/test_thread_pipe.cc b/unittests/test_thread_pipe.cc new file mode 100644 index 0000000..8866496 --- /dev/null +++ b/unittests/test_thread_pipe.cc @@ -0,0 +1,89 @@ +#include +#include +#include +#include +#include +#include + +#include + +namespace { + // Sizes written and block size + static const size_t times = 100000; + static const size_t size = 67; + static const ssize_t block = 1024; + + // Thread function: Output + void producer(thread_pipe::ostream_buffered* output, int thread_id) { + auto it = output->begin(); + + for(size_t i = 0; i < times; ++i) { + *it << thread_id; + for(size_t j = 0; j < size; ++j) + *it << ' ' << i; + *it << '\t'; + if(it->tellp() > block) + ++it; + } + it.done(); + } + + TEST(ThreadPipe, MultipleProducers) { + static const char* file = "multipleproducers"; + static const int nb_threads = 4; + + { // Write content to file + std::ofstream os(file); + thread_pipe::ostream_buffered output(os); + + std::vector threads; + for(int i = 0; i < nb_threads; ++i) + threads.push_back(std::thread(producer, &output, i)); + + for(auto& th : threads) + th.join(); + + EXPECT_TRUE(os.good()); + } + + { // Read and check content in file. It is tab separated + std::ifstream is(file); + std::vector content; + std::string block; + + while(std::getline(is, block, '\t')) { + content.push_back(block); + } + + EXPECT_TRUE(is.eof()); + EXPECT_EQ(times * (size_t)nb_threads, content.size()); + + // Expect every thread to have written lines in order, each line + // containing "times" order value. + std::vector indices(nb_threads, 0); + std::istringstream iss; + int thid; + size_t count; + for(const auto& l : content) { + iss.str(l); + iss.clear(); + iss >> thid; + EXPECT_TRUE(iss.good()); + EXPECT_GE(thid, 0); + EXPECT_LT(thid, nb_threads); + for(size_t i = 0; i < size; ++i) { + EXPECT_TRUE(iss.good()) << "thid " << this << " i " << i; + iss >> count; + EXPECT_EQ(indices[thid], count) << "thid " << thid; + } + iss >> count; + EXPECT_TRUE(iss.eof()); + ++indices[thid]; + } + + for(const auto v : indices) + EXPECT_EQ(times, v); + } + } + +} // namespace diff --git a/unittests/test_whole_sequence_parser.cc b/unittests/test_whole_sequence_parser.cc index 972e219..44a4158 100644 --- a/unittests/test_whole_sequence_parser.cc +++ b/unittests/test_whole_sequence_parser.cc @@ -28,7 +28,6 @@ TEST(SequenceParser, Fasta) { static const char* seq1 = "ATTACCTTGTACCTTCAGAGC"; static const char* seq2 = "TTCGATCCCTTGATAATTAGTCACGTTAGCT"; const char* file_name = "Fasta.fa"; - file_unlink fu(file_name); { std::ofstream sequence(file_name); @@ -73,7 +72,6 @@ TEST(SequenceParser, Fastq) { static const char* seq1 = "ATTACCTTGTACCTTCAGAGC"; static const char* seq2 = "TTCGATCCCTTGATAATTAGTCACGTTAGCT"; const char* file_name = "Fasta.fq"; - file_unlink fu(file_name); { std::ofstream sequence(file_name); @@ -126,7 +124,6 @@ TEST(SequenceParser, Fastq) { TEST(SequenceParser, FastaMany) { const char* file_name = "FastaMany.fa"; - file_unlink fu(file_name); static const int nb_sequences = 1000; std::uniform_int_distribution rand_byte(0, 255); diff --git a/unittests/unittests b/unittests/unittests new file mode 100755 index 0000000..1a5a147 --- /dev/null +++ b/unittests/unittests @@ -0,0 +1,9 @@ +#! /usr/bin/env bash + +set -e + +pwd +p=$(realpath unittests/test_all) +mkdir -p unittests/tmp +cd unittests/tmp +exec $p