From bfeea63bbae6260b4ddf3ecad310387bedea0302 Mon Sep 17 00:00:00 2001 From: David Heise Date: Wed, 26 Sep 2018 09:43:47 -0400 Subject: [PATCH 01/15] WIP of updating to samtools/htslib 1.9 and enabling parallal bgzf. --- copyrights/COPYING.samtools | 1 - general/BgzfFileType.cpp | 15 +++++++++++++-- general/BgzfFileType.h | 6 +++++- general/PhoneHome.cpp | 12 ++++++------ 4 files changed, 24 insertions(+), 10 deletions(-) delete mode 120000 copyrights/COPYING.samtools diff --git a/copyrights/COPYING.samtools b/copyrights/COPYING.samtools deleted file mode 120000 index e1a6318..0000000 --- a/copyrights/COPYING.samtools +++ /dev/null @@ -1 +0,0 @@ -../samtools/COPYING \ No newline at end of file diff --git a/general/BgzfFileType.cpp b/general/BgzfFileType.cpp index 9176aff..5d903e7 100644 --- a/general/BgzfFileType.cpp +++ b/general/BgzfFileType.cpp @@ -27,14 +27,23 @@ bool BgzfFileType::ourRequireEofBlock = true; BgzfFileType::BgzfFileType(const char * filename, const char * mode) { + BgzfFileType::numThreads = 1; + char threadSpec[8]; + const char* thread_start = strchr(mode, '@'); + if (thread_start != NULL && thread_start+1 != '\0'){ + thread_start++; + strncpy(threadSpec, thread_start, 7); + threadSpec[7] = '\0'; + numThreads = strtol(threadSpec, NULL, 10); + } // If the file is for write and is '-', then write to stdout. - if(((mode[0] == 'w') || (mode[0] == 'W')) && + if(((mode[0] == 'w') || (mode[0] == 'W')) && (strcmp(filename, "-") == 0)) { // Write to stdout. bgzfHandle = bgzf_dopen(fileno(stdout), mode); } - else if(((mode[0] == 'r') || (mode[0] == 'R')) && + else if(((mode[0] == 'r') || (mode[0] == 'R')) && (strcmp(filename, "-") == 0)) { // read from stdin @@ -48,6 +57,8 @@ BgzfFileType::BgzfFileType(const char * filename, const char * mode) myStartPos = 0; if (bgzfHandle != NULL) { + //If the file handle is valid, set it for multithreaded operations + bgzf_mt(bgzfHandle, numThreads, 1); // Check to see if the file is being opened for read, if the eof block // is required, and if it is, if it is there. if ((mode[0] == 'r' || mode[0] == 'R') && (strcmp(filename, "-") != 0) diff --git a/general/BgzfFileType.h b/general/BgzfFileType.h index 02da7c4..7c0ff70 100644 --- a/general/BgzfFileType.h +++ b/general/BgzfFileType.h @@ -29,6 +29,7 @@ class BgzfFileType : public FileType public: BgzfFileType() { + numThreads = 1; bgzfHandle = NULL; myEOF = false; } @@ -110,7 +111,7 @@ class BgzfFileType : public FileType } else if((bytesRead != (int)size) & (bytesRead >= 0)) { - // Less then the requested size was read + // Less then the requested size was read // and an error was not returned (bgzf_read returns -1 on error). myEOF = true; } @@ -173,6 +174,9 @@ class BgzfFileType : public FileType // at the end of the file. If the block is required, but not on the file, // the constructor fails to open the file. static bool ourRequireEofBlock; + + // The number of threads to use when using multithreaded BGZF + int64_t numThreads; }; #endif diff --git a/general/PhoneHome.cpp b/general/PhoneHome.cpp index 31ef8bd..df0c472 100644 --- a/general/PhoneHome.cpp +++ b/general/PhoneHome.cpp @@ -82,7 +82,7 @@ bool PhoneHome::checkVersion(const char* programName, const char* version, // Found this program, so extract the version. start += ourToolName.Length(); - while((start < ourReturnString.Length()) && + while((start < ourReturnString.Length()) && isspace(ourReturnString[start])) { // Consume whitespace @@ -93,7 +93,7 @@ bool PhoneHome::checkVersion(const char* programName, const char* version, String thisVersion = version; String latestVersion; int end = start; - while((end < ourReturnString.Length()) && + while((end < ourReturnString.Length()) && !isspace(ourReturnString[end])) { latestVersion += ourReturnString[end]; @@ -105,9 +105,9 @@ bool PhoneHome::checkVersion(const char* programName, const char* version, if(latestVersion.FastCompare(thisVersion) > 0) { std::cerr << "\n**************************************************************************************\n" - << "A new version, " << latestVersion + << "A new version, " << latestVersion << ", of " << ourToolName - << " is available (currently running " + << " is available (currently running " << thisVersion.c_str() << ")" << "\n**************************************************************************************\n\n"; return(false); @@ -201,7 +201,7 @@ bool PhoneHome::connect() ourReturnString.Clear(); // return(true); #ifndef _NO_PHONEHOME - knet_silent(1); + //knet_silent(1); knetFile *file = knet_open(ourURL.c_str(), "r"); if (file == 0) return(false); @@ -218,7 +218,7 @@ bool PhoneHome::connect() } knet_close(file); - knet_silent(0); + //knet_silent(0); // std::cerr << "PhoneHome URL = " << ourReturnString.c_str() << std::endl; #endif return(true); From dfd9f54f346ebcc56f414faf56e73aa8ca59224d Mon Sep 17 00:00:00 2001 From: David Heise Date: Wed, 26 Sep 2018 17:04:28 -0400 Subject: [PATCH 02/15] Insulate bgzf_open from the specialized mode for specifying threads. --- general/BgzfFileType.cpp | 30 +++++++++++++++++++++--------- general/BgzfFileType.h | 1 + 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/general/BgzfFileType.cpp b/general/BgzfFileType.cpp index 5d903e7..e9f3b52 100644 --- a/general/BgzfFileType.cpp +++ b/general/BgzfFileType.cpp @@ -29,36 +29,48 @@ BgzfFileType::BgzfFileType(const char * filename, const char * mode) { BgzfFileType::numThreads = 1; char threadSpec[8]; + char bgzfMode[4]; const char* thread_start = strchr(mode, '@'); + int mode_len = thread_start-mode; + if(mode_len < 4){ + strncpy(bgzfMode, mode, mode_len); + bgzfMode[mode_len] = '\0'; + } else { + strncpy(bgzfMode, mode, 3); + bgzfMode[3] = '\0'; + } if (thread_start != NULL && thread_start+1 != '\0'){ - thread_start++; - strncpy(threadSpec, thread_start, 7); - threadSpec[7] = '\0'; - numThreads = strtol(threadSpec, NULL, 10); + // Advance past the @, then parse the number of threads + thread_start++; + strncpy(threadSpec, thread_start, 7); + threadSpec[7] = '\0'; + numThreads = strtol(threadSpec, NULL, 10); + // If we can't parse the number, revert to one thread + if (numThreads == 0){ numThreads = 1; } } // If the file is for write and is '-', then write to stdout. if(((mode[0] == 'w') || (mode[0] == 'W')) && (strcmp(filename, "-") == 0)) { // Write to stdout. - bgzfHandle = bgzf_dopen(fileno(stdout), mode); + bgzfHandle = bgzf_dopen(fileno(stdout), bgzfMode); } else if(((mode[0] == 'r') || (mode[0] == 'R')) && (strcmp(filename, "-") == 0)) { // read from stdin - bgzfHandle = bgzf_dopen(fileno(stdin), mode); + bgzfHandle = bgzf_dopen(fileno(stdin), bgzfMode); } else { - bgzfHandle = bgzf_open(filename, mode); + bgzfHandle = bgzf_open(filename, bgzfMode); } myStartPos = 0; if (bgzfHandle != NULL) { - //If the file handle is valid, set it for multithreaded operations - bgzf_mt(bgzfHandle, numThreads, 1); + //Only do multithreaded IO if more than one thread is used. + if(numThreads > 1){ bgzf_mt(bgzfHandle, numThreads, 256); } // Check to see if the file is being opened for read, if the eof block // is required, and if it is, if it is there. if ((mode[0] == 'r' || mode[0] == 'R') && (strcmp(filename, "-") != 0) diff --git a/general/BgzfFileType.h b/general/BgzfFileType.h index 7c0ff70..b7ee28e 100644 --- a/general/BgzfFileType.h +++ b/general/BgzfFileType.h @@ -36,6 +36,7 @@ class BgzfFileType : public FileType virtual ~BgzfFileType() { + bgzf_close(bgzfHandle); bgzfHandle = NULL; } From d488b94fbe2850b77c4588795bcbcd4afcd1a44d Mon Sep 17 00:00:00 2001 From: David Heise Date: Thu, 27 Sep 2018 16:02:26 -0400 Subject: [PATCH 03/15] Remove dependency on a samtools subdirectory --- Makefile | 5 +- samtools/COPYING | 21 -- samtools/Makefile | 4 - samtools/Makefile.depends | 1 - samtools/README.txt | 1 - samtools/bam.h | 69 ---- samtools/bgzf.c | 589 --------------------------------- samtools/bgzf.h | 197 ----------- samtools/khash.h | 532 ------------------------------ samtools/knetfile.c | 669 -------------------------------------- samtools/knetfile.h | 79 ----- 11 files changed, 1 insertion(+), 2166 deletions(-) delete mode 100644 samtools/COPYING delete mode 100644 samtools/Makefile delete mode 100644 samtools/Makefile.depends delete mode 100644 samtools/README.txt delete mode 100644 samtools/bam.h delete mode 100644 samtools/bgzf.c delete mode 100644 samtools/bgzf.h delete mode 100644 samtools/khash.h delete mode 100755 samtools/knetfile.c delete mode 100644 samtools/knetfile.h diff --git a/Makefile b/Makefile index 887b019..b1d2e40 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION ?= 1.0.14 .PHONY: package -SUBDIRS=general bam fastq glf samtools vcf +SUBDIRS=general bam fastq glf vcf include Makefiles/Makefile.base @@ -12,9 +12,6 @@ clean:$(SUBDIRS) rm -f $(STAT_GEN_LIB_DEBUG) rm -f $(STAT_GEN_LIB_PROFILE) -# general depends on samtools -general: samtools - # other subdirectories depend on general bam fastq glf vcf: general diff --git a/samtools/COPYING b/samtools/COPYING deleted file mode 100644 index 82fa2f4..0000000 --- a/samtools/COPYING +++ /dev/null @@ -1,21 +0,0 @@ -The MIT License - -Copyright (c) 2008-2009 Genome Research Ltd. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. \ No newline at end of file diff --git a/samtools/Makefile b/samtools/Makefile deleted file mode 100644 index 683f893..0000000 --- a/samtools/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -TOOLBASE = bgzf knetfile -HDRONLY = khash.h bam.h - -include ../Makefiles/Makefile.lib \ No newline at end of file diff --git a/samtools/Makefile.depends b/samtools/Makefile.depends deleted file mode 100644 index 3755e78..0000000 --- a/samtools/Makefile.depends +++ /dev/null @@ -1 +0,0 @@ -# DO NOT DELETE diff --git a/samtools/README.txt b/samtools/README.txt deleted file mode 100644 index 227fc93..0000000 --- a/samtools/README.txt +++ /dev/null @@ -1 +0,0 @@ -These files are based on samtools version 981. (retrieved 7/26/11) diff --git a/samtools/bam.h b/samtools/bam.h deleted file mode 100644 index 76c4be3..0000000 --- a/samtools/bam.h +++ /dev/null @@ -1,69 +0,0 @@ -/* The MIT License - - Copyright (c) 2008-2010 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Heng Li */ - -#ifndef BAM_BAM_H -#define BAM_BAM_H - -/*! - @header - - BAM library provides I/O and various operations on manipulating files - in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map) - format. It now supports importing from or exporting to SAM, sorting, - merging, generating pileup, and quickly retrieval of reads overlapped - with a specified region. - - @copyright Genome Research Ltd. - */ - - -#include - -/* - * Only small section is pulled out that we use. - * 4/29/2011 - Mary Kate Trost - */ - - -/*! - @abstract Calculate the minimum bin that contains a region [beg,end). - @param beg start of the region, 0-based - @param end end of the region, 0-based - @return bin - */ -static inline int bam_reg2bin(uint32_t beg, uint32_t end) -{ - --end; - if (beg>>14 == end>>14) return 4681 + (beg>>14); - if (beg>>17 == end>>17) return 585 + (beg>>17); - if (beg>>20 == end>>20) return 73 + (beg>>20); - if (beg>>23 == end>>23) return 9 + (beg>>23); - if (beg>>26 == end>>26) return 1 + (beg>>26); - return 0; -} - -#endif diff --git a/samtools/bgzf.c b/samtools/bgzf.c deleted file mode 100644 index c52ae2b..0000000 --- a/samtools/bgzf.c +++ /dev/null @@ -1,589 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology - 2011 Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -#ifdef __ZLIB_AVAILABLE__ -#include -#include -#include -#include -#include -#include -#include "bgzf.h" - -#ifdef _USE_KNETFILE -#include "knetfile.h" -typedef knetFile *_bgzf_file_t; -#define _bgzf_open(fn, mode) knet_open(fn, mode) -#define _bgzf_dopen(fp, mode) knet_dopen(fp, mode) -#define _bgzf_close(fp) knet_close(fp) -#define _bgzf_fileno(fp) ((fp)->fd) -#define _bgzf_tell(fp) knet_tell(fp) -#define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence) -#define _bgzf_read(fp, buf, len) knet_read(fp, buf, len) -#define _bgzf_write(fp, buf, len) knet_write(fp, buf, len) -#else // ~defined(_USE_KNETFILE) -#if defined(_WIN32) || defined(_MSC_VER) -#define ftello(fp) ftell(fp) -#define fseeko(fp, offset, whence) fseek(fp, offset, whence) -#else // ~defined(_WIN32) -extern off_t ftello(FILE *stream); -extern int fseeko(FILE *stream, off_t offset, int whence); -#endif // ~defined(_WIN32) -typedef FILE *_bgzf_file_t; -#define _bgzf_open(fn, mode) fopen(fn, mode) -#define _bgzf_dopen(fp, mode) fdopen(fp, mode) -#define _bgzf_close(fp) fclose(fp) -#define _bgzf_fileno(fp) fileno(fp) -#define _bgzf_tell(fp) ftello(fp) -#define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence) -#define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp) -#define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp) -#endif // ~define(_USE_KNETFILE) - -#define BLOCK_HEADER_LENGTH 18 -#define BLOCK_FOOTER_LENGTH 8 - -/* BGZF/GZIP header (speciallized from RFC 1952; little endian): - +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ - | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN| - +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ -*/ -static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0"; - -#ifdef BGZF_CACHE -typedef struct { - int size; - uint8_t *block; - int64_t end_offset; -} cache_t; -#include "khash.h" -KHASH_MAP_INIT_INT64(cache, cache_t) -#endif - -static inline void packInt16(uint8_t *buffer, uint16_t value) -{ - buffer[0] = value; - buffer[1] = value >> 8; -} - -static inline int unpackInt16(const uint8_t *buffer) -{ - return buffer[0] | buffer[1] << 8; -} - -static inline void packInt32(uint8_t *buffer, uint32_t value) -{ - buffer[0] = value; - buffer[1] = value >> 8; - buffer[2] = value >> 16; - buffer[3] = value >> 24; -} - -static BGZF *bgzf_read_init() -{ - BGZF *fp; - fp = calloc(1, sizeof(BGZF)); - fp->open_mode = 'r'; - fp->uncompressed_block = malloc(BGZF_BLOCK_SIZE); - fp->compressed_block = malloc(BGZF_BLOCK_SIZE); -#ifdef BGZF_CACHE - fp->cache = kh_init(cache); -#endif - return fp; -} - -static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level -{ - BGZF *fp; - fp = calloc(1, sizeof(BGZF)); - fp->open_mode = 'w'; - fp->uncompressed_block = malloc(BGZF_BLOCK_SIZE); - fp->compressed_block = malloc(BGZF_BLOCK_SIZE); - fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1 - if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION; - return fp; -} -// get the compress level from the mode string -static int mode2level(const char *__restrict mode) -{ - int i, compress_level = -1; - for (i = 0; mode[i]; ++i) - if (mode[i] >= '0' && mode[i] <= '9') break; - if (mode[i]) compress_level = (int)mode[i] - '0'; - if (strchr(mode, 'u')) compress_level = 0; - return compress_level; -} - -BGZF *bgzf_open(const char *path, const char *mode) -{ - BGZF *fp = 0; - if (strchr(mode, 'r') || strchr(mode, 'R')) { - _bgzf_file_t fpr; - if ((fpr = _bgzf_open(path, "r")) == 0) return 0; - fp = bgzf_read_init(); - fp->fp = fpr; - } else if (strchr(mode, 'w') || strchr(mode, 'W')) { - FILE *fpw; - if ((fpw = fopen(path, "w")) == 0) return 0; - fp = bgzf_write_init(mode2level(mode)); - fp->fp = fpw; - } else if (strchr(mode, 'a') || strchr(mode, 'A')) { - FILE *fpw; - if ((fpw = fopen(path, "r+")) == 0) return 0; - fp = bgzf_write_init(mode2level(mode)); - fp->fp = fpw; - // Check for trailing EOF block. - if(bgzf_check_EOF(fp)) - { - // Overwrite the trailing EOF. - _bgzf_seek(fp->fp, -28, SEEK_END); - } - else - { - // No trailing EOF block, so go to the end - _bgzf_seek(fp->fp, 0, SEEK_END); - } - } - return fp; -} - -BGZF *bgzf_dopen(int fd, const char *mode) -{ - BGZF *fp = 0; - if (strchr(mode, 'r') || strchr(mode, 'R')) { - _bgzf_file_t fpr; - if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0; - fp = bgzf_read_init(); - fp->fp = fpr; - } else if (strchr(mode, 'w') || strchr(mode, 'W')) { - FILE *fpw; - if ((fpw = fdopen(fd, "w")) == 0) return 0; - fp = bgzf_write_init(mode2level(mode)); - fp->fp = fpw; - } - return fp; -} - -// Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length. -static int deflate_block(BGZF *fp, int block_length) -{ - uint8_t *buffer = fp->compressed_block; - int buffer_size = BGZF_BLOCK_SIZE; - int input_length = block_length; - int compressed_length = 0; - int remaining; - uint32_t crc; - - assert(block_length <= BGZF_BLOCK_SIZE); // guaranteed by the caller - memcpy(buffer, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block - while (1) { // loop to retry for blocks that do not compress enough - int status; - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = fp->uncompressed_block; - zs.avail_in = input_length; - zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; - zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; - status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY); // -15 to disable zlib header/footer - if (status != Z_OK) { - fp->errcode |= BGZF_ERR_ZLIB; - return -1; - } - status = deflate(&zs, Z_FINISH); - if (status != Z_STREAM_END) { // not compressed enough - deflateEnd(&zs); // reset the stream - if (status == Z_OK) { // reduce the size and recompress - input_length -= 1024; - assert(input_length > 0); // logically, this should not happen - continue; - } - fp->errcode |= BGZF_ERR_ZLIB; - return -1; - } - if (deflateEnd(&zs) != Z_OK) { - fp->errcode |= BGZF_ERR_ZLIB; - return -1; - } - compressed_length = zs.total_out; - compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; - assert(compressed_length <= BGZF_BLOCK_SIZE); - break; - } - - assert(compressed_length > 0); - packInt16((uint8_t*)&buffer[16], compressed_length - 1); // write the compressed_length; -1 to fit 2 bytes - crc = crc32(0L, NULL, 0L); - crc = crc32(crc, fp->uncompressed_block, input_length); - packInt32((uint8_t*)&buffer[compressed_length-8], crc); - packInt32((uint8_t*)&buffer[compressed_length-4], input_length); - - remaining = block_length - input_length; - if (remaining > 0) { - assert(remaining <= input_length); - memcpy(fp->uncompressed_block, fp->uncompressed_block + input_length, remaining); - } - fp->block_offset = remaining; - return compressed_length; -} - -// Inflate the block in fp->compressed_block into fp->uncompressed_block -static int inflate_block(BGZF* fp, int block_length) -{ - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = fp->compressed_block + 18; - zs.avail_in = block_length - 16; - zs.next_out = fp->uncompressed_block; - zs.avail_out = BGZF_BLOCK_SIZE; - - if (inflateInit2(&zs, -15) != Z_OK) { - fp->errcode |= BGZF_ERR_ZLIB; - return -1; - } - if (inflate(&zs, Z_FINISH) != Z_STREAM_END) { - inflateEnd(&zs); - fp->errcode |= BGZF_ERR_ZLIB; - return -1; - } - if (inflateEnd(&zs) != Z_OK) { - fp->errcode |= BGZF_ERR_ZLIB; - return -1; - } - return zs.total_out; -} - -static int check_header(const uint8_t *header) -{ - return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0 - && unpackInt16((uint8_t*)&header[10]) == 6 - && header[12] == 'B' && header[13] == 'C' - && unpackInt16((uint8_t*)&header[14]) == 2); -} - -#ifdef BGZF_CACHE -static void free_cache(BGZF *fp) -{ - khint_t k; - khash_t(cache) *h = (khash_t(cache)*)fp->cache; - if (fp->open_mode != 'r') return; - for (k = kh_begin(h); k < kh_end(h); ++k) - if (kh_exist(h, k)) free(kh_val(h, k).block); - kh_destroy(cache, h); -} - -static int load_block_from_cache(BGZF *fp, int64_t block_address) -{ - khint_t k; - cache_t *p; - khash_t(cache) *h = (khash_t(cache)*)fp->cache; - k = kh_get(cache, h, block_address); - if (k == kh_end(h)) return 0; - p = &kh_val(h, k); - if (fp->block_length != 0) fp->block_offset = 0; - fp->block_address = block_address; - fp->block_length = p->size; - memcpy(fp->uncompressed_block, p->block, BGZF_BLOCK_SIZE); - _bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET); - return p->size; -} - -static void cache_block(BGZF *fp, int size) -{ - int ret; - khint_t k; - cache_t *p; - khash_t(cache) *h = (khash_t(cache)*)fp->cache; - if (BGZF_BLOCK_SIZE >= fp->cache_size) return; - if ((kh_size(h) + 1) * BGZF_BLOCK_SIZE > fp->cache_size) { - /* A better way would be to remove the oldest block in the - * cache, but here we remove a random one for simplicity. This - * should not have a big impact on performance. */ - for (k = kh_begin(h); k < kh_end(h); ++k) - if (kh_exist(h, k)) break; - if (k < kh_end(h)) { - free(kh_val(h, k).block); - kh_del(cache, h, k); - } - } - k = kh_put(cache, h, fp->block_address, &ret); - if (ret == 0) return; // if this happens, a bug! - p = &kh_val(h, k); - p->size = fp->block_length; - p->end_offset = fp->block_address + size; - p->block = malloc(BGZF_BLOCK_SIZE); - memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_BLOCK_SIZE); -} -#else -static void free_cache(BGZF *fp) {} -static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;} -static void cache_block(BGZF *fp, int size) {} -#endif - -int bgzf_read_block(BGZF *fp) -{ - uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block; - int count, size = 0, block_length, remaining; - int64_t block_address; - block_address = _bgzf_tell((_bgzf_file_t)fp->fp); - if (load_block_from_cache(fp, block_address)) return 0; - count = _bgzf_read(fp->fp, header, sizeof(header)); - if (count == 0) { // no data read - fp->block_length = 0; - return 0; - } - if (count != sizeof(header) || !check_header(header)) { - fp->errcode |= BGZF_ERR_HEADER; - return -1; - } - size = count; - block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1" - compressed_block = (uint8_t*)fp->compressed_block; - memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); - remaining = block_length - BLOCK_HEADER_LENGTH; - count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining); - if (count != remaining) { - fp->errcode |= BGZF_ERR_IO; - return -1; - } - size += count; - if ((count = inflate_block(fp, block_length)) < 0) return -1; - if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek. - fp->block_address = block_address; - fp->block_length = count; - cache_block(fp, size); - return 0; -} - -ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length) -{ - ssize_t bytes_read = 0; - uint8_t *output = data; - if (length <= 0) return 0; - assert(fp->open_mode == 'r'); - while (bytes_read < length) { - int copy_length, available = fp->block_length - fp->block_offset; - uint8_t *buffer; - if (available <= 0) { - if (bgzf_read_block(fp) != 0) return -1; - available = fp->block_length - fp->block_offset; - if (available <= 0) break; - } - copy_length = length - bytes_read < available? length - bytes_read : available; - buffer = fp->uncompressed_block; - memcpy(output, buffer + fp->block_offset, copy_length); - fp->block_offset += copy_length; - output += copy_length; - bytes_read += copy_length; - } - if (fp->block_offset == fp->block_length) { - fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); - fp->block_offset = fp->block_length = 0; - } - return bytes_read; -} - -int bgzf_flush(BGZF *fp) -{ - assert(fp->open_mode == 'w'); - while (fp->block_offset > 0) { - int block_length; - block_length = deflate_block(fp, fp->block_offset); - if (block_length < 0) return -1; - if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) { - fp->errcode |= BGZF_ERR_IO; // possibly truncated file - return -1; - } - fp->block_address += block_length; - } - return 0; -} - -int bgzf_flush_try(BGZF *fp, ssize_t size) -{ - if (fp->block_offset + size > BGZF_BLOCK_SIZE) - return bgzf_flush(fp); - return -1; -} - -ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length) -{ - const uint8_t *input = data; - int block_length = BGZF_BLOCK_SIZE, bytes_written; - assert(fp->open_mode == 'w'); - input = data; - bytes_written = 0; - while (bytes_written < length) { - uint8_t* buffer = fp->uncompressed_block; - int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written; - memcpy(buffer + fp->block_offset, input, copy_length); - fp->block_offset += copy_length; - input += copy_length; - bytes_written += copy_length; - if (fp->block_offset == block_length && bgzf_flush(fp)) break; - } - return bytes_written; -} - -int bgzf_close(BGZF* fp) -{ - int ret, count, block_length; - if (fp == 0) return -1; - if (fp->open_mode == 'w') { - if (bgzf_flush(fp) != 0) return -1; - block_length = deflate_block(fp, 0); // write an empty block - count = fwrite(fp->compressed_block, 1, block_length, fp->fp); - if(count != 0) - { - // Something was written - } - if (fflush(fp->fp) != 0) { - fp->errcode |= BGZF_ERR_IO; - return -1; - } - } - ret = fp->open_mode == 'w'? fclose(fp->fp) : _bgzf_close(fp->fp); - if (ret != 0) return -1; - free(fp->uncompressed_block); - free(fp->compressed_block); - free_cache(fp); - free(fp); - return 0; -} - -void bgzf_set_cache_size(BGZF *fp, int cache_size) -{ - if (fp) fp->cache_size = cache_size; -} - -int bgzf_check_EOF(BGZF *fp) -{ - static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0"; - // Last 28 bytes of an uncompressed bgzf file which are different - // from the last 28 bytes of compressed bgzf files. - static uint8_t magic2[28] = "\4\0\0\0\0\0\377\6\0\102\103\2\0\036\0\1\0\0\377\377\0\0\0\0\0\0\0\0"; - uint8_t buf[28]; - off_t offset; - offset = _bgzf_tell((_bgzf_file_t)fp->fp); - if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0; - int count = _bgzf_read(fp->fp, buf, 28); - if(count != 28) - { - fp->errcode |= BGZF_ERR_IO; // possibly truncated file - return(0); - } - _bgzf_seek(fp->fp, offset, SEEK_SET); - if((memcmp(magic, buf, 28) == 0) || (memcmp(magic2, buf, 28) == 0)) - { - return(1); - } - return(0); -} - -int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) -{ - int block_offset; - int64_t block_address; - - if (fp->open_mode != 'r' || where != SEEK_SET) { - fp->errcode |= BGZF_ERR_MISUSE; - return -1; - } - block_offset = pos & 0xFFFF; - block_address = pos >> 16; - if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) { - fp->errcode |= BGZF_ERR_IO; - return -1; - } - fp->block_length = 0; // indicates current block has not been loaded - fp->block_address = block_address; - fp->block_offset = block_offset; - return 0; -} - -int bgzf_is_bgzf(const char *fn) -{ - uint8_t buf[16]; - int n; - _bgzf_file_t fp; - if ((fp = _bgzf_open(fn, "r")) == 0) return 0; - n = _bgzf_read(fp, buf, 16); - _bgzf_close(fp); - if (n != 16) return 0; - return memcmp(g_magic, buf, 16) == 0? 1 : 0; -} - -int bgzf_getc(BGZF *fp) -{ - int c; - if (fp->block_offset >= fp->block_length) { - if (bgzf_read_block(fp) != 0) return -2; /* error */ - if (fp->block_length == 0) return -1; /* end-of-file */ - } - c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; - if (fp->block_offset == fp->block_length) { - fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); - fp->block_offset = 0; - fp->block_length = 0; - } - return c; -} - -#ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - -int bgzf_getline(BGZF *fp, int delim, kstring_t *str) -{ - int l, state = 0; - unsigned char *buf = (unsigned char*)fp->uncompressed_block; - str->l = 0; - do { - if (fp->block_offset >= fp->block_length) { - if (bgzf_read_block(fp) != 0) { state = -2; break; } - if (fp->block_length == 0) { state = -1; break; } - } - for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l); - if (l < fp->block_length) state = 1; - l -= fp->block_offset; - if (str->l + l + 1 >= str->m) { - str->m = str->l + l + 2; - kroundup32(str->m); - str->s = (char*)realloc(str->s, str->m); - } - memcpy(str->s + str->l, buf + fp->block_offset, l); - str->l += l; - fp->block_offset += l + 1; - if (fp->block_offset >= fp->block_length) { - fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); - fp->block_offset = 0; - fp->block_length = 0; - } - } while (state == 0); - if (str->l == 0 && state < 0) return state; - str->s[str->l] = 0; - return str->l; -} -#endif diff --git a/samtools/bgzf.h b/samtools/bgzf.h deleted file mode 100644 index ea04df4..0000000 --- a/samtools/bgzf.h +++ /dev/null @@ -1,197 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology - 2011 Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. -*/ - -/* The BGZF library was originally written by Bob Handsaker from the Broad - * Institute. It was later improved by the SAMtools developers. */ - -#ifndef __BGZF_H -#define __BGZF_H - -#include -#include -#ifdef __ZLIB_AVAILABLE__ -#include -#endif - -#define BGZF_BLOCK_SIZE 0x10000 // 64k - -#define BGZF_ERR_ZLIB 1 -#define BGZF_ERR_HEADER 2 -#define BGZF_ERR_IO 4 -#define BGZF_ERR_MISUSE 8 - -typedef struct { - int open_mode:8, compress_level:8, errcode:16; - int cache_size; - int block_length, block_offset; - int64_t block_address; - void *uncompressed_block, *compressed_block; - void *cache; // a pointer to a hash table - void *fp; // actual file handler; FILE* on writing; FILE* or knetFile* on reading -} BGZF; - -#ifndef KSTRING_T -#define KSTRING_T kstring_t -typedef struct __kstring_t { - size_t l, m; - char *s; -} kstring_t; -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -BGZF* dummy(); - - /****************** - * Basic routines * - ******************/ - - /** - * Open an existing file descriptor for reading or writing. - * - * @param fd file descriptor - * @param mode mode matching /[rwu0-9]+/: 'r' for reading, 'w' for writing and a digit specifies - * the zlib compression level; if both 'r' and 'w' are present, 'w' is ignored. - * @return BGZF file handler; 0 on error - */ - BGZF* bgzf_dopen(int fd, const char *mode); - - /** - * Open the specified file for reading or writing. - */ - BGZF* bgzf_open(const char* path, const char *mode); - - /** - * Close the BGZF and free all associated resources. - * - * @param fp BGZF file handler - * @return 0 on success and -1 on error - */ - int bgzf_close(BGZF *fp); - - /** - * Read up to _length_ bytes from the file storing into _data_. - * - * @param fp BGZF file handler - * @param data data array to read into - * @param length size of data to read - * @return number of bytes actually read; 0 on end-of-file and -1 on error - */ - ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length); - - /** - * Write _length_ bytes from _data_ to the file. - * - * @param fp BGZF file handler - * @param data data array to write - * @param length size of data to write - * @return number of bytes actually written; -1 on error - */ - ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length); - - /** - * Write the data in the buffer to the file. - */ - int bgzf_flush(BGZF *fp); - - /** - * Return a virtual file pointer to the current location in the file. - * No interpetation of the value should be made, other than a subsequent - * call to bgzf_seek can be used to position the file at the same point. - * Return value is non-negative on success. - */ - #define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)) - - /** - * Set the file to read from the location specified by _pos_. - * - * @param fp BGZF file handler - * @param pos virtual file offset returned by bgzf_tell() - * @param whence must be SEEK_SET - * @return 0 on success and -1 on error - */ - int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence); - - /** - * Check if the BGZF end-of-file (EOF) marker is present - * - * @param fp BGZF file handler opened for reading - * @return 1 if EOF is present; 0 if not or on I/O error - */ - int bgzf_check_EOF(BGZF *fp); - - /** - * Check if a file is in the BGZF format - * - * @param fn file name - * @return 1 if _fn_ is BGZF; 0 if not or on I/O error - */ - int bgzf_is_bgzf(const char *fn); - - /********************* - * Advanced routines * - *********************/ - - /** - * Set the cache size. Only effective when compiled with -DBGZF_CACHE. - * - * @param fp BGZF file handler - * @param size size of cache in bytes; 0 to disable caching (default) - */ - void bgzf_set_cache_size(BGZF *fp, int size); - - /** - * Flush the file if the remaining buffer size is smaller than _size_ - */ - int bgzf_flush_try(BGZF *fp, ssize_t size); - - /** - * Read one byte from a BGZF file. It is faster than bgzf_read() - * @param fp BGZF file handler - * @return byte read; -1 on end-of-file or error - */ - int bgzf_getc(BGZF *fp); - - /** - * Read one line from a BGZF file. It is faster than bgzf_getc() - * - * @param fp BGZF file handler - * @param delim delimitor - * @param str string to write to; must be initialized - * @return length of the string; 0 on end-of-file; negative on error - */ - int bgzf_getline(BGZF *fp, int delim, kstring_t *str); - - /** - * Read the next BGZF block. - */ - int bgzf_read_block(BGZF *fp); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/samtools/khash.h b/samtools/khash.h deleted file mode 100644 index 2ad433c..0000000 --- a/samtools/khash.h +++ /dev/null @@ -1,532 +0,0 @@ -/* The MIT License - - Copyright (c) 2008, 2009, 2011 by Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* - An example: - -#include "khash.h" -KHASH_MAP_INIT_INT(32, char) -int main() { - int ret, is_missing; - khiter_t k; - khash_t(32) *h = kh_init(32); - k = kh_put(32, h, 5, &ret); - if (!ret) kh_del(32, h, k); - kh_value(h, k) = 10; - k = kh_get(32, h, 10); - is_missing = (k == kh_end(h)); - k = kh_get(32, h, 5); - kh_del(32, h, k); - for (k = kh_begin(h); k != kh_end(h); ++k) - if (kh_exist(h, k)) kh_value(h, k) = 1; - kh_destroy(32, h); - return 0; -} -*/ - -/* - 2011-02-14 (0.2.5): - - * Allow to declare global functions. - - 2009-09-26 (0.2.4): - - * Improve portability - - 2008-09-19 (0.2.3): - - * Corrected the example - * Improved interfaces - - 2008-09-11 (0.2.2): - - * Improved speed a little in kh_put() - - 2008-09-10 (0.2.1): - - * Added kh_clear() - * Fixed a compiling error - - 2008-09-02 (0.2.0): - - * Changed to token concatenation which increases flexibility. - - 2008-08-31 (0.1.2): - - * Fixed a bug in kh_get(), which has not been tested previously. - - 2008-08-31 (0.1.1): - - * Added destructor -*/ - - -#ifndef __AC_KHASH_H -#define __AC_KHASH_H - -/*! - @header - - Generic hash table library. - - @copyright Heng Li - */ - -#define AC_VERSION_KHASH_H "0.2.5" - -#include -#include -#include - -/* compiler specific configuration */ - -#if UINT_MAX == 0xffffffffu -typedef unsigned int khint32_t; -#elif ULONG_MAX == 0xffffffffu -typedef unsigned long khint32_t; -#endif - -#if ULONG_MAX == ULLONG_MAX -typedef unsigned long khint64_t; -#else -typedef unsigned long long khint64_t; -#endif - -#ifdef _MSC_VER -#define inline __inline -#endif - -#ifdef _WIN32 -#define inline __inline -#endif - -typedef khint32_t khint_t; -typedef khint_t khiter_t; - -#define __ac_HASH_PRIME_SIZE 32 -static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = -{ - 0ul, 3ul, 11ul, 23ul, 53ul, - 97ul, 193ul, 389ul, 769ul, 1543ul, - 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, - 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, - 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, - 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, - 3221225473ul, 4294967291ul -}; - -#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) -#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) -#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) -#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) -#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) -#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) -#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) - -static const double __ac_HASH_UPPER = 0.77; - -#define KHASH_DECLARE(name, khkey_t, khval_t) \ - typedef struct { \ - khint_t n_buckets, size, n_occupied, upper_bound; \ - khint32_t *flags; \ - khkey_t *keys; \ - khval_t *vals; \ - } kh_##name##_t; \ - extern kh_##name##_t *kh_init_##name(); \ - extern void kh_destroy_##name(kh_##name##_t *h); \ - extern void kh_clear_##name(kh_##name##_t *h); \ - extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ - extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ - extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ - extern void kh_del_##name(kh_##name##_t *h, khint_t x); - -#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - typedef struct { \ - khint_t n_buckets, size, n_occupied, upper_bound; \ - khint32_t *flags; \ - khkey_t *keys; \ - khval_t *vals; \ - } kh_##name##_t; \ - SCOPE kh_##name##_t *kh_init_##name() { \ - return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ - } \ - SCOPE void kh_destroy_##name(kh_##name##_t *h) \ - { \ - if (h) { \ - free(h->keys); free(h->flags); \ - free(h->vals); \ - free(h); \ - } \ - } \ - SCOPE void kh_clear_##name(kh_##name##_t *h) \ - { \ - if (h && h->flags) { \ - memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \ - h->size = h->n_occupied = 0; \ - } \ - } \ - SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ - { \ - if (h->n_buckets) { \ - khint_t inc, k, i, last; \ - k = __hash_func(key); i = k % h->n_buckets; \ - inc = 1 + k % (h->n_buckets - 1); last = i; \ - while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ - else i += inc; \ - if (i == last) return h->n_buckets; \ - } \ - return __ac_iseither(h->flags, i)? h->n_buckets : i; \ - } else return 0; \ - } \ - SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ - { \ - khint32_t *new_flags = 0; \ - khint_t j = 1; \ - { \ - khint_t t = __ac_HASH_PRIME_SIZE - 1; \ - while (__ac_prime_list[t] > new_n_buckets) --t; \ - new_n_buckets = __ac_prime_list[t+1]; \ - if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ - else { \ - new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ - memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ - if (h->n_buckets < new_n_buckets) { \ - h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) \ - h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ - } \ - } \ - } \ - if (j) { \ - for (j = 0; j != h->n_buckets; ++j) { \ - if (__ac_iseither(h->flags, j) == 0) { \ - khkey_t key = h->keys[j]; \ - khval_t val; \ - if (kh_is_map) val = h->vals[j]; \ - __ac_set_isdel_true(h->flags, j); \ - while (1) { \ - khint_t inc, k, i; \ - k = __hash_func(key); \ - i = k % new_n_buckets; \ - inc = 1 + k % (new_n_buckets - 1); \ - while (!__ac_isempty(new_flags, i)) { \ - if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \ - else i += inc; \ - } \ - __ac_set_isempty_false(new_flags, i); \ - if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ - { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ - if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ - __ac_set_isdel_true(h->flags, i); \ - } else { \ - h->keys[i] = key; \ - if (kh_is_map) h->vals[i] = val; \ - break; \ - } \ - } \ - } \ - } \ - if (h->n_buckets > new_n_buckets) { \ - h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) \ - h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ - } \ - free(h->flags); \ - h->flags = new_flags; \ - h->n_buckets = new_n_buckets; \ - h->n_occupied = h->size; \ - h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ - } \ - } \ - SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ - { \ - khint_t x; \ - if (h->n_occupied >= h->upper_bound) { \ - if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ - else kh_resize_##name(h, h->n_buckets + 1); \ - } \ - { \ - khint_t inc, k, i, site, last; \ - x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \ - if (__ac_isempty(h->flags, i)) x = i; \ - else { \ - inc = 1 + k % (h->n_buckets - 1); last = i; \ - while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - if (__ac_isdel(h->flags, i)) site = i; \ - if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ - else i += inc; \ - if (i == last) { x = site; break; } \ - } \ - if (x == h->n_buckets) { \ - if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ - else x = i; \ - } \ - } \ - } \ - if (__ac_isempty(h->flags, x)) { \ - h->keys[x] = key; \ - __ac_set_isboth_false(h->flags, x); \ - ++h->size; ++h->n_occupied; \ - *ret = 1; \ - } else if (__ac_isdel(h->flags, x)) { \ - h->keys[x] = key; \ - __ac_set_isboth_false(h->flags, x); \ - ++h->size; \ - *ret = 2; \ - } else *ret = 0; \ - return x; \ - } \ - SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ - { \ - if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ - __ac_set_isdel_true(h->flags, x); \ - --h->size; \ - } \ - } - -#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - KHASH_INIT2(name, static inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) - -/* --- BEGIN OF HASH FUNCTIONS --- */ - -/*! @function - @abstract Integer hash function - @param key The integer [khint32_t] - @return The hash value [khint_t] - */ -#define kh_int_hash_func(key) (khint32_t)(key) -/*! @function - @abstract Integer comparison function - */ -#define kh_int_hash_equal(a, b) ((a) == (b)) -/*! @function - @abstract 64-bit integer hash function - @param key The integer [khint64_t] - @return The hash value [khint_t] - */ -#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) -/*! @function - @abstract 64-bit integer comparison function - */ -#define kh_int64_hash_equal(a, b) ((a) == (b)) -/*! @function - @abstract const char* hash function - @param s Pointer to a null terminated string - @return The hash value - */ -static inline khint_t __ac_X31_hash_string(const char *s) -{ - khint_t h = *s; - if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; - return h; -} -/*! @function - @abstract Another interface to const char* hash function - @param key Pointer to a null terminated string [const char*] - @return The hash value [khint_t] - */ -#define kh_str_hash_func(key) __ac_X31_hash_string(key) -/*! @function - @abstract Const char* comparison function - */ -#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) - -/* --- END OF HASH FUNCTIONS --- */ - -/* Other necessary macros... */ - -/*! - @abstract Type of the hash table. - @param name Name of the hash table [symbol] - */ -#define khash_t(name) kh_##name##_t - -/*! @function - @abstract Initiate a hash table. - @param name Name of the hash table [symbol] - @return Pointer to the hash table [khash_t(name)*] - */ -#define kh_init(name) kh_init_##name() - -/*! @function - @abstract Destroy a hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - */ -#define kh_destroy(name, h) kh_destroy_##name(h) - -/*! @function - @abstract Reset a hash table without deallocating memory. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - */ -#define kh_clear(name, h) kh_clear_##name(h) - -/*! @function - @abstract Resize a hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param s New size [khint_t] - */ -#define kh_resize(name, h, s) kh_resize_##name(h, s) - -/*! @function - @abstract Insert a key to the hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param k Key [type of keys] - @param r Extra return code: 0 if the key is present in the hash table; - 1 if the bucket is empty (never used); 2 if the element in - the bucket has been deleted [int*] - @return Iterator to the inserted element [khint_t] - */ -#define kh_put(name, h, k, r) kh_put_##name(h, k, r) - -/*! @function - @abstract Retrieve a key from the hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param k Key [type of keys] - @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] - */ -#define kh_get(name, h, k) kh_get_##name(h, k) - -/*! @function - @abstract Remove a key from the hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param k Iterator to the element to be deleted [khint_t] - */ -#define kh_del(name, h, k) kh_del_##name(h, k) - - -/*! @function - @abstract Test whether a bucket contains data. - @param h Pointer to the hash table [khash_t(name)*] - @param x Iterator to the bucket [khint_t] - @return 1 if containing data; 0 otherwise [int] - */ -#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) - -/*! @function - @abstract Get key given an iterator - @param h Pointer to the hash table [khash_t(name)*] - @param x Iterator to the bucket [khint_t] - @return Key [type of keys] - */ -#define kh_key(h, x) ((h)->keys[x]) - -/*! @function - @abstract Get value given an iterator - @param h Pointer to the hash table [khash_t(name)*] - @param x Iterator to the bucket [khint_t] - @return Value [type of values] - @discussion For hash sets, calling this results in segfault. - */ -#define kh_val(h, x) ((h)->vals[x]) - -/*! @function - @abstract Alias of kh_val() - */ -#define kh_value(h, x) ((h)->vals[x]) - -/*! @function - @abstract Get the start iterator - @param h Pointer to the hash table [khash_t(name)*] - @return The start iterator [khint_t] - */ -#define kh_begin(h) (khint_t)(0) - -/*! @function - @abstract Get the end iterator - @param h Pointer to the hash table [khash_t(name)*] - @return The end iterator [khint_t] - */ -#define kh_end(h) ((h)->n_buckets) - -/*! @function - @abstract Get the number of elements in the hash table - @param h Pointer to the hash table [khash_t(name)*] - @return Number of elements in the hash table [khint_t] - */ -#define kh_size(h) ((h)->size) - -/*! @function - @abstract Get the number of buckets in the hash table - @param h Pointer to the hash table [khash_t(name)*] - @return Number of buckets in the hash table [khint_t] - */ -#define kh_n_buckets(h) ((h)->n_buckets) - -/* More conenient interfaces */ - -/*! @function - @abstract Instantiate a hash set containing integer keys - @param name Name of the hash table [symbol] - */ -#define KHASH_SET_INIT_INT(name) \ - KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) - -/*! @function - @abstract Instantiate a hash map containing integer keys - @param name Name of the hash table [symbol] - @param khval_t Type of values [type] - */ -#define KHASH_MAP_INIT_INT(name, khval_t) \ - KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) - -/*! @function - @abstract Instantiate a hash map containing 64-bit integer keys - @param name Name of the hash table [symbol] - */ -#define KHASH_SET_INIT_INT64(name) \ - KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) - -/*! @function - @abstract Instantiate a hash map containing 64-bit integer keys - @param name Name of the hash table [symbol] - @param khval_t Type of values [type] - */ -#define KHASH_MAP_INIT_INT64(name, khval_t) \ - KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) - -typedef const char *kh_cstr_t; -/*! @function - @abstract Instantiate a hash map containing const char* keys - @param name Name of the hash table [symbol] - */ -#define KHASH_SET_INIT_STR(name) \ - KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) - -/*! @function - @abstract Instantiate a hash map containing const char* keys - @param name Name of the hash table [symbol] - @param khval_t Type of values [type] - */ -#define KHASH_MAP_INIT_STR(name, khval_t) \ - KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) - -#endif /* __AC_KHASH_H */ diff --git a/samtools/knetfile.c b/samtools/knetfile.c deleted file mode 100755 index 904d92b..0000000 --- a/samtools/knetfile.c +++ /dev/null @@ -1,669 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 by Genome Research Ltd (GRL). - 2010 by Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Probably I will not do socket programming in the next few years and - therefore I decide to heavily annotate this file, for Linux and - Windows as well. -ac */ - -/* - * Updated 10/22/2013 by Mary Kate Wing - * Upgraded to latest version from htslib: develop branch - * 1) Fix compile warnings - * 2) Add flag to silently fail socket logic -*/ - -#include -#include -#include -#include -#include -#include -#include - -#ifndef _WIN32 -#include -#include -#include -#include -#else -#include -#endif - -#include "knetfile.h" - - -int knetsilent = 0; - - -void knet_silent(int silent) -{ - knetsilent = silent; -} - - -/* In winsock.h, the type of a socket is SOCKET, which is: "typedef - * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed - * integer -1. In knetfile.c, I use "int" for socket type - * throughout. This should be improved to avoid confusion. - * - * In Linux/Mac, recv() and read() do almost the same thing. You can see - * in the header file that netread() is simply an alias of read(). In - * Windows, however, they are different and using recv() is mandatory. - */ - -/* This function tests if the file handler is ready for reading (or - * writing if is_read==0). */ -static int socket_wait(int fd, int is_read) -{ - fd_set fds, *fdr = 0, *fdw = 0; - struct timeval tv; - int ret; - tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out - FD_ZERO(&fds); - FD_SET(fd, &fds); - if (is_read) fdr = &fds; - else fdw = &fds; - ret = select(fd+1, fdr, fdw, 0, &tv); -#ifndef _WIN32 - if (ret == -1) perror("select"); -#else - if (ret == 0) - { - if(!knetsilent) - { - fprintf(stderr, "select time-out\n"); - } - } - else if (ret == SOCKET_ERROR) - { - if(!knetsilent) - { - fprintf(stderr, "select: %d\n", WSAGetLastError()); - } - } -#endif - return ret; -} - -#ifndef _WIN32 -/* This function does not work with Windows due to the lack of - * getaddrinfo() in winsock. It is addapted from an example in "Beej's - * Guide to Network Programming" (http://beej.us/guide/bgnet/). */ -static int socket_connect(const char *host, const char *port) -{ -#define __err_connect(func) do { if(!knetsilent){perror(func);} freeaddrinfo(res); return -1; } while (0) - - int on = 1, fd; - struct linger lng = { 0, 0 }; - struct addrinfo hints, *res = 0; - memset(&hints, 0, sizeof(struct addrinfo)); - hints.ai_family = AF_UNSPEC; - hints.ai_socktype = SOCK_STREAM; - /* In Unix/Mac, getaddrinfo() is the most convenient way to get - * server information. */ - if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); - if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); - /* The following two setsockopt() are used by ftplib - * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they - * necessary. */ - if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); - if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); - if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); - freeaddrinfo(res); - return fd; -} -#else -/* MinGW's printf has problem with "%lld" */ -char *int64tostr(char *buf, int64_t x) -{ - int cnt; - int i = 0; - do { - buf[i++] = '0' + x % 10; - x /= 10; - } while (x); - buf[i] = 0; - for (cnt = i, i = 0; i < cnt/2; ++i) { - int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c; - } - return buf; -} - -int64_t strtoint64(const char *buf) -{ - int64_t x; - for (x = 0; *buf != '\0'; ++buf) - x = x * 10 + ((int64_t) *buf - 48); - return x; -} -/* In windows, the first thing is to establish the TCP connection. */ -int knet_win32_init() -{ - WSADATA wsaData; - return WSAStartup(MAKEWORD(2, 2), &wsaData); -} -void knet_win32_destroy() -{ - WSACleanup(); -} -/* A slightly modfied version of the following function also works on - * Mac (and presummably Linux). However, this function is not stable on - * my Mac. It sometimes works fine but sometimes does not. Therefore for - * non-Windows OS, I do not use this one. */ -static SOCKET socket_connect(const char *host, const char *port) -{ -#define __err_connect(func) \ - do { \ - if(!knetsilent) {fprintf(stderr, "%s: %d\n", func, WSAGetLastError());} \ - return -1; \ - } while (0) - - int on = 1; - SOCKET fd; - struct linger lng = { 0, 0 }; - struct sockaddr_in server; - struct hostent *hp = 0; - // open socket - if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket"); - if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt"); - if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt"); - // get host info - if (isalpha(host[0])) hp = gethostbyname(host); - else { - struct in_addr addr; - addr.s_addr = inet_addr(host); - hp = gethostbyaddr((char*)&addr, 4, AF_INET); - } - if (hp == 0) __err_connect("gethost"); - // connect - server.sin_addr.s_addr = *((unsigned long*)hp->h_addr); - server.sin_family= AF_INET; - server.sin_port = htons(atoi(port)); - if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect"); - // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!) - return fd; -} -#endif - -static off_t my_netread(int fd, void *buf, off_t len) -{ - off_t rest = len, curr, l = 0; - /* recv() and read() may not read the required length of data with - * one call. They have to be called repeatedly. */ - while (rest) { - if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading - curr = netread(fd, (void*)((char*)buf + l), rest); - /* According to the glibc manual, section 13.2, a zero returned - * value indicates end-of-file (EOF), which should mean that - * read() will not return zero if EOF has not been met but data - * are not immediately available. */ - if (curr == 0) break; - l += curr; rest -= curr; - } - return l; -} - -/************************* - * FTP specific routines * - *************************/ - -static int kftp_get_response(knetFile *ftp) -{ -#ifndef _WIN32 - unsigned char c; -#else - char c; -#endif - int n = 0; - char *p; - if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0; - while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O - //fputc(c, stderr); - if (n >= ftp->max_response) { - ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256; - ftp->response = (char*)realloc(ftp->response, ftp->max_response); - } - ftp->response[n++] = c; - if (c == '\n') { - if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2]) - && ftp->response[3] != '-') break; - n = 0; - continue; - } - } - if (n < 2) return -1; - ftp->response[n-2] = 0; - return strtol(ftp->response, &p, 0); -} - -static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get) -{ - if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing - if(netwrite(ftp->ctrl_fd, cmd, strlen(cmd)) != strlen(cmd)) - { - - } - return is_get? kftp_get_response(ftp) : 0; -} - -static int kftp_pasv_prep(knetFile *ftp) -{ - char *p; - int v[6]; - kftp_send_cmd(ftp, "PASV\r\n", 1); - for (p = ftp->response; *p && *p != '('; ++p); - if (*p != '(') return -1; - ++p; - sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); - memcpy(ftp->pasv_ip, v, 4 * sizeof(int)); - ftp->pasv_port = (v[4]<<8&0xff00) + v[5]; - return 0; -} - - -static int kftp_pasv_connect(knetFile *ftp) -{ - char host[80], port[10]; - if (ftp->pasv_port == 0) { - if(!knetsilent) - { - fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n"); - } - return -1; - } - sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]); - sprintf(port, "%d", ftp->pasv_port); - ftp->fd = socket_connect(host, port); - if (ftp->fd == -1) return -1; - return 0; -} - -int kftp_connect(knetFile *ftp) -{ - ftp->ctrl_fd = socket_connect(ftp->host, ftp->port); - if (ftp->ctrl_fd == -1) return -1; - kftp_get_response(ftp); - kftp_send_cmd(ftp, "USER anonymous\r\n", 1); - kftp_send_cmd(ftp, "PASS kftp@\r\n", 1); - kftp_send_cmd(ftp, "TYPE I\r\n", 1); - return 0; -} - -int kftp_reconnect(knetFile *ftp) -{ - if (ftp->ctrl_fd != -1) { - netclose(ftp->ctrl_fd); - ftp->ctrl_fd = -1; - } - netclose(ftp->fd); - ftp->fd = -1; - return kftp_connect(ftp); -} - -// initialize ->type, ->host, ->retr and ->size -knetFile *kftp_parse_url(const char *fn, const char *mode) -{ - knetFile *fp; - char *p; - int l; - if (strstr(fn, "ftp://") != fn) return 0; - for (p = (char*)fn + 6; *p && *p != '/'; ++p); - if (*p != '/') return 0; - l = p - fn - 6; - fp = (knetFile*)calloc(1, sizeof(knetFile)); - fp->type = KNF_TYPE_FTP; - fp->fd = -1; - /* the Linux/Mac version of socket_connect() also recognizes a port - * like "ftp", but the Windows version does not. */ - fp->port = strdup("21"); - fp->host = (char*)calloc(l + 1, 1); - if (strchr(mode, 'c')) fp->no_reconnect = 1; - strncpy(fp->host, fn + 6, l); - fp->retr = (char*)calloc(strlen(p) + 8, 1); - sprintf(fp->retr, "RETR %s\r\n", p); - fp->size_cmd = (char*)calloc(strlen(p) + 8, 1); - sprintf(fp->size_cmd, "SIZE %s\r\n", p); - fp->seek_offset = 0; - return fp; -} -// place ->fd at offset off -int kftp_connect_file(knetFile *fp) -{ - int ret; - long long file_size; - if (fp->fd != -1) { - netclose(fp->fd); - if (fp->no_reconnect) kftp_get_response(fp); - } - kftp_pasv_prep(fp); - kftp_send_cmd(fp, fp->size_cmd, 1); - if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 ) - { - if(!knetsilent) - { - fprintf(stderr,"[kftp_connect_file] %s\n", fp->response); - } - return -1; - } - fp->file_size = file_size; - if (fp->offset>=0) { - char tmp[32]; -#ifndef _WIN32 - sprintf(tmp, "REST %lld\r\n", (long long)fp->offset); -#else - strcpy(tmp, "REST "); - int64tostr(tmp + 5, fp->offset); - strcat(tmp, "\r\n"); -#endif - kftp_send_cmd(fp, tmp, 1); - } - kftp_send_cmd(fp, fp->retr, 0); - kftp_pasv_connect(fp); - ret = kftp_get_response(fp); - if (ret != 150) { - if(!knetsilent) - { - fprintf(stderr, "[kftp_connect_file] %s\n", fp->response); - } - netclose(fp->fd); - fp->fd = -1; - return -1; - } - fp->is_ready = 1; - return 0; -} - - -/************************** - * HTTP specific routines * - **************************/ - -knetFile *khttp_parse_url(const char *fn, const char *mode) -{ - knetFile *fp; - char *p, *proxy, *q; - int l; - if (strstr(fn, "http://") != fn) return 0; - // set ->http_host - for (p = (char*)fn + 7; *p && *p != '/'; ++p); - l = p - fn - 7; - fp = (knetFile*)calloc(1, sizeof(knetFile)); - fp->http_host = (char*)calloc(l + 1, 1); - strncpy(fp->http_host, fn + 7, l); - fp->http_host[l] = 0; - for (q = fp->http_host; *q && *q != ':'; ++q); - if (*q == ':') *q++ = 0; - // get http_proxy - proxy = getenv("http_proxy"); - // set ->host, ->port and ->path - if (proxy == 0) { - fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name. - fp->port = strdup(*q? q : "80"); - fp->path = strdup(*p? p : "/"); - } else { - fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); - for (q = fp->host; *q && *q != ':'; ++q); - if (*q == ':') *q++ = 0; - fp->port = strdup(*q? q : "80"); - fp->path = strdup(fn); - } - fp->type = KNF_TYPE_HTTP; - fp->ctrl_fd = fp->fd = -1; - fp->seek_offset = 0; - return fp; -} - -int khttp_connect_file(knetFile *fp) -{ - int ret, l = 0; - char *buf, *p; - if (fp->fd != -1) netclose(fp->fd); - fp->fd = socket_connect(fp->host, fp->port); - buf = (char*)calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. - l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host); - l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset); - l += sprintf(buf + l, "\r\n"); - if(netwrite(fp->fd, buf, l) != l) - { - } - l = 0; - while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency - if (buf[l] == '\n' && l >= 3) - if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; - ++l; - } - buf[l] = 0; - if (l < 14) { // prematured header - netclose(fp->fd); - fp->fd = -1; - return -1; - } - ret = strtol(buf + 8, &p, 0); // HTTP return code - if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file - off_t rest = fp->offset; - while (rest) { - off_t l = rest < 0x10000? rest : 0x10000; - rest -= my_netread(fp->fd, buf, l); - } - } else if (ret != 206 && ret != 200) { - free(buf); - if(!knetsilent) - { - fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret); - } - netclose(fp->fd); - fp->fd = -1; - return -1; - } - free(buf); - fp->is_ready = 1; - return 0; -} - -/******************** - * Generic routines * - ********************/ - -knetFile *knet_open(const char *fn, const char *mode) -{ - knetFile *fp = 0; - if (mode[0] != 'r') { - if(!knetsilent) - { - fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n"); - } - return 0; - } - if (strstr(fn, "ftp://") == fn) { - fp = kftp_parse_url(fn, mode); - if (fp == 0) return 0; - if (kftp_connect(fp) == -1) { - knet_close(fp); - return 0; - } - kftp_connect_file(fp); - } else if (strstr(fn, "http://") == fn) { - fp = khttp_parse_url(fn, mode); - if (fp == 0) return 0; - khttp_connect_file(fp); - } else { // local file -#ifdef _WIN32 - /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may - * be undefined on some systems, although it is defined on my - * Mac and the Linux I have tested on. */ - int fd = open(fn, O_RDONLY | O_BINARY); -#else - int fd = open(fn, O_RDONLY); -#endif - if (fd == -1) { - perror("open"); - return 0; - } - fp = (knetFile*)calloc(1, sizeof(knetFile)); - fp->type = KNF_TYPE_LOCAL; - fp->fd = fd; - fp->ctrl_fd = -1; - } - if (fp && fp->fd == -1) { - knet_close(fp); - return 0; - } - return fp; -} - -knetFile *knet_dopen(int fd, const char *mode) -{ - knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile)); - fp->type = KNF_TYPE_LOCAL; - fp->fd = fd; - return fp; -} - -ssize_t knet_read(knetFile *fp, void *buf, size_t len) -{ - off_t l = 0; - if (fp->fd == -1) return 0; - if (fp->type == KNF_TYPE_FTP) { - if (fp->is_ready == 0) { - if (!fp->no_reconnect) kftp_reconnect(fp); - kftp_connect_file(fp); - } - } else if (fp->type == KNF_TYPE_HTTP) { - if (fp->is_ready == 0) - khttp_connect_file(fp); - } - if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX - size_t rest = len; - ssize_t curr; - while (rest) { - do { - curr = read(fp->fd, (void*)((char*)buf + l), rest); - } while (curr < 0 && EINTR == errno); - if (curr < 0) return -1; - if (curr == 0) break; - l += curr; rest -= curr; - } - } else l = my_netread(fp->fd, buf, len); - fp->offset += l; - return l; -} - -off_t knet_seek(knetFile *fp, off_t off, int whence) -{ - if (whence == SEEK_SET && off == fp->offset) return 0; - if (fp->type == KNF_TYPE_LOCAL) { - /* Be aware that lseek() returns the offset after seeking, while fseek() returns zero on success. */ - off_t offset = lseek(fp->fd, off, whence); - if (offset == -1) return -1; - fp->offset = offset; - return 0; - } else if (fp->type == KNF_TYPE_FTP) { - if (whence == SEEK_CUR) fp->offset += off; - else if (whence == SEEK_SET) fp->offset = off; - else if (whence == SEEK_END) fp->offset = fp->file_size + off; - else return -1; - fp->is_ready = 0; - return 0; - } else if (fp->type == KNF_TYPE_HTTP) { - if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future? - if(!knetsilent) - { - fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n"); - } - errno = ESPIPE; - return -1; - } - if (whence == SEEK_CUR) fp->offset += off; - else if (whence == SEEK_SET) fp->offset = off; - else return -1; - fp->is_ready = 0; - return 0; - } - errno = EINVAL; - if(!knetsilent) - { - fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); - } - return -1; -} - -int knet_close(knetFile *fp) -{ - if (fp == 0) return 0; - if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific - if (fp->fd != -1) { - /* On Linux/Mac, netclose() is an alias of close(), but on - * Windows, it is an alias of closesocket(). */ - if (fp->type == KNF_TYPE_LOCAL) close(fp->fd); - else netclose(fp->fd); - } - free(fp->host); free(fp->port); - free(fp->response); free(fp->retr); // FTP specific - free(fp->path); free(fp->http_host); // HTTP specific - free(fp); - return 0; -} - -#ifdef KNETFILE_MAIN -int main(void) -{ - char *buf; - knetFile *fp; - int type = 4, l; -#ifdef _WIN32 - knet_win32_init(); -#endif - buf = calloc(0x100000, 1); - if (type == 0) { - fp = knet_open("knetfile.c", "r"); - knet_seek(fp, 1000, SEEK_SET); - } else if (type == 1) { // NCBI FTP, large file - fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r"); - knet_seek(fp, 2500000000ll, SEEK_SET); - l = knet_read(fp, buf, 255); - } else if (type == 2) { - fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r"); - knet_seek(fp, 1000, SEEK_SET); - } else if (type == 3) { - fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r"); - knet_seek(fp, 1000, SEEK_SET); - } else if (type == 4) { - fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r"); - knet_read(fp, buf, 10000); - knet_seek(fp, 20000, SEEK_SET); - knet_seek(fp, 10000, SEEK_SET); - l = knet_read(fp, buf+10000, 10000000) + 10000; - } - if (type != 4 && type != 1) { - knet_read(fp, buf, 255); - buf[255] = 0; - printf("%s\n", buf); - } else write(fileno(stdout), buf, l); - knet_close(fp); - free(buf); - return 0; -} -#endif diff --git a/samtools/knetfile.h b/samtools/knetfile.h deleted file mode 100644 index 7e23a6a..0000000 --- a/samtools/knetfile.h +++ /dev/null @@ -1,79 +0,0 @@ -#ifndef KNETFILE_H -#define KNETFILE_H - -#include -#include - -#ifndef _WIN32 -#define netread(fd, ptr, len) read(fd, ptr, len) -#define netwrite(fd, ptr, len) write(fd, ptr, len) -#define netclose(fd) close(fd) -#else -#include -#define netread(fd, ptr, len) recv(fd, ptr, len, 0) -#define netwrite(fd, ptr, len) send(fd, ptr, len, 0) -#define netclose(fd) closesocket(fd) -#endif - -// FIXME: currently I/O is unbuffered - -#define KNF_TYPE_LOCAL 1 -#define KNF_TYPE_FTP 2 -#define KNF_TYPE_HTTP 3 - -typedef struct knetFile_s { - int type, fd; - int64_t offset; - char *host, *port; - - // the following are for FTP only - int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; - char *response, *retr, *size_cmd; - int64_t seek_offset; // for lazy seek - int64_t file_size; - - // the following are for HTTP only - char *path, *http_host; -} knetFile; - -#define knet_tell(fp) ((fp)->offset) -#define knet_fileno(fp) ((fp)->fd) - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef _WIN32 - int knet_win32_init(); - void knet_win32_destroy(); -#endif - - // Pass in non-zero to make knetfile silent (no messages), pass in - // 0 to keep any messages (default is 0). - void knet_silent(int silent); - - knetFile *knet_open(const char *fn, const char *mode); - - /* - This only works with local files. - */ - knetFile *knet_dopen(int fd, const char *mode); - - /* - If ->is_ready==0, this routine updates ->fd; otherwise, it simply - reads from ->fd. - */ - ssize_t knet_read(knetFile *fp, void *buf, size_t len); - - /* - This routine only sets ->offset and ->is_ready=0. It does not - communicate with the FTP server. - */ - off_t knet_seek(knetFile *fp, off_t off, int whence); - int knet_close(knetFile *fp); - -#ifdef __cplusplus -} -#endif - -#endif From acf0b6d9c4955e6007c608ccfe01dbb81eb07531 Mon Sep 17 00:00:00 2001 From: David Heise Date: Thu, 27 Sep 2018 16:04:27 -0400 Subject: [PATCH 04/15] Use system paths for samtools and htslib. --- Makefiles/Makefile.include | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/Makefiles/Makefile.include b/Makefiles/Makefile.include index 87b107f..2ebe870 100644 --- a/Makefiles/Makefile.include +++ b/Makefiles/Makefile.include @@ -42,6 +42,8 @@ OPTFLAG_PROFILE?=-pg CURRENT_DIR_INCLUDE?=-I. ZLIB_AVAIL ?= 1 +HTSLIB_AVAIL ?= 1 +SAMTOOLS_AVAIL ?= 1 USE_ZLIB = -D__ZLIB_AVAILABLE__ ZLIB_LIB = -lz @@ -50,6 +52,13 @@ ifeq ($(ZLIB_AVAIL), 0) ZLIB_LIB = endif + +HTSLIB_LIB = -lhts +HTS_INCLUDE= -I/usr/local/include/htslib +SAMTOOLS_LIB = -lbam -lst +SAMTOOLS_INCLUDE= -I/usr/local/include/samtools + + KNET_ON ?= 0 USE_KNET ?= @@ -69,7 +78,7 @@ REQ_SETTINGS = CFLAGS ?= $(OPTFLAG) -pipe -Wall -COMPFLAGS = $(CFLAGS) $(USER_WARNINGS) -I$(INCLUDE_PATH) $(CURRENT_DIR_INCLUDE) $(USER_INCLUDES) $(USE_KNET) $(USE_ZLIB) -D_FILE_OFFSET_BITS=64 -D__STDC_LIMIT_MACROS $(USER_COMPILE_VARS) +COMPFLAGS = $(CFLAGS) $(USER_WARNINGS) -I$(INCLUDE_PATH) $(CURRENT_DIR_INCLUDE) $(USER_INCLUDES) $(SAMTOOLS_INCLUDE) $(HTS_INCLUDE) $(USE_KNET) $(USE_ZLIB) -D_FILE_OFFSET_BITS=64 -D__STDC_LIMIT_MACROS $(USER_COMPILE_VARS) # default installation directory -INSTALLDIR?=/usr/local/bin +INSTALLDIR?=/usr/local/lib From fb5fe987122f67b193ca59dd6df3deb94f9d60f6 Mon Sep 17 00:00:00 2001 From: David Heise Date: Thu, 27 Sep 2018 16:07:55 -0400 Subject: [PATCH 05/15] Remove dependency on local samtools and rely on system locations. --- bam/Makefile.depends | 12 ++++++------ bam/SamRecord.cpp | 2 +- general/BgzfFileType.h | 2 +- general/Makefile.depends | 17 ++++++++--------- general/PhoneHome.cpp | 2 +- 5 files changed, 17 insertions(+), 18 deletions(-) diff --git a/bam/Makefile.depends b/bam/Makefile.depends index 7a19187..e1adc46 100644 --- a/bam/Makefile.depends +++ b/bam/Makefile.depends @@ -26,7 +26,7 @@ $(OBJDIR_OPT)/SamFile.o: ../include/MathVector.h ../include/CigarRoller.h $(OBJDIR_OPT)/SamFile.o: ../include/Cigar.h GenericSamInterface.h BamIndex.h $(OBJDIR_OPT)/SamFile.o: ../include/IndexBase.h SamStatistics.h $(OBJDIR_OPT)/SamFile.o: BamInterface.h SamInterface.h -$(OBJDIR_OPT)/SamFile.o: ../include/BgzfFileType.h ../include/bgzf.h +$(OBJDIR_OPT)/SamFile.o: ../include/BgzfFileType.h $(OBJDIR_OPT)/GenericSamInterface.o: GenericSamInterface.h SamStatus.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/StatGenStatus.h $(OBJDIR_OPT)/GenericSamInterface.o: ../include/ErrorHandler.h @@ -91,7 +91,7 @@ $(OBJDIR_OPT)/BamInterface.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_OPT)/BamInterface.o: ../include/MathVector.h $(OBJDIR_OPT)/BamInterface.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_OPT)/BamInterface.o: ../include/CharBuffer.h -$(OBJDIR_OPT)/SamRecord.o: ../include/bam.h SamRecord.h +$(OBJDIR_OPT)/SamRecord.o: SamRecord.h $(OBJDIR_OPT)/SamRecord.o: ../include/GenomeSequence.h $(OBJDIR_OPT)/SamRecord.o: ../include/MemoryMapArray.h ../include/Generic.h $(OBJDIR_OPT)/SamRecord.o: ../include/MemoryMap.h ../include/BaseAsciiMap.h @@ -410,7 +410,7 @@ $(OBJDIR_DEBUG)/SamFile.o: ../include/MathVector.h ../include/CigarRoller.h $(OBJDIR_DEBUG)/SamFile.o: ../include/Cigar.h GenericSamInterface.h $(OBJDIR_DEBUG)/SamFile.o: BamIndex.h ../include/IndexBase.h SamStatistics.h $(OBJDIR_DEBUG)/SamFile.o: BamInterface.h SamInterface.h -$(OBJDIR_DEBUG)/SamFile.o: ../include/BgzfFileType.h ../include/bgzf.h +$(OBJDIR_DEBUG)/SamFile.o: ../include/BgzfFileType.h $(OBJDIR_DEBUG)/GenericSamInterface.o: GenericSamInterface.h SamStatus.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/StatGenStatus.h $(OBJDIR_DEBUG)/GenericSamInterface.o: ../include/ErrorHandler.h @@ -479,7 +479,7 @@ $(OBJDIR_DEBUG)/BamInterface.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_DEBUG)/BamInterface.o: ../include/MathVector.h $(OBJDIR_DEBUG)/BamInterface.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_DEBUG)/BamInterface.o: ../include/CharBuffer.h -$(OBJDIR_DEBUG)/SamRecord.o: ../include/bam.h SamRecord.h +$(OBJDIR_DEBUG)/SamRecord.o: SamRecord.h $(OBJDIR_DEBUG)/SamRecord.o: ../include/GenomeSequence.h $(OBJDIR_DEBUG)/SamRecord.o: ../include/MemoryMapArray.h ../include/Generic.h $(OBJDIR_DEBUG)/SamRecord.o: ../include/MemoryMap.h ../include/BaseAsciiMap.h @@ -813,7 +813,7 @@ $(OBJDIR_PROFILE)/SamFile.o: ../include/MathVector.h ../include/CigarRoller.h $(OBJDIR_PROFILE)/SamFile.o: ../include/Cigar.h GenericSamInterface.h $(OBJDIR_PROFILE)/SamFile.o: BamIndex.h ../include/IndexBase.h $(OBJDIR_PROFILE)/SamFile.o: SamStatistics.h BamInterface.h SamInterface.h -$(OBJDIR_PROFILE)/SamFile.o: ../include/BgzfFileType.h ../include/bgzf.h +$(OBJDIR_PROFILE)/SamFile.o: ../include/BgzfFileType.h $(OBJDIR_PROFILE)/GenericSamInterface.o: GenericSamInterface.h SamStatus.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/StatGenStatus.h $(OBJDIR_PROFILE)/GenericSamInterface.o: ../include/ErrorHandler.h @@ -882,7 +882,7 @@ $(OBJDIR_PROFILE)/BamInterface.o: ../include/LongHash.h ../include/Error.h $(OBJDIR_PROFILE)/BamInterface.o: ../include/MathVector.h $(OBJDIR_PROFILE)/BamInterface.o: ../include/CigarRoller.h ../include/Cigar.h $(OBJDIR_PROFILE)/BamInterface.o: ../include/CharBuffer.h -$(OBJDIR_PROFILE)/SamRecord.o: ../include/bam.h SamRecord.h +$(OBJDIR_PROFILE)/SamRecord.o: SamRecord.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/GenomeSequence.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/MemoryMapArray.h $(OBJDIR_PROFILE)/SamRecord.o: ../include/Generic.h ../include/MemoryMap.h diff --git a/bam/SamRecord.cpp b/bam/SamRecord.cpp index dea1ce8..531c090 100644 --- a/bam/SamRecord.cpp +++ b/bam/SamRecord.cpp @@ -19,7 +19,7 @@ #include #include -#include "bam.h" +#include #include "SamRecord.h" #include "SamValidation.h" diff --git a/general/BgzfFileType.h b/general/BgzfFileType.h index b7ee28e..10bfb38 100644 --- a/general/BgzfFileType.h +++ b/general/BgzfFileType.h @@ -21,7 +21,7 @@ #ifdef __ZLIB_AVAILABLE__ #include // stdexcept header file -#include "bgzf.h" +#include #include "FileType.h" class BgzfFileType : public FileType diff --git a/general/Makefile.depends b/general/Makefile.depends index d1cac72..2a2c4b1 100644 --- a/general/Makefile.depends +++ b/general/Makefile.depends @@ -6,7 +6,7 @@ $(OBJDIR_OPT)/BaseQualityHelper.o: BaseQualityHelper.h $(OBJDIR_OPT)/BaseUtilities.o: BaseUtilities.h BaseAsciiMap.h StringBasics.h $(OBJDIR_OPT)/BaseUtilities.o: InputFile.h FileType.h $(OBJDIR_OPT)/BasicHash.o: BasicHash.h Error.h -$(OBJDIR_OPT)/BgzfFileType.o: BgzfFileType.h ../include/bgzf.h FileType.h +$(OBJDIR_OPT)/BgzfFileType.o: BgzfFileType.h FileType.h $(OBJDIR_OPT)/BgzfFileTypeRecovery.o: BgzfFileTypeRecovery.h FileType.h $(OBJDIR_OPT)/CharBuffer.o: CharBuffer.h InputFile.h FileType.h $(OBJDIR_OPT)/Chromosome.o: Chromosome.h GenomeSequence.h MemoryMapArray.h @@ -44,7 +44,7 @@ $(OBJDIR_OPT)/IndexBase.o: IndexBase.h InputFile.h FileType.h StatGenStatus.h $(OBJDIR_OPT)/IndexBase.o: ErrorHandler.h $(OBJDIR_OPT)/Input.o: Input.h Error.h Constant.h $(OBJDIR_OPT)/InputFile.o: InputFile.h FileType.h StringBasics.h GzipHeader.h -$(OBJDIR_OPT)/InputFile.o: BgzfFileType.h ../include/bgzf.h +$(OBJDIR_OPT)/InputFile.o: BgzfFileType.h $(OBJDIR_OPT)/InputFile.o: BgzfFileTypeRecovery.h GzipFileType.h $(OBJDIR_OPT)/InputFile.o: UncompressedFileType.h $(OBJDIR_OPT)/IntArray.o: IntArray.h Error.h Hash.h Sort.h Constant.h @@ -105,7 +105,6 @@ $(OBJDIR_OPT)/PedigreePerson.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_OPT)/PedigreePerson.o: StringHash.h Hash.h IntArray.h MathVector.h $(OBJDIR_OPT)/PedigreePerson.o: Error.h $(OBJDIR_OPT)/PhoneHome.o: PhoneHome.h StringBasics.h InputFile.h FileType.h -$(OBJDIR_OPT)/PhoneHome.o: ../include/knetfile.h $(OBJDIR_OPT)/QuickIndex.o: QuickIndex.h MathVector.h StringBasics.h $(OBJDIR_OPT)/QuickIndex.o: InputFile.h FileType.h StringArray.h StringHash.h $(OBJDIR_OPT)/QuickIndex.o: Constant.h Hash.h IntArray.h StringMap.h Error.h @@ -159,7 +158,7 @@ $(OBJDIR_DEBUG)/BaseQualityHelper.o: BaseQualityHelper.h $(OBJDIR_DEBUG)/BaseUtilities.o: BaseUtilities.h BaseAsciiMap.h $(OBJDIR_DEBUG)/BaseUtilities.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_DEBUG)/BasicHash.o: BasicHash.h Error.h -$(OBJDIR_DEBUG)/BgzfFileType.o: BgzfFileType.h ../include/bgzf.h FileType.h +$(OBJDIR_DEBUG)/BgzfFileType.o: BgzfFileType.h FileType.h $(OBJDIR_DEBUG)/BgzfFileTypeRecovery.o: BgzfFileTypeRecovery.h FileType.h $(OBJDIR_DEBUG)/CharBuffer.o: CharBuffer.h InputFile.h FileType.h $(OBJDIR_DEBUG)/Chromosome.o: Chromosome.h GenomeSequence.h MemoryMapArray.h @@ -198,7 +197,7 @@ $(OBJDIR_DEBUG)/IndexBase.o: IndexBase.h InputFile.h FileType.h $(OBJDIR_DEBUG)/IndexBase.o: StatGenStatus.h ErrorHandler.h $(OBJDIR_DEBUG)/Input.o: Input.h Error.h Constant.h $(OBJDIR_DEBUG)/InputFile.o: InputFile.h FileType.h StringBasics.h -$(OBJDIR_DEBUG)/InputFile.o: GzipHeader.h BgzfFileType.h ../include/bgzf.h +$(OBJDIR_DEBUG)/InputFile.o: GzipHeader.h BgzfFileType.h $(OBJDIR_DEBUG)/InputFile.o: BgzfFileTypeRecovery.h GzipFileType.h $(OBJDIR_DEBUG)/InputFile.o: UncompressedFileType.h $(OBJDIR_DEBUG)/IntArray.o: IntArray.h Error.h Hash.h Sort.h Constant.h @@ -262,7 +261,7 @@ $(OBJDIR_DEBUG)/PedigreePerson.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_DEBUG)/PedigreePerson.o: StringHash.h Hash.h IntArray.h MathVector.h $(OBJDIR_DEBUG)/PedigreePerson.o: Error.h $(OBJDIR_DEBUG)/PhoneHome.o: PhoneHome.h StringBasics.h InputFile.h -$(OBJDIR_DEBUG)/PhoneHome.o: FileType.h ../include/knetfile.h +$(OBJDIR_DEBUG)/PhoneHome.o: FileType.h $(OBJDIR_DEBUG)/QuickIndex.o: QuickIndex.h MathVector.h StringBasics.h $(OBJDIR_DEBUG)/QuickIndex.o: InputFile.h FileType.h StringArray.h $(OBJDIR_DEBUG)/QuickIndex.o: StringHash.h Constant.h Hash.h IntArray.h @@ -319,7 +318,7 @@ $(OBJDIR_PROFILE)/BaseQualityHelper.o: BaseQualityHelper.h $(OBJDIR_PROFILE)/BaseUtilities.o: BaseUtilities.h BaseAsciiMap.h $(OBJDIR_PROFILE)/BaseUtilities.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_PROFILE)/BasicHash.o: BasicHash.h Error.h -$(OBJDIR_PROFILE)/BgzfFileType.o: BgzfFileType.h ../include/bgzf.h FileType.h +$(OBJDIR_PROFILE)/BgzfFileType.o: BgzfFileType.h FileType.h $(OBJDIR_PROFILE)/BgzfFileTypeRecovery.o: BgzfFileTypeRecovery.h FileType.h $(OBJDIR_PROFILE)/CharBuffer.o: CharBuffer.h InputFile.h FileType.h $(OBJDIR_PROFILE)/Chromosome.o: Chromosome.h GenomeSequence.h @@ -359,7 +358,7 @@ $(OBJDIR_PROFILE)/IndexBase.o: IndexBase.h InputFile.h FileType.h $(OBJDIR_PROFILE)/IndexBase.o: StatGenStatus.h ErrorHandler.h $(OBJDIR_PROFILE)/Input.o: Input.h Error.h Constant.h $(OBJDIR_PROFILE)/InputFile.o: InputFile.h FileType.h StringBasics.h -$(OBJDIR_PROFILE)/InputFile.o: GzipHeader.h BgzfFileType.h ../include/bgzf.h +$(OBJDIR_PROFILE)/InputFile.o: GzipHeader.h BgzfFileType.h $(OBJDIR_PROFILE)/InputFile.o: BgzfFileTypeRecovery.h GzipFileType.h $(OBJDIR_PROFILE)/InputFile.o: UncompressedFileType.h $(OBJDIR_PROFILE)/IntArray.o: IntArray.h Error.h Hash.h Sort.h Constant.h @@ -426,7 +425,7 @@ $(OBJDIR_PROFILE)/PedigreePerson.o: StringBasics.h InputFile.h FileType.h $(OBJDIR_PROFILE)/PedigreePerson.o: StringHash.h Hash.h IntArray.h $(OBJDIR_PROFILE)/PedigreePerson.o: MathVector.h Error.h $(OBJDIR_PROFILE)/PhoneHome.o: PhoneHome.h StringBasics.h InputFile.h -$(OBJDIR_PROFILE)/PhoneHome.o: FileType.h ../include/knetfile.h +$(OBJDIR_PROFILE)/PhoneHome.o: FileType.h $(OBJDIR_PROFILE)/QuickIndex.o: QuickIndex.h MathVector.h StringBasics.h $(OBJDIR_PROFILE)/QuickIndex.o: InputFile.h FileType.h StringArray.h $(OBJDIR_PROFILE)/QuickIndex.o: StringHash.h Constant.h Hash.h IntArray.h diff --git a/general/PhoneHome.cpp b/general/PhoneHome.cpp index df0c472..86e53bd 100644 --- a/general/PhoneHome.cpp +++ b/general/PhoneHome.cpp @@ -16,7 +16,7 @@ */ #include "PhoneHome.h" -#include "knetfile.h" +#include #include #include From 29349d77c6602a522c8ce5ba450bdeda3b750559 Mon Sep 17 00:00:00 2001 From: David Heise Date: Thu, 27 Sep 2018 16:10:47 -0400 Subject: [PATCH 06/15] Note dependency on samtools and htslib 1.9 --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e949d6e..ba04122 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,12 @@ On debian type systems (including Ubuntu), add the following packages if they ar sudo apt-get install g++ libssl-dev zlib1g-dev +The following libraries are dependencies that need to be installed on the build system. + +[htslib 1.9](https://github.com/samtools/htslib/tree/1.9) + +[samtools 1.9](https://github.com/samtools/samtools/tree/1.9) + Building -------- @@ -23,7 +29,6 @@ Under the main statgen repository, there are: - `glf` - library code for operating on glf files. - `include` - after compiling, the library headers are linked here - `Makefiles` - directory containing Makefiles that are used in the library and can be used for developing programs using the library -- `samtools` - library code used from samtools After Compiling: `libStatGen.a`, `libStatGen_debug.a`, and `libStatGen_profile.a` are created at the top level. From 16aa67652279b0380b7d68249a6dbcae001f66f1 Mon Sep 17 00:00:00 2001 From: David Heise Date: Mon, 1 Oct 2018 13:28:55 -0400 Subject: [PATCH 07/15] Fix a segfault on gzip files. --- general/BgzfFileType.cpp | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/general/BgzfFileType.cpp b/general/BgzfFileType.cpp index e9f3b52..9a82b0f 100644 --- a/general/BgzfFileType.cpp +++ b/general/BgzfFileType.cpp @@ -29,15 +29,21 @@ BgzfFileType::BgzfFileType(const char * filename, const char * mode) { BgzfFileType::numThreads = 1; char threadSpec[8]; - char bgzfMode[4]; + char bgzfMode[8]; const char* thread_start = strchr(mode, '@'); - int mode_len = thread_start-mode; - if(mode_len < 4){ + int mode_len = 0; + if (thread_start != NULL){ + mode_len = thread_start-mode; + } else { + mode_len = strnlen(mode, 7); + } + printf("%s", mode); + if(mode_len < 8){ strncpy(bgzfMode, mode, mode_len); bgzfMode[mode_len] = '\0'; } else { - strncpy(bgzfMode, mode, 3); - bgzfMode[3] = '\0'; + strncpy(bgzfMode, mode, 7); + bgzfMode[7] = '\0'; } if (thread_start != NULL && thread_start+1 != '\0'){ // Advance past the @, then parse the number of threads From f5790d4f364f1bcab4cf65c665f1ccfe7426d516 Mon Sep 17 00:00:00 2001 From: David Heise Date: Thu, 4 Oct 2018 10:00:50 -0400 Subject: [PATCH 08/15] Add a write buffer to speed up small writes. --- general/InputFile.cpp | 15 +++-- general/InputFile.h | 128 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 120 insertions(+), 23 deletions(-) diff --git a/general/InputFile.cpp b/general/InputFile.cpp index 71537fc..4f5defd 100644 --- a/general/InputFile.cpp +++ b/general/InputFile.cpp @@ -32,9 +32,12 @@ InputFile::InputFile(const char * filename, const char * mode, myAttemptRecovery = false; myFileTypePtr = NULL; myBufferIndex = 0; + myWriteIndex = 0; myCurrentBufferSize = 0; myAllocatedBufferSize = DEFAULT_BUFFER_SIZE; myFileBuffer = new char[myAllocatedBufferSize]; + myWriteBuffer = new char[myAllocatedBufferSize]; + memset(myWriteBuffer, '\0', myAllocatedBufferSize); myFileName.clear(); openFile(filename, mode, compressionMode); @@ -55,13 +58,13 @@ int InputFile::readTilChar(const std::string& stopChars, std::string& stringRef) { return(-1); } - + // Try to find the character in the stopChars. pos = stopChars.find(charRead); if(pos == std::string::npos) { - // Didn't find a stop character and it is not an EOF, + // Didn't find a stop character and it is not an EOF, // so add it to the string. stringRef += charRead; } @@ -84,7 +87,7 @@ int InputFile::readTilChar(const std::string& stopChars) { return(-1); } - + // Try to find the character in the stopChars. pos = stopChars.find(charRead); } @@ -163,7 +166,7 @@ bool InputFile::openFile(const char * filename, const char * mode, { // // if recovering, we don't want to issue big readaheads, since - // that interferes with the decompression - we only want to + // that interferes with the decompression - we only want to // decompress one at a time, and handle the exceptions immediately // rather than at some indeterminate point in time. // @@ -180,7 +183,7 @@ bool InputFile::openFile(const char * filename, const char * mode, // Check if reading from stdin. if((strcmp(filename, "-") == 0) || (strcmp(filename, "-.gz") == 0)) { - // Reading from stdin, open it based on the + // Reading from stdin, open it based on the // compression mode. openFileUsingMode(filename, mode, compressionMode); } @@ -219,7 +222,7 @@ bool InputFile::openFile(const char * filename, const char * mode, // Read the file to see if it a gzip file. GzipHeader gzipHeader; bool isGzip = gzipHeader.readHeader(file); - + // The file header has been read, so close the file, so it can // be re-opened as the correct type. file.close(); diff --git a/general/InputFile.h b/general/InputFile.h index 1f242bb..2c71772 100644 --- a/general/InputFile.h +++ b/general/InputFile.h @@ -14,7 +14,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -/*! \file */ +/*! \file */ #ifndef __INPUTFILE_H__ #define __INPUTFILE_H__ @@ -54,10 +54,13 @@ class InputFile myAttemptRecovery = false; myFileTypePtr = NULL; myBufferIndex = 0; + myWriteIndex = 0; myCurrentBufferSize = 0; // Default to buffer. myAllocatedBufferSize = DEFAULT_BUFFER_SIZE; myFileBuffer = new char[myAllocatedBufferSize]; + myWriteBuffer = new char[myAllocatedBufferSize]; + memset(myWriteBuffer, '\0', myAllocatedBufferSize); myFileName.clear(); } @@ -68,7 +71,7 @@ class InputFile /// \param filename file to open /// \param mode same format as fopen: "r" for read & "w" for write. /// \param compressionMode set the type of file to open for writing or - /// for reading from stdin (when reading files, the compression type is + /// for reading from stdin (when reading files, the compression type is /// determined by reading the file). InputFile(const char * filename, const char * mode, InputFile::ifileCompression compressionMode = InputFile::DEFAULT); @@ -127,7 +130,7 @@ class InputFile } } - + /// Close the file. /// \return status of the close (0 is success). inline int ifclose() @@ -136,8 +139,10 @@ class InputFile { return EOF; } + ifflush(); int result = myFileTypePtr->close(); delete myFileTypePtr; + myWriteBuffer = NULL; myFileTypePtr = NULL; myFileName.clear(); return result; @@ -228,13 +233,13 @@ class InputFile } // Now copy the rest of the bytes into the buffer. - memcpy((char*)buffer+availableBytes, + memcpy((char*)buffer+availableBytes, myFileBuffer, copySize); // set the buffer index to the location after what we are // returning as read. myBufferIndex = copySize; - + returnSize += copySize; } } @@ -301,7 +306,7 @@ class InputFile /// Read, appending the characters into the specified string until new /// line or EOF is found, returning -1 if EOF is found first and 0 if new - /// line is found first. The new line and EOF are not written into the + /// line is found first. The new line and EOF are not written into the /// specified string. /// \param line reference to a string that the read characters should /// be apppended to (does not include the new line or eof). @@ -415,7 +420,92 @@ class InputFile // No myFileTypePtr, so return 0 - nothing written. return 0; } - return myFileTypePtr->write(buffer, size); + if(myAllocatedBufferSize == 0){ + return myFileTypePtr->write(buffer, size); + } + //return myFileTypePtr->write(buffer, size); + // There are 2 cases: + // 1) There are already size available bytes in buffer. + // 2) There are not size bytes in buffer. + + // Determine the number of available bytes in the buffer. + unsigned int availableBytes = myAllocatedBufferSize - myWriteIndex; + //printf("Index: %d\n", myWriteIndex); + //printf("Bufsize bytes: %d\n", myAllocatedBufferSize); + //printf("Available bytes: %d\n", availableBytes); + //unsigned int remainingBytes = size; + //unsigned int tmpIdx = 0; + + int returnSize = 0; + // If the incoming buffer would exceed the available bytes in the + // buffer, go ahead and write the buffer and reset + if (size > availableBytes){ + myFileTypePtr->write(myWriteBuffer, myWriteIndex); + myWriteIndex = 0; + availableBytes = myAllocatedBufferSize; + memset(myWriteBuffer, '\0', myAllocatedBufferSize); + } + // If the new write is bigger than the buffer, go ahead and write it, + // skipping the buffer. + if (size >= myAllocatedBufferSize){ + myFileTypePtr->write(buffer, size); + } else { + // Otherwise, write it into the buffer but not to disk + memcpy(myWriteBuffer+myWriteIndex, buffer, size); + myWriteIndex += size; + } + return size; + + // Case 1: There are already size available bytes in buffer. + /* + while (remainingBytes > 0) + { + if (availableBytes == 0) { + returnSize += myFileTypePtr->write(myWriteBuffer, myCurrentBufferSize); + myWriteIndex = 0; + availableBytes = myCurrentBufferSize - myWriteIndex; + memset(myWriteBuffer, '\0', myCurrentBufferSize); + } + if (remainingBytes >= availableBytes && availableBytes > 0) + { + // Size > availableBytes > 0 + // Copy the available bytes into the buffer. + memcpy(myWriteBuffer+myWriteIndex, (char*)buffer+tmpIdx, availableBytes); + myWriteIndex += availableBytes; + tmpIdx += availableBytes; + remainingBytes -= availableBytes; + returnSize += availableBytes; + availableBytes = myCurrentBufferSize - myWriteIndex; + } else if (remainingBytes > 0 && remainingBytes < availableBytes){ + memcpy(myWriteBuffer+myWriteIndex, buffer, remainingBytes); + myWriteIndex += remainingBytes; + tmpIdx += remainingBytes; + //remainingBytes -= remainingBytes; + returnSize += remainingBytes; + remainingBytes = 0; + availableBytes = myCurrentBufferSize - myWriteIndex; + } + // Increment the buffer index. + } + return returnSize; + if (remainingBytes > 0) { + memcpy(myFileBuffer+myWriteIndex, (char*)buffer+tmpIdx, remainingBytes); + myWriteIndex += remainingBytes; + tmpIdx += remainingBytes; + remainingBytes -= remainingBytes; + returnSize += remainingBytes; + } + */ + return returnSize; + } + + inline unsigned int ifflush(){ + int returnSize = 0; + if (myWriteIndex == 0 ) { return 0; } + returnSize += myFileTypePtr->write(myWriteBuffer, myWriteIndex); + memset(myWriteBuffer, '\0', myCurrentBufferSize); + myWriteIndex = 0; + return returnSize; } /// Returns whether or not the file was successfully opened. @@ -476,7 +566,7 @@ class InputFile } /// Enable (default) or disable recovery. - /// + /// /// When true, we can attach a myFileTypePtr /// that implements a recovery capable decompressor. /// This requires that the caller be able to catch @@ -489,7 +579,7 @@ class InputFile bool attemptRecoverySync(bool (*checkSignature)(void *data) , int length) { - if(myFileTypePtr==NULL) return false; + if(myFileTypePtr==NULL) return false; return myFileTypePtr->attemptRecoverySync(checkSignature, length); } @@ -533,10 +623,14 @@ class InputFile // Buffer used to do large reads rather than 1 by 1 character reads // from the file. The class is then managed to iterate through the buffer. char* myFileBuffer; + char* myWriteBuffer; + // Current index into the buffer. Used to track where we are in reading the // file from the buffer. int myBufferIndex; + int myWriteIndex; + // Current number of entries in the buffer. Used to ensure that // if a read did not fill the buffer, we stop before hitting the @@ -551,7 +645,7 @@ class InputFile typedef InputFile* IFILE; -/// Open a file with the specified name and mode, using a filename of "-" to +/// Open a file with the specified name and mode, using a filename of "-" to /// indicate stdin/stdout. /// \param filename file to open ("-" meands stdin/stdout) /// \param mode same format as fopen: "r" for read & "w" for write. @@ -714,7 +808,7 @@ inline bool ifseek(IFILE file, int64_t offset, int origin) /// \return number of bytes written int ifprintf(IFILE output, const char * format, ...); -/// Read a line from a file using streaming. +/// Read a line from a file using streaming. /// Will not fail when the file hits EOF, so do not do: while(iFile >> iStr) /// unless within your loop you check for ifeof and break. /// Instead, do something like: @@ -736,11 +830,11 @@ inline IFILE operator >> (IFILE stream, std::string &str) inline InputFile& operator << (InputFile& stream, const std::string& str) { unsigned int numExpected = str.length(); - unsigned int numWritten = + unsigned int numWritten = stream.ifwrite(str.c_str(), numExpected); if(numExpected != numWritten) { - std::cerr << "Failed to stream to IFILE, expected " + std::cerr << "Failed to stream to IFILE, expected " << numExpected << " but only wrote " << numWritten << std::endl; } @@ -753,11 +847,11 @@ inline InputFile& operator << (InputFile& stream, const std::string& str) inline InputFile& operator << (InputFile& stream, const char* str) { unsigned int numExpected = strlen(str); - unsigned int numWritten = + unsigned int numWritten = stream.ifwrite(str, numExpected); if(numExpected != numWritten) { - std::cerr << "Failed to stream to IFILE, expected " + std::cerr << "Failed to stream to IFILE, expected " << numExpected << " but only wrote " << numWritten << std::endl; } @@ -785,11 +879,11 @@ InputFile& operator << (InputFile& stream, unsigned int num); /// \param ch character that should be written to the file. inline InputFile& operator << (InputFile& stream, char ch) { - unsigned int numWritten = + unsigned int numWritten = stream.ifwrite(&ch, 1); if(1 != numWritten) { - std::cerr << "Failed to stream to IFILE, expected 1, but only wrote " + std::cerr << "Failed to stream to IFILE, expected 1, but only wrote " << numWritten << std::endl; } return(stream); From c1edadef6211b3e26766f124b706d96dd5dbfe23 Mon Sep 17 00:00:00 2001 From: David Heise Date: Thu, 4 Oct 2018 10:20:16 -0400 Subject: [PATCH 09/15] Fix misused variable. --- general/InputFile.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/general/InputFile.h b/general/InputFile.h index 2c71772..407a1d0 100644 --- a/general/InputFile.h +++ b/general/InputFile.h @@ -503,7 +503,7 @@ class InputFile int returnSize = 0; if (myWriteIndex == 0 ) { return 0; } returnSize += myFileTypePtr->write(myWriteBuffer, myWriteIndex); - memset(myWriteBuffer, '\0', myCurrentBufferSize); + memset(myWriteBuffer, '\0', myAllocatedBufferSize); myWriteIndex = 0; return returnSize; } From 18829f70bbec7950fefe6ebbd5897d5730ecf13e Mon Sep 17 00:00:00 2001 From: David Heise Date: Wed, 10 Oct 2018 11:14:39 -0400 Subject: [PATCH 10/15] Change dependencies based on the new cram code. --- Makefiles/Makefile.include | 11 +---------- README.md | 2 -- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/Makefiles/Makefile.include b/Makefiles/Makefile.include index 2ebe870..b4b6209 100644 --- a/Makefiles/Makefile.include +++ b/Makefiles/Makefile.include @@ -42,8 +42,6 @@ OPTFLAG_PROFILE?=-pg CURRENT_DIR_INCLUDE?=-I. ZLIB_AVAIL ?= 1 -HTSLIB_AVAIL ?= 1 -SAMTOOLS_AVAIL ?= 1 USE_ZLIB = -D__ZLIB_AVAILABLE__ ZLIB_LIB = -lz @@ -52,13 +50,6 @@ ifeq ($(ZLIB_AVAIL), 0) ZLIB_LIB = endif - -HTSLIB_LIB = -lhts -HTS_INCLUDE= -I/usr/local/include/htslib -SAMTOOLS_LIB = -lbam -lst -SAMTOOLS_INCLUDE= -I/usr/local/include/samtools - - KNET_ON ?= 0 USE_KNET ?= @@ -78,7 +69,7 @@ REQ_SETTINGS = CFLAGS ?= $(OPTFLAG) -pipe -Wall -COMPFLAGS = $(CFLAGS) $(USER_WARNINGS) -I$(INCLUDE_PATH) $(CURRENT_DIR_INCLUDE) $(USER_INCLUDES) $(SAMTOOLS_INCLUDE) $(HTS_INCLUDE) $(USE_KNET) $(USE_ZLIB) -D_FILE_OFFSET_BITS=64 -D__STDC_LIMIT_MACROS $(USER_COMPILE_VARS) +COMPFLAGS = $(CFLAGS) $(USER_WARNINGS) -I$(INCLUDE_PATH) $(CURRENT_DIR_INCLUDE) $(USER_INCLUDES) $(USE_KNET) $(USE_ZLIB) -D_FILE_OFFSET_BITS=64 -D__STDC_LIMIT_MACROS $(USER_COMPILE_VARS) # default installation directory INSTALLDIR?=/usr/local/lib diff --git a/README.md b/README.md index ba04122..23ca96f 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,6 @@ The following libraries are dependencies that need to be installed on the build [htslib 1.9](https://github.com/samtools/htslib/tree/1.9) -[samtools 1.9](https://github.com/samtools/samtools/tree/1.9) - Building -------- From 18b1b921601e2f80f57c20850718e741cc6e9fdc Mon Sep 17 00:00:00 2001 From: David Heise Date: Wed, 10 Oct 2018 12:48:07 -0400 Subject: [PATCH 11/15] add dependency on htslib --- Makefiles/Makefile.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefiles/Makefile.test b/Makefiles/Makefile.test index 46a609e..52b3995 100644 --- a/Makefiles/Makefile.test +++ b/Makefiles/Makefile.test @@ -38,7 +38,7 @@ all debug: $(EXE) # dependencies for executables $(EXE) : $(LIBRARY) $(OBJECTS) - $(CXX) $(COMPFLAGS) -o $@ $(OBJECTS) $(LIBRARY) $(LDFLAGS) -lm $(ZLIB_LIB) $(UNAME_LIBS) + $(CXX) $(COMPFLAGS) -o $@ $(OBJECTS) $(LIBRARY) $(LDFLAGS) -lm -lhts $(ZLIB_LIB) $(UNAME_LIBS) $(OBJECTS): $(TOOLHDR) $(LIBHDR) | $(OBJDIR) From 5b7f8a6b5dd7ba84223b62abf4d40ee9ddbec1b8 Mon Sep 17 00:00:00 2001 From: David Heise Date: Thu, 11 Oct 2018 11:16:23 -0400 Subject: [PATCH 12/15] Remove debugging statement. --- general/BgzfFileType.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/general/BgzfFileType.cpp b/general/BgzfFileType.cpp index 9a82b0f..a445754 100644 --- a/general/BgzfFileType.cpp +++ b/general/BgzfFileType.cpp @@ -37,7 +37,7 @@ BgzfFileType::BgzfFileType(const char * filename, const char * mode) } else { mode_len = strnlen(mode, 7); } - printf("%s", mode); + if(mode_len < 8){ strncpy(bgzfMode, mode, mode_len); bgzfMode[mode_len] = '\0'; From c5ac59631cb21d1d4d107982ad9077b2952b57b4 Mon Sep 17 00:00:00 2001 From: David Heise Date: Thu, 11 Oct 2018 11:53:49 -0400 Subject: [PATCH 13/15] Fix a memort leak. --- general/InputFile.cpp | 20 +++++++++++++++++--- general/InputFile.h | 9 ++++----- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/general/InputFile.cpp b/general/InputFile.cpp index 4f5defd..c892015 100644 --- a/general/InputFile.cpp +++ b/general/InputFile.cpp @@ -35,9 +35,18 @@ InputFile::InputFile(const char * filename, const char * mode, myWriteIndex = 0; myCurrentBufferSize = 0; myAllocatedBufferSize = DEFAULT_BUFFER_SIZE; - myFileBuffer = new char[myAllocatedBufferSize]; - myWriteBuffer = new char[myAllocatedBufferSize]; - memset(myWriteBuffer, '\0', myAllocatedBufferSize); + if(strchr(mode, 'r') || strchr(mode, 'R')){ + myFileBuffer = new char[myAllocatedBufferSize]; + } else { + myFileBuffer = NULL; + } + if(strchr(mode, 'w') || strchr(mode, 'W') || + strchr(mode, 'a') || strchr(mode, 'A')){ + myWriteBuffer = new char[myAllocatedBufferSize]; + memset(myWriteBuffer, '\0', myAllocatedBufferSize); + } else { + myWriteBuffer = NULL; + } myFileName.clear(); openFile(filename, mode, compressionMode); @@ -395,6 +404,11 @@ InputFile::~InputFile() delete[] myFileBuffer; myFileBuffer = NULL; } + if(myWriteBuffer != NULL) + { + delete[] myWriteBuffer; + myWriteBuffer = NULL; + } } diff --git a/general/InputFile.h b/general/InputFile.h index 407a1d0..ca6198c 100644 --- a/general/InputFile.h +++ b/general/InputFile.h @@ -57,9 +57,9 @@ class InputFile myWriteIndex = 0; myCurrentBufferSize = 0; // Default to buffer. - myAllocatedBufferSize = DEFAULT_BUFFER_SIZE; - myFileBuffer = new char[myAllocatedBufferSize]; - myWriteBuffer = new char[myAllocatedBufferSize]; + myAllocatedBufferSize = 0; + myFileBuffer = NULL; + myWriteBuffer = NULL; memset(myWriteBuffer, '\0', myAllocatedBufferSize); myFileName.clear(); } @@ -142,7 +142,6 @@ class InputFile ifflush(); int result = myFileTypePtr->close(); delete myFileTypePtr; - myWriteBuffer = NULL; myFileTypePtr = NULL; myFileName.clear(); return result; @@ -420,7 +419,7 @@ class InputFile // No myFileTypePtr, so return 0 - nothing written. return 0; } - if(myAllocatedBufferSize == 0){ + if(myAllocatedBufferSize == 0 || myWriteBuffer == NULL){ return myFileTypePtr->write(buffer, size); } //return myFileTypePtr->write(buffer, size); From fa861785ca6e94f964d1437131c932c2300ae024 Mon Sep 17 00:00:00 2001 From: David Heise Date: Mon, 30 Nov 2020 11:38:10 -0800 Subject: [PATCH 14/15] Fix pointer issue --- general/BgzfFileType.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/general/BgzfFileType.cpp b/general/BgzfFileType.cpp index a445754..677814f 100644 --- a/general/BgzfFileType.cpp +++ b/general/BgzfFileType.cpp @@ -45,7 +45,7 @@ BgzfFileType::BgzfFileType(const char * filename, const char * mode) strncpy(bgzfMode, mode, 7); bgzfMode[7] = '\0'; } - if (thread_start != NULL && thread_start+1 != '\0'){ + if (thread_start != NULL && thread_start[1] != '\0'){ // Advance past the @, then parse the number of threads thread_start++; strncpy(threadSpec, thread_start, 7); From cc43f7e0ccc263d4341ee62f949fdfa50e45c9d3 Mon Sep 17 00:00:00 2001 From: "Heise, David" Date: Wed, 22 Sep 2021 15:31:00 -0400 Subject: [PATCH 15/15] Revert default constructor. --- general/InputFile.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/general/InputFile.h b/general/InputFile.h index ca6198c..b4b4ac4 100644 --- a/general/InputFile.h +++ b/general/InputFile.h @@ -57,9 +57,10 @@ class InputFile myWriteIndex = 0; myCurrentBufferSize = 0; // Default to buffer. - myAllocatedBufferSize = 0; - myFileBuffer = NULL; - myWriteBuffer = NULL; + myAllocatedBufferSize = DEFAULT_BUFFER_SIZE; + myFileBuffer = new char[myAllocatedBufferSize]; + myWriteBuffer = new char[myAllocatedBufferSize]; + memset(myFileBuffer, '\0', myAllocatedBufferSize); memset(myWriteBuffer, '\0', myAllocatedBufferSize); myFileName.clear(); }