diff --git a/Makefiles/Makefile.include b/Makefiles/Makefile.include index 352f6bd..e86d2ed 100644 --- a/Makefiles/Makefile.include +++ b/Makefiles/Makefile.include @@ -72,4 +72,4 @@ CFLAGS ?= $(OPTFLAG) -pipe -Wall COMPFLAGS = $(CFLAGS) $(USER_WARNINGS) -I$(INCLUDE_PATH) $(CURRENT_DIR_INCLUDE) $(USER_INCLUDES) $(USE_KNET) $(USE_ZLIB) -D_FILE_OFFSET_BITS=64 -D__STDC_LIMIT_MACROS $(USER_COMPILE_VARS) # default installation directory -INSTALLDIR?=/usr/local/bin +INSTALLDIR?=/usr/local/lib diff --git a/Makefiles/Makefile.test b/Makefiles/Makefile.test index 46a609e..52b3995 100644 --- a/Makefiles/Makefile.test +++ b/Makefiles/Makefile.test @@ -38,7 +38,7 @@ all debug: $(EXE) # dependencies for executables $(EXE) : $(LIBRARY) $(OBJECTS) - $(CXX) $(COMPFLAGS) -o $@ $(OBJECTS) $(LIBRARY) $(LDFLAGS) -lm $(ZLIB_LIB) $(UNAME_LIBS) + $(CXX) $(COMPFLAGS) -o $@ $(OBJECTS) $(LIBRARY) $(LDFLAGS) -lm -lhts $(ZLIB_LIB) $(UNAME_LIBS) $(OBJECTS): $(TOOLHDR) $(LIBHDR) | $(OBJDIR) diff --git a/README.md b/README.md index e949d6e..23ca96f 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,10 @@ On debian type systems (including Ubuntu), add the following packages if they ar sudo apt-get install g++ libssl-dev zlib1g-dev +The following libraries are dependencies that need to be installed on the build system. + +[htslib 1.9](https://github.com/samtools/htslib/tree/1.9) + Building -------- @@ -23,7 +27,6 @@ Under the main statgen repository, there are: - `glf` - library code for operating on glf files. - `include` - after compiling, the library headers are linked here - `Makefiles` - directory containing Makefiles that are used in the library and can be used for developing programs using the library -- `samtools` - library code used from samtools After Compiling: `libStatGen.a`, `libStatGen_debug.a`, and `libStatGen_profile.a` are created at the top level. diff --git a/general/BgzfFileType.cpp b/general/BgzfFileType.cpp index 9176aff..677814f 100644 --- a/general/BgzfFileType.cpp +++ b/general/BgzfFileType.cpp @@ -27,27 +27,56 @@ bool BgzfFileType::ourRequireEofBlock = true; BgzfFileType::BgzfFileType(const char * filename, const char * mode) { + BgzfFileType::numThreads = 1; + char threadSpec[8]; + char bgzfMode[8]; + const char* thread_start = strchr(mode, '@'); + int mode_len = 0; + if (thread_start != NULL){ + mode_len = thread_start-mode; + } else { + mode_len = strnlen(mode, 7); + } + + if(mode_len < 8){ + strncpy(bgzfMode, mode, mode_len); + bgzfMode[mode_len] = '\0'; + } else { + strncpy(bgzfMode, mode, 7); + bgzfMode[7] = '\0'; + } + if (thread_start != NULL && thread_start[1] != '\0'){ + // Advance past the @, then parse the number of threads + thread_start++; + strncpy(threadSpec, thread_start, 7); + threadSpec[7] = '\0'; + numThreads = strtol(threadSpec, NULL, 10); + // If we can't parse the number, revert to one thread + if (numThreads == 0){ numThreads = 1; } + } // If the file is for write and is '-', then write to stdout. - if(((mode[0] == 'w') || (mode[0] == 'W')) && + if(((mode[0] == 'w') || (mode[0] == 'W')) && (strcmp(filename, "-") == 0)) { // Write to stdout. - bgzfHandle = bgzf_dopen(fileno(stdout), mode); + bgzfHandle = bgzf_dopen(fileno(stdout), bgzfMode); } - else if(((mode[0] == 'r') || (mode[0] == 'R')) && + else if(((mode[0] == 'r') || (mode[0] == 'R')) && (strcmp(filename, "-") == 0)) { // read from stdin - bgzfHandle = bgzf_dopen(fileno(stdin), mode); + bgzfHandle = bgzf_dopen(fileno(stdin), bgzfMode); } else { - bgzfHandle = bgzf_open(filename, mode); + bgzfHandle = bgzf_open(filename, bgzfMode); } myStartPos = 0; if (bgzfHandle != NULL) { + //Only do multithreaded IO if more than one thread is used. + if(numThreads > 1){ bgzf_mt(bgzfHandle, numThreads, 256); } // Check to see if the file is being opened for read, if the eof block // is required, and if it is, if it is there. if ((mode[0] == 'r' || mode[0] == 'R') && (strcmp(filename, "-") != 0) diff --git a/general/BgzfFileType.h b/general/BgzfFileType.h index b7eef81..21c708f 100644 --- a/general/BgzfFileType.h +++ b/general/BgzfFileType.h @@ -29,12 +29,14 @@ class BgzfFileType : public FileType public: BgzfFileType() { + numThreads = 1; bgzfHandle = NULL; myEOF = false; } virtual ~BgzfFileType() { + bgzf_close(bgzfHandle); bgzfHandle = NULL; } @@ -110,7 +112,7 @@ class BgzfFileType : public FileType } else if((bytesRead != (int)size) & (bytesRead >= 0)) { - // Less then the requested size was read + // Less then the requested size was read // and an error was not returned (bgzf_read returns -1 on error). myEOF = true; } @@ -173,6 +175,9 @@ class BgzfFileType : public FileType // at the end of the file. If the block is required, but not on the file, // the constructor fails to open the file. static bool ourRequireEofBlock; + + // The number of threads to use when using multithreaded BGZF + int64_t numThreads; }; #endif diff --git a/general/InputFile.cpp b/general/InputFile.cpp index 71537fc..c892015 100644 --- a/general/InputFile.cpp +++ b/general/InputFile.cpp @@ -32,9 +32,21 @@ InputFile::InputFile(const char * filename, const char * mode, myAttemptRecovery = false; myFileTypePtr = NULL; myBufferIndex = 0; + myWriteIndex = 0; myCurrentBufferSize = 0; myAllocatedBufferSize = DEFAULT_BUFFER_SIZE; - myFileBuffer = new char[myAllocatedBufferSize]; + if(strchr(mode, 'r') || strchr(mode, 'R')){ + myFileBuffer = new char[myAllocatedBufferSize]; + } else { + myFileBuffer = NULL; + } + if(strchr(mode, 'w') || strchr(mode, 'W') || + strchr(mode, 'a') || strchr(mode, 'A')){ + myWriteBuffer = new char[myAllocatedBufferSize]; + memset(myWriteBuffer, '\0', myAllocatedBufferSize); + } else { + myWriteBuffer = NULL; + } myFileName.clear(); openFile(filename, mode, compressionMode); @@ -55,13 +67,13 @@ int InputFile::readTilChar(const std::string& stopChars, std::string& stringRef) { return(-1); } - + // Try to find the character in the stopChars. pos = stopChars.find(charRead); if(pos == std::string::npos) { - // Didn't find a stop character and it is not an EOF, + // Didn't find a stop character and it is not an EOF, // so add it to the string. stringRef += charRead; } @@ -84,7 +96,7 @@ int InputFile::readTilChar(const std::string& stopChars) { return(-1); } - + // Try to find the character in the stopChars. pos = stopChars.find(charRead); } @@ -163,7 +175,7 @@ bool InputFile::openFile(const char * filename, const char * mode, { // // if recovering, we don't want to issue big readaheads, since - // that interferes with the decompression - we only want to + // that interferes with the decompression - we only want to // decompress one at a time, and handle the exceptions immediately // rather than at some indeterminate point in time. // @@ -180,7 +192,7 @@ bool InputFile::openFile(const char * filename, const char * mode, // Check if reading from stdin. if((strcmp(filename, "-") == 0) || (strcmp(filename, "-.gz") == 0)) { - // Reading from stdin, open it based on the + // Reading from stdin, open it based on the // compression mode. openFileUsingMode(filename, mode, compressionMode); } @@ -219,7 +231,7 @@ bool InputFile::openFile(const char * filename, const char * mode, // Read the file to see if it a gzip file. GzipHeader gzipHeader; bool isGzip = gzipHeader.readHeader(file); - + // The file header has been read, so close the file, so it can // be re-opened as the correct type. file.close(); @@ -392,6 +404,11 @@ InputFile::~InputFile() delete[] myFileBuffer; myFileBuffer = NULL; } + if(myWriteBuffer != NULL) + { + delete[] myWriteBuffer; + myWriteBuffer = NULL; + } } diff --git a/general/InputFile.h b/general/InputFile.h index 1f242bb..b4b4ac4 100644 --- a/general/InputFile.h +++ b/general/InputFile.h @@ -14,7 +14,7 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -/*! \file */ +/*! \file */ #ifndef __INPUTFILE_H__ #define __INPUTFILE_H__ @@ -54,10 +54,14 @@ class InputFile myAttemptRecovery = false; myFileTypePtr = NULL; myBufferIndex = 0; + myWriteIndex = 0; myCurrentBufferSize = 0; // Default to buffer. myAllocatedBufferSize = DEFAULT_BUFFER_SIZE; myFileBuffer = new char[myAllocatedBufferSize]; + myWriteBuffer = new char[myAllocatedBufferSize]; + memset(myFileBuffer, '\0', myAllocatedBufferSize); + memset(myWriteBuffer, '\0', myAllocatedBufferSize); myFileName.clear(); } @@ -68,7 +72,7 @@ class InputFile /// \param filename file to open /// \param mode same format as fopen: "r" for read & "w" for write. /// \param compressionMode set the type of file to open for writing or - /// for reading from stdin (when reading files, the compression type is + /// for reading from stdin (when reading files, the compression type is /// determined by reading the file). InputFile(const char * filename, const char * mode, InputFile::ifileCompression compressionMode = InputFile::DEFAULT); @@ -127,7 +131,7 @@ class InputFile } } - + /// Close the file. /// \return status of the close (0 is success). inline int ifclose() @@ -136,6 +140,7 @@ class InputFile { return EOF; } + ifflush(); int result = myFileTypePtr->close(); delete myFileTypePtr; myFileTypePtr = NULL; @@ -228,13 +233,13 @@ class InputFile } // Now copy the rest of the bytes into the buffer. - memcpy((char*)buffer+availableBytes, + memcpy((char*)buffer+availableBytes, myFileBuffer, copySize); // set the buffer index to the location after what we are // returning as read. myBufferIndex = copySize; - + returnSize += copySize; } } @@ -301,7 +306,7 @@ class InputFile /// Read, appending the characters into the specified string until new /// line or EOF is found, returning -1 if EOF is found first and 0 if new - /// line is found first. The new line and EOF are not written into the + /// line is found first. The new line and EOF are not written into the /// specified string. /// \param line reference to a string that the read characters should /// be apppended to (does not include the new line or eof). @@ -415,7 +420,92 @@ class InputFile // No myFileTypePtr, so return 0 - nothing written. return 0; } - return myFileTypePtr->write(buffer, size); + if(myAllocatedBufferSize == 0 || myWriteBuffer == NULL){ + return myFileTypePtr->write(buffer, size); + } + //return myFileTypePtr->write(buffer, size); + // There are 2 cases: + // 1) There are already size available bytes in buffer. + // 2) There are not size bytes in buffer. + + // Determine the number of available bytes in the buffer. + unsigned int availableBytes = myAllocatedBufferSize - myWriteIndex; + //printf("Index: %d\n", myWriteIndex); + //printf("Bufsize bytes: %d\n", myAllocatedBufferSize); + //printf("Available bytes: %d\n", availableBytes); + //unsigned int remainingBytes = size; + //unsigned int tmpIdx = 0; + + int returnSize = 0; + // If the incoming buffer would exceed the available bytes in the + // buffer, go ahead and write the buffer and reset + if (size > availableBytes){ + myFileTypePtr->write(myWriteBuffer, myWriteIndex); + myWriteIndex = 0; + availableBytes = myAllocatedBufferSize; + memset(myWriteBuffer, '\0', myAllocatedBufferSize); + } + // If the new write is bigger than the buffer, go ahead and write it, + // skipping the buffer. + if (size >= myAllocatedBufferSize){ + myFileTypePtr->write(buffer, size); + } else { + // Otherwise, write it into the buffer but not to disk + memcpy(myWriteBuffer+myWriteIndex, buffer, size); + myWriteIndex += size; + } + return size; + + // Case 1: There are already size available bytes in buffer. + /* + while (remainingBytes > 0) + { + if (availableBytes == 0) { + returnSize += myFileTypePtr->write(myWriteBuffer, myCurrentBufferSize); + myWriteIndex = 0; + availableBytes = myCurrentBufferSize - myWriteIndex; + memset(myWriteBuffer, '\0', myCurrentBufferSize); + } + if (remainingBytes >= availableBytes && availableBytes > 0) + { + // Size > availableBytes > 0 + // Copy the available bytes into the buffer. + memcpy(myWriteBuffer+myWriteIndex, (char*)buffer+tmpIdx, availableBytes); + myWriteIndex += availableBytes; + tmpIdx += availableBytes; + remainingBytes -= availableBytes; + returnSize += availableBytes; + availableBytes = myCurrentBufferSize - myWriteIndex; + } else if (remainingBytes > 0 && remainingBytes < availableBytes){ + memcpy(myWriteBuffer+myWriteIndex, buffer, remainingBytes); + myWriteIndex += remainingBytes; + tmpIdx += remainingBytes; + //remainingBytes -= remainingBytes; + returnSize += remainingBytes; + remainingBytes = 0; + availableBytes = myCurrentBufferSize - myWriteIndex; + } + // Increment the buffer index. + } + return returnSize; + if (remainingBytes > 0) { + memcpy(myFileBuffer+myWriteIndex, (char*)buffer+tmpIdx, remainingBytes); + myWriteIndex += remainingBytes; + tmpIdx += remainingBytes; + remainingBytes -= remainingBytes; + returnSize += remainingBytes; + } + */ + return returnSize; + } + + inline unsigned int ifflush(){ + int returnSize = 0; + if (myWriteIndex == 0 ) { return 0; } + returnSize += myFileTypePtr->write(myWriteBuffer, myWriteIndex); + memset(myWriteBuffer, '\0', myAllocatedBufferSize); + myWriteIndex = 0; + return returnSize; } /// Returns whether or not the file was successfully opened. @@ -476,7 +566,7 @@ class InputFile } /// Enable (default) or disable recovery. - /// + /// /// When true, we can attach a myFileTypePtr /// that implements a recovery capable decompressor. /// This requires that the caller be able to catch @@ -489,7 +579,7 @@ class InputFile bool attemptRecoverySync(bool (*checkSignature)(void *data) , int length) { - if(myFileTypePtr==NULL) return false; + if(myFileTypePtr==NULL) return false; return myFileTypePtr->attemptRecoverySync(checkSignature, length); } @@ -533,10 +623,14 @@ class InputFile // Buffer used to do large reads rather than 1 by 1 character reads // from the file. The class is then managed to iterate through the buffer. char* myFileBuffer; + char* myWriteBuffer; + // Current index into the buffer. Used to track where we are in reading the // file from the buffer. int myBufferIndex; + int myWriteIndex; + // Current number of entries in the buffer. Used to ensure that // if a read did not fill the buffer, we stop before hitting the @@ -551,7 +645,7 @@ class InputFile typedef InputFile* IFILE; -/// Open a file with the specified name and mode, using a filename of "-" to +/// Open a file with the specified name and mode, using a filename of "-" to /// indicate stdin/stdout. /// \param filename file to open ("-" meands stdin/stdout) /// \param mode same format as fopen: "r" for read & "w" for write. @@ -714,7 +808,7 @@ inline bool ifseek(IFILE file, int64_t offset, int origin) /// \return number of bytes written int ifprintf(IFILE output, const char * format, ...); -/// Read a line from a file using streaming. +/// Read a line from a file using streaming. /// Will not fail when the file hits EOF, so do not do: while(iFile >> iStr) /// unless within your loop you check for ifeof and break. /// Instead, do something like: @@ -736,11 +830,11 @@ inline IFILE operator >> (IFILE stream, std::string &str) inline InputFile& operator << (InputFile& stream, const std::string& str) { unsigned int numExpected = str.length(); - unsigned int numWritten = + unsigned int numWritten = stream.ifwrite(str.c_str(), numExpected); if(numExpected != numWritten) { - std::cerr << "Failed to stream to IFILE, expected " + std::cerr << "Failed to stream to IFILE, expected " << numExpected << " but only wrote " << numWritten << std::endl; } @@ -753,11 +847,11 @@ inline InputFile& operator << (InputFile& stream, const std::string& str) inline InputFile& operator << (InputFile& stream, const char* str) { unsigned int numExpected = strlen(str); - unsigned int numWritten = + unsigned int numWritten = stream.ifwrite(str, numExpected); if(numExpected != numWritten) { - std::cerr << "Failed to stream to IFILE, expected " + std::cerr << "Failed to stream to IFILE, expected " << numExpected << " but only wrote " << numWritten << std::endl; } @@ -785,11 +879,11 @@ InputFile& operator << (InputFile& stream, unsigned int num); /// \param ch character that should be written to the file. inline InputFile& operator << (InputFile& stream, char ch) { - unsigned int numWritten = + unsigned int numWritten = stream.ifwrite(&ch, 1); if(1 != numWritten) { - std::cerr << "Failed to stream to IFILE, expected 1, but only wrote " + std::cerr << "Failed to stream to IFILE, expected 1, but only wrote " << numWritten << std::endl; } return(stream);