Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions include/fqc/algo/global_analyzer.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ struct Minimizer {
std::uint64_t hash;

/// @brief Position in the read where minimizer starts
std::uint16_t position;
std::uint32_t position;

/// @brief Whether the minimizer is from reverse complement
bool isReverseComplement;
Expand All @@ -82,7 +82,7 @@ struct Minimizer {
constexpr Minimizer() noexcept : hash(0), position(0), isReverseComplement(false) {}

/// @brief Construct with values
constexpr Minimizer(std::uint64_t h, std::uint16_t pos, bool rc) noexcept
constexpr Minimizer(std::uint64_t h, std::uint32_t pos, bool rc) noexcept
: hash(h), position(pos), isReverseComplement(rc) {}

/// @brief Equality comparison
Expand Down
4 changes: 2 additions & 2 deletions include/fqc/algo/pe_optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ struct PEEncodedPair {
bool useComplementarity = false;

/// @brief If complementarity: positions where R2 differs from R1-RC.
std::vector<std::uint16_t> diffPositions;
std::vector<std::uint32_t> diffPositions;

/// @brief If complementarity: bases at diff positions.
std::vector<char> diffBases;
Expand Down Expand Up @@ -188,7 +188,7 @@ class PEOptimizer {
[[nodiscard]] static std::string reverseComplement(std::string_view seq);

/// @brief Compute diff between two sequences.
[[nodiscard]] static std::pair<std::vector<std::uint16_t>, std::vector<char>> computeDiff(
[[nodiscard]] static std::pair<std::vector<std::uint32_t>, std::vector<char>> computeDiff(
std::string_view seq1, std::string_view seq2);
};

Expand Down
4 changes: 2 additions & 2 deletions src/algo/global_analyzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ constexpr char kComplement[256] = {
struct BucketEntry {
std::uint64_t readId; ///< Original read ID
std::uint64_t minimizer; ///< Minimizer hash
std::uint16_t position; ///< Position in read
std::uint32_t position; ///< Position in read
bool isRC; ///< Is reverse complement
};

Expand Down Expand Up @@ -338,7 +338,7 @@ std::vector<Minimizer> extractMinimizers(std::string_view sequence, std::size_t
}
bool isRC = (minHash != fwdHash);

minimizers.emplace_back(minHash, static_cast<std::uint16_t>(minPos), isRC);
minimizers.emplace_back(minHash, static_cast<std::uint32_t>(minPos), isRC);
prevMinPos = minPos;
}
}
Expand Down
22 changes: 15 additions & 7 deletions src/algo/pe_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,22 +98,24 @@ std::string PEOptimizer::reverseComplement(std::string_view seq) {
return result;
}

std::pair<std::vector<std::uint16_t>, std::vector<char>> PEOptimizer::computeDiff(
std::pair<std::vector<std::uint32_t>, std::vector<char>> PEOptimizer::computeDiff(
std::string_view seq1, std::string_view seq2) {
std::vector<std::uint16_t> positions;
std::vector<std::uint32_t> positions;
std::vector<char> bases;
positions.reserve(seq2.length());
bases.reserve(seq2.length());

std::size_t len = std::min(seq1.length(), seq2.length());
for (std::size_t i = 0; i < len; ++i) {
if (seq1[i] != seq2[i]) {
positions.push_back(static_cast<std::uint16_t>(i));
positions.push_back(static_cast<std::uint32_t>(i));
bases.push_back(seq2[i]);
}
}

// Handle length differences
for (std::size_t i = len; i < seq2.length(); ++i) {
positions.push_back(static_cast<std::uint16_t>(i));
positions.push_back(static_cast<std::uint32_t>(i));
bases.push_back(seq2[i]);
}

Expand Down Expand Up @@ -182,7 +184,7 @@ PEEncodedPair PEOptimizer::encodePair(const io::PairedEndRecord& pair) const {
// Compute quality deltas
std::string r1QualRev(pair.read1.quality.rbegin(), pair.read1.quality.rend());
for (std::size_t i = 0; i < encoded.diffPositions.size(); ++i) {
std::uint16_t pos = encoded.diffPositions[i];
std::uint32_t pos = encoded.diffPositions[i];
if (pos < r1QualRev.length() && pos < pair.read2.quality.length()) {
std::int8_t delta =
static_cast<std::int8_t>(pair.read2.quality[pos] - r1QualRev[pos]);
Expand All @@ -194,7 +196,13 @@ PEEncodedPair PEOptimizer::encodePair(const io::PairedEndRecord& pair) const {

// Update stats
++stats_.complementarityUsed;
stats_.bytesSaved += pair.read2.sequence.length() - encoded.diffPositions.size() * 3;
const std::size_t encodedBytes =
encoded.diffPositions.size() * (sizeof(std::uint32_t) + sizeof(char)) +
encoded.qualDelta.size() * sizeof(std::int8_t);
const std::size_t rawBytes = pair.read2.sequence.size() + pair.read2.quality.size();
if (rawBytes > encodedBytes) {
stats_.bytesSaved += rawBytes - encodedBytes;
}
} else {
encoded.useComplementarity = false;
encoded.seq2 = pair.read2.sequence;
Expand Down Expand Up @@ -307,7 +315,7 @@ std::string generateR2Id(std::string_view r1Id) {

// Check for existing /1 suffix
if (id.length() >= 2 && id[id.length() - 1] == '1' &&
(id[id.length() - 2] == '/' || id[id.length() - 2] == '.')) {
(id[id.length() - 2] == '/' || id[id.length() - 2] == '.' || id[id.length() - 2] == '_')) {
id[id.length() - 1] = '2';
return id;
}
Expand Down
6 changes: 5 additions & 1 deletion src/algo/quality_compressor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -580,9 +580,13 @@ Result<std::vector<std::uint8_t>> QualityCompressorImpl::compressSCM(

Result<std::vector<std::string>> QualityCompressorImpl::decompressSCM(
std::span<const std::uint8_t> data, std::span<const std::uint32_t> lengths) {
if (data.empty() || lengths.empty()) {
if (lengths.empty()) {
return std::vector<std::string>(lengths.size());
}
if (data.empty()) {
return makeError<std::vector<std::string>>(ErrorCode::kFormatError,
"Compressed quality payload is empty");
}

// Decompress Zstd layer first
std::size_t decompressedSize = ZSTD_getFrameContentSize(data.data(), data.size());
Expand Down
6 changes: 1 addition & 5 deletions src/commands/compression_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,6 @@ auto executeSingleThreadCompression(const CompressionPlan& plan,
compressorConfig.readLengthClass = analysisResult.lengthClass;
algo::BlockCompressor blockCompressor(compressorConfig);

std::uint64_t totalCompressedBytes = 0;

for (const auto& blockBoundary : analysisResult.blockBoundaries) {
std::vector<ReadRecord> blockReads;
const auto startId = blockBoundary.archiveIdStart;
Expand Down Expand Up @@ -227,8 +225,6 @@ auto executeSingleThreadCompression(const CompressionPlan& plan,
payload.auxData = compressedBlock.auxStream;

fqcWriter.writeBlock(blockHeader, payload);

totalCompressedBytes += compressedBlock.totalCompressedSize();
stats.blocksWritten++;
}

Expand All @@ -246,7 +242,7 @@ auto executeSingleThreadCompression(const CompressionPlan& plan,
}

fqcWriter.finalize();
stats.outputBytes = totalCompressedBytes;
stats.outputBytes = std::filesystem::file_size(options.outputPath);
format::unregisterWriterForCleanup(&fqcWriter);
}

Expand Down
2 changes: 1 addition & 1 deletion src/format/fqc_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ BlockData FQCReader::readBlock(BlockId blockId, StreamSelection selection) {
result.header = readBlockHeader(blockId);

// Calculate payload start position
auto payloadStart = blockIndex_[blockId].offset + BlockHeader::kSize;
auto payloadStart = blockIndex_[blockId].offset + result.header.headerSize;

// Read selected streams
if (hasStream(selection, StreamSelection::kIds) && result.header.sizeIds > 0) {
Expand Down
18 changes: 13 additions & 5 deletions src/io/fastq_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,9 +197,12 @@ ParserStats FastqParser::sampleRecords(std::size_t maxSamples) {
}

// Save current state
const auto savedPosition = stream_ ? stream_->tellg() : std::istream::pos_type(-1);
auto savedLineNumber = lineNumber_;
auto savedRecordNumber = recordNumber_;
auto savedStats = stats_;
auto savedLastError = lastError_;
auto savedEof = eof_;

// Reset for sampling
reset();
Expand All @@ -218,10 +221,15 @@ ParserStats FastqParser::sampleRecords(std::size_t maxSamples) {
}

// Restore state
reset();
stream_->clear();
if (savedPosition != std::istream::pos_type(-1)) {
stream_->seekg(savedPosition);
}
eof_ = savedEof;
lineNumber_ = savedLineNumber;
recordNumber_ = savedRecordNumber;
stats_ = savedStats;
lastError_ = std::move(savedLastError);

return sampleStats;
}
Expand Down Expand Up @@ -388,16 +396,16 @@ bool FastqParser::validateQuality(std::string_view qual) const {
void FastqParser::setError(std::string message, std::string_view lineContent) {
lastError_ = ParseError{
.lineNumber = lineNumber_,
.recordNumber = recordNumber_,
.recordNumber = recordNumber_ + 1,
.message = std::move(message),
.lineContent = std::string(lineContent.substr(0, 100)) // Truncate long lines
};
}

void FastqParser::trimRight(std::string& str) {
auto it =
std::find_if(str.rbegin(), str.rend(), [](unsigned char c) { return !std::isspace(c); });
str.erase(it.base(), str.end());
while (!str.empty() && str.back() == '\r') {
str.pop_back();
}
}

// =============================================================================
Expand Down
6 changes: 5 additions & 1 deletion src/pipeline/pipeline_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,11 @@ std::size_t BackpressureController::maxInFlight() const noexcept {
}

void BackpressureController::reset() noexcept {
inFlight_.store(0);
{
std::lock_guard lock(mutex_);
inFlight_.store(0);
}
cv_.notify_all();
}

} // namespace fqc::pipeline
10 changes: 10 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,11 @@ fqc_add_test(chunk_marshal_test
SOURCES pipeline/chunk_marshal_test.cpp
)

# Backpressure controller regression tests
fqc_add_test(backpressure_controller_test
SOURCES pipeline/backpressure_controller_test.cpp
)

# Pipeline split-header compile test
fqc_add_test(pipeline_node_headers_test
SOURCES
Expand All @@ -145,6 +150,11 @@ fqc_add_test(block_compressor_regression_test
SOURCES algo/block_compressor_regression_test.cpp
)

# Algorithm regression tests
fqc_add_test(algo_regression_test
SOURCES algo/algo_regression_test.cpp
)

# Original-order command tests
fqc_add_test(original_order_command_test
SOURCES commands/original_order_command_test.cpp
Expand Down
65 changes: 65 additions & 0 deletions tests/algo/algo_regression_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// =============================================================================
// fq-compressor - Algorithm Regression Tests
// =============================================================================

#include "fqc/algo/global_analyzer.h"
#include "fqc/algo/pe_optimizer.h"
#include "fqc/algo/quality_compressor.h"

#include <cstdint>
#include <string>
#include <vector>

#include <gtest/gtest.h>

namespace fqc::algo::test {

TEST(AlgoRegressionTest, LosslessQualityDecompressionRejectsEmptyPayloadForNonEmptyLengths) {
QualityCompressorConfig config;
config.qualityMode = QualityMode::kLossless;

QualityCompressor compressor(config);
const std::vector<std::uint32_t> lengths = {4};

const auto result = compressor.decompress({}, lengths);

EXPECT_FALSE(result.has_value());
}

TEST(AlgoRegressionTest, GenerateR2IdPreservesUnderscoreSuffixConvention) {
const std::string r1Id = "instrument:run:flowcell_1";
const auto r2Id = generateR2Id(r1Id);

EXPECT_EQ(r2Id, "instrument:run:flowcell_2");
EXPECT_TRUE(io::arePairedIds(r1Id, r2Id));
}

TEST(AlgoRegressionTest, ExtractMinimizersPreservesPositionsBeyondUint16Range) {
std::string sequence(70000, 'A');

const auto minimizers = extractMinimizers(sequence, 1, 1);

ASSERT_FALSE(minimizers.empty());
EXPECT_EQ(minimizers.back().position, 69999u);
}

TEST(AlgoRegressionTest, EncodeDecodePairHandlesDiffPositionsBeyondUint16Range) {
io::PairedEndRecord pair;
pair.read1.id = "long-read/1";
pair.read1.sequence.assign(70000, 'A');
pair.read1.quality.assign(70000, 'I');

pair.read2.id = "long-read/2";
pair.read2.sequence.assign(70000, 'T');
pair.read2.sequence[66000] = 'G';
pair.read2.quality.assign(70000, 'I');

PEOptimizer optimizer;
const auto encoded = optimizer.encodePair(pair);
const auto decoded = optimizer.decodePair(encoded);

EXPECT_EQ(decoded.read2.sequence, pair.read2.sequence);
EXPECT_EQ(decoded.read2.quality, pair.read2.quality);
}

} // namespace fqc::algo::test
8 changes: 8 additions & 0 deletions tests/algo/pe_property_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,14 @@ TEST(PairedEndTest, EmptyPairHandling) {
EXPECT_FALSE(pair.isValid()); // Empty sequences are invalid
}

TEST(PairedEndTest, GenerateR2IdPreservesUnderscoreConvention) {
const std::string r1Id = "instrument:run:flowcell_1";
const std::string r2Id = generateR2Id(r1Id);

EXPECT_EQ(r2Id, "instrument:run:flowcell_2");
EXPECT_TRUE(io::arePairedIds(r1Id, r2Id));
}

/// @brief Test single base pair.
TEST(PairedEndTest, SingleBasePair) {
io::PairedEndRecord pair;
Expand Down
1 change: 1 addition & 0 deletions tests/commands/compression_engine_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ TEST(CompressionEngineTest, ExecutesSingleThreadArchivePlanAndReportsStats) {
EXPECT_EQ(statsResult->totalBases, 8u);
EXPECT_GT(statsResult->inputBytes, 0u);
EXPECT_GT(statsResult->outputBytes, 0u);
EXPECT_EQ(statsResult->outputBytes, std::filesystem::file_size(outputPath));
EXPECT_EQ(statsResult->blocksWritten, 1u);
EXPECT_TRUE(std::filesystem::exists(outputPath));

Expand Down
Loading
Loading