diff --git a/.github/workflows/clang-tidy-review-post.yml b/.github/workflows/clang-tidy-review-post.yml index 0ec0eb7..4c5ef8c 100644 --- a/.github/workflows/clang-tidy-review-post.yml +++ b/.github/workflows/clang-tidy-review-post.yml @@ -11,26 +11,21 @@ permissions: pull-requests: write concurrency: - group: ${{ github.workflow }}-${{ github.event.workflow_run.pull_requests[0].number }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number }} cancel-in-progress: true jobs: - post-comments: - # Only run if the triggering workflow was from a pull request - if: github.event.workflow_run.event == 'pull_request' + build: runs-on: ubuntu-latest - + steps: - name: Post review comments id: post-review uses: ZedThree/clang-tidy-review/post@v0.21.0 with: max_comments: 10 - - # Fail if there are any clang-tidy warnings - - name: Check for issues - if: steps.post-review.outputs.total_comments > 0 - run: | - echo "::error::Found ${{ steps.post-review.outputs.total_comments }} clang-tidy issues" - exit 1 + + # If there are any comments, fail the check + - if: steps.post-review.outputs.total_comments > 0 + run: exit 1 diff --git a/CMakeLists.txt b/CMakeLists.txt index 2bc3b44..d0a1d5c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,8 +74,6 @@ if(RAMTOOLS_BUILD_TOOLS) add_subdirectory(tools) endif() -# Include benchmark directory if benchmarks OR tests are enabled -# (sam_generator is needed by tests) if(RAMTOOLS_BUILD_BENCHMARKS OR RAMTOOLS_BUILD_TESTS) add_subdirectory(benchmark) endif() diff --git a/README.md b/README.md index fee725f..dd0ee2a 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,24 @@ Tested with HG00154 sample from the 1000 Genomes Project (196M reads, 72GB SAM f - LZ4 compression provides the best query performance among all compression algorithms - For a 100Mb region query: RNTuple processes **453,736 reads/sec** vs TTree+ZLIB's **197,815 reads/sec** +### RNTuple vs. CRAM (samtools) Region Queries + +Measured on `HG00096.cram` (GRCh37 reference, samtools `view -c -F 2308`) versus the converted RAM RNTuple produced by `samtoramntuple`. Benchmarks run on an Intel i5-8250U laptop with SSD storage. + +| Query (GRCh37) | Reads | RNTuple `ramntupleview` Wall (s) | samtools (CRAM) Wall (s) | Relative Speed (`CRAM / RNTuple`) | +|----------------|-------|----------------------------------|---------------------------|-----------------------------------| +| chr1:1,000,000-1,001,000 | 6 | 12.6 | 0.42 | 0.03× | +| chr1:1,000,000-2,000,000 | 41,608 | 5.65 | 0.31 | 0.06× | +| chr1:1-50,000,000 | 2,921,678 | 6.52 | 9.41 | **1.44×** | +| chr2:1-100,000,000 | 6,879,654 | 7.75 | 20.19 | **2.61×** | +| chr7:50,000,000-150,000,000 | 6,698,314 | 7.96 | 17.80 | **2.24×** | +| chr21:1-48,129,895 | 2,510,708 | 6.58 | 8.29 | **1.26×** | + +**Interpretation** +- RNTuple has higher constant overhead for tiny windows (dozens of reads); CRAM via samtools remains faster for ad‑hoc lookups. +- For broad regions that touch millions of records, the columnar layout and sparse index give RNTuple a **1.3×–2.6× throughput advantage** over samtools. +- Large windows (≥50 Mb) saturate sequential access on both formats; RNTuple’s page-level prefetch keeps bandwidth high while retaining exact read counts matching samtools. + ## TTree Implementation (Legacy) ROOT scripts to convert a SAM file to a RAM (ROOT Alignment/Map) file using the older TTree format and to work with those files. diff --git a/benchmark/generate_sam_benchmark.cxx b/benchmark/generate_sam_benchmark.cxx index 73f6b69..464464e 100644 --- a/benchmark/generate_sam_benchmark.cxx +++ b/benchmark/generate_sam_benchmark.cxx @@ -1,6 +1,6 @@ #include #include "generate_sam_benchmark.h" -#include // For std::remove +#include static void BM_GenerateSAM(benchmark::State &state) { diff --git a/benchmark/region_query_benchmark.cxx b/benchmark/region_query_benchmark.cxx index 0abfcfc..ad2ccd6 100644 --- a/benchmark/region_query_benchmark.cxx +++ b/benchmark/region_query_benchmark.cxx @@ -20,7 +20,7 @@ class RegionQueryFixture : public benchmark::Fixture { sam_file_ = "/media/aditya/213e0e46-6f86-4288-8b79-74851c34314f/output_big.sam"; ttree_root_file_ = "/media/aditya/213e0e46-6f86-4288-8b79-74851c34314f/output_big_lzma.root"; - rntuple_root_file_ = "/media/aditya/213e0e46-6f86-4288-8b79-74851c34314f/output_root.root"; + rntuple_root_file_ = "/home/aditya/ramtools/build/rntuple.root"; } void TearDown(const benchmark::State &) override {} @@ -39,24 +39,24 @@ class RegionQueryFixture : public benchmark::Fixture { const char *get_current_region() const { return regions_[region_idx_ % regions_.size()].c_str(); } }; -const std::vector RegionQueryFixture::regions_ = {"chr1:1000000-1001000", - "chr2:5000000-5010000", - "chrX:100000-150000", - "chr1:1000000-2000000", - "chr5:10000000-15000000", - "chr10:50000000-60000000", - "chr1:1-50000000", - "chr2:1-100000000", - "chr7:50000000-150000000", - "chr21:1-48129895", - "chrM:1-16571", - "chrY:2600000-2700000", +const std::vector RegionQueryFixture::regions_ = {"1:1000000-1001000", + "2:5000000-5010000", + "X:100000-150000", + "1:1000000-2000000", + "5:10000000-15000000", + "10:50000000-60000000", + "1:1-50000000", + "2:1-100000000", + "7:50000000-150000000", + "21:1-48129895", + "MT:1-16571", + "Y:2600000-2700000", "GL000227.1:1-100000", - "chr1:1-1000", - "chr1:249250621-249250621", - "chr22:51304566-51304566", - "chr17:41196312-41277500", - "chr13:32889611-32973805"}; + "1:1-1000", + "1:249250621-249250621", + "22:51304566-51304566", + "17:41196312-41277500", + "13:32889611-32973805"}; BENCHMARK_DEFINE_F(RegionQueryFixture, TTree)(benchmark::State &state) { @@ -96,8 +96,7 @@ BENCHMARK_DEFINE_F(RegionQueryFixture, RNTuple)(benchmark::State &state) state.SetLabel(std::to_string(reads_in_this_run) + " reads"); } -BENCHMARK_REGISTER_F(RegionQueryFixture, TTree)->Args({0})->Args({3})->Args({6})->Args({9})->Unit(benchmark::kSecond); - -BENCHMARK_REGISTER_F(RegionQueryFixture, RNTuple)->Args({0})->Args({3})->Args({6})->Args({9})->Unit(benchmark::kSecond); +BENCHMARK_REGISTER_F(RegionQueryFixture, TTree)->DenseRange(0, 17, 1)->Unit(benchmark::kSecond); +BENCHMARK_REGISTER_F(RegionQueryFixture, RNTuple)->DenseRange(0, 17, 1)->Unit(benchmark::kSecond); BENCHMARK_MAIN(); diff --git a/inc/ramcore/SamToNTuple.h b/inc/ramcore/SamToNTuple.h index 1b986c0..dd8a404 100644 --- a/inc/ramcore/SamToNTuple.h +++ b/inc/ramcore/SamToNTuple.h @@ -3,6 +3,7 @@ #include #include +#include void samtoramntuple(const char *datafile, const char *treefile, @@ -11,6 +12,7 @@ void samtoramntuple(const char *datafile, uint32_t quality_policy); void samtoramntuple_split_by_chromosome(const char *datafile, const char *output_prefix, int compression_algorithm, - uint32_t quality_policy, int num_threads = 4); + uint32_t quality_policy, int num_threads = 4, + const std::vector &only_chromosomes = {}); #endif diff --git a/src/ramcore/RAMNTupleView.cxx b/src/ramcore/RAMNTupleView.cxx index dcb18f0..5f8e7e3 100644 --- a/src/ramcore/RAMNTupleView.cxx +++ b/src/ramcore/RAMNTupleView.cxx @@ -1,6 +1,11 @@ #include "ramcore/RAMNTupleView.h" -#include + +#include #include +#include +#include +#include + #include #include #include @@ -21,64 +26,159 @@ Long64_t ramntupleview(const char *file, const char *query, bool cache, bool per } std::string region = query; + if (region.empty() || region == "*") { + stopwatch.Print(); + return reader->GetNEntries(); + } + + TString rname; + Int_t range_start = 1; + Int_t range_end = std::numeric_limits::max(); + int chrDelimiterPos = region.find(":"); if (chrDelimiterPos == std::string::npos) { - std::cerr << "Invalid region format. Use rname:start-end\n"; - return 0; + rname = region; + } else { + rname = region.substr(0, chrDelimiterPos); + int rangeDelimiterPos = region.find("-", chrDelimiterPos); + if (rangeDelimiterPos == std::string::npos) { + try { + range_start = std::stoi(region.substr(chrDelimiterPos + 1)); + range_end = range_start; + } catch (...) { + std::cerr << "Invalid region format. Use rname:start-end or rname:position\n"; + return 0; + } + } else { + range_start = std::stoi(region.substr(chrDelimiterPos + 1, rangeDelimiterPos - chrDelimiterPos - 1)); + range_end = std::stoi(region.substr(rangeDelimiterPos + 1)); + } } - TString rname = region.substr(0, chrDelimiterPos); - int rangeDelimiterPos = region.find("-", chrDelimiterPos); - if (rangeDelimiterPos == std::string::npos) { - std::cerr << "Invalid region format. Use rname:start-end\n"; + + auto refs = RAMNTupleRecord::GetRnameRefs(); + auto refid = refs->GetRefId(rname.Data()); + + if (refid < 0) { + if (rname.BeginsWith("chr")) { + TString stripped_rname = rname(3, rname.Length() - 3); + refid = refs->GetRefId(stripped_rname.Data()); + } + if (refid < 0 && (rname == "chrM" || rname == "M")) { + refid = refs->GetRefId("MT"); + } + } + + if (refid < 0) { + std::cerr << "Error: Reference name '" << rname.Data() << "' not found in file.\n"; return 0; } - Int_t range_start = std::stoi(region.substr(chrDelimiterPos + 1, rangeDelimiterPos - chrDelimiterPos - 1)); - Int_t range_end = std::stoi(region.substr(rangeDelimiterPos + 1)); - auto refid = RAMNTupleRecord::GetRnameRefs()->GetRefId(rname.Data()); auto index = RAMNTupleRecord::GetIndex(); - - auto recordView = reader->GetView("record"); + auto flagView = reader->GetView("record.flag"); + auto refidView = reader->GetView("record.refid"); + auto posView = reader->GetView("record.pos"); + auto cigarView = reader->GetView>("record.cigar"); Long64_t count = 0; + const Long64_t totalEntries = reader->GetNEntries(); + + const int FLAG_FILTER = 0x904; + const Int_t rs0 = (range_start > 0) ? (range_start - 1) : 0; + const Int_t re0 = (range_end > 0) ? (range_end - 1) : 0; + constexpr int kMaxRefSpanHeuristic = 10000; + + auto computeRefSpan = [](const std::vector &cigarOps) -> int { + int span = 0; + for (uint32_t op : cigarOps) { + int len = static_cast(op >> 4); + int code = static_cast(op & 0xF); + + if (code == 0 || code == 2 || code == 3 || code == 7 || code == 8) { + span += len; + } + } + return span; + }; if (!index || index->Size() == 0) { + bool seenRef = false; for (auto i : reader->GetEntryRange()) { - const auto &rec = recordView(i); - if (rec.refid == refid && rec.pos >= range_start - 1 && rec.pos <= range_end - 1) { + const auto flag = flagView(i); + if (flag & FLAG_FILTER) + continue; + + const auto curRef = refidView(i); + if (curRef == refid) { + seenRef = true; + } else { + if (seenRef && curRef > refid) + break; + continue; + } + const auto curPos = posView(i); + if (curPos > re0) + break; + + int read_start = curPos; + if (read_start >= rs0) { + count++; + continue; + } + if (read_start + kMaxRefSpanHeuristic < rs0) { + continue; + } + const auto &cigarOps = cigarView(i); + int refSpan = computeRefSpan(cigarOps); + int read_end = (refSpan > 0) ? (read_start + refSpan - 1) : read_start; + + if (read_start <= re0 && read_end >= rs0) { count++; } } } else { - auto start_entry = index->GetRow(refid, range_start); - auto end_entry = index->GetRow(refid, range_end); + auto start_entry = index->GetRow(refid, rs0); + + if (start_entry < 0) + start_entry = index->GetRow(refid, 0); if (start_entry < 0) start_entry = 0; - if (end_entry < 0) - end_entry = reader->GetNEntries(); - for (; start_entry < end_entry; start_entry++) { - const auto &rec = recordView(start_entry); - int seqlen = rec.GetSEQLEN(); - if (rec.pos + seqlen > range_start - 1) { - break; - } - } + for (Long64_t j = start_entry; j < totalEntries; j++) { + const auto flag = flagView(j); + if (flag & FLAG_FILTER) + continue; - Long64_t j; - for (j = start_entry; j < reader->GetNEntries(); j++) { - const auto &rec = recordView(j); + const auto curRef = refidView(j); + if (curRef < refid) + continue; + if (curRef > refid) + break; - if (rec.pos >= range_end) { + const auto curPos = posView(j); + if (curPos > re0) break; + + int read_start = curPos; + if (read_start >= rs0) { + count++; + continue; + } + if (read_start + kMaxRefSpanHeuristic < rs0) { + continue; + } + const auto &cigarOps = cigarView(j); + int refSpan = computeRefSpan(cigarOps); + int read_end = (refSpan > 0) ? (read_start + refSpan - 1) : read_start; + + if (read_end >= rs0) { + count++; } - count++; } } stopwatch.Print(); - + std::cout << "Found " << static_cast(count) << " records in region " << (query ? query : "") << std::endl; return count; } diff --git a/src/ramcore/SamToNTuple.cxx b/src/ramcore/SamToNTuple.cxx index 1b4807f..ab776fd 100644 --- a/src/ramcore/SamToNTuple.cxx +++ b/src/ramcore/SamToNTuple.cxx @@ -2,6 +2,17 @@ #include "ramcore/SamParser.h" #include "rntuple/RAMNTupleRecord.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include #include #include @@ -13,124 +24,130 @@ #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -void samtoramntuple(const char *datafile, - const char *treefile, - bool index, bool split, bool cache, +void samtoramntuple(const char *datafile, const char *treefile, bool index, bool split, bool cache, int compression_algorithm, - uint32_t quality_policy) + uint32_t quality_policy) // NOLINT(misc-use-internal-linkage) - public API function { - TStopwatch stopwatch; - stopwatch.Start(); - - auto rootFile = std::unique_ptr(TFile::Open(treefile, "RECREATE")); - if (!rootFile || !rootFile->IsOpen()) { - printf("Failed to create RAM file %s\n", treefile); - return; - } - - RAMNTupleRecord::InitializeRefs(); - - auto model = RAMNTupleRecord::MakeModel(); - - ROOT::RNTupleWriteOptions writeOptions; - writeOptions.SetCompression(compression_algorithm); - writeOptions.SetMaxUnzippedPageSize(64000); - - auto writer = ROOT::RNTupleWriter::Append(std::move(model), "RAM", *rootFile, writeOptions); - auto defaultEntry = writer->GetModel().CreateEntry(); - auto recordPtr = defaultEntry->GetPtr("record"); - - TList headers; - headers.SetName("headers"); - - ramcore::SamParser parser; - - auto header_callback = [&headers](const std::string& tag, const std::string& content) { - headers.Add(new TNamed(tag.c_str(), content.c_str())); - - if (tag == "@SQ") { - size_t sn_pos = content.find("SN:"); - if (sn_pos != std::string::npos) { - sn_pos += 3; - size_t tab_pos = content.find('\t', sn_pos); - std::string ref_name = content.substr(sn_pos, - tab_pos != std::string::npos ? tab_pos - sn_pos : std::string::npos); - RAMNTupleRecord::GetRnameRefs()->GetRefId(ref_name.c_str()); - } - } - }; - - auto record_callback = [&](const ramcore::SamRecord &sam_record, size_t record_num) { - recordPtr->SetBit(quality_policy); - - recordPtr->SetQNAME(sam_record.qname.c_str()); - recordPtr->SetFLAG(sam_record.flag); - recordPtr->SetREFID(sam_record.rname.c_str()); - recordPtr->SetPOS(sam_record.pos); - recordPtr->SetMAPQ(sam_record.mapq); - recordPtr->SetCIGAR(sam_record.cigar.c_str()); - recordPtr->SetREFNEXT(sam_record.rnext.c_str()); - recordPtr->SetPNEXT(sam_record.pnext); - recordPtr->SetTLEN(sam_record.tlen); - recordPtr->SetSEQ(sam_record.seq.c_str()); - recordPtr->SetQUAL(sam_record.qual.c_str()); - - recordPtr->ResetNOPT(); - for (const auto &opt : sam_record.optional_fields) { - recordPtr->SetOPT(opt.c_str()); - } - - writer->Fill(*defaultEntry); - - if (index && record_num % 1000 == 0) { - RAMNTupleRecord::GetIndex()->AddItem(recordPtr->GetREFID(), recordPtr->GetPOS() - 1, record_num); - } - }; - - if (!parser.ParseFile(datafile, header_callback, record_callback)) { - printf("Failed to parse SAM file %s\n", datafile); - return; - } - - writer.reset(); - - if (index) { - RAMNTupleRecord::WriteIndex(*rootFile); - } - RAMNTupleRecord::WriteAllRefs(*rootFile); - - headers.Write(); - rootFile->Close(); - - printf("\nRAM file created: %s\n", treefile); - printf("Number of entries: %zu\n", parser.GetRecordsProcessed()); - - RAMNTupleRecord::GetRnameRefs()->Print(); - RAMNTupleRecord::GetRnextRefs()->Print(); - - if (index) { - printf("\nIndex entries: %zu\n", RAMNTupleRecord::GetIndex()->Size()); - } - - printf("\nProcessed %zu SAM headers\n", - parser.GetLinesProcessed() - parser.GetRecordsProcessed()); - printf("Processed %zu SAM records\n\n", parser.GetRecordsProcessed()); - - stopwatch.Print(); -} + (void)split; + (void)cache; + TStopwatch stopwatch; + stopwatch.Start(); + + auto rootFile = std::unique_ptr(TFile::Open(treefile, "RECREATE")); + if (!rootFile || !rootFile->IsOpen()) { + std::cerr << "Failed to create RAM file " << treefile << std::endl; + return; + } + + RAMNTupleRecord::InitializeRefs(); + + auto model = RAMNTupleRecord::MakeModel(); + + ROOT::RNTupleWriteOptions writeOptions; + writeOptions.SetCompression(compression_algorithm); + writeOptions.SetMaxUnzippedPageSize(64000); + + auto writer = ROOT::RNTupleWriter::Append(std::move(model), "RAM", *rootFile, writeOptions); + auto defaultEntry = writer->GetModel().CreateEntry(); + auto recordPtr = defaultEntry->GetPtr("record"); + + TList headers; + headers.SetName("headers"); + + ramcore::SamParser parser; + + int64_t mapped_count = 0; + int32_t last_refid = -2; + int32_t last_indexed_pos = -1000000; + const int32_t POS_INTERVAL = 10000; + const int64_t MAPPED_INTERVAL = 100; + + auto header_callback = [&headers](const std::string &tag, const std::string &content) { + headers.Add(new TNamed(tag.c_str(), content.c_str())); + + if (tag == "@SQ") { + size_t sn_pos = content.find("SN:"); + if (sn_pos != std::string::npos) { + sn_pos += 3; + size_t tab_pos = content.find('\t', sn_pos); + std::string ref_name = + content.substr(sn_pos, tab_pos != std::string::npos ? tab_pos - sn_pos : std::string::npos); + RAMNTupleRecord::GetRnameRefs()->GetRefId(ref_name.c_str()); + } + } + }; + auto record_callback = [&](const ramcore::SamRecord &sam_record, size_t record_num) { + recordPtr->SetBit(quality_policy); + + recordPtr->SetQNAME(sam_record.qname.c_str()); + recordPtr->SetFLAG(sam_record.flag); + recordPtr->SetREFID(sam_record.rname.c_str()); + recordPtr->SetPOS(sam_record.pos); + recordPtr->SetMAPQ(sam_record.mapq); + recordPtr->SetCIGAR(sam_record.cigar.c_str()); + recordPtr->SetREFNEXT(sam_record.rnext.c_str()); + recordPtr->SetPNEXT(sam_record.pnext); + recordPtr->SetTLEN(sam_record.tlen); + recordPtr->SetSEQ(sam_record.seq.c_str()); + recordPtr->SetQUAL(sam_record.qual.c_str()); + + recordPtr->ResetNOPT(); + for (const auto &opt : sam_record.optional_fields) { + recordPtr->SetOPT(opt.c_str()); + } + + writer->Fill(*defaultEntry); + + if (index && !(sam_record.flag & 0x4) && recordPtr->GetREFID() >= 0) { + mapped_count++; + bool should_index = false; + + int current_refid = recordPtr->GetREFID(); + int current_pos = recordPtr->GetPOS() - 1; + + if (current_refid != last_refid) { + should_index = true; + last_refid = current_refid; + last_indexed_pos = current_pos; + } + + else if (current_pos - last_indexed_pos >= POS_INTERVAL) { + should_index = true; + last_indexed_pos = current_pos; + } + + else if (mapped_count % MAPPED_INTERVAL == 0) { + should_index = true; + } + + if (should_index) { + RAMNTupleRecord::GetIndex()->AddItem(current_refid, current_pos, record_num); + } + } + }; + + if (!parser.ParseFile(datafile, header_callback, record_callback)) { + std::cerr << "Failed to parse SAM file " << datafile << std::endl; + return; + } + + writer.reset(); + + if (index) { + RAMNTupleRecord::WriteIndex(*rootFile); + } + RAMNTupleRecord::WriteAllRefs(*rootFile); + + headers.Write(); + rootFile->Close(); + + (void)mapped_count; + (void)stopwatch; +} void samtoramntuple_split_by_chromosome(const char *datafile, const char *output_prefix, int compression_algorithm, - uint32_t quality_policy, int num_threads) + uint32_t quality_policy, int num_threads, + const std::vector &only_chromosomes) { ROOT::EnableThreadSafety(); RAMNTupleRecord::InitializeRefs(); @@ -140,6 +157,9 @@ void samtoramntuple_split_by_chromosome(const char *datafile, const char *output ramcore::SamParser parser; + std::unordered_set chromosome_filter(only_chromosomes.begin(), only_chromosomes.end()); + const bool filter_active = !chromosome_filter.empty(); + auto header_callback = [&](const std::string &tag, const std::string &content) { headers.push_back({tag, content}); @@ -157,6 +177,9 @@ void samtoramntuple_split_by_chromosome(const char *datafile, const char *output auto record_callback = [&](const ramcore::SamRecord &sam_record, size_t record_num) { if (sam_record.rname != "*") { + if (filter_active && chromosome_filter.find(sam_record.rname) == chromosome_filter.end()) { + return; + } chromosome_records[sam_record.rname].push_back(sam_record); } }; diff --git a/src/rntuple/RAMNTupleRecord.cxx b/src/rntuple/RAMNTupleRecord.cxx index fe60c5b..31dd160 100644 --- a/src/rntuple/RAMNTupleRecord.cxx +++ b/src/rntuple/RAMNTupleRecord.cxx @@ -519,7 +519,6 @@ std::string FormatCIGAR(const std::vector &cigar_ops) void RAMNTupleConverter::ConvertSAMToRAMNTuple(const std::string &sam_file, const std::string &ram_file, uint32_t compression_flags) { - RAMNTupleRecord::InitializeRefs(); auto file = std::unique_ptr(TFile::Open(ram_file.c_str(), "RECREATE")); @@ -546,9 +545,14 @@ void RAMNTupleConverter::ConvertSAMToRAMNTuple(const std::string &sam_file, cons std::string line; int64_t entry_number = 0; + int64_t mapped_count = 0; + int32_t last_refid = -2; + int32_t last_indexed_pos = -1000000; + const int32_t POS_INTERVAL = 10000; + const int64_t MAPPED_INTERVAL = 100; + while (std::getline(sam, line)) { if (line.empty() || line[0] == '@') { - if (line.substr(0, 3) == "@SQ") { size_t sn_pos = line.find("SN:"); if (sn_pos != std::string::npos) { @@ -591,8 +595,25 @@ void RAMNTupleConverter::ConvertSAMToRAMNTuple(const std::string &sam_file, cons rec.AddTag(tag); } - if (rec.refid >= 0 && rec.pos >= 0) { - RAMNTupleRecord::GetIndex()->AddItem(rec.refid, rec.pos, entry_number); + if (!(flag_int & 0x4) && rec.refid >= 0) { + mapped_count++; + bool should_index = false; + + if (rec.refid != last_refid) { + should_index = true; + last_refid = rec.refid; + last_indexed_pos = rec.pos; + std::cout << "Indexing chromosome " << rec.refid << " at entry " << entry_number << std::endl; + } else if (rec.pos - last_indexed_pos >= POS_INTERVAL) { + should_index = true; + last_indexed_pos = rec.pos; + } else if (mapped_count % MAPPED_INTERVAL == 0) { + should_index = true; + } + + if (should_index) { + RAMNTupleRecord::GetIndex()->AddItem(rec.refid, rec.pos, entry_number); + } } *recordPtr = std::move(rec); @@ -611,6 +632,8 @@ void RAMNTupleConverter::ConvertSAMToRAMNTuple(const std::string &sam_file, cons RAMNTupleRecord::WriteIndex(*file); std::cout << "Conversion complete. Processed " << entry_number << " records." << std::endl; + std::cout << "Index entries: " << RAMNTupleRecord::GetIndex()->Size() << std::endl; + std::cout << "Mapped reads: " << mapped_count << std::endl; } void RAMNTupleConverter::ConvertRAMNTupleToSAM(const std::string &ram_file, const std::string &sam_file) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 16c5358..f96a6ee 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -8,6 +8,7 @@ function(add_ramcore_test test_name) ramcore ROOT::Core ROOT::Tree + ROOT::TreePlayer ROOT::ROOTNTuple ROOT::RIO sam_generator diff --git a/test/ramcoretests.cxx b/test/ramcoretests.cxx index ac4df24..cb44232 100644 --- a/test/ramcoretests.cxx +++ b/test/ramcoretests.cxx @@ -9,71 +9,69 @@ #include #include #include -#include +#include -class ramcoreTest : public ::testing::Test { -protected: - void SetUp() override { - const int num_reads_for_test = 100; - if (!std::filesystem::exists("samexample.sam")) { - GenerateSAMFile("samexample.sam", num_reads_for_test); - } - - std::remove("test_ttree.root"); - std::remove("test_rntuple.root"); - } - - void TearDown() override { - std::remove("test_ttree.root"); - std::remove("test_rntuple.root"); - } -}; +#include "../tools/ramview.cxx" -TEST_F(ramcoreTest, ConversionProducesEqualEntries) { - const char* samFile = "samexample.sam"; - const char* ttreeFile = "test_ttree.root"; - const char* rntupleFile = "test_rntuple.root"; +namespace { - samtoram(samFile, ttreeFile, true, true, true, 1, 0); - samtoramntuple(samFile, rntupleFile, true, true, true, 505, 0); +constexpr int kNumReadsForTest = 100; +constexpr std::string_view kSamFile{"samexample.sam"}; +constexpr std::string_view kTTreeFile{"test_ttree.root"}; +constexpr std::string_view kRNTupleFile{"test_rntuple.root"}; - auto ft = std::unique_ptr(TFile::Open(ttreeFile)); - ASSERT_TRUE(ft && !ft->IsZombie()) << "Failed to open TTree file"; - - auto ttree = dynamic_cast(ft->Get("RAM")); - ASSERT_NE(ttree, nullptr) << "Failed to get TTree"; - Long64_t ttreeEntries = ttree->GetEntries(); +class ramcoreTest : public ::testing::Test { +protected: + void SetUp() override + { + GenerateSAMFile(kSamFile.data(), kNumReadsForTest); - auto reader = ROOT::RNTupleReader::Open("RAM", rntupleFile); - ASSERT_NE(reader, nullptr) << "Failed to open RNTuple"; - Long64_t rntupleEntries = reader->GetNEntries(); + std::remove(kTTreeFile.data()); + std::remove(kRNTupleFile.data()); + } - EXPECT_EQ(ttreeEntries, rntupleEntries) - << "Entry count mismatch - TTree: " << ttreeEntries - << ", RNTuple: " << rntupleEntries; - EXPECT_GT(ttreeEntries, 0) << "No entries found"; -} + void TearDown() override + { + std::remove(kTTreeFile.data()); + std::remove(kRNTupleFile.data()); + } +}; -TEST_F(ramcoreTest, RNTupleView) +TEST_F(ramcoreTest, ConversionProducesEqualEntries) { - const char *samFile = "samexample.sam"; - const char *rntupleFile = "test_rntuple.root"; + samtoram(kSamFile.data(), kTTreeFile.data(), true, true, true, 1, 0); + samtoramntuple(kSamFile.data(), kRNTupleFile.data(), true, true, true, 505, 0); - samtoramntuple(samFile, rntupleFile, true, true, true, 505, 0); + auto ft = std::unique_ptr(TFile::Open(kTTreeFile.data())); + ASSERT_TRUE(ft && !ft->IsZombie()) << "Failed to open TTree file"; - const char *regions[] = {"chr1:100-200", "chr2:500-1000", "chr5:1000-5000", "chr10:50000-100000", "chrX:1-1000"}; + auto ttree = dynamic_cast(ft->Get("RAM")); + ASSERT_NE(ttree, nullptr) << "Failed to get TTree"; + Long64_t ttreeEntries = ttree->GetEntries(); - for (const char *region : regions) { + auto reader = ROOT::RNTupleReader::Open("RAM", kRNTupleFile.data()); + ASSERT_NE(reader, nullptr) << "Failed to open RNTuple"; + Long64_t rntupleEntries = reader->GetNEntries(); - testing::internal::CaptureStdout(); + EXPECT_EQ(ttreeEntries, rntupleEntries) + << "Entry count mismatch - TTree: " << ttreeEntries << ", RNTuple: " << rntupleEntries; + EXPECT_GT(ttreeEntries, 0) << "No entries found"; - Long64_t count = ramntupleview(rntupleFile, region, true, false, nullptr); + const char *region = "chrM:1-100000000"; - testing::internal::GetCapturedStdout(); + testing::internal::CaptureStdout(); + ramview(kTTreeFile.data(), region, /*cache=*/true, /*perfstats=*/false, /*perfstatsfilename=*/nullptr); + std::string ramview_output{}; + ramview_output = testing::internal::GetCapturedStdout(); - EXPECT_GE(count, 0) << "ramntupleview returned negative count for region: " << region; - } + testing::internal::CaptureStdout(); + ramntupleview(kRNTupleFile.data(), region, /*cache=*/true, /*perfstats=*/false, /*perfstatsfilename=*/nullptr); + std::string ramntupleview_output{}; + ramntupleview_output = testing::internal::GetCapturedStdout(); - auto reader = ROOT::RNTupleReader::Open("RAM", rntupleFile); - ASSERT_NE(reader, nullptr) << "RNTuple file corrupted after viewing"; + EXPECT_TRUE(ramview_output.find("Found") != std::string::npos); + EXPECT_TRUE(ramntupleview_output.find("Found") != std::string::npos); + EXPECT_TRUE(ramntupleview_output.find("records in region") != std::string::npos); } + +} // namespace diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index b67568f..e561071 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -22,10 +22,6 @@ ROOT_EXECUTABLE(ramntupleview ROOT::ROOTNTuple ) -install(TARGETS samtoram samtoramntuple ramntupleview - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} -) - set(ROOT_MACROS checkindex.cxx rammerge.cxx diff --git a/tools/ramview.cxx b/tools/ramview.cxx index f2fa9e9..692ae46 100644 --- a/tools/ramview.cxx +++ b/tools/ramview.cxx @@ -56,14 +56,16 @@ Long64_t ramview(const char *file, const char *query, bool cache = true, bool pe Int_t range_start = std::stoi(region.substr(chrDelimiterPos + 1, rangeDelimiterPos - chrDelimiterPos)); Int_t range_end = std::stoi(region.substr(rangeDelimiterPos + 1, region.size() - rangeDelimiterPos)); + Int_t rs0 = range_start > 0 ? range_start - 1 : 0; + Int_t re0 = range_end > 0 ? range_end - 1 : 0; auto refid = RAMRecord::GetRnameRefs()->GetRefId(rname); + const UInt_t FLAG_FILTER = 0x904; - auto start_entry = RAMRecord::GetIndex()->GetRow(refid, range_start); - auto end_entry = RAMRecord::GetIndex()->GetRow(refid, range_end); + auto start_entry = RAMRecord::GetIndex()->GetRow(refid, rs0); + auto end_entry = RAMRecord::GetIndex()->GetRow(refid, re0); - printf("ramview: %s:%d (%lld) - %d (%lld)\n", rname.Data(), range_start, start_entry, - range_end, end_entry); + printf("ramview: %s:%d (%lld) - %d (%lld)\n", rname.Data(), range_start, start_entry, range_end, end_entry); if (b->GetSplitLevel() > 0) t->SetBranchStatus("RAMRecord.*", 0); @@ -76,10 +78,14 @@ Long64_t ramview(const char *file, const char *query, bool cache = true, bool pe for (; start_entry < t->GetEntries(); start_entry++) { t->GetEntry(start_entry); - if (r->GetPOS() + r->GetSEQLEN() > range_start) { - + if (r->GetREFID() < refid) + continue; + if (r->GetREFID() > refid) + break; + if (r->GetPOS() > re0) + break; + if (r->GetPOS() + r->GetSEQLEN() > rs0) break; - } } if (b->GetSplitLevel() > 0) @@ -90,14 +96,30 @@ Long64_t ramview(const char *file, const char *query, bool cache = true, bool pe for (j = start_entry; j < t->GetEntries(); j++) { t->GetEntry(j); - if (r->GetPOS() >= range_end) { + if (r->GetREFID() < refid) + continue; + if (r->GetREFID() > refid) + break; + if (r->GetFLAG() & FLAG_FILTER) + continue; + if (r->GetPOS() > re0) break; + const Int_t read_start = r->GetPOS(); + if (read_start >= rs0) { + count++; + continue; } - count++; + const Int_t read_end = r->GetSEQLEN() > 0 ? (read_start + r->GetSEQLEN() - 1) : read_start; + if (read_end >= rs0) + count++; } stopwatch.Print(); + std::string region_str = + std::string(rname.Data()) + ":" + std::to_string(range_start) + "-" + std::to_string(range_end); + std::cout << "Found " << static_cast(count) << " records in region " << region_str << std::endl; + if (perfstats) { ps->SaveAs(perfstatsfilename); delete ps; diff --git a/tools/samtoramntuple.cxx b/tools/samtoramntuple.cxx index 55d0be1..0498c8b 100644 --- a/tools/samtoramntuple.cxx +++ b/tools/samtoramntuple.cxx @@ -1,67 +1,82 @@ #include "ramcore/SamToNTuple.h" #include "rntuple/RAMNTupleRecord.h" +#include +#include +#include #include #include -#include +#include -int main(int argc, char* argv[]) { - if (argc < 2) { - std::cout << "Usage: " << argv[0] << " [output]\n"; - std::cout << "Options:\n"; - std::cout << " -split Split by chromosome\n"; - std::cout << " -noindex Disable indexing\n"; - std::cout << " -illumina Use Illumina quality binning\n"; - std::cout << " -dropqual Drop quality scores\n"; - return 1; - } - - const char* input = argv[1]; - const char* output = nullptr; +int main(int argc, char *argv[]) +{ + std::vector args; + args.reserve(static_cast(argc)); + std::for_each_n(argv, static_cast(argc), [&](char *arg) { args.emplace_back(arg); }); - bool do_split = false; - bool do_index = true; - uint32_t quality_mode = RAMNTupleRecord::kPhred33; - - for (int i = 2; i < argc; i++) { - std::string arg = argv[i]; - if (arg == "-split") { - do_split = true; - } else if (arg == "-noindex") { - do_index = false; - } else if (arg == "-illumina") { - quality_mode = RAMNTupleRecord::kIlluminaBinning; - } else if (arg == "-dropqual") { - quality_mode = RAMNTupleRecord::kDrop; - } else if (arg[0] != '-') { - output = argv[i]; - } - } + if (args.size() < 2) { + std::cout << "Usage: " << args[0] << " [output]\n"; + std::cout << "Options:\n"; + std::cout << " -split Split by chromosome\n"; + std::cout << " -only When used with -split, emit only the specified chromosome (repeatable)\n"; + std::cout << " -noindex Disable indexing\n"; + std::cout << " -illumina Use Illumina quality binning\n"; + std::cout << " -dropqual Drop quality scores\n"; + return 1; + } - std::string outfile; - if (!output) { - outfile = std::string(input); - size_t pos = outfile.rfind(".sam"); - if (pos != std::string::npos) { - outfile.erase(pos); - } - output = outfile.c_str(); - } + const char *input = args[1].c_str(); + const char *output = nullptr; - try { - if (do_split) { - samtoramntuple_split_by_chromosome(input, output, 505, quality_mode, 4); - } else { - std::string ramfile = std::string(output); - if (ramfile.find(".root") == std::string::npos && ramfile.find(".ram") == std::string::npos) { - ramfile += ".ram"; - } - samtoramntuple(input, ramfile.c_str(), do_index, true, true, 505, quality_mode); - } - } catch (const std::exception& e) { - std::cerr << "Error: " << e.what() << std::endl; - return 1; - } - - return 0; -} + bool do_split = false; + bool do_index = true; + uint32_t quality_mode = RAMNTupleRecord::kPhred33; + std::vector only_chromosomes; + + for (int i = 2; i < static_cast(args.size()); i++) { + std::string arg = args[static_cast(i)]; + if (arg == "-split") { + do_split = true; + } else if (arg == "-only") { + if (i + 1 < static_cast(args.size())) { + only_chromosomes.emplace_back(args[static_cast(++i)]); + } else { + std::cerr << "-only expects a chromosome name\n"; + return 1; + } + } else if (arg == "-noindex") { + do_index = false; + } else if (arg == "-illumina" || arg == "-dropqual") { + quality_mode = (arg == "-illumina") ? RAMNTupleRecord::kIlluminaBinning : RAMNTupleRecord::kDrop; + } else if (!arg.empty() && arg[0] != '-') { + output = args[static_cast(i)].c_str(); + } + } + + std::string outfile{}; + if (!output) { + outfile = std::string(input); + size_t pos = outfile.rfind(".sam"); + if (pos != std::string::npos) { + outfile.erase(pos); + } + output = outfile.c_str(); + } + + try { + if (do_split) { + samtoramntuple_split_by_chromosome(input, output, 505, quality_mode, 4, only_chromosomes); + } else { + std::string ramfile = std::string(output); + if (ramfile.find(".root") == std::string::npos && ramfile.find(".ram") == std::string::npos) { + ramfile += ".ram"; + } + samtoramntuple(input, ramfile.c_str(), do_index, true, true, 505, quality_mode); + } + } catch (const std::exception &e) { + std::cerr << "Error: " << e.what() << std::endl; + return 1; + } + + return 0; +}