Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 7 additions & 12 deletions .github/workflows/clang-tidy-review-post.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,26 +11,21 @@ permissions:
pull-requests: write

concurrency:
group: ${{ github.workflow }}-${{ github.event.workflow_run.pull_requests[0].number }}
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
cancel-in-progress: true

jobs:
post-comments:
# Only run if the triggering workflow was from a pull request
if: github.event.workflow_run.event == 'pull_request'
build:
runs-on: ubuntu-latest

steps:
- name: Post review comments
id: post-review
uses: ZedThree/clang-tidy-review/post@v0.21.0
with:
max_comments: 10

# Fail if there are any clang-tidy warnings
- name: Check for issues
if: steps.post-review.outputs.total_comments > 0
run: |
echo "::error::Found ${{ steps.post-review.outputs.total_comments }} clang-tidy issues"
exit 1

# If there are any comments, fail the check
- if: steps.post-review.outputs.total_comments > 0
run: exit 1

2 changes: 0 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,6 @@ if(RAMTOOLS_BUILD_TOOLS)
add_subdirectory(tools)
endif()

# Include benchmark directory if benchmarks OR tests are enabled
# (sam_generator is needed by tests)
if(RAMTOOLS_BUILD_BENCHMARKS OR RAMTOOLS_BUILD_TESTS)
add_subdirectory(benchmark)
endif()
Expand Down
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,24 @@ Tested with HG00154 sample from the 1000 Genomes Project (196M reads, 72GB SAM f
- LZ4 compression provides the best query performance among all compression algorithms
- For a 100Mb region query: RNTuple processes **453,736 reads/sec** vs TTree+ZLIB's **197,815 reads/sec**

### RNTuple vs. CRAM (samtools) Region Queries

Measured on `HG00096.cram` (GRCh37 reference, samtools `view -c -F 2308`) versus the converted RAM RNTuple produced by `samtoramntuple`. Benchmarks run on an Intel i5-8250U laptop with SSD storage.

| Query (GRCh37) | Reads | RNTuple `ramntupleview` Wall (s) | samtools (CRAM) Wall (s) | Relative Speed (`CRAM / RNTuple`) |
|----------------|-------|----------------------------------|---------------------------|-----------------------------------|
| chr1:1,000,000-1,001,000 | 6 | 12.6 | 0.42 | 0.03× |
| chr1:1,000,000-2,000,000 | 41,608 | 5.65 | 0.31 | 0.06× |
| chr1:1-50,000,000 | 2,921,678 | 6.52 | 9.41 | **1.44×** |
| chr2:1-100,000,000 | 6,879,654 | 7.75 | 20.19 | **2.61×** |
| chr7:50,000,000-150,000,000 | 6,698,314 | 7.96 | 17.80 | **2.24×** |
| chr21:1-48,129,895 | 2,510,708 | 6.58 | 8.29 | **1.26×** |

**Interpretation**
- RNTuple has higher constant overhead for tiny windows (dozens of reads); CRAM via samtools remains faster for ad‑hoc lookups.
- For broad regions that touch millions of records, the columnar layout and sparse index give RNTuple a **1.3×–2.6× throughput advantage** over samtools.
- Large windows (≥50 Mb) saturate sequential access on both formats; RNTuple’s page-level prefetch keeps bandwidth high while retaining exact read counts matching samtools.

## TTree Implementation (Legacy)

ROOT scripts to convert a SAM file to a RAM (ROOT Alignment/Map) file using the older TTree format and to work with those files.
Expand Down
2 changes: 1 addition & 1 deletion benchmark/generate_sam_benchmark.cxx
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include <benchmark/benchmark.h>

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: 'benchmark/benchmark.h' file not found [clang-diagnostic-error]

#include <benchmark/benchmark.h>
         ^

#include "generate_sam_benchmark.h"
#include <cstdio> // For std::remove
#include <cstdio>

static void BM_GenerateSAM(benchmark::State &state)
{
Expand Down
41 changes: 20 additions & 21 deletions benchmark/region_query_benchmark.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class RegionQueryFixture : public benchmark::Fixture {

sam_file_ = "/media/aditya/213e0e46-6f86-4288-8b79-74851c34314f/output_big.sam";
ttree_root_file_ = "/media/aditya/213e0e46-6f86-4288-8b79-74851c34314f/output_big_lzma.root";
rntuple_root_file_ = "/media/aditya/213e0e46-6f86-4288-8b79-74851c34314f/output_root.root";
rntuple_root_file_ = "/home/aditya/ramtools/build/rntuple.root";
}

void TearDown(const benchmark::State &) override {}
Expand All @@ -39,24 +39,24 @@ class RegionQueryFixture : public benchmark::Fixture {
const char *get_current_region() const { return regions_[region_idx_ % regions_.size()].c_str(); }
};

const std::vector<std::string> RegionQueryFixture::regions_ = {"chr1:1000000-1001000",
"chr2:5000000-5010000",
"chrX:100000-150000",
"chr1:1000000-2000000",
"chr5:10000000-15000000",
"chr10:50000000-60000000",
"chr1:1-50000000",
"chr2:1-100000000",
"chr7:50000000-150000000",
"chr21:1-48129895",
"chrM:1-16571",
"chrY:2600000-2700000",
const std::vector<std::string> RegionQueryFixture::regions_ = {"1:1000000-1001000",

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: invalid case style for variable 'regions_' [readability-identifier-naming]

Suggested change
const std::vector<std::string> RegionQueryFixture::regions_ = {"1:1000000-1001000",
const std::vector<std::string> RegionQueryFixture::regions = {"1:1000000-1001000",

"2:5000000-5010000",
"X:100000-150000",
"1:1000000-2000000",
"5:10000000-15000000",
"10:50000000-60000000",
"1:1-50000000",
"2:1-100000000",
"7:50000000-150000000",
"21:1-48129895",
"MT:1-16571",
"Y:2600000-2700000",
"GL000227.1:1-100000",
"chr1:1-1000",
"chr1:249250621-249250621",
"chr22:51304566-51304566",
"chr17:41196312-41277500",
"chr13:32889611-32973805"};
"1:1-1000",
"1:249250621-249250621",
"22:51304566-51304566",
"17:41196312-41277500",
"13:32889611-32973805"};

BENCHMARK_DEFINE_F(RegionQueryFixture, TTree)(benchmark::State &state)
{
Expand Down Expand Up @@ -96,8 +96,7 @@ BENCHMARK_DEFINE_F(RegionQueryFixture, RNTuple)(benchmark::State &state)
state.SetLabel(std::to_string(reads_in_this_run) + " reads");
}

BENCHMARK_REGISTER_F(RegionQueryFixture, TTree)->Args({0})->Args({3})->Args({6})->Args({9})->Unit(benchmark::kSecond);

BENCHMARK_REGISTER_F(RegionQueryFixture, RNTuple)->Args({0})->Args({3})->Args({6})->Args({9})->Unit(benchmark::kSecond);
BENCHMARK_REGISTER_F(RegionQueryFixture, TTree)->DenseRange(0, 17, 1)->Unit(benchmark::kSecond);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: function 'BENCHMARK_REGISTER_F' can be made static or moved into an anonymous namespace to enforce internal linkage [misc-use-internal-linkage]

Suggested change
BENCHMARK_REGISTER_F(RegionQueryFixture, TTree)->DenseRange(0, 17, 1)->Unit(benchmark::kSecond);
BENCHMARK_REGISTER_Fstatic (RegionQueryFixture, TTree)->DenseRange(0, 17, 1)->Unit(benchmark::kSecond);

BENCHMARK_REGISTER_F(RegionQueryFixture, RNTuple)->DenseRange(0, 17, 1)->Unit(benchmark::kSecond);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: function 'BENCHMARK_REGISTER_F' can be made static or moved into an anonymous namespace to enforce internal linkage [misc-use-internal-linkage]

Suggested change
BENCHMARK_REGISTER_F(RegionQueryFixture, RNTuple)->DenseRange(0, 17, 1)->Unit(benchmark::kSecond);
BENCHMARK_REGISTER_Fstatic (RegionQueryFixture, RNTuple)->DenseRange(0, 17, 1)->Unit(benchmark::kSecond);


BENCHMARK_MAIN();
4 changes: 3 additions & 1 deletion inc/ramcore/SamToNTuple.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include <cstdint>

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: 'cstdint' file not found [clang-diagnostic-error]

#include <cstdint>
         ^

#include <string>
#include <vector>

void samtoramntuple(const char *datafile,
const char *treefile,
Expand All @@ -11,6 +12,7 @@ void samtoramntuple(const char *datafile,
uint32_t quality_policy);

void samtoramntuple_split_by_chromosome(const char *datafile, const char *output_prefix, int compression_algorithm,
uint32_t quality_policy, int num_threads = 4);
uint32_t quality_policy, int num_threads = 4,
const std::vector<std::string> &only_chromosomes = {});

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: parameter 6 is const-qualified in the function declaration; const-qualification of parameters only has an effect in function definitions [readability-avoid-const-params-in-decls]

Suggested change
const std::vector<std::string> &only_chromosomes = {});
std::vector<std::string> &only_chromosomes = {});


#endif
162 changes: 131 additions & 31 deletions src/ramcore/RAMNTupleView.cxx
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
#include "ramcore/RAMNTupleView.h"

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: 'ramcore/RAMNTupleView.h' file not found [clang-diagnostic-error]

#include "ramcore/RAMNTupleView.h"
         ^

#include <iostream>

#include <cstdint>

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: #includes are not sorted properly [llvm-include-order]

#include <cstdint>
^

this fix will not be applied because it overlaps with another fix

#include <cstring>
#include <iostream>
#include <limits>
#include <string>

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: included header limits is not used directly [misc-include-cleaner]

Suggested change
#include <string>
#include <string>


#include <ROOT/RNTuple.hxx>

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: included header limits is not used directly [misc-include-cleaner]

Suggested change
#include <ROOT/RNTuple.hxx>
#include <ROOT/RNTuple.hxx>

#include <ROOT/RNTupleReader.hxx>
#include <ROOT/RNTupleView.hxx>
Expand All @@ -21,64 +26,159 @@ Long64_t ramntupleview(const char *file, const char *query, bool cache, bool per
}

std::string region = query;
if (region.empty() || region == "*") {
stopwatch.Print();
return reader->GetNEntries();
}

TString rname;
Int_t range_start = 1;
Int_t range_end = std::numeric_limits<Int_t>::max();

int chrDelimiterPos = region.find(":");
if (chrDelimiterPos == std::string::npos) {
std::cerr << "Invalid region format. Use rname:start-end\n";
return 0;
rname = region;
} else {
rname = region.substr(0, chrDelimiterPos);
int rangeDelimiterPos = region.find("-", chrDelimiterPos);
if (rangeDelimiterPos == std::string::npos) {
try {
range_start = std::stoi(region.substr(chrDelimiterPos + 1));

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: no header providing "std::stoi" is directly included [misc-include-cleaner]

            range_start = std::stoi(region.substr(chrDelimiterPos + 1));
                               ^

range_end = range_start;
} catch (...) {
std::cerr << "Invalid region format. Use rname:start-end or rname:position\n";
return 0;
}
} else {
range_start = std::stoi(region.substr(chrDelimiterPos + 1, rangeDelimiterPos - chrDelimiterPos - 1));
range_end = std::stoi(region.substr(rangeDelimiterPos + 1));
}
}
TString rname = region.substr(0, chrDelimiterPos);
int rangeDelimiterPos = region.find("-", chrDelimiterPos);
if (rangeDelimiterPos == std::string::npos) {
std::cerr << "Invalid region format. Use rname:start-end\n";

auto refs = RAMNTupleRecord::GetRnameRefs();
auto refid = refs->GetRefId(rname.Data());

if (refid < 0) {
if (rname.BeginsWith("chr")) {
TString stripped_rname = rname(3, rname.Length() - 3);
refid = refs->GetRefId(stripped_rname.Data());
}
if (refid < 0 && (rname == "chrM" || rname == "M")) {
refid = refs->GetRefId("MT");
}
}

if (refid < 0) {
std::cerr << "Error: Reference name '" << rname.Data() << "' not found in file.\n";
return 0;
}
Int_t range_start = std::stoi(region.substr(chrDelimiterPos + 1, rangeDelimiterPos - chrDelimiterPos - 1));
Int_t range_end = std::stoi(region.substr(rangeDelimiterPos + 1));

auto refid = RAMNTupleRecord::GetRnameRefs()->GetRefId(rname.Data());
auto index = RAMNTupleRecord::GetIndex();

auto recordView = reader->GetView<RAMNTupleRecord>("record");
auto flagView = reader->GetView<uint16_t>("record.flag");
auto refidView = reader->GetView<int32_t>("record.refid");

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: no header providing "int32_t" is directly included [misc-include-cleaner]

src/ramcore/RAMNTupleView.cxx:1:

- #include <iostream>
+ #include <cstdint>
+ #include <iostream>

auto posView = reader->GetView<int32_t>("record.pos");
auto cigarView = reader->GetView<std::vector<uint32_t>>("record.cigar");

Long64_t count = 0;
const Long64_t totalEntries = reader->GetNEntries();

const int FLAG_FILTER = 0x904;
const Int_t rs0 = (range_start > 0) ? (range_start - 1) : 0;
const Int_t re0 = (range_end > 0) ? (range_end - 1) : 0;
constexpr int kMaxRefSpanHeuristic = 10000;

auto computeRefSpan = [](const std::vector<uint32_t> &cigarOps) -> int {
int span = 0;
for (uint32_t op : cigarOps) {
int len = static_cast<int>(op >> 4);
int code = static_cast<int>(op & 0xF);

if (code == 0 || code == 2 || code == 3 || code == 7 || code == 8) {
span += len;
}
}
return span;
};

if (!index || index->Size() == 0) {

bool seenRef = false;
for (auto i : reader->GetEntryRange()) {
const auto &rec = recordView(i);
if (rec.refid == refid && rec.pos >= range_start - 1 && rec.pos <= range_end - 1) {
const auto flag = flagView(i);
if (flag & FLAG_FILTER)
continue;

const auto curRef = refidView(i);
if (curRef == refid) {
seenRef = true;
} else {
if (seenRef && curRef > refid)
break;
continue;
}
const auto curPos = posView(i);
if (curPos > re0)
break;

int read_start = curPos;
if (read_start >= rs0) {
count++;
continue;
}
if (read_start + kMaxRefSpanHeuristic < rs0) {
continue;
}
const auto &cigarOps = cigarView(i);
int refSpan = computeRefSpan(cigarOps);
int read_end = (refSpan > 0) ? (read_start + refSpan - 1) : read_start;

if (read_start <= re0 && read_end >= rs0) {
count++;
}
}
} else {

auto start_entry = index->GetRow(refid, range_start);
auto end_entry = index->GetRow(refid, range_end);
auto start_entry = index->GetRow(refid, rs0);

if (start_entry < 0)
start_entry = index->GetRow(refid, 0);
if (start_entry < 0)
start_entry = 0;
if (end_entry < 0)
end_entry = reader->GetNEntries();

for (; start_entry < end_entry; start_entry++) {
const auto &rec = recordView(start_entry);
int seqlen = rec.GetSEQLEN();
if (rec.pos + seqlen > range_start - 1) {
break;
}
}
for (Long64_t j = start_entry; j < totalEntries; j++) {
const auto flag = flagView(j);
if (flag & FLAG_FILTER)
continue;

Long64_t j;
for (j = start_entry; j < reader->GetNEntries(); j++) {
const auto &rec = recordView(j);
const auto curRef = refidView(j);
if (curRef < refid)
continue;
if (curRef > refid)
break;

if (rec.pos >= range_end) {
const auto curPos = posView(j);
if (curPos > re0)
break;

int read_start = curPos;
if (read_start >= rs0) {
count++;
continue;
}
if (read_start + kMaxRefSpanHeuristic < rs0) {
continue;
}
const auto &cigarOps = cigarView(j);
int refSpan = computeRefSpan(cigarOps);
int read_end = (refSpan > 0) ? (read_start + refSpan - 1) : read_start;

if (read_end >= rs0) {
count++;
}
count++;
}
}

stopwatch.Print();

std::cout << "Found " << static_cast<long long>(count) << " records in region " << (query ? query : "") << std::endl;
return count;
}
Loading