-
Notifications
You must be signed in to change notification settings - Fork 1
Index Repairing of RNTuple ROOT file #13
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -20,7 +20,7 @@ class RegionQueryFixture : public benchmark::Fixture { | |||||
|
|
||||||
| sam_file_ = "/media/aditya/213e0e46-6f86-4288-8b79-74851c34314f/output_big.sam"; | ||||||
| ttree_root_file_ = "/media/aditya/213e0e46-6f86-4288-8b79-74851c34314f/output_big_lzma.root"; | ||||||
| rntuple_root_file_ = "/media/aditya/213e0e46-6f86-4288-8b79-74851c34314f/output_root.root"; | ||||||
| rntuple_root_file_ = "/home/aditya/ramtools/build/rntuple.root"; | ||||||
| } | ||||||
|
|
||||||
| void TearDown(const benchmark::State &) override {} | ||||||
|
|
@@ -39,24 +39,24 @@ class RegionQueryFixture : public benchmark::Fixture { | |||||
| const char *get_current_region() const { return regions_[region_idx_ % regions_.size()].c_str(); } | ||||||
| }; | ||||||
|
|
||||||
| const std::vector<std::string> RegionQueryFixture::regions_ = {"chr1:1000000-1001000", | ||||||
| "chr2:5000000-5010000", | ||||||
| "chrX:100000-150000", | ||||||
| "chr1:1000000-2000000", | ||||||
| "chr5:10000000-15000000", | ||||||
| "chr10:50000000-60000000", | ||||||
| "chr1:1-50000000", | ||||||
| "chr2:1-100000000", | ||||||
| "chr7:50000000-150000000", | ||||||
| "chr21:1-48129895", | ||||||
| "chrM:1-16571", | ||||||
| "chrY:2600000-2700000", | ||||||
| const std::vector<std::string> RegionQueryFixture::regions_ = {"1:1000000-1001000", | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: invalid case style for variable 'regions_' [readability-identifier-naming]
Suggested change
|
||||||
| "2:5000000-5010000", | ||||||
| "X:100000-150000", | ||||||
| "1:1000000-2000000", | ||||||
| "5:10000000-15000000", | ||||||
| "10:50000000-60000000", | ||||||
| "1:1-50000000", | ||||||
| "2:1-100000000", | ||||||
| "7:50000000-150000000", | ||||||
| "21:1-48129895", | ||||||
| "MT:1-16571", | ||||||
| "Y:2600000-2700000", | ||||||
| "GL000227.1:1-100000", | ||||||
| "chr1:1-1000", | ||||||
| "chr1:249250621-249250621", | ||||||
| "chr22:51304566-51304566", | ||||||
| "chr17:41196312-41277500", | ||||||
| "chr13:32889611-32973805"}; | ||||||
| "1:1-1000", | ||||||
| "1:249250621-249250621", | ||||||
| "22:51304566-51304566", | ||||||
| "17:41196312-41277500", | ||||||
| "13:32889611-32973805"}; | ||||||
|
|
||||||
| BENCHMARK_DEFINE_F(RegionQueryFixture, TTree)(benchmark::State &state) | ||||||
| { | ||||||
|
|
@@ -96,8 +96,7 @@ BENCHMARK_DEFINE_F(RegionQueryFixture, RNTuple)(benchmark::State &state) | |||||
| state.SetLabel(std::to_string(reads_in_this_run) + " reads"); | ||||||
| } | ||||||
|
|
||||||
| BENCHMARK_REGISTER_F(RegionQueryFixture, TTree)->Args({0})->Args({3})->Args({6})->Args({9})->Unit(benchmark::kSecond); | ||||||
|
|
||||||
| BENCHMARK_REGISTER_F(RegionQueryFixture, RNTuple)->Args({0})->Args({3})->Args({6})->Args({9})->Unit(benchmark::kSecond); | ||||||
| BENCHMARK_REGISTER_F(RegionQueryFixture, TTree)->DenseRange(0, 17, 1)->Unit(benchmark::kSecond); | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: function 'BENCHMARK_REGISTER_F' can be made static or moved into an anonymous namespace to enforce internal linkage [misc-use-internal-linkage]
Suggested change
|
||||||
| BENCHMARK_REGISTER_F(RegionQueryFixture, RNTuple)->DenseRange(0, 17, 1)->Unit(benchmark::kSecond); | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: function 'BENCHMARK_REGISTER_F' can be made static or moved into an anonymous namespace to enforce internal linkage [misc-use-internal-linkage]
Suggested change
|
||||||
|
|
||||||
| BENCHMARK_MAIN(); | ||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -3,6 +3,7 @@ | |||||
|
|
||||||
| #include <cstdint> | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: 'cstdint' file not found [clang-diagnostic-error] #include <cstdint>
^ |
||||||
| #include <string> | ||||||
| #include <vector> | ||||||
|
|
||||||
| void samtoramntuple(const char *datafile, | ||||||
| const char *treefile, | ||||||
|
|
@@ -11,6 +12,7 @@ void samtoramntuple(const char *datafile, | |||||
| uint32_t quality_policy); | ||||||
|
|
||||||
| void samtoramntuple_split_by_chromosome(const char *datafile, const char *output_prefix, int compression_algorithm, | ||||||
| uint32_t quality_policy, int num_threads = 4); | ||||||
| uint32_t quality_policy, int num_threads = 4, | ||||||
| const std::vector<std::string> &only_chromosomes = {}); | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: parameter 6 is const-qualified in the function declaration; const-qualification of parameters only has an effect in function definitions [readability-avoid-const-params-in-decls]
Suggested change
|
||||||
|
|
||||||
| #endif | ||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,6 +1,11 @@ | ||||||
| #include "ramcore/RAMNTupleView.h" | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: 'ramcore/RAMNTupleView.h' file not found [clang-diagnostic-error] #include "ramcore/RAMNTupleView.h"
^ |
||||||
| #include <iostream> | ||||||
|
|
||||||
| #include <cstdint> | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: #includes are not sorted properly [llvm-include-order] #include <cstdint>
^this fix will not be applied because it overlaps with another fix |
||||||
| #include <cstring> | ||||||
| #include <iostream> | ||||||
| #include <limits> | ||||||
| #include <string> | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: included header limits is not used directly [misc-include-cleaner]
Suggested change
|
||||||
|
|
||||||
| #include <ROOT/RNTuple.hxx> | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: included header limits is not used directly [misc-include-cleaner]
Suggested change
|
||||||
| #include <ROOT/RNTupleReader.hxx> | ||||||
| #include <ROOT/RNTupleView.hxx> | ||||||
|
|
@@ -21,64 +26,159 @@ Long64_t ramntupleview(const char *file, const char *query, bool cache, bool per | |||||
| } | ||||||
|
|
||||||
| std::string region = query; | ||||||
| if (region.empty() || region == "*") { | ||||||
| stopwatch.Print(); | ||||||
| return reader->GetNEntries(); | ||||||
| } | ||||||
|
|
||||||
| TString rname; | ||||||
| Int_t range_start = 1; | ||||||
| Int_t range_end = std::numeric_limits<Int_t>::max(); | ||||||
|
|
||||||
| int chrDelimiterPos = region.find(":"); | ||||||
| if (chrDelimiterPos == std::string::npos) { | ||||||
| std::cerr << "Invalid region format. Use rname:start-end\n"; | ||||||
| return 0; | ||||||
| rname = region; | ||||||
| } else { | ||||||
| rname = region.substr(0, chrDelimiterPos); | ||||||
| int rangeDelimiterPos = region.find("-", chrDelimiterPos); | ||||||
| if (rangeDelimiterPos == std::string::npos) { | ||||||
| try { | ||||||
| range_start = std::stoi(region.substr(chrDelimiterPos + 1)); | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: no header providing "std::stoi" is directly included [misc-include-cleaner] range_start = std::stoi(region.substr(chrDelimiterPos + 1));
^ |
||||||
| range_end = range_start; | ||||||
| } catch (...) { | ||||||
| std::cerr << "Invalid region format. Use rname:start-end or rname:position\n"; | ||||||
| return 0; | ||||||
| } | ||||||
| } else { | ||||||
| range_start = std::stoi(region.substr(chrDelimiterPos + 1, rangeDelimiterPos - chrDelimiterPos - 1)); | ||||||
| range_end = std::stoi(region.substr(rangeDelimiterPos + 1)); | ||||||
| } | ||||||
| } | ||||||
| TString rname = region.substr(0, chrDelimiterPos); | ||||||
| int rangeDelimiterPos = region.find("-", chrDelimiterPos); | ||||||
| if (rangeDelimiterPos == std::string::npos) { | ||||||
| std::cerr << "Invalid region format. Use rname:start-end\n"; | ||||||
|
|
||||||
| auto refs = RAMNTupleRecord::GetRnameRefs(); | ||||||
| auto refid = refs->GetRefId(rname.Data()); | ||||||
|
|
||||||
| if (refid < 0) { | ||||||
| if (rname.BeginsWith("chr")) { | ||||||
| TString stripped_rname = rname(3, rname.Length() - 3); | ||||||
| refid = refs->GetRefId(stripped_rname.Data()); | ||||||
| } | ||||||
| if (refid < 0 && (rname == "chrM" || rname == "M")) { | ||||||
| refid = refs->GetRefId("MT"); | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| if (refid < 0) { | ||||||
| std::cerr << "Error: Reference name '" << rname.Data() << "' not found in file.\n"; | ||||||
| return 0; | ||||||
| } | ||||||
| Int_t range_start = std::stoi(region.substr(chrDelimiterPos + 1, rangeDelimiterPos - chrDelimiterPos - 1)); | ||||||
| Int_t range_end = std::stoi(region.substr(rangeDelimiterPos + 1)); | ||||||
|
|
||||||
| auto refid = RAMNTupleRecord::GetRnameRefs()->GetRefId(rname.Data()); | ||||||
| auto index = RAMNTupleRecord::GetIndex(); | ||||||
|
|
||||||
| auto recordView = reader->GetView<RAMNTupleRecord>("record"); | ||||||
| auto flagView = reader->GetView<uint16_t>("record.flag"); | ||||||
| auto refidView = reader->GetView<int32_t>("record.refid"); | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: no header providing "int32_t" is directly included [misc-include-cleaner] src/ramcore/RAMNTupleView.cxx:1: - #include <iostream>
+ #include <cstdint>
+ #include <iostream> |
||||||
| auto posView = reader->GetView<int32_t>("record.pos"); | ||||||
| auto cigarView = reader->GetView<std::vector<uint32_t>>("record.cigar"); | ||||||
|
|
||||||
| Long64_t count = 0; | ||||||
| const Long64_t totalEntries = reader->GetNEntries(); | ||||||
|
|
||||||
| const int FLAG_FILTER = 0x904; | ||||||
| const Int_t rs0 = (range_start > 0) ? (range_start - 1) : 0; | ||||||
| const Int_t re0 = (range_end > 0) ? (range_end - 1) : 0; | ||||||
| constexpr int kMaxRefSpanHeuristic = 10000; | ||||||
|
|
||||||
| auto computeRefSpan = [](const std::vector<uint32_t> &cigarOps) -> int { | ||||||
| int span = 0; | ||||||
| for (uint32_t op : cigarOps) { | ||||||
| int len = static_cast<int>(op >> 4); | ||||||
| int code = static_cast<int>(op & 0xF); | ||||||
|
|
||||||
| if (code == 0 || code == 2 || code == 3 || code == 7 || code == 8) { | ||||||
| span += len; | ||||||
| } | ||||||
| } | ||||||
| return span; | ||||||
| }; | ||||||
|
|
||||||
| if (!index || index->Size() == 0) { | ||||||
|
|
||||||
| bool seenRef = false; | ||||||
| for (auto i : reader->GetEntryRange()) { | ||||||
| const auto &rec = recordView(i); | ||||||
| if (rec.refid == refid && rec.pos >= range_start - 1 && rec.pos <= range_end - 1) { | ||||||
| const auto flag = flagView(i); | ||||||
| if (flag & FLAG_FILTER) | ||||||
| continue; | ||||||
|
|
||||||
| const auto curRef = refidView(i); | ||||||
| if (curRef == refid) { | ||||||
| seenRef = true; | ||||||
| } else { | ||||||
| if (seenRef && curRef > refid) | ||||||
| break; | ||||||
| continue; | ||||||
| } | ||||||
| const auto curPos = posView(i); | ||||||
| if (curPos > re0) | ||||||
| break; | ||||||
|
|
||||||
| int read_start = curPos; | ||||||
| if (read_start >= rs0) { | ||||||
| count++; | ||||||
| continue; | ||||||
| } | ||||||
| if (read_start + kMaxRefSpanHeuristic < rs0) { | ||||||
| continue; | ||||||
| } | ||||||
| const auto &cigarOps = cigarView(i); | ||||||
| int refSpan = computeRefSpan(cigarOps); | ||||||
| int read_end = (refSpan > 0) ? (read_start + refSpan - 1) : read_start; | ||||||
|
|
||||||
| if (read_start <= re0 && read_end >= rs0) { | ||||||
| count++; | ||||||
| } | ||||||
| } | ||||||
| } else { | ||||||
|
|
||||||
| auto start_entry = index->GetRow(refid, range_start); | ||||||
| auto end_entry = index->GetRow(refid, range_end); | ||||||
| auto start_entry = index->GetRow(refid, rs0); | ||||||
|
|
||||||
| if (start_entry < 0) | ||||||
| start_entry = index->GetRow(refid, 0); | ||||||
| if (start_entry < 0) | ||||||
| start_entry = 0; | ||||||
| if (end_entry < 0) | ||||||
| end_entry = reader->GetNEntries(); | ||||||
|
|
||||||
| for (; start_entry < end_entry; start_entry++) { | ||||||
| const auto &rec = recordView(start_entry); | ||||||
| int seqlen = rec.GetSEQLEN(); | ||||||
| if (rec.pos + seqlen > range_start - 1) { | ||||||
| break; | ||||||
| } | ||||||
| } | ||||||
| for (Long64_t j = start_entry; j < totalEntries; j++) { | ||||||
| const auto flag = flagView(j); | ||||||
| if (flag & FLAG_FILTER) | ||||||
| continue; | ||||||
|
|
||||||
| Long64_t j; | ||||||
| for (j = start_entry; j < reader->GetNEntries(); j++) { | ||||||
| const auto &rec = recordView(j); | ||||||
| const auto curRef = refidView(j); | ||||||
| if (curRef < refid) | ||||||
| continue; | ||||||
| if (curRef > refid) | ||||||
| break; | ||||||
|
|
||||||
| if (rec.pos >= range_end) { | ||||||
| const auto curPos = posView(j); | ||||||
| if (curPos > re0) | ||||||
| break; | ||||||
|
|
||||||
| int read_start = curPos; | ||||||
| if (read_start >= rs0) { | ||||||
| count++; | ||||||
| continue; | ||||||
| } | ||||||
| if (read_start + kMaxRefSpanHeuristic < rs0) { | ||||||
| continue; | ||||||
| } | ||||||
| const auto &cigarOps = cigarView(j); | ||||||
| int refSpan = computeRefSpan(cigarOps); | ||||||
| int read_end = (refSpan > 0) ? (read_start + refSpan - 1) : read_start; | ||||||
|
|
||||||
| if (read_end >= rs0) { | ||||||
| count++; | ||||||
| } | ||||||
| count++; | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| stopwatch.Print(); | ||||||
|
|
||||||
| std::cout << "Found " << static_cast<long long>(count) << " records in region " << (query ? query : "") << std::endl; | ||||||
| return count; | ||||||
| } | ||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
warning: 'benchmark/benchmark.h' file not found [clang-diagnostic-error]