From 1f05789e5f24da84dbb537a345a7334c213c7d9d Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Tue, 28 Apr 2026 16:57:51 +0200 Subject: [PATCH 1/7] Drop unused recount3 annotation fields, add tests --- .github/workflows/ci.yml | 1 + cpp/SJRow.cpp | 8 +- cpp/SJRow.h | 28 ++++--- tests/UnitTests.cpp | 101 ++++++++++++++++++------ tests/gtfs/parser_test3.gtf | 4 +- tests/gtfs/splicing_scenarios_test1.gtf | 4 +- tests/gtfs/splicing_scenarios_test2.gtf | 4 +- tests/gtfs/splicing_scenarios_test3.gtf | 4 +- tests/gtfs/splicing_scenarios_test4.gtf | 4 +- tests/gtfs/splicing_scenarios_test5.gtf | 4 +- tests/gtfs/splicing_scenarios_test6.gtf | 4 +- tests/gtfs/splicing_scenarios_test7.gtf | 4 +- tests/gtfs/splicing_scenarios_test8.gtf | 4 +- 13 files changed, 113 insertions(+), 61 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 80a483b..887d35b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,6 +7,7 @@ on: branches: - build - imallona + - perf-mm-rr-memory jobs: build-and-test: diff --git a/cpp/SJRow.cpp b/cpp/SJRow.cpp index 2847f42..fdb4a0d 100644 --- a/cpp/SJRow.cpp +++ b/cpp/SJRow.cpp @@ -4,17 +4,11 @@ #include "SJRow.h" -SJRow::SJRow(std::string _chrom, uint64_t _start, uint64_t _end, int _length, char _strand, bool _annotated, - std::string _left_motif, std::string _right_motif, std::string _left_annotated, std::string _right_annotated) { +SJRow::SJRow(std::string _chrom, uint64_t _start, uint64_t _end, int _length, char _strand, bool _annotated) { chrom = _chrom; start = _start; end = _end; length = _length; strand = (_strand == '+'); annotated = _annotated; - left_motif = _left_motif; - right_motif = _right_motif; - left_annotated = _left_annotated; - right_annotated = _right_annotated; }; - diff --git a/cpp/SJRow.h b/cpp/SJRow.h index 35b9f07..9233984 100644 --- a/cpp/SJRow.h +++ b/cpp/SJRow.h @@ -19,32 +19,34 @@ class SJRow unsigned int length; bool strand; // 1 = +, 0 = - bool annotated; // 0 or 1 - std::string left_motif; - std::string right_motif; - std::string left_annotated; - std::string right_annotated; + + // The recount3 RR file carries four extra columns per junction — + // left_motif, right_motif, left_annotated, right_annotated — that fastder + // reads but never accesses after parse. On full hg38 (~9.5M junctions) + // those strings cost ~2 GB of resident heap for nothing. We parse and + // discard them in operator>> below to stay format-compatible without + // paying the storage cost. // constructor SJRow() = default; - SJRow(std::string _chrom, uint64_t _start, uint64_t _end, int _length, char _strand, bool _annotated, - std::string _left_motif, std::string _right_motif, std::string _left_annotated, std::string _right_annotated); + SJRow(std::string _chrom, uint64_t _start, uint64_t _end, int _length, char _strand, bool _annotated); // overload input operator for SJRow friend std::istream& operator>>(std::istream &is, SJRow &row) { char strand_; - is >> row.chrom >> row.start >> row.end >> row.length >> strand_ >> row.annotated >> row.left_motif - >> row.right_motif >> row.left_annotated >> row.right_annotated; + std::string discard; + is >> row.chrom >> row.start >> row.end >> row.length >> strand_ >> row.annotated + >> discard >> discard >> discard >> discard; row.strand = (strand_ == '+'); // 1 if + return is; } - // overload output operator for SJRow + // overload output operator for SJRow (debug only — not part of fastder's outputs) friend std::ostream& operator<< (std::ostream& os, const SJRow& row) { - return os << row.chrom << "\t" << row.start << "\t" << row.end << "\t" << row.length << "\t" << row.strand << "\t" << - row.annotated << "\t" << row.left_motif << "\t" << row.right_motif << "\t" << row.left_annotated << "\t" << row.right_annotated; - + return os << row.chrom << "\t" << row.start << "\t" << row.end << "\t" << row.length + << "\t" << row.strand << "\t" << row.annotated; } }; @@ -52,4 +54,4 @@ class SJRow -#endif //FASTDER_SPLICE_JUNCTION_H \ No newline at end of file +#endif //FASTDER_SPLICE_JUNCTION_H diff --git a/tests/UnitTests.cpp b/tests/UnitTests.cpp index ebc4f43..ff21eeb 100644 --- a/tests/UnitTests.cpp +++ b/tests/UnitTests.cpp @@ -17,8 +17,8 @@ TEST(SpliceTestChromOne, TwoStitchedERsTwoSJs) { // create splice junctions std::vector rr_all_sj = { - SJRow("chr1", 10500, 11000, 500, '-', false, "CT", "AC", "0", "0"), // id 1 - SJRow("chr1", 13000, 14000, 1000, '-', false, "CT", "AC", "0", "0"), // id 2 + SJRow("chr1", 10500, 11000, 500, '-', false), // id 1 + SJRow("chr1", 13000, 14000, 1000, '-', false), // id 2 }; // create expressed regions map @@ -43,9 +43,9 @@ TEST(SpliceTestChromOne, StitchedERWithThreeERsTwoSJs) { // create splice junctions std::vector rr_all_sj = { - SJRow("chr1", 10500, 11000, 500, '-', false, "CT", "AC", "0", "0"), // id 1 - SJRow("chr1", 13000, 14000, 1000, '-', false, "CT", "AC", "0", "0"), // id 2 - SJRow("chr1", 14200, 15000, 800, '-', false, "CT", "AC", "0", "0") // id 3 + SJRow("chr1", 10500, 11000, 500, '-', false), // id 1 + SJRow("chr1", 13000, 14000, 1000, '-', false), // id 2 + SJRow("chr1", 14200, 15000, 800, '-', false) // id 3 }; // create expressed regions map @@ -70,9 +70,9 @@ TEST(SpliceTestChromOne, StitchedERWithThreeERsTwoSJsAndTailingER) { // create splice junctions std::vector rr_all_sj = { - SJRow("chr1", 10500, 11000, 500, '-', false, "CT", "AC", "0", "0"), // id 1 - SJRow("chr1", 13000, 14000, 1000, '-', false, "CT", "AC", "0", "0"), // id 2 - SJRow("chr1", 14200, 15000, 800, '-', false, "CT", "AC", "0", "0") // id 3 + SJRow("chr1", 10500, 11000, 500, '-', false), // id 1 + SJRow("chr1", 13000, 14000, 1000, '-', false), // id 2 + SJRow("chr1", 14200, 15000, 800, '-', false) // id 3 }; // create expressed regions map @@ -98,9 +98,9 @@ TEST(SpliceTestChromOne, StitchedERWithThreeERsTwoSJsAndTwoTailingERs) { // create splice junctions std::vector rr_all_sj = { - SJRow("chr1", 10500, 11000, 500, '-', false, "CT", "AC", "0", "0"), // id 1 - SJRow("chr1", 13000, 14000, 1000, '-', false, "CT", "AC", "0", "0"), // id 2 - SJRow("chr1", 14200, 15000, 800, '-', false, "CT", "AC", "0", "0") // id 3 + SJRow("chr1", 10500, 11000, 500, '-', false), // id 1 + SJRow("chr1", 13000, 14000, 1000, '-', false), // id 2 + SJRow("chr1", 14200, 15000, 800, '-', false) // id 3 }; // create expressed regions map @@ -128,9 +128,9 @@ TEST(SpliceTestChromOne, NoSJUsed) { // create splice junctions std::vector rr_all_sj = { - SJRow("chr1", 500, 1000, 500, '-', false, "CT", "AC", "0", "0"), // id 1 - SJRow("chr1", 12000, 13000, 1000, '-', false, "CT", "AC", "0", "0"), // id 2 - SJRow("chr1", 15300, 15400, 100, '-', false, "CT", "AC", "0", "0") // id 3 + SJRow("chr1", 500, 1000, 500, '-', false), // id 1 + SJRow("chr1", 12000, 13000, 1000, '-', false), // id 2 + SJRow("chr1", 15300, 15400, 100, '-', false) // id 3 }; // create expressed regions map @@ -158,9 +158,9 @@ TEST(SpliceTestChromOne, MiddleSJUnusedTailingSJsUsed) { // create splice junctions std::vector rr_all_sj = { - SJRow("chr1", 10500, 11000, 500, '-', false, "CT", "AC", "0", "0"), // id 1 - SJRow("chr1", 12000, 13000, 1000, '-', false, "CT", "AC", "0", "0"), // id 2 - SJRow("chr1", 14200, 15000, 800, '-', false, "CT", "AC", "0", "0") // id 3 + SJRow("chr1", 10500, 11000, 500, '-', false), // id 1 + SJRow("chr1", 12000, 13000, 1000, '-', false), // id 2 + SJRow("chr1", 14200, 15000, 800, '-', false) // id 3 }; // create expressed regions map @@ -193,8 +193,8 @@ TEST(SpliceTestChromOne, FirstERNotStitchedRemainingERsStitched) { // create splice junctions std::vector rr_all_sj = { - SJRow("chr1", 10500, 11000, 500, '-', false, "CT", "AC", "0", "0"), // id 1 - SJRow("chr1", 14200, 15000, 800, '-', false, "CT", "AC", "0", "0") // id 3 + SJRow("chr1", 10500, 11000, 500, '-', false), // id 1 + SJRow("chr1", 14200, 15000, 800, '-', false) // id 3 }; // create expressed regions map @@ -224,10 +224,10 @@ TEST(SpliceTestChromOneAndTwo, BothChrHaveMatchingSJs) { // create splice junctions (ordered within a chromosome but NOT ordered by chromosome! std::vector rr_all_sj = { - SJRow("chr2", 1000, 1500, 500, '-', false, "CT", "AC", "0", "0"), // id 1 - SJRow("chr2", 3000, 3800, 800, '-', false, "CT", "AC", "0", "0"), // id 3 - SJRow("chr1", 10500, 11000, 500, '-', false, "CT", "AC", "0", "0"), // id 1 - SJRow("chr1", 14200, 15000, 800, '-', false, "CT", "AC", "0", "0") // id 3 + SJRow("chr2", 1000, 1500, 500, '-', false), // id 1 + SJRow("chr2", 3000, 3800, 800, '-', false), // id 3 + SJRow("chr1", 10500, 11000, 500, '-', false), // id 1 + SJRow("chr1", 14200, 15000, 800, '-', false) // id 3 }; // create expressed regions map @@ -382,4 +382,59 @@ TEST(Parser, DateFormatInGTF) std::cout << date; +} + + +// SJRow no longer stores left_motif / right_motif / left_annotated / +// right_annotated. operator>> must still consume those four columns from a +// recount3-formatted RR line so existing inputs continue to parse, but only +// the six load-bearing fields end up in the struct. +TEST(SJRow, ParsesAndDiscardsUnusedRRColumns) +{ + std::istringstream iss( + "chr1\t143465342\t143470614\t5273\t-\t1\tCT\tAC\tcH38,gC24\taC19,cH38"); + SJRow row; + iss >> row; + + EXPECT_TRUE(iss.good() || iss.eof()); + EXPECT_EQ(row.chrom, "chr1"); + EXPECT_EQ(row.start, 143465342u); + EXPECT_EQ(row.end, 143470614u); + EXPECT_EQ(row.length, 5273u); + EXPECT_FALSE(row.strand); // '-' -> false + EXPECT_TRUE(row.annotated); + + // Must have consumed all ten whitespace-separated tokens; nothing is left + // for a follow-up record on the same stream + std::string leftover; + iss >> leftover; + EXPECT_TRUE(leftover.empty()); +} + + +// Empty / placeholder annotation columns must also parse cleanly, since +// monorail_light emits "." for left_annotated / right_annotated. +TEST(SJRow, ParsesPlaceholderAnnotationColumns) +{ + std::istringstream iss("chr21\t100\t200\t101\t+\t0\tGT\tAG\t.\t."); + SJRow row; + iss >> row; + EXPECT_EQ(row.chrom, "chr21"); + EXPECT_EQ(row.start, 100u); + EXPECT_EQ(row.end, 200u); + EXPECT_TRUE(row.strand); // '+' + EXPECT_FALSE(row.annotated); +} + + +// Sanity check on the size shrink. SJRow now carries exactly one std::string +// (chrom) plus 8 + 8 + 4 + 1 + 1 bytes of trivially-copyable scalars. The +// upstream version carried five extra std::strings; on libstdc++ that is +// roughly +160 bytes per row, which matters at ~10M rows for full hg38. +TEST(SJRow, ResidentSizeBoundedToOneString) +{ + // Allow generous slack for std::string SSO / alignment differences across + // toolchains. The check is "much smaller than the legacy 6-string layout" + // (6 * sizeof(std::string) on libstdc++ is 192 bytes by itself). + EXPECT_LT(sizeof(SJRow), 96u); } \ No newline at end of file diff --git a/tests/gtfs/parser_test3.gtf b/tests/gtfs/parser_test3.gtf index 0d9e10d..5cdb4ea 100644 --- a/tests/gtfs/parser_test3.gtf +++ b/tests/gtfs/parser_test3.gtf @@ -1,8 +1,8 @@ -#description: expressed region annotation of genome based on bedgraph and MM / RR splice junction information. +#description: expressed region annotation of the genome based on BedGraph and MM / RR splice junction information. #provider: FASTDER #contact: martina.lavanya@gmail.com #format: gtf -#date: 2025-12-16 +#date: 2026-4-28 chr1 fastder gene 10000 10500 225.982 . . gene_id "gene1"; gene_name "faster_gene1"; chr1 fastder transcript 10000 10500 225.982 . . gene_id "gene1"; transcript_id "tx1"; chr1 fastder exon 10000 10500 225.982 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; diff --git a/tests/gtfs/splicing_scenarios_test1.gtf b/tests/gtfs/splicing_scenarios_test1.gtf index b3fa165..235eeef 100644 --- a/tests/gtfs/splicing_scenarios_test1.gtf +++ b/tests/gtfs/splicing_scenarios_test1.gtf @@ -1,8 +1,8 @@ -#description: expressed region annotation of genome based on bedgraph and MM / RR splice junction information. +#description: expressed region annotation of the genome based on BedGraph and MM / RR splice junction information. #provider: FASTDER #contact: martina.lavanya@gmail.com #format: gtf -#date: 2025-12-16 +#date: 2026-4-28 chr1 fastder gene 10000 12500 100.75 . . gene_id "gene1"; gene_name "faster_gene1"; chr1 fastder transcript 10000 12500 100.75 . . gene_id "gene1"; transcript_id "tx1"; chr1 fastder exon 10000 10500 100 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; diff --git a/tests/gtfs/splicing_scenarios_test2.gtf b/tests/gtfs/splicing_scenarios_test2.gtf index e47cc60..5ffeb25 100644 --- a/tests/gtfs/splicing_scenarios_test2.gtf +++ b/tests/gtfs/splicing_scenarios_test2.gtf @@ -1,8 +1,8 @@ -#description: expressed region annotation of genome based on bedgraph and MM / RR splice junction information. +#description: expressed region annotation of the genome based on BedGraph and MM / RR splice junction information. #provider: FASTDER #contact: martina.lavanya@gmail.com #format: gtf -#date: 2025-12-16 +#date: 2026-4-28 chr1 fastder gene 10000 12500 100.75 . . gene_id "gene1"; gene_name "faster_gene1"; chr1 fastder transcript 10000 12500 100.75 . . gene_id "gene1"; transcript_id "tx1"; chr1 fastder exon 10000 10500 100 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; diff --git a/tests/gtfs/splicing_scenarios_test3.gtf b/tests/gtfs/splicing_scenarios_test3.gtf index 7b2471a..a9e68b4 100644 --- a/tests/gtfs/splicing_scenarios_test3.gtf +++ b/tests/gtfs/splicing_scenarios_test3.gtf @@ -1,8 +1,8 @@ -#description: expressed region annotation of genome based on bedgraph and MM / RR splice junction information. +#description: expressed region annotation of the genome based on BedGraph and MM / RR splice junction information. #provider: FASTDER #contact: martina.lavanya@gmail.com #format: gtf -#date: 2025-12-16 +#date: 2026-4-28 chr1 fastder gene 10000 12500 100.75 . . gene_id "gene1"; gene_name "faster_gene1"; chr1 fastder transcript 10000 12500 100.75 . . gene_id "gene1"; transcript_id "tx1"; chr1 fastder exon 10000 10500 100 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; diff --git a/tests/gtfs/splicing_scenarios_test4.gtf b/tests/gtfs/splicing_scenarios_test4.gtf index a57daed..174d888 100644 --- a/tests/gtfs/splicing_scenarios_test4.gtf +++ b/tests/gtfs/splicing_scenarios_test4.gtf @@ -1,8 +1,8 @@ -#description: expressed region annotation of genome based on bedgraph and MM / RR splice junction information. +#description: expressed region annotation of the genome based on BedGraph and MM / RR splice junction information. #provider: FASTDER #contact: martina.lavanya@gmail.com #format: gtf -#date: 2025-12-16 +#date: 2026-4-28 chr1 fastder gene 10000 12500 100.75 . . gene_id "gene1"; gene_name "faster_gene1"; chr1 fastder transcript 10000 12500 100.75 . . gene_id "gene1"; transcript_id "tx1"; chr1 fastder exon 10000 10500 100 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; diff --git a/tests/gtfs/splicing_scenarios_test5.gtf b/tests/gtfs/splicing_scenarios_test5.gtf index 62a35fd..b981f8d 100644 --- a/tests/gtfs/splicing_scenarios_test5.gtf +++ b/tests/gtfs/splicing_scenarios_test5.gtf @@ -1,8 +1,8 @@ -#description: expressed region annotation of genome based on bedgraph and MM / RR splice junction information. +#description: expressed region annotation of the genome based on BedGraph and MM / RR splice junction information. #provider: FASTDER #contact: martina.lavanya@gmail.com #format: gtf -#date: 2025-12-16 +#date: 2026-4-28 chr1 fastder gene 10000 10500 100 . . gene_id "gene1"; gene_name "faster_gene1"; chr1 fastder transcript 10000 10500 100 . . gene_id "gene1"; transcript_id "tx1"; chr1 fastder exon 10000 10500 100 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; diff --git a/tests/gtfs/splicing_scenarios_test6.gtf b/tests/gtfs/splicing_scenarios_test6.gtf index db42ca9..410d536 100644 --- a/tests/gtfs/splicing_scenarios_test6.gtf +++ b/tests/gtfs/splicing_scenarios_test6.gtf @@ -1,8 +1,8 @@ -#description: expressed region annotation of genome based on bedgraph and MM / RR splice junction information. +#description: expressed region annotation of the genome based on BedGraph and MM / RR splice junction information. #provider: FASTDER #contact: martina.lavanya@gmail.com #format: gtf -#date: 2025-12-16 +#date: 2026-4-28 chr1 fastder gene 10000 12500 100.75 . . gene_id "gene1"; gene_name "faster_gene1"; chr1 fastder transcript 10000 12500 100.75 . . gene_id "gene1"; transcript_id "tx1"; chr1 fastder exon 10000 10500 100 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; diff --git a/tests/gtfs/splicing_scenarios_test7.gtf b/tests/gtfs/splicing_scenarios_test7.gtf index e3f813b..f90eb85 100644 --- a/tests/gtfs/splicing_scenarios_test7.gtf +++ b/tests/gtfs/splicing_scenarios_test7.gtf @@ -1,8 +1,8 @@ -#description: expressed region annotation of genome based on bedgraph and MM / RR splice junction information. +#description: expressed region annotation of the genome based on BedGraph and MM / RR splice junction information. #provider: FASTDER #contact: martina.lavanya@gmail.com #format: gtf -#date: 2025-12-16 +#date: 2026-4-28 chr1 fastder gene 9000 9200 2 . . gene_id "gene1"; gene_name "faster_gene1"; chr1 fastder transcript 9000 9200 2 . . gene_id "gene1"; transcript_id "tx1"; chr1 fastder exon 9000 9200 2 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; diff --git a/tests/gtfs/splicing_scenarios_test8.gtf b/tests/gtfs/splicing_scenarios_test8.gtf index 289853e..0850c2e 100644 --- a/tests/gtfs/splicing_scenarios_test8.gtf +++ b/tests/gtfs/splicing_scenarios_test8.gtf @@ -1,8 +1,8 @@ -#description: expressed region annotation of genome based on bedgraph and MM / RR splice junction information. +#description: expressed region annotation of the genome based on BedGraph and MM / RR splice junction information. #provider: FASTDER #contact: martina.lavanya@gmail.com #format: gtf -#date: 2025-12-16 +#date: 2026-4-28 chr1 fastder gene 9000 9200 2 . . gene_id "gene1"; gene_name "faster_gene1"; chr1 fastder transcript 9000 9200 2 . . gene_id "gene1"; transcript_id "tx1"; chr1 fastder exon 9000 9200 2 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; From f6a609d7898e61fdcc58878229234bfa9176b8e4 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Tue, 28 Apr 2026 17:22:39 +0200 Subject: [PATCH 2/7] Update parser for performance so chr can be filtered in RR rows, shrink mm_chrom_sj, add tests read_rr now skips rows outside chromosomes_set; sj_id_remap keeps MM lookups consistent. mm_chrom_sj becomes unordered_map> --- cpp/Integrator.cpp | 2 +- cpp/Integrator.h | 2 +- cpp/Parser.cpp | 64 ++++++++++++++------- cpp/Parser.h | 17 +++++- cpp/SJRow.h | 12 ++-- tests/UnitTests.cpp | 137 ++++++++++++++++++++++++++++++++++++++++---- 6 files changed, 191 insertions(+), 43 deletions(-) diff --git a/cpp/Integrator.cpp b/cpp/Integrator.cpp index c3862c6..6c40cae 100644 --- a/cpp/Integrator.cpp +++ b/cpp/Integrator.cpp @@ -42,7 +42,7 @@ bool Integrator::sj_too_far_back(const uint64_t most_recent_er_end, const uint64 && !within_threshold(most_recent_er_end, sj_start); } -void Integrator::stitch_up(std::unordered_map>& expressed_regions, const std::map>& mm_chrom_sj, const std::vector& rr_all_sj) +void Integrator::stitch_up(std::unordered_map>& expressed_regions, const std::unordered_map>& mm_chrom_sj, const std::vector& rr_all_sj) { // iterate over chromosomes and sj_ids -> sjs.first = chrom, sjs.second = vector for (auto& sjs : mm_chrom_sj) diff --git a/cpp/Integrator.h b/cpp/Integrator.h index 10c75df..2bd28b9 100644 --- a/cpp/Integrator.h +++ b/cpp/Integrator.h @@ -20,7 +20,7 @@ class Integrator public: Integrator(double coverage_tolerance_, int position_tolerance_); - void stitch_up(std::unordered_map>& expressed_regions, const std::map>& mm_chrom_sj, const std::vector& rr_all_sj); + void stitch_up(std::unordered_map>& expressed_regions, const std::unordered_map>& mm_chrom_sj, const std::vector& rr_all_sj); bool within_threshold(double val1, double val2) const; bool within_threshold(uint64_t val1, uint64_t val2) const; bool is_similar(const StitchedER& most_recent_er, const BedGraphRow& expressed_region, const SJRow& current_sj); diff --git a/cpp/Parser.cpp b/cpp/Parser.cpp index 06b0ea7..7f22522 100644 --- a/cpp/Parser.cpp +++ b/cpp/Parser.cpp @@ -166,26 +166,40 @@ void Parser::read_rr(std::string filename) } std::string line; - // iterate over lines + // Two-pass parse: keep only RR rows whose chromosome is in + // chromosomes_set. The on-disk MM references rows by their 1-based RR + // row number, so we record a remap from old sj_id to the new compact + // index in rr_all_sj. Dropped rows get 0 in the remap. + // For full hg38 with --chr chr21, this drops rr_all_sj from about 9.5M + // rows to about 3.5k rows. The remap itself costs 4 bytes per RR row. + rr_all_sj.clear(); + sj_id_remap.clear(); + rr_total_rows = 0; + while (std::getline(file, line)) { - // read in line by line - std::istringstream iss(line); - - // skip invalid lines and headers (which contain the string "chromosome" --> actually don't skip ERCC and Y-chromosome since we need all sj_ids to be correct - if (line.empty() || line.find("chromosome") != std::string::npos) {//|| line.find("ERCC-") != std::string::npos || line.find("chrY") != std::string::npos) { - //std::cout << line << std::endl; + // skip invalid lines and headers (which contain the string "chromosome") + if (line.empty() || line.find("chromosome") != std::string::npos) { continue; } - SJRow row = SJRow(); + + SJRow row; + std::istringstream iss(line); iss >> row; - // rr_all_sj needs to contain all sj_ids, even those of chromosomes that aren't provided in the bedgraph files --> otherwise the mapping from RR to MM file via sj_id is broken - rr_all_sj.emplace_back(row); - } - std::cout << "[INFO] Total number of splice junctions: " << rr_all_sj.size() << std::endl; - //assert(rr_all_sj.size() == 9484210); + ++rr_total_rows; + if (chromosomes_set.contains(row.chrom)) { + rr_all_sj.emplace_back(std::move(row)); + // 1-based index into rr_all_sj + sj_id_remap.push_back(static_cast(rr_all_sj.size())); + } else { + sj_id_remap.push_back(0); // sentinel: row not retained + } + } + std::cout << "[INFO] RR rows total: " << rr_total_rows + << " | retained for analysed chromosomes: " << rr_all_sj.size() + << std::endl; } @@ -223,9 +237,12 @@ void Parser::read_mm(std::string filename) { std::istringstream iss(line); // header: 9484210 2931 699368828, actual #lines = 699368831 iss >> nr_of_sj >> nr_of_samples >> sj_occ_in_samples; - //std::cout << nr_of_sj << ", " << rr_all_sj.size() << std::endl; - if (nr_of_sj != rr_all_sj.size()) { - std::cerr << "[ERROR] RR File and number of splice junctions are not equal! Quitting..."; + // Compare against the *total* number of rows seen in RR, not + // rr_all_sj.size(): rr_all_sj is now a chr-filtered subset. + if (nr_of_sj != rr_total_rows) { + std::cerr << "[ERROR] MM header sj count (" << nr_of_sj + << ") does not match RR row count (" << rr_total_rows + << "). Quitting..." << std::endl; return; } seen_header = true; @@ -258,12 +275,17 @@ void Parser::read_mm(std::string filename) { continue; } - // add count if the mm_id corresponds to any of the provided samples - // check in chromosomes_set to prevent using splice junctions on chromosomes that weren't parsed - if (mm_ids.contains(mm_id) && chromosomes_set.contains(rr_all_sj[sj_id - 1].chrom)) // rail_id_to_mm_id has mapping + // Look up the new (post-filter) sj_id via the remap built in + // read_rr. A 0 means this junction was on a chromosome we don't + // analyse and was dropped — skip without ever touching rr_all_sj. + if (sj_id == 0 || sj_id - 1 >= sj_id_remap.size()) continue; + const uint32_t new_sj_id = sj_id_remap[sj_id - 1]; + if (new_sj_id == 0) continue; + + if (mm_ids.contains(mm_id)) { - // store vector of sj_ids for each chromosome - mm_chrom_sj[rr_all_sj[sj_id - 1].chrom].emplace_back(sj_id); + // mm_chrom_sj stores *new* sj_ids (indexes into rr_all_sj). + mm_chrom_sj[rr_all_sj[new_sj_id - 1].chrom].emplace_back(new_sj_id); } } //std::cout << "[INFO] MM file contains " << count_lines << " lines"<< std::endl; diff --git a/cpp/Parser.h b/cpp/Parser.h index 8ed9931..ba10d67 100644 --- a/cpp/Parser.h +++ b/cpp/Parser.h @@ -35,11 +35,22 @@ class Parser { std::unordered_set chromosomes_set; // for fast check if chromosome is included std::vector> all_bedgraphs; //TODO maybe change to unordered map with key = sample id, value = bedgraph of the sample? std::vector>> all_per_base_coverages; //NOT ordered by chromosomes - // store RR info for each splice junctions + // RR rows on chromosomes the user requested. Dropped rows are not stored. std::vector rr_all_sj; - // store relevant Market Matrix (MM) - std::map> mm_chrom_sj; // ordered by sj_id, map of sj occurring in samples part of the user input + // Maps the 1-based sj_id used by the on-disk MM file, which equals the + // row number in the RR file, to the 1-based index into rr_all_sj. Dropped + // rows have value 0. Size equals the total RR row count. + std::vector sj_id_remap; + + // Total junction rows seen in the RR file, independent of the chr filter. + // Used to validate the MM header's nr_of_sj. + uint64_t rr_total_rows = 0; + + // Sparse junction matrix per chromosome. sj_ids stored here are + // post-remap indexes into rr_all_sj, not original RR row numbers. + // uint32_t fits any realistic junction catalog (recount3 hg38 has 9.5M). + std::unordered_map> mm_chrom_sj; std::vector> rail_id_to_ext_id; // for all samples in the dataset // later sorted by rail_id to receive rank (= mm_id) diff --git a/cpp/SJRow.h b/cpp/SJRow.h index 9233984..2c71623 100644 --- a/cpp/SJRow.h +++ b/cpp/SJRow.h @@ -20,12 +20,12 @@ class SJRow bool strand; // 1 = +, 0 = - bool annotated; // 0 or 1 - // The recount3 RR file carries four extra columns per junction — - // left_motif, right_motif, left_annotated, right_annotated — that fastder - // reads but never accesses after parse. On full hg38 (~9.5M junctions) - // those strings cost ~2 GB of resident heap for nothing. We parse and - // discard them in operator>> below to stay format-compatible without - // paying the storage cost. + // The recount3 RR file carries four extra per-junction columns + // (left_motif, right_motif, left_annotated, right_annotated) that fastder + // reads but never accesses after parse. On full hg38 with about 9.5M + // junctions those strings cost roughly 2 GB of resident heap. operator>> + // below parses and discards them so the on-disk format stays compatible + // and the storage cost goes away. // constructor SJRow() = default; diff --git a/tests/UnitTests.cpp b/tests/UnitTests.cpp index ff21eeb..fc43977 100644 --- a/tests/UnitTests.cpp +++ b/tests/UnitTests.cpp @@ -29,7 +29,7 @@ TEST(SpliceTestChromOne, TwoStitchedERsTwoSJs) BedGraphRow("chr1", 12861, 12999, 29), // length 138 BedGraphRow("chr1", 14001, 14540, 30) // length 539 }; - std::map> mm_chrom_sj; + std::unordered_map> mm_chrom_sj; mm_chrom_sj["chr1"] = {1, 2}; Integrator integrator = Integrator(0.1, 5); integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); @@ -57,7 +57,7 @@ TEST(SpliceTestChromOne, StitchedERWithThreeERsTwoSJs) BedGraphRow("chr1", 14001, 14201, 30), BedGraphRow("chr1", 14999, 15300, 30), }; - std::map> mm_chrom_sj; + std::unordered_map> mm_chrom_sj; mm_chrom_sj["chr1"] = {1, 2, 3}; Integrator integrator = Integrator(0.1, 5); integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); @@ -85,7 +85,7 @@ TEST(SpliceTestChromOne, StitchedERWithThreeERsTwoSJsAndTailingER) BedGraphRow("chr1", 14999, 15300, 30), BedGraphRow("chr1", 15300, 15600, 37), }; - std::map> mm_chrom_sj; + std::unordered_map> mm_chrom_sj; mm_chrom_sj["chr1"] = {1, 2, 3}; Integrator integrator = Integrator(0.1, 5); integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); @@ -114,7 +114,7 @@ TEST(SpliceTestChromOne, StitchedERWithThreeERsTwoSJsAndTwoTailingERs) BedGraphRow("chr1", 15300, 15600, 37), BedGraphRow("chr1", 15600, 15800, 87), }; - std::map> mm_chrom_sj; + std::unordered_map> mm_chrom_sj; mm_chrom_sj["chr1"] = {1, 2, 3}; Integrator integrator = Integrator(0.1, 5); integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); @@ -144,7 +144,7 @@ TEST(SpliceTestChromOne, NoSJUsed) BedGraphRow("chr1", 15300, 15600, 37), BedGraphRow("chr1", 15600, 15800, 87), }; - std::map> mm_chrom_sj; + std::unordered_map> mm_chrom_sj; mm_chrom_sj["chr1"] = {1, 2, 3}; Integrator integrator = Integrator(0.1, 5); integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); @@ -174,7 +174,7 @@ TEST(SpliceTestChromOne, MiddleSJUnusedTailingSJsUsed) BedGraphRow("chr1", 15300, 15600, 37), BedGraphRow("chr1", 15600, 15800, 87), }; - std::map> mm_chrom_sj; + std::unordered_map> mm_chrom_sj; mm_chrom_sj["chr1"] = {1, 2, 3}; Integrator integrator = Integrator(0.1, 5); integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); @@ -206,7 +206,7 @@ TEST(SpliceTestChromOne, FirstERNotStitchedRemainingERsStitched) BedGraphRow("chr1", 14001, 14201, 30), BedGraphRow("chr1", 14999, 15300, 30), }; - std::map> mm_chrom_sj; + std::unordered_map> mm_chrom_sj; mm_chrom_sj["chr1"] = {1, 2}; Integrator integrator = Integrator(0.1, 5); integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); @@ -243,7 +243,7 @@ TEST(SpliceTestChromOneAndTwo, BothChrHaveMatchingSJs) BedGraphRow("chr1", 14999, 15300, 32), }; - std::map> mm_chrom_sj; + std::unordered_map> mm_chrom_sj; mm_chrom_sj["chr2"] = {1, 2}; mm_chrom_sj["chr1"] = {3, 4}; Integrator integrator = Integrator(0.1, 5); @@ -429,12 +429,127 @@ TEST(SJRow, ParsesPlaceholderAnnotationColumns) // Sanity check on the size shrink. SJRow now carries exactly one std::string // (chrom) plus 8 + 8 + 4 + 1 + 1 bytes of trivially-copyable scalars. The -// upstream version carried five extra std::strings; on libstdc++ that is -// roughly +160 bytes per row, which matters at ~10M rows for full hg38. +// upstream version carried five extra std::strings, which on libstdc++ adds +// roughly 160 bytes per row, a meaningful cost at 10M rows for full hg38. TEST(SJRow, ResidentSizeBoundedToOneString) { - // Allow generous slack for std::string SSO / alignment differences across + // Allow generous slack for std::string SSO and alignment differences across // toolchains. The check is "much smaller than the legacy 6-string layout" // (6 * sizeof(std::string) on libstdc++ is 192 bytes by itself). EXPECT_LT(sizeof(SJRow), 96u); +} + + +// read_rr now keeps only rows whose chromosome is in chromosomes_set. +// rr_all_sj is the filtered subset; sj_id_remap maps the on-disk 1-based +// sj_id to the 1-based index in rr_all_sj, or 0 for dropped rows. +TEST(Parser, ChrFilterRetainsOnlyRequestedRows) +{ + namespace fs = std::filesystem; + auto tmp = fs::temp_directory_path() / "fastder_test_chr_filter"; + fs::create_directories(tmp); + auto rr = tmp / "fake.ALL.RR"; + { + std::ofstream out(rr); + out << "chromosome\tstart\tend\tlength\tstrand\tannotated\t" + "left_motif\tright_motif\tleft_annotated\tright_annotated\n"; + out << "chr1\t100\t200\t101\t+\t1\tGT\tAG\t.\t.\n"; + out << "chr2\t300\t400\t101\t+\t1\tGT\tAG\t.\t.\n"; + out << "chr1\t500\t600\t101\t-\t0\tCT\tAC\t.\t.\n"; + } + + Parser parser("dummy_path", {"chr1"}, 1); + parser.read_rr(rr.string()); + + EXPECT_EQ(parser.rr_total_rows, 3u); + EXPECT_EQ(parser.rr_all_sj.size(), 2u); + ASSERT_EQ(parser.sj_id_remap.size(), 3u); + EXPECT_EQ(parser.sj_id_remap[0], 1u); + EXPECT_EQ(parser.sj_id_remap[1], 0u); + EXPECT_EQ(parser.sj_id_remap[2], 2u); + ASSERT_EQ(parser.rr_all_sj.size(), 2u); + EXPECT_EQ(parser.rr_all_sj[0].chrom, "chr1"); + EXPECT_EQ(parser.rr_all_sj[0].start, 100u); + EXPECT_EQ(parser.rr_all_sj[1].start, 500u); + + fs::remove_all(tmp); +} + + +// read_mm must skip lines whose sj_id was dropped by the chr filter, must +// translate retained sj_ids through the remap, and must validate its header +// against the total RR row count (not the filtered rr_all_sj size). +TEST(Parser, MMUsesRemapAndValidatesHeaderAgainstTotalRows) +{ + namespace fs = std::filesystem; + auto tmp = fs::temp_directory_path() / "fastder_test_mm_remap"; + fs::create_directories(tmp); + auto rr = tmp / "fake.ALL.RR"; + auto mm = tmp / "fake.ALL.MM"; + { + std::ofstream out(rr); + out << "chromosome\tstart\tend\tlength\tstrand\tannotated\t" + "left_motif\tright_motif\tleft_annotated\tright_annotated\n"; + out << "chr1\t100\t200\t101\t+\t1\tGT\tAG\t.\t.\n"; + out << "chr2\t300\t400\t101\t+\t1\tGT\tAG\t.\t.\n"; + out << "chr1\t500\t600\t101\t-\t0\tCT\tAC\t.\t.\n"; + } + { + std::ofstream out(mm); + out << "%%MatrixMarket matrix coordinate integer general\n"; + out << "%-----------------------------------------------\n"; + out << "3\t2\t4\n"; + out << "1\t1\t5\n"; + out << "2\t1\t3\n"; + out << "3\t1\t8\n"; + out << "1\t2\t10\n"; + } + + Parser parser("dummy_path", {"chr1"}, 1); + parser.mm_ids = {1u, 2u}; + parser.read_rr(rr.string()); + parser.read_mm(mm.string()); + + ASSERT_TRUE(parser.mm_chrom_sj.count("chr1")); + EXPECT_EQ(parser.mm_chrom_sj.count("chr2"), 0u); + EXPECT_EQ(parser.mm_chrom_sj["chr1"].size(), 3u); + for (uint32_t sj_id : parser.mm_chrom_sj["chr1"]) { + ASSERT_LE(sj_id, parser.rr_all_sj.size()); + EXPECT_EQ(parser.rr_all_sj[sj_id - 1].chrom, "chr1"); + } + + fs::remove_all(tmp); +} + + +// Header validation must reject an MM whose declared row count does not match +// the RR's total row count, even when the filtered rr_all_sj is smaller. +TEST(Parser, MMHeaderRejectsCountMismatch) +{ + namespace fs = std::filesystem; + auto tmp = fs::temp_directory_path() / "fastder_test_mm_mismatch"; + fs::create_directories(tmp); + auto rr = tmp / "fake.ALL.RR"; + auto mm = tmp / "fake.ALL.MM"; + { + std::ofstream out(rr); + out << "chromosome\tstart\tend\tlength\tstrand\tannotated\t" + "left_motif\tright_motif\tleft_annotated\tright_annotated\n"; + out << "chr1\t100\t200\t101\t+\t1\tGT\tAG\t.\t.\n"; + out << "chr2\t300\t400\t101\t+\t1\tGT\tAG\t.\t.\n"; + } + { + std::ofstream out(mm); + out << "%%MatrixMarket matrix coordinate integer general\n"; + out << "999\t1\t0\n"; + } + + Parser parser("dummy_path", {"chr1"}, 1); + parser.mm_ids = {1u}; + parser.read_rr(rr.string()); + parser.read_mm(mm.string()); + + EXPECT_TRUE(parser.mm_chrom_sj.empty()); + + fs::remove_all(tmp); } \ No newline at end of file From 381ec836b59a1500e702a7a0f70dc81ae9d57348 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Tue, 28 Apr 2026 17:59:19 +0200 Subject: [PATCH 3/7] Major changes to include sparse intervals, bigwigs, and strand-aware stitch_up, plus tests Averager works on sparse BedGraphRow lists, no per-base vectors to reduce memory overheads. read_bigwig gated on FASTDER_USE_LIBBIGWIG. stitch_up buckets SJs by strand from unstranded bigwigs, so ERs emit the strand their SJs had, or, if without SJS, are reported to have no strand. --- CMakeLists.txt | 22 +++ cpp/Averager.cpp | 276 +++++++++++++++++--------- cpp/Averager.h | 27 ++- cpp/BedGraphRow.cpp | 13 +- cpp/BedGraphRow.h | 7 + cpp/Integrator.cpp | 209 ++++++++++++++------ cpp/Integrator.h | 14 ++ cpp/Parser.cpp | 123 +++++++++--- cpp/Parser.h | 17 +- cpp/StitchedER.h | 5 + cpp/main.cpp | 4 +- tests/CMakeLists.txt | 9 + tests/UnitTests.cpp | 462 ++++++++++++++++++++++++++++++++++++++++++- 13 files changed, 996 insertions(+), 192 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 367b627..c203de8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,8 @@ option(CONDA_BUILD "Building inside conda-build" OFF) +# Build with libBigWig support so Parser::read_bigwig can parse .bw files +# directly. Off by default to keep the build hermetic; turning it on requires +# libBigWig and its transitive deps (libcurl, zlib) to be available. +option(FASTDER_USE_LIBBIGWIG "Build with libBigWig support for direct .bw reading" OFF) cmake_minimum_required(VERSION 4.0) project(fastder) @@ -26,6 +30,24 @@ add_executable(fastder ) target_link_libraries(fastder PRIVATE fastder_lib Threads::Threads) +if(FASTDER_USE_LIBBIGWIG) + # libBigWig ships its own CMakeLists and exports a static target called + # BigWig along with the headers under its source directory. Pull it in, + # link against the target, and add the source dir to our include path so + # the include in Parser.cpp resolves. + include(FetchContent) + FetchContent_Declare( + libBigWig_src + GIT_REPOSITORY https://github.com/dpryan79/libBigWig.git + GIT_TAG 0.4.7 + ) + FetchContent_MakeAvailable(libBigWig_src) + + target_link_libraries(fastder_lib PRIVATE BigWig) + target_include_directories(fastder_lib PRIVATE ${libbigwig_src_SOURCE_DIR}) + target_compile_definitions(fastder_lib PRIVATE FASTDER_USE_LIBBIGWIG) +endif() + enable_testing() if(NOT CONDA_BUILD) diff --git a/cpp/Averager.cpp b/cpp/Averager.cpp index c72185e..a919e5e 100644 --- a/cpp/Averager.cpp +++ b/cpp/Averager.cpp @@ -8,12 +8,13 @@ #include #include #include +#include +#include #include #include "BedGraphRow.h" #include "SJRow.h" #include "Averager.h" #include -#include Averager::Averager(int threads_) @@ -21,150 +22,241 @@ Averager::Averager(int threads_) nof_threads = threads_; } -//compute mean coverage vector across samples -void Averager::compute_mean_coverage(std::vector>>& all_per_base_coverages) +namespace { + +// Bucket each sample's intervals by chromosome so the per-chromosome sweep +// only sees its own data. Within each chromosome the intervals are sorted by +// start position because read_bedgraph / read_bigwig already emit in file +// order, but we re-sort defensively in case a future input source mixes order. +std::unordered_map>> +bucket_by_chrom(const std::vector>& all_bedgraphs) { + std::unordered_map>> chrom_samples; + for (const auto& sample : all_bedgraphs) + { + // collect this sample's rows per chromosome, then push into the global map + std::unordered_map> per_chrom; + for (const auto& row : sample) + { + per_chrom[row.chrom].push_back(row); + } + for (auto& [chrom, rows] : per_chrom) + { + std::sort(rows.begin(), rows.end(), + [](const BedGraphRow& a, const BedGraphRow& b) { return a.start < b.start; }); + chrom_samples[chrom].push_back(std::move(rows)); + } + } + return chrom_samples; +} - if (all_per_base_coverages.empty()) +// Compute the sparse mean for one chromosome. samples[s] is sample s's sorted, +// non-overlapping interval list on this chromosome. total_samples is the +// total sample count across the experiment, including samples that have no +// coverage on this chromosome (they contribute zero to the mean). +// +// Algorithm: collect every distinct start/end position from every sample as a +// breakpoint. Between two consecutive breakpoints no sample's interval starts +// or ends, so each sample's contribution is constant over the segment. We +// advance a per-sample iterator alongside the sweep so the per-segment +// lookup is amortised O(1). Zero-mean segments are dropped from the output. +std::vector mean_for_chrom(const std::string& chrom, + const std::vector>& samples, + size_t total_samples) +{ + std::vector result; + if (samples.empty() || total_samples == 0) return result; + + // gather and dedupe breakpoints + std::vector breakpoints; + for (const auto& sample : samples) { - std::cerr << "[ERROR] No per-base coverages were computed..." << std::endl; - return; + breakpoints.reserve(breakpoints.size() + sample.size() * 2); + for (const auto& row : sample) + { + breakpoints.push_back(row.start); + breakpoints.push_back(row.end); + } } + std::sort(breakpoints.begin(), breakpoints.end()); + breakpoints.erase(std::unique(breakpoints.begin(), breakpoints.end()), breakpoints.end()); + if (breakpoints.size() < 2) return result; - // reserve storage - mean_coverage.reserve(all_per_base_coverages[0].size()); + // per-sample index into samples[s]: points to the first interval whose + // end > current_position, or samples[s].size() once exhausted. + std::vector idx(samples.size(), 0); + for (size_t b = 0; b + 1 < breakpoints.size(); ++b) + { + const uint64_t seg_start = breakpoints[b]; + const uint64_t seg_end = breakpoints[b + 1]; - for (auto& item : all_per_base_coverages.at(0)) + double sum = 0.0; + for (size_t s = 0; s < samples.size(); ++s) + { + // skip past intervals that lie entirely before this segment + while (idx[s] < samples[s].size() && samples[s][idx[s]].end <= seg_start) + { + ++idx[s]; + } + // include this sample's coverage only if its current interval covers seg_start + if (idx[s] < samples[s].size() && + samples[s][idx[s]].start <= seg_start && + seg_start < samples[s][idx[s]].end) + { + sum += samples[s][idx[s]].coverage; + } + } + const double mean = sum / static_cast(total_samples); + if (mean > 0.0) + { + // adjacent identical-mean segments are coalesced below + result.emplace_back(BedGraphRow(chrom, seg_start, seg_end, mean)); + } + } + + // coalesce contiguous intervals that share the same mean coverage. This + // keeps the interval count tight and makes find_ERs's contiguity check cheap. + if (result.size() < 2) return result; + std::vector merged; + merged.reserve(result.size()); + merged.emplace_back(result[0]); + for (size_t i = 1; i < result.size(); ++i) { - chroms.push_back(item.first); + BedGraphRow& last = merged.back(); + if (last.end == result[i].start && last.coverage == result[i].coverage) + { + last.end = result[i].end; + last.length = static_cast(last.end - last.start); + } + else + { + merged.emplace_back(result[i]); + } } + return merged; +} + +} // anonymous namespace + + +// compute mean coverage as sparse intervals across samples +void Averager::compute_mean_coverage(std::vector>& all_bedgraphs) +{ + if (all_bedgraphs.empty()) + { + std::cerr << "[ERROR] No samples were provided to compute_mean_coverage." << std::endl; + return; + } + + auto chrom_samples = bucket_by_chrom(all_bedgraphs); + const size_t total_samples = all_bedgraphs.size(); + + // populate the chroms list once, used by find_ERs to parallelise + chroms.clear(); + chroms.reserve(chrom_samples.size()); + for (auto& [chrom, _] : chrom_samples) chroms.push_back(chrom); - // store threads std::vector threads; threads.reserve(nof_threads); std::atomic_int next_index{0}; - std::cout << "[INFO] fastder is using " << nof_threads << " threads for detecting ERs." << std::endl; - // parallel iteration over chromosomes. pair.first = chromosome, pair.second = vector of per base coverages of that chromosome - for (unsigned int t = 0; t < nof_threads; ++t) + std::cout << "[INFO] fastder is using " << nof_threads << " threads for computing mean coverage." << std::endl; + + for (unsigned int t = 0; t < static_cast(nof_threads); ++t) { - // capture everything threads.emplace_back([&]() { while (true) { - unsigned int idx = next_index++; - if (idx >= chroms.size()) break; // leave loop after reaching the last chromosome - const std::string chrom = chroms[idx]; - std::vector chrom_mean_vector; - - chrom_mean_vector.reserve(all_per_base_coverages[0].at(chrom).size()); - - // compute mean coverage across samples - for (unsigned int i = 0; i < all_per_base_coverages[0][chrom].size(); i++) - { - double sum = 0.0; - // iterate over all positions i across samples j - for (unsigned int j = 0; j < all_per_base_coverages.size(); j++) - { - // all_per_base_coverages[sample_nr][chromosome][base_pair_position] - sum += all_per_base_coverages[j][chrom][i]; //sample j, chromosome chrom, position i - } - chrom_mean_vector.push_back(sum / all_per_base_coverages.size()); // (sum over coverage at bp i) / (#nof samples) - } - // mutex to avoid race conditions + unsigned int i = next_index++; + if (i >= chroms.size()) break; + const std::string& chrom = chroms[i]; + std::vector intervals = mean_for_chrom(chrom, chrom_samples[chrom], total_samples); { std::lock_guard lock(map_mutex); - mean_coverage[chrom] = std::move(chrom_mean_vector); + mean_intervals[chrom] = std::move(intervals); } - } }); } - // join threads - for (auto& thr: threads) { - thr.join(); - } + for (auto& thr : threads) thr.join(); } -// identify ERs with coverage > threshold and length > min_length bp + +// find ERs as maximal contiguous runs of mean intervals above threshold void Averager::find_ERs(double threshold, int min_length) { - if (mean_coverage.empty()) + if (mean_intervals.empty()) { - std::cerr << "[ERROR] The mean coverage matrix is empty..." << std::endl; + std::cerr << "[ERROR] mean_intervals is empty." << std::endl; return; } - // store threads std::vector workers; workers.reserve(nof_threads); std::atomic_int next_index{0}; - // parallel iteration over chromosomes. pair.first = chromosome, pair.second = vector of per base coverages of that chromosome - for (unsigned int t = 0; t < nof_threads; ++t) { + for (unsigned int t = 0; t < static_cast(nof_threads); ++t) + { workers.emplace_back([&, t] { - while (true) { - unsigned int idx = next_index++; - if (idx >= chroms.size()) break; - const std::string& chrom = chroms[idx]; + while (true) + { + unsigned int i = next_index++; + if (i >= chroms.size()) break; + const std::string& chrom = chroms[i]; + const std::vector& intervals = mean_intervals[chrom]; + + std::vector chrom_ers; - int start = 0; - double current_sum = 0.0; - int count = 0; - std::vector chrom_expressed_regions; + // running ER state + bool in_er = false; + uint64_t er_start = 0; + uint64_t er_end = 0; + double weighted_sum = 0.0; - for (unsigned int i = 0; i <= mean_coverage[chrom].size(); i++) + auto finalize = [&]() { - // append the last expressed region if it's long enough - if (i == mean_coverage[chrom].size()) { - if ((i - start) > min_length) - { - //TODO last ER can be inclusive or exclusive depending on mean_coverage[chrom][i] <= threshold, but this is currently not encoded - double current_avg = current_sum / (i - start); - BedGraphRow expressed_region = BedGraphRow(chrom, start, i, current_avg); - chrom_expressed_regions.push_back(expressed_region); - } - break; + if (in_er && (er_end - er_start) > static_cast(min_length)) + { + const double avg = weighted_sum / static_cast(er_end - er_start); + chrom_ers.emplace_back(BedGraphRow(chrom, er_start, er_end, avg)); } + in_er = false; + weighted_sum = 0.0; + }; - double coverage = mean_coverage[chrom][i]; - // coverage is less than threshold - if (mean_coverage[chrom][i] <= threshold) + for (const auto& row : intervals) + { + if (row.coverage > threshold) { - // region at least 5 bp long, append to results - if ((i - start) > min_length) + // a gap or non-contiguous step also breaks the ER, even if + // the new interval is above threshold + if (in_er && row.start == er_end) { - double current_avg = current_sum / (i - start); - - // assert(count == i - start); - BedGraphRow expressed_region = BedGraphRow(chrom, start, i, current_avg); - chrom_expressed_regions.push_back(expressed_region); - + er_end = row.end; + weighted_sum += row.coverage * static_cast(row.end - row.start); + } + else + { + finalize(); + er_start = row.start; + er_end = row.end; + weighted_sum = row.coverage * static_cast(row.end - row.start); + in_er = true; } - //region too short to be appended, reset start and current avg - start = i + 1; - count = 0; - current_sum = 0; - } - // add to current ER - else if (coverage > threshold) + else { - current_sum += coverage; - ++count; + finalize(); } + } + finalize(); - } // end inner for loop - - // use mutex to write to expressed_regions - // writing to unordered_maps concurrently is only thread-safe if the keys already exist { std::lock_guard lock(map_mutex); - expressed_regions[chrom] = std::move(chrom_expressed_regions); + expressed_regions[chrom] = std::move(chrom_ers); } - } }); } diff --git a/cpp/Averager.h b/cpp/Averager.h index f54d95d..32af7d8 100644 --- a/cpp/Averager.h +++ b/cpp/Averager.h @@ -17,19 +17,30 @@ class Averager { public: Averager(int threads_); - void compute_mean_coverage(std::vector>>& all_per_base_coverages); + // Compute the per-chromosome mean coverage as a sparse interval list + // by sweep-merging the per-sample interval lists. The previous + // implementation built a dense per-base vector per chromosome + // per sample; on full hg38 that grew to ~24 GB per sample. The sweep + // produces only as many output intervals as the union of breakpoints + // across samples requires, with zero coverage segments suppressed. + void compute_mean_coverage(std::vector>& all_bedgraphs); + // Walk the sparse mean_intervals and emit one BedGraphRow per + // expressed region: a maximal contiguous run of intervals all above + // threshold. min_length is in nucleotides on the reference. void find_ERs(double threshold, int min_length); - // matrix consisting of multiple vectors, where each row in a vector is of type BedGraphRow, - // a BedGraphRow is a bin of nucleotides with the same read count - // a vector of BedGraphRows corresponds to one sample - //matrix where each vector contains the normalized count per base from ONE sample int nof_threads; std::vector chroms; std::mutex map_mutex; - std::unordered_map> mean_coverage; //key = chromosome, value = vector of per-base mean coverage of this chromosome - std::unordered_map> expressed_regions; //key = chromosome, value = vector of BedGraphRows with coverage > threshold - // store all the individual sample maps in a vector (since sample identity doesn't matter anymore later on) + // Mean coverage per chromosome as a sorted, non-overlapping list of + // BedGraphRows. Adjacent rows can be contiguous (row[i].end == row[i+1].start); + // any uncovered region between rows is implicit zero coverage. + std::unordered_map> mean_intervals; + // Expressed regions per chromosome, one BedGraphRow per ER. Strand on + // each ER is inherited from mean_intervals (currently '.' since the + // BigWig input is unstranded; the SJ-strand stitching that the + // Integrator does later is what tags the stitched ERs). + std::unordered_map> expressed_regions; diff --git a/cpp/BedGraphRow.cpp b/cpp/BedGraphRow.cpp index 63ea5f6..2877319 100644 --- a/cpp/BedGraphRow.cpp +++ b/cpp/BedGraphRow.cpp @@ -9,9 +9,18 @@ #include -// constructor with arguments +// constructor with arguments. Coverage rows are unstranded by default since the +// monorail-style BigWig the workflow emits is not strand-resolved. BedGraphRow::BedGraphRow(std::string _chrom, uint64_t _start, uint64_t _end, double _coverage) : chrom(_chrom), - start(_start), end(_end), coverage(_coverage), total_reads(0), length(_end - _start) + start(_start), end(_end), coverage(_coverage), total_reads(0), length(_end - _start), strand('.') +{ +} + +// constructor for stranded BigWig input. _strand is '+' or '-'; '.' means +// unstranded (the BigWig contained reads from both strands). +BedGraphRow::BedGraphRow(std::string _chrom, uint64_t _start, uint64_t _end, double _coverage, char _strand) : + chrom(_chrom), start(_start), end(_end), coverage(_coverage), total_reads(0), + length(_end - _start), strand(_strand) { } diff --git a/cpp/BedGraphRow.h b/cpp/BedGraphRow.h index 1d0e61d..510601a 100644 --- a/cpp/BedGraphRow.h +++ b/cpp/BedGraphRow.h @@ -26,11 +26,18 @@ class BedGraphRow double coverage; // normalized coverage by CPM unsigned int total_reads; // number of reads spanning across the bin, total_reads = length * coverage unsigned int length; + // strand is '.' for unstranded coverage (the default), '+' for plus-strand + // BigWigs and '-' for minus-strand BigWigs. Keeping the field on every row + // means stranded data can flow through the pipeline without a parallel + // data structure. Downstream code currently ignores it; future work in + // find_ERs / stitch_up can filter on it. + char strand = '.'; // add optional values for average coverage, DER identifier BedGraphRow() = default; BedGraphRow(std::string chrom, uint64_t start, uint64_t end, double coverage); + BedGraphRow(std::string chrom, uint64_t start, uint64_t end, double coverage, char strand); void print() const; void normalize(const uint64_t library_size); diff --git a/cpp/Integrator.cpp b/cpp/Integrator.cpp index 6c40cae..ac602b2 100644 --- a/cpp/Integrator.cpp +++ b/cpp/Integrator.cpp @@ -3,6 +3,7 @@ // #include +#include #include // constructor @@ -42,69 +43,163 @@ bool Integrator::sj_too_far_back(const uint64_t most_recent_er_end, const uint64 && !within_threshold(most_recent_er_end, sj_start); } -void Integrator::stitch_up(std::unordered_map>& expressed_regions, const std::unordered_map>& mm_chrom_sj, const std::vector& rr_all_sj) +// Build chains of ERs connected by SJs of one strand, and append the +// chains of length 2 or more to stitched_ERs (tagged with that strand). +// Single-ER candidates produced along the way are not emitted here; the +// caller is responsible for emitting unstitched ERs once with strand '.'. +// +// Algorithm: walk the chromosome's ERs left to right. Keep a candidate chain +// in progress. For each new ER, try to extend the candidate via the next +// available SJ on this strand. If the SJ matches (coverage and position), the +// chain grows; otherwise the candidate is closed (emitted only if it has 2+ +// ERs) and a fresh candidate is seeded with the current ER. Indices of any +// ERs that wound up inside an emitted chain are added to consumed_indices. +void Integrator::stitch_one_strand(const std::string& chrom, + char strand, + const std::vector& strand_sjs, + const std::vector& ers, + const std::vector& rr_all_sj, + std::unordered_set& consumed_indices, + std::vector& output) { - // iterate over chromosomes and sj_ids -> sjs.first = chrom, sjs.second = vector - for (auto& sjs : mm_chrom_sj) + if (ers.empty() || strand_sjs.empty()) return; + + auto sj_it = strand_sjs.begin(); + + StitchedER candidate(ers[0], 0); + candidate.strand = strand; + bool have_candidate = true; + + auto close_candidate = [&]() { - //std::cout << "[INFO] Stitching chromosome " << sjs.first << std::endl; - std::string chrom = sjs.first; - StitchedER er1 = StitchedER(expressed_regions.at(chrom).at(0), 0); // define the first StitchedER, currently consisting of 1 ER - stitched_ERs.emplace_back(er1); - auto current_sj_id = sjs.second.begin(); // iterator over the vector of sj_id - - int max_stitched_ers = 0; - int nof_stitched_ers = 0; - // iterate over expressed regions starting with region 2 (since region 1 was already appended) - for (int i = 1; i < expressed_regions.at(chrom).size(); ++i) + // a chain has been extended at least once when er_ids has more than the seed entry + if (candidate.er_ids.size() > 1) { - const auto& expressed_region = expressed_regions[chrom][i]; - //only compare if we aren't at the last SJ yet - if (current_sj_id != sjs.second.end()){ - StitchedER& current_stitched_er = stitched_ERs.back(); // this is one expressed region right now - - // skip ahead to SJ with coordinates that line up with the most recent ER - while (current_sj_id != sjs.second.end() - && (current_stitched_er.end > rr_all_sj[*current_sj_id - 1].start && !within_threshold(current_stitched_er.end, rr_all_sj[*current_sj_id - 1].start)) - && rr_all_sj[*current_sj_id - 1].chrom == chrom) - { - ++current_sj_id; - } - // make sure to never dereference the end() pointer - if (current_sj_id == sjs.second.end()) - { - --current_sj_id; - } - // get rr_all_sj, which is a vector of SJRows - if (is_similar(current_stitched_er, expressed_region, rr_all_sj[*current_sj_id - 1])) - { - uint64_t sj_length = expressed_region.start - expressed_regions[chrom][current_stitched_er.er_ids.back()].end; // always use ER coordinates since a small mismatch of SJ and ER coordinates is tolerated - current_stitched_er.append(-1, sj_length, 0.0); // append the spliced region and the intron - current_stitched_er.append(i, expressed_region.length, expressed_region.coverage); - ++nof_stitched_ers; - // move to next SJ - ++current_sj_id; - - // find maximum number of ERs that were stitched together - if (max_stitched_ers < nof_stitched_ers) - { - max_stitched_ers = nof_stitched_ers; - } - } - // current ER doesn't belong to any existing ERs --> start a new ER - else - { - nof_stitched_ers = 1; // reset counter - stitched_ERs.emplace_back(StitchedER(expressed_region, i)); - } - } - // no more splice junctions left, so each remaining expressed region forms its own StitchedER - else + output.emplace_back(candidate); + for (int id : candidate.er_ids) { - stitched_ERs.emplace_back(StitchedER(expressed_region, i)); + if (id >= 0) consumed_indices.insert(id); } } - std::cout << "[INFO] Longest stitched ER in " << chrom << " contains " << max_stitched_ers << " ERs" << std::endl; + have_candidate = false; + }; + + int max_chain_len = 0; // number of ERs in the longest chain emitted + + for (int i = 1; i < static_cast(ers.size()); ++i) + { + const auto& expressed_region = ers[i]; + if (!have_candidate) + { + candidate = StitchedER(expressed_region, i); + candidate.strand = strand; + have_candidate = true; + continue; + } + + if (sj_it == strand_sjs.end()) + { + // no SJs of this strand left: close the current candidate and start fresh + // with this ER as its own (unstitched) seed + close_candidate(); + candidate = StitchedER(expressed_region, i); + candidate.strand = strand; + have_candidate = true; + continue; + } + + // skip past SJs whose end coordinate is too far behind the chain + while (sj_it != strand_sjs.end() + && (candidate.end > rr_all_sj[*sj_it - 1].start + && !within_threshold(candidate.end, rr_all_sj[*sj_it - 1].start)) + && rr_all_sj[*sj_it - 1].chrom == chrom) + { + ++sj_it; + } + // if we ran out, fall back to the last entry so dereferencing is safe + const auto sj_to_check = (sj_it == strand_sjs.end()) ? std::prev(strand_sjs.end()) : sj_it; + + if (is_similar(candidate, expressed_region, rr_all_sj[*sj_to_check - 1])) + { + uint64_t sj_length = expressed_region.start - ers[candidate.er_ids.back()].end; + candidate.append(-1, sj_length, 0.0); // append the intron placeholder + candidate.append(i, expressed_region.length, expressed_region.coverage); + // count actual ERs in chain (non-negative er_ids entries) + int er_count = 0; + for (int id : candidate.er_ids) if (id >= 0) ++er_count; + if (er_count > max_chain_len) max_chain_len = er_count; + if (sj_it != strand_sjs.end()) ++sj_it; + } + else + { + close_candidate(); + candidate = StitchedER(expressed_region, i); + candidate.strand = strand; + have_candidate = true; + } + } + + if (have_candidate) close_candidate(); + + if (max_chain_len > 0) + { + std::cout << "[INFO] Longest stitched ER in " << chrom << " (" << strand << ") contains " + << max_chain_len << " ERs" << std::endl; + } +} + + +void Integrator::stitch_up(std::unordered_map>& expressed_regions, const std::unordered_map>& mm_chrom_sj, const std::vector& rr_all_sj) +{ + // For each chromosome: + // 1. Bucket the chromosome's SJs by strand. SJRow.strand is true for '+', false for '-'. + // 2. Run stitch_one_strand twice (once per non-empty bucket). Each pass + // emits only chains of 2 or more ERs, tagged with its strand, and + // records the indices of consumed ERs. + // 3. Walk the chromosome's ERs and emit each ER not consumed by any + // chain as a single-ER StitchedER tagged with strand '.'. This makes + // sure every ER appears in stitched_ERs exactly once: either inside + // a strand-labelled chain (when SJs supported it) or as an unstranded + // standalone ER (when no SJ on either strand stitched it). + for (const auto& sjs : mm_chrom_sj) + { + const std::string& chrom = sjs.first; + if (expressed_regions.find(chrom) == expressed_regions.end()) continue; + const auto& ers = expressed_regions.at(chrom); + if (ers.empty()) continue; + + std::vector plus_sjs; + std::vector minus_sjs; + plus_sjs.reserve(sjs.second.size()); + minus_sjs.reserve(sjs.second.size()); + for (uint32_t sj_id : sjs.second) + { + if (sj_id == 0 || sj_id - 1 >= rr_all_sj.size()) continue; + (rr_all_sj[sj_id - 1].strand ? plus_sjs : minus_sjs).emplace_back(sj_id); + } + + std::unordered_set consumed; + // Collect this chromosome's StitchedERs in a local vector so we can + // sort them by start position before appending. The original + // (unstranded) algorithm always emitted ERs in genomic order, and + // write_to_gtf relies on it; running two strand passes plus a + // standalone pass would otherwise interleave them out of order. + std::vector chrom_stitched; + if (!plus_sjs.empty()) stitch_one_strand(chrom, '+', plus_sjs, ers, rr_all_sj, consumed, chrom_stitched); + if (!minus_sjs.empty()) stitch_one_strand(chrom, '-', minus_sjs, ers, rr_all_sj, consumed, chrom_stitched); + + // any ER not pulled into a chain is emitted once, unstranded + for (int i = 0; i < static_cast(ers.size()); ++i) + { + if (consumed.count(i)) continue; + StitchedER fresh(ers[i], i); + // strand stays '.' (default) + chrom_stitched.emplace_back(fresh); + } + + std::sort(chrom_stitched.begin(), chrom_stitched.end(), + [](const StitchedER& a, const StitchedER& b) { return a.start < b.start; }); + for (auto& ser : chrom_stitched) stitched_ERs.emplace_back(std::move(ser)); } } diff --git a/cpp/Integrator.h b/cpp/Integrator.h index 2bd28b9..9f1f4cd 100644 --- a/cpp/Integrator.h +++ b/cpp/Integrator.h @@ -14,6 +14,7 @@ #include #include #include +#include class Integrator { @@ -21,6 +22,19 @@ class Integrator Integrator(double coverage_tolerance_, int position_tolerance_); void stitch_up(std::unordered_map>& expressed_regions, const std::unordered_map>& mm_chrom_sj, const std::vector& rr_all_sj); + // Per-strand pass invoked by stitch_up. Builds chains of ERs connected by + // SJs of the requested strand and appends them to stitched_ERs only when + // the chain contains 2 or more ERs. Indices of ERs that ended up in any + // emitted chain are added to consumed_indices so the caller can avoid + // emitting them again as unstranded standalone ERs. Exposed publicly so + // tests can exercise one bucket in isolation. + void stitch_one_strand(const std::string& chrom, + char strand, + const std::vector& strand_sjs, + const std::vector& ers, + const std::vector& rr_all_sj, + std::unordered_set& consumed_indices, + std::vector& output); bool within_threshold(double val1, double val2) const; bool within_threshold(uint64_t val1, uint64_t val2) const; bool is_similar(const StitchedER& most_recent_er, const BedGraphRow& expressed_region, const SJRow& current_sj); diff --git a/cpp/Parser.cpp b/cpp/Parser.cpp index 7f22522..a0d5b82 100644 --- a/cpp/Parser.cpp +++ b/cpp/Parser.cpp @@ -54,21 +54,10 @@ Parser::Parser(std::string path_, std::vector chromosomes_, int cor } } -// fill vector with coverage value per bp (for mean coverage computation later on -> different bedgraphs have different binning intervals) -void Parser::compute_per_base_coverage(const BedGraphRow& row, std::unordered_map>& per_base_coverage) -{ - // row.end is NOT inclusive - unsigned int bin_length = row.length ? row.length : static_cast(row.end - row.start); - if (bin_length == 0) - { - std::cerr << "[ERROR] BedGraph bin with length 0 (start = end) provided!" << std::endl; - return; - } - - auto& dest = per_base_coverage[row.chrom]; - dest.insert(dest.end(), bin_length, row.coverage); // insert bin coverage for each individual bp - -} +// compute_per_base_coverage was deleted. The dense per-base double vector it +// produced was the largest single resident structure in fastder (47 Mb x 8 B +// per chr21 sample, 24 GB per full-hg38 sample). Averager now consumes the +// sparse interval form in all_bedgraphs directly. // parse relevant chromosomes of a bedgraph file std::vector Parser::read_bedgraph(const std::string& filename, uint64_t& library_size) const @@ -361,7 +350,6 @@ void Parser::read_all_bedgraphs(std::vector bedgraph_files, unsigne std::cout << "[INFO] fastder is using " << nof_threads + 1 << " threads for parsing." << std::endl; // reserve space all_bedgraphs.resize(bedgraph_files.size()); - all_per_base_coverages.resize(bedgraph_files.size()); // storage for threads std::vector threads; @@ -380,27 +368,38 @@ void Parser::read_all_bedgraphs(std::vector bedgraph_files, unsigne unsigned int i = next_index++; //passes index, then does post-increment! if (i >= bedgraph_files.size()) break; + const std::string& filename = bedgraph_files.at(i); // mutex to ensure print statement is not shuffled from concurrency { std::lock_guard lock(mutex); - std::cout << "[FILE] Processing BedGraph File " << bedgraph_files.at(i) << std::endl; + std::cout << "[FILE] Processing coverage file " << filename << std::endl; } uint64_t library_size = 0; // ensure that the integer type is large enough - std::vector sample_bedgraph = read_bedgraph(bedgraph_files.at(i), library_size); - std::unordered_map> per_base_coverage; + std::vector sample_bedgraph; - // normalize to CPM and expand rows to per-base coverage (also normalized) + // pick the right reader by file extension. BigWig parsing is + // gated on libBigWig at compile time; if it is not built in, + // read_bigwig logs an error and returns an empty vector. + if (filename.size() >= 3 && filename.substr(filename.size() - 3) == ".bw") + { + sample_bedgraph = read_bigwig(filename, library_size); + } + else + { + sample_bedgraph = read_bedgraph(filename, library_size); + } + + // normalize each interval to CPM. Per-base expansion is no + // longer performed; Averager consumes the sparse intervals. for (BedGraphRow& row : sample_bedgraph) { row.normalize(library_size); - compute_per_base_coverage(row, per_base_coverage); } - // add all bedgraphs of one sample to the matrix + // add the sample's intervals to the per-sample matrix { std::lock_guard lock(mutex); all_bedgraphs[i] = std::move(sample_bedgraph); - all_per_base_coverages[i] = std::move(per_base_coverage); } } @@ -412,6 +411,76 @@ void Parser::read_all_bedgraphs(std::vector bedgraph_files, unsigne } } + +// libBigWig integration. The body is gated on FASTDER_USE_LIBBIGWIG so the +// default build is hermetic. When the option is on, libBigWig is FetchContent'd +// from upstream and its headers are on the include path. +#ifdef FASTDER_USE_LIBBIGWIG +extern "C" { +#include +} +#include +#endif + +std::vector Parser::read_bigwig(const std::string& filename, uint64_t& library_size, + char strand) const +{ + std::vector intervals; +#ifdef FASTDER_USE_LIBBIGWIG + // bwInit allocates a process-wide read buffer. Call it once across all + // threads. bwCleanup is left to process exit; libBigWig's cleanup function + // is not safe to call while other readers may still be active. + static std::once_flag bw_init_flag; + std::call_once(bw_init_flag, []() { bwInit(1 << 17); }); + + bigWigFile_t* fp = bwOpen(const_cast(filename.c_str()), nullptr, "r"); + if (!fp) + { + std::cerr << "[ERROR] Could not open BigWig " << filename << std::endl; + return intervals; + } + + // Iterate the BigWig's chromosomes. Skip any that the user did not + // request via --chr (chromosomes_set), matching read_bedgraph's filter. + for (int64_t k = 0; k < fp->cl->nKeys; ++k) + { + const std::string chrom = fp->cl->chrom[k]; + const uint32_t chrom_len = fp->cl->len[k]; + if (!chromosomes_set.contains(chrom)) continue; + + // bwGetOverlappingIntervals returns the BigWig's intrinsic intervals + // that overlap [start, end). Calling it for the entire chromosome + // yields the file's stored intervals on that chromosome. + bwOverlappingIntervals_t* o = bwGetOverlappingIntervals( + fp, const_cast(chrom.c_str()), 0, chrom_len); + if (!o) continue; + + intervals.reserve(intervals.size() + o->l); + for (uint32_t i = 0; i < o->l; ++i) + { + BedGraphRow row(chrom, + static_cast(o->start[i]), + static_cast(o->end[i]), + static_cast(o->value[i]), + strand); + row.length = static_cast(row.end - row.start); + row.total_reads = static_cast(row.length * row.coverage); + library_size += row.total_reads; + intervals.emplace_back(std::move(row)); + } + bwDestroyOverlappingIntervals(o); + } + + bwClose(fp); +#else + (void)filename; (void)library_size; (void)strand; + std::cerr << "[ERROR] read_bigwig was called but fastder was built without " + "libBigWig support. Reconfigure with -DFASTDER_USE_LIBBIGWIG=ON " + "or feed BedGraph (.bedGraph) input." << std::endl; +#endif + return intervals; +} + // attempt to parse all files in path (not recursive!) void Parser::search_directory() { bool contains_ids = false; @@ -436,8 +505,12 @@ void Parser::search_directory() { } - // collect all bedgraph files to later fill up rail_id_to_mm_id - else if (filename.find(".bedGraph") != std::string::npos) + // collect all coverage files (BedGraph or BigWig) to later fill up + // rail_id_to_mm_id. The variable name keeps "bedgraph_files" for + // continuity but the list is just paths; read_all_bedgraphs picks the + // right reader by extension. .bw files require a libBigWig-enabled build. + else if (filename.find(".bedGraph") != std::string::npos || + (filename.size() >= 3 && filename.substr(filename.size() - 3) == ".bw")) { bedgraph_files.emplace_back(filename); } diff --git a/cpp/Parser.h b/cpp/Parser.h index ba10d67..ec0a76d 100644 --- a/cpp/Parser.h +++ b/cpp/Parser.h @@ -21,20 +21,29 @@ class Parser { // read in individual file types void read_all_bedgraphs(std::vector bedgraph_files, unsigned int nof_threads); std::vector read_bedgraph(const std::string& filename, uint64_t& library_size) const; + // Parse a BigWig file into the same sparse interval representation we use + // for BedGraph. _strand is stamped on every emitted row ('.' = unstranded, + // '+' / '-' for stranded BigWigs). Implementation is gated on the CMake + // option FASTDER_USE_LIBBIGWIG. Without that flag the function logs an + // error and returns an empty vector, which keeps the build hermetic. + std::vector read_bigwig(const std::string& filename, uint64_t& library_size, + char strand = '.') const; void read_mm(std::string filename); void read_rr(std::string filename); void read_url_csv(std::string filename); void fill_up(std::vector bedgraph_files); - static void compute_per_base_coverage(const BedGraphRow& row, std::unordered_map>& per_base_coverage); - // TODO add function get_rail_id_from_filename(filename)? unsigned int user_cores; std::string path; std::vector chromosomes_vec; // for fast iteration std::unordered_set chromosomes_set; // for fast check if chromosome is included - std::vector> all_bedgraphs; //TODO maybe change to unordered map with key = sample id, value = bedgraph of the sample? - std::vector>> all_per_base_coverages; //NOT ordered by chromosomes + // Per-sample sparse interval coverage. Element s is the BedGraph or BigWig + // content of sample s, sorted by (chrom, start). This is the only coverage + // representation fastder keeps in memory; the dense per-base expansion + // that the previous version computed has been removed since + // compute_mean_coverage and find_ERs now operate on intervals directly. + std::vector> all_bedgraphs; // RR rows on chromosomes the user requested. Dropped rows are not stored. std::vector rr_all_sj; diff --git a/cpp/StitchedER.h b/cpp/StitchedER.h index e26e316..d0bf6b7 100644 --- a/cpp/StitchedER.h +++ b/cpp/StitchedER.h @@ -34,6 +34,11 @@ class StitchedER uint64_t start; uint64_t end; std::string chrom; + // Strand of the splice junctions that produced this stitched ER. '.' for + // unstranded inputs (the SJs in this chromosome were not partitioned by + // strand) and '+' or '-' when the stitching pass restricted to one + // strand. Matches the strand semantics on BedGraphRow and SJRow. + char strand = '.'; // overload output operator for SJRow friend std::ostream& operator<< (std::ostream& os, const StitchedER& stitched_er) diff --git a/cpp/main.cpp b/cpp/main.cpp index f37aae9..50c6071 100644 --- a/cpp/main.cpp +++ b/cpp/main.cpp @@ -128,9 +128,9 @@ int main(int argc, char* argv[]) { std::chrono::duration elapsed_parsing = end_parsing - start; std::cout << "[INFO] Parsing took " << elapsed_parsing.count() << " seconds.\n \n"; - // get mean coverage vector + // get mean coverage as sparse intervals (no dense per-base expansion) Averager averager(cores); - averager.compute_mean_coverage(parser.all_per_base_coverages); + averager.compute_mean_coverage(parser.all_bedgraphs); averager.find_ERs(min_coverage, min_length); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index deb136a..d753464 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -24,5 +24,14 @@ target_link_libraries(fastder-tests GTest::gtest_main ) +# When fastder is built with libBigWig, the BigWig round-trip and equivalence +# tests in UnitTests.cpp include and call libBigWig's writer API, +# so the test target needs both the include path and the static lib too. +if(FASTDER_USE_LIBBIGWIG) + target_link_libraries(fastder-tests PRIVATE BigWig) + target_include_directories(fastder-tests PRIVATE ${libbigwig_src_SOURCE_DIR}) + target_compile_definitions(fastder-tests PRIVATE FASTDER_USE_LIBBIGWIG) +endif() + include(GoogleTest) gtest_discover_tests(fastder-tests) diff --git a/tests/UnitTests.cpp b/tests/UnitTests.cpp index fc43977..ef49d84 100644 --- a/tests/UnitTests.cpp +++ b/tests/UnitTests.cpp @@ -329,9 +329,9 @@ TEST(Parser, TestWrongChromosomeOrder) Parser parser = Parser(directory, chromosomes, cores); parser.search_directory(); - // get mean coverage vector + // get mean coverage as sparse intervals Averager averager(cores); - averager.compute_mean_coverage(parser.all_per_base_coverages); + averager.compute_mean_coverage(parser.all_bedgraphs); // get expressed regions averager.find_ERs(coverage_threshold, min_length); @@ -552,4 +552,462 @@ TEST(Parser, MMHeaderRejectsCountMismatch) EXPECT_TRUE(parser.mm_chrom_sj.empty()); fs::remove_all(tmp); +} + + +// ===================================================================== +// BedGraphRow strand field +// ===================================================================== + +TEST(BedGraphRow, DefaultStrandIsUnstranded) +{ + BedGraphRow row("chr1", 100, 200, 5.0); + EXPECT_EQ(row.strand, '.'); +} + +TEST(BedGraphRow, ExplicitStrandIsStored) +{ + BedGraphRow plus("chr1", 100, 200, 5.0, '+'); + BedGraphRow minus("chr1", 100, 200, 5.0, '-'); + EXPECT_EQ(plus.strand, '+'); + EXPECT_EQ(minus.strand, '-'); +} + + +// ===================================================================== +// Averager::compute_mean_coverage on sparse intervals +// ===================================================================== + +// The sparse mean of a single sample equals the sample's own intervals. +// Adjacent intervals with the same coverage are coalesced. +TEST(Averager, SparseMeanSingleSampleIsIdentity) +{ + std::vector> all_bedgraphs(1); + all_bedgraphs[0] = { + BedGraphRow("chr1", 100, 200, 4.0), + BedGraphRow("chr1", 200, 300, 4.0), // contiguous, same coverage -> coalesced + BedGraphRow("chr1", 400, 500, 8.0), + }; + + Averager avg(1); + avg.compute_mean_coverage(all_bedgraphs); + + ASSERT_TRUE(avg.mean_intervals.count("chr1")); + const auto& m = avg.mean_intervals["chr1"]; + ASSERT_EQ(m.size(), 2u); + EXPECT_EQ(m[0].start, 100u); + EXPECT_EQ(m[0].end, 300u); + EXPECT_DOUBLE_EQ(m[0].coverage, 4.0); + EXPECT_EQ(m[1].start, 400u); + EXPECT_EQ(m[1].end, 500u); + EXPECT_DOUBLE_EQ(m[1].coverage, 8.0); +} + +// Two samples with overlapping intervals: the union of breakpoints partitions +// the chromosome and each segment carries the per-sample average. +TEST(Averager, SparseMeanTwoOverlappingSamples) +{ + std::vector> all_bedgraphs(2); + all_bedgraphs[0] = { BedGraphRow("chr1", 100, 300, 10.0) }; + all_bedgraphs[1] = { BedGraphRow("chr1", 200, 400, 20.0) }; + + Averager avg(1); + avg.compute_mean_coverage(all_bedgraphs); + + ASSERT_TRUE(avg.mean_intervals.count("chr1")); + const auto& m = avg.mean_intervals["chr1"]; + // expected segments: [100,200) cov=10/2=5, [200,300) cov=(10+20)/2=15, [300,400) cov=20/2=10 + ASSERT_EQ(m.size(), 3u); + EXPECT_EQ(m[0].start, 100u); EXPECT_EQ(m[0].end, 200u); + EXPECT_DOUBLE_EQ(m[0].coverage, 5.0); + EXPECT_EQ(m[1].start, 200u); EXPECT_EQ(m[1].end, 300u); + EXPECT_DOUBLE_EQ(m[1].coverage, 15.0); + EXPECT_EQ(m[2].start, 300u); EXPECT_EQ(m[2].end, 400u); + EXPECT_DOUBLE_EQ(m[2].coverage, 10.0); +} + +// A sample with no coverage on a chromosome contributes 0 to every segment. +TEST(Averager, SparseMeanSampleAbsentFromChromContributesZero) +{ + std::vector> all_bedgraphs(2); + all_bedgraphs[0] = { BedGraphRow("chr1", 100, 200, 10.0) }; + all_bedgraphs[1] = { BedGraphRow("chr2", 100, 200, 30.0) }; + + Averager avg(1); + avg.compute_mean_coverage(all_bedgraphs); + + ASSERT_TRUE(avg.mean_intervals.count("chr1")); + ASSERT_TRUE(avg.mean_intervals.count("chr2")); + const auto& m1 = avg.mean_intervals["chr1"]; + const auto& m2 = avg.mean_intervals["chr2"]; + ASSERT_EQ(m1.size(), 1u); + ASSERT_EQ(m2.size(), 1u); + EXPECT_DOUBLE_EQ(m1[0].coverage, 5.0); // 10 / 2 samples + EXPECT_DOUBLE_EQ(m2[0].coverage, 15.0); // 30 / 2 samples +} + +// A gap between intervals is implicit zero coverage, not a continuation. +TEST(Averager, SparseMeanGapIsImplicitZero) +{ + std::vector> all_bedgraphs(1); + all_bedgraphs[0] = { + BedGraphRow("chr1", 100, 200, 10.0), + BedGraphRow("chr1", 500, 600, 10.0), + }; + + Averager avg(1); + avg.compute_mean_coverage(all_bedgraphs); + + const auto& m = avg.mean_intervals["chr1"]; + ASSERT_EQ(m.size(), 2u); + EXPECT_EQ(m[0].end, 200u); + EXPECT_EQ(m[1].start, 500u); +} + + +// ===================================================================== +// Averager::find_ERs on sparse intervals +// ===================================================================== + +// One contiguous run above threshold -> one ER spanning the full run. +TEST(Averager, FindERsSimpleRun) +{ + Averager avg(1); + avg.chroms = {"chr1"}; + avg.mean_intervals["chr1"] = { + BedGraphRow("chr1", 100, 200, 10.0), + BedGraphRow("chr1", 200, 300, 10.0), + }; + + avg.find_ERs(1.0, 5); + const auto& ers = avg.expressed_regions["chr1"]; + ASSERT_EQ(ers.size(), 1u); + EXPECT_EQ(ers[0].start, 100u); + EXPECT_EQ(ers[0].end, 300u); +} + +// A below-threshold interval breaks the run. +TEST(Averager, FindERsBelowThresholdBreaksRun) +{ + Averager avg(1); + avg.chroms = {"chr1"}; + avg.mean_intervals["chr1"] = { + BedGraphRow("chr1", 100, 200, 10.0), + BedGraphRow("chr1", 200, 250, 0.1), + BedGraphRow("chr1", 250, 350, 10.0), + }; + + avg.find_ERs(1.0, 5); + const auto& ers = avg.expressed_regions["chr1"]; + ASSERT_EQ(ers.size(), 2u); + EXPECT_EQ(ers[0].start, 100u); EXPECT_EQ(ers[0].end, 200u); + EXPECT_EQ(ers[1].start, 250u); EXPECT_EQ(ers[1].end, 350u); +} + +// A coverage gap (non-contiguous intervals) also breaks the run, even when +// both flanks are above threshold. +TEST(Averager, FindERsGapBreaksRun) +{ + Averager avg(1); + avg.chroms = {"chr1"}; + avg.mean_intervals["chr1"] = { + BedGraphRow("chr1", 100, 200, 10.0), + BedGraphRow("chr1", 500, 600, 10.0), + }; + + avg.find_ERs(1.0, 5); + const auto& ers = avg.expressed_regions["chr1"]; + ASSERT_EQ(ers.size(), 2u); +} + +// min_length filters short ERs even if they cleared the coverage threshold. +TEST(Averager, FindERsMinLengthFilters) +{ + Averager avg(1); + avg.chroms = {"chr1"}; + avg.mean_intervals["chr1"] = { + BedGraphRow("chr1", 100, 102, 10.0), // length 2, below min_length + BedGraphRow("chr1", 500, 600, 10.0), // length 100, kept + }; + + avg.find_ERs(1.0, 5); + const auto& ers = avg.expressed_regions["chr1"]; + ASSERT_EQ(ers.size(), 1u); + EXPECT_EQ(ers[0].start, 500u); +} + + +// ===================================================================== +// Integrator strand-aware stitch_up +// ===================================================================== + +// Each ER appears in stitched_ERs exactly once: ERs that participate in a +// chain are tagged with that chain's strand, ERs left over come back with '.'. +TEST(Integrator, StitchUpSinglePlusChain) +{ + std::vector rr_all_sj = { + SJRow("chr1", 10500, 11000, 500, '+', false), // sj_id 1, strand=true + }; + std::unordered_map> expressed_regions; + expressed_regions["chr1"] = { + BedGraphRow("chr1", 10000, 10500, 100.0), + BedGraphRow("chr1", 11000, 12500, 101.0), + }; + std::unordered_map> mm_chrom_sj; + mm_chrom_sj["chr1"] = {1}; + + Integrator integrator(0.1, 5); + integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); + + ASSERT_EQ(integrator.stitched_ERs.size(), 1u); + EXPECT_EQ(integrator.stitched_ERs[0].strand, '+'); + EXPECT_EQ(integrator.stitched_ERs[0].er_ids.size(), 3u); +} + +TEST(Integrator, StitchUpSingleMinusChain) +{ + std::vector rr_all_sj = { + SJRow("chr1", 10500, 11000, 500, '-', false), // sj_id 1, strand=false + }; + std::unordered_map> expressed_regions; + expressed_regions["chr1"] = { + BedGraphRow("chr1", 10000, 10500, 100.0), + BedGraphRow("chr1", 11000, 12500, 101.0), + }; + std::unordered_map> mm_chrom_sj; + mm_chrom_sj["chr1"] = {1}; + + Integrator integrator(0.1, 5); + integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); + + ASSERT_EQ(integrator.stitched_ERs.size(), 1u); + EXPECT_EQ(integrator.stitched_ERs[0].strand, '-'); +} + +// An ER that no SJ stitches must be emitted exactly once with strand '.'. +TEST(Integrator, StitchUpUnstitchedERIsEmittedOnceUnstranded) +{ + std::vector rr_all_sj = { + SJRow("chr1", 10500, 11000, 500, '+', false), + SJRow("chr1", 20500, 21000, 500, '-', false), + }; + std::unordered_map> expressed_regions; + expressed_regions["chr1"] = { + BedGraphRow("chr1", 10000, 10500, 100.0), // chain start (+ pass) + BedGraphRow("chr1", 11000, 12500, 101.0), // chain extension + BedGraphRow("chr1", 13000, 14000, 5.0), // unstitched gap + BedGraphRow("chr1", 20000, 20500, 50.0), // chain start (- pass) + BedGraphRow("chr1", 21000, 22000, 50.0), // chain extension + }; + std::unordered_map> mm_chrom_sj; + mm_chrom_sj["chr1"] = {1, 2}; + + Integrator integrator(0.1, 5); + integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); + + // Expect: + chain (ERs 0,1), unstranded standalone (ER 2), - chain (ERs 3,4) + ASSERT_EQ(integrator.stitched_ERs.size(), 3u); + + int unstranded_count = 0; + int plus_count = 0; + int minus_count = 0; + for (const auto& ser : integrator.stitched_ERs) + { + if (ser.strand == '.') ++unstranded_count; + if (ser.strand == '+') ++plus_count; + if (ser.strand == '-') ++minus_count; + } + EXPECT_EQ(unstranded_count, 1); + EXPECT_EQ(plus_count, 1); + EXPECT_EQ(minus_count, 1); +} + +// stitched_ERs must be sorted by start within a chromosome regardless of which +// strand pass produced them, since write_to_gtf relies on genomic order. +TEST(Integrator, StitchUpEmitsInGenomicOrder) +{ + std::vector rr_all_sj = { + SJRow("chr1", 20500, 21000, 500, '+', false), // late + chain + SJRow("chr1", 10500, 11000, 500, '-', false), // early - chain + }; + std::unordered_map> expressed_regions; + expressed_regions["chr1"] = { + BedGraphRow("chr1", 10000, 10500, 50.0), + BedGraphRow("chr1", 11000, 12000, 50.0), + BedGraphRow("chr1", 20000, 20500, 100.0), + BedGraphRow("chr1", 21000, 22000, 100.0), + }; + std::unordered_map> mm_chrom_sj; + mm_chrom_sj["chr1"] = {1, 2}; + + Integrator integrator(0.1, 5); + integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); + + ASSERT_EQ(integrator.stitched_ERs.size(), 2u); + // first by genomic order is the - chain (ER 0..1) + EXPECT_EQ(integrator.stitched_ERs[0].start, 10000u); + EXPECT_EQ(integrator.stitched_ERs[0].strand, '-'); + EXPECT_EQ(integrator.stitched_ERs[1].start, 20000u); + EXPECT_EQ(integrator.stitched_ERs[1].strand, '+'); +} + + +// ===================================================================== +// libBigWig integration (gated). These run only when fastder is built with +// -DFASTDER_USE_LIBBIGWIG=ON; otherwise they GTEST_SKIP so the unbuilt code +// path does not block local development. +// ===================================================================== + +#ifdef FASTDER_USE_LIBBIGWIG +extern "C" { +#include +} + +// Helper: write a tiny BigWig at path with the supplied chromosome name, +// chromosome length and triples of (start, end, value). Uses libBigWig's +// bedGraph-style interval writer, which is the most permissive of its three +// emission modes and the closest match to what bedtools genomecov produces. +static void write_test_bigwig(const std::string& path, + const std::string& chrom, + uint32_t chrom_len, + const std::vector>& intervals) +{ + bwInit(1 << 17); + bigWigFile_t* fp = bwOpen(const_cast(path.c_str()), nullptr, "w"); + ASSERT_NE(fp, nullptr) << "Could not open BigWig for write: " << path; + ASSERT_EQ(bwCreateHdr(fp, 10), 0); + + char* chrom_array[1] = { const_cast(chrom.c_str()) }; + uint32_t lens[1] = { chrom_len }; + fp->cl = bwCreateChromList(chrom_array, lens, 1); + ASSERT_NE(fp->cl, nullptr); + ASSERT_EQ(bwWriteHdr(fp), 0); + + std::vector chr_buf(intervals.size(), const_cast(chrom.c_str())); + std::vector starts(intervals.size()); + std::vector ends(intervals.size()); + std::vector values(intervals.size()); + for (size_t i = 0; i < intervals.size(); ++i) + { + starts[i] = std::get<0>(intervals[i]); + ends[i] = std::get<1>(intervals[i]); + values[i] = std::get<2>(intervals[i]); + } + ASSERT_EQ(bwAddIntervals(fp, + chr_buf.data(), + starts.data(), + ends.data(), + values.data(), + static_cast(intervals.size())), 0); + bwClose(fp); +} +#endif + + +// Round-trip: write a small BigWig with libBigWig, read it through +// Parser::read_bigwig, assert intervals come back identical. +TEST(BigWig, RoundTripParsesWrittenIntervals) +{ +#ifndef FASTDER_USE_LIBBIGWIG + GTEST_SKIP() << "libBigWig support is off; rebuild with -DFASTDER_USE_LIBBIGWIG=ON"; +#else + namespace fs = std::filesystem; + auto tmp = fs::temp_directory_path() / "fastder_test_bw_roundtrip"; + fs::create_directories(tmp); + auto bw_path = (tmp / "tiny.bw").string(); + + // intervals: (start, end, value) + const std::vector> intervals = { + {100, 200, 10.0f}, + {200, 300, 0.0f}, + {300, 500, 25.0f}, + }; + write_test_bigwig(bw_path, "chr1", 1000, intervals); + + Parser parser("dummy_path", {"chr1"}, 1); + uint64_t library_size = 0; + auto rows = parser.read_bigwig(bw_path, library_size, '.'); + + ASSERT_EQ(rows.size(), intervals.size()); + for (size_t i = 0; i < rows.size(); ++i) + { + EXPECT_EQ(rows[i].chrom, "chr1"); + EXPECT_EQ(rows[i].start, std::get<0>(intervals[i])); + EXPECT_EQ(rows[i].end, std::get<1>(intervals[i])); + EXPECT_FLOAT_EQ(static_cast(rows[i].coverage), std::get<2>(intervals[i])); + EXPECT_EQ(rows[i].strand, '.'); + } + + fs::remove_all(tmp); +#endif +} + + +// Equivalence: feed the same coverage through read_bedgraph and read_bigwig +// and assert that compute_mean_coverage and find_ERs produce identical +// expressed_regions on both. This is the regression guard for "BedGraph and +// BigWig must yield the same fastder output for the same data". +TEST(BigWig, BedGraphAndBigWigEquivalentExpressedRegions) +{ +#ifndef FASTDER_USE_LIBBIGWIG + GTEST_SKIP() << "libBigWig support is off; rebuild with -DFASTDER_USE_LIBBIGWIG=ON"; +#else + namespace fs = std::filesystem; + auto tmp = fs::temp_directory_path() / "fastder_test_bw_equiv"; + fs::create_directories(tmp); + auto bw_path = (tmp / "sample.bw").string(); + auto bg_path = (tmp / "sample.bedGraph").string(); + + const std::vector> intervals = { + {100, 300, 10.0f}, + {300, 400, 0.5f}, // below the 1.0 threshold used in find_ERs + {400, 700, 10.0f}, + }; + write_test_bigwig(bw_path, "chr1", 1000, intervals); + { + std::ofstream out(bg_path); + for (const auto& [s, e, v] : intervals) + { + out << "chr1\t" << s << '\t' << e << '\t' << v << '\n'; + } + } + + auto run_pipeline = [&](const std::string& cov_path) + { + Parser parser("dummy_path", {"chr1"}, 1); + uint64_t library_size = 0; + std::vector rows; + if (cov_path.size() >= 3 && cov_path.substr(cov_path.size() - 3) == ".bw") + { + rows = parser.read_bigwig(cov_path, library_size, '.'); + } + else + { + rows = parser.read_bedgraph(cov_path, library_size); + } + // Note: skipping normalize() so the comparison is on raw coverage; the + // two readers must produce identical raw values for the test to be + // meaningful. + std::vector> all_bedgraphs(1); + all_bedgraphs[0] = std::move(rows); + + Averager avg(1); + avg.compute_mean_coverage(all_bedgraphs); + avg.find_ERs(1.0, 5); + return avg.expressed_regions["chr1"]; + }; + + auto from_bw = run_pipeline(bw_path); + auto from_bg = run_pipeline(bg_path); + + ASSERT_EQ(from_bw.size(), from_bg.size()); + for (size_t i = 0; i < from_bw.size(); ++i) + { + EXPECT_EQ(from_bw[i].start, from_bg[i].start); + EXPECT_EQ(from_bw[i].end, from_bg[i].end); + EXPECT_DOUBLE_EQ(from_bw[i].coverage, from_bg[i].coverage); + } + + fs::remove_all(tmp); +#endif } \ No newline at end of file From 98bc9d0e5dfc31bb0ec5db01e0f1309e9947d540 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Tue, 28 Apr 2026 18:27:40 +0200 Subject: [PATCH 4/7] Fix stitching, update tests, document --- .github/workflows/ci.yml | 70 ++++++++++--- README.md | 28 +++++- cpp/Integrator.cpp | 210 +++++++++++---------------------------- cpp/Integrator.h | 14 --- tests/UnitTests.cpp | 104 ++++++------------- 5 files changed, 170 insertions(+), 256 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 887d35b..7a2b039 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,29 +1,27 @@ name: build_and_test +# Trigger only on pull_request and manual dispatch. The push trigger was +# removed so that opening a PR from a branch in this repo does not run the +# workflow twice (once for the push, once for the pull_request). on: pull_request: workflow_dispatch: - push: - branches: - - build - - imallona - - perf-mm-rr-memory jobs: build-and-test: runs-on: ubuntu-latest - + steps: - name: Checkout code uses: actions/checkout@v4 - + # - name: Install dependencies # run: | # sudo apt-get update # sudo apt-get install -y g++ cppcheck python3-pip # pip3 install "cmake>=4.0" # cmake --version - + - name: Set up Miniconda uses: conda-incubator/setup-miniconda@v3 with: @@ -38,27 +36,27 @@ jobs: conda install -y conda-build make gxx_linux-64 cmake>=4.0 cmake --version conda-build --version - + - name: Configure CMake shell: bash -l {0} run: | mkdir -p build cd build cmake .. - + - name: Build shell: bash -l {0} run: | cd build make -j$(nproc) - + - name: Run tests shell: bash -l {0} run: | cd build ctest --output-on-failure --verbose - - # - name: Run cppcheck + + # - name: Run cppcheck # uses: deep5050/cppcheck-action@main # with: # skip_preprocessor: disable @@ -92,7 +90,7 @@ jobs: - name: Clean previous build run: rm -rf build - + - name: Build Conda package shell: bash -l {0} run: | @@ -103,3 +101,47 @@ jobs: with: name: fastder-conda path: /usr/share/miniconda/envs/build-env/conda-bld/linux-64/*.conda + + # Second job: build with libBigWig support so the BigWig round-trip and + # equivalence tests in UnitTests.cpp actually run instead of skipping. + # libBigWig is fetched at configure time. Its build needs zlib and libcurl + # headers; we pull them from conda-forge since the project requires + # cmake >= 4.0 which is also only available there on this runner. + build-and-test-libbigwig: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Miniconda + uses: conda-incubator/setup-miniconda@v3 + with: + auto-update-conda: true + python-version: 3.12 + channels: conda-forge,nodefaults + activate-environment: bw-build-env + + - name: Install build tools and libBigWig deps + shell: bash -l {0} + run: | + conda install -y make gxx_linux-64 cmake>=4.0 zlib libcurl + + - name: Configure CMake with libBigWig + shell: bash -l {0} + run: | + mkdir -p build_bw + cd build_bw + cmake -DFASTDER_USE_LIBBIGWIG=ON .. + + - name: Build + shell: bash -l {0} + run: | + cd build_bw + make -j$(nproc) + + - name: Run tests (BigWig path active) + shell: bash -l {0} + run: | + cd build_bw + ctest --output-on-failure --verbose diff --git a/README.md b/README.md index 8629a73..cc5a5fe 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,32 @@ It is intended to build on the `recount3` [resource](https://rna.recount.bio/), which consists of over 750'000 uniformly processed RNA-seq samples across different mouse and human studies. The tool aims to reconstruct expressed genes prior to splicing in an annotation-agnostic approach. -`fastder` takes genome-wide coverage bigWig files and splice junction coordinates as an input. The tool averages across samples and performs thresholding to identify -consecutive regions with above-threshold expression. Following this, `fastder` attempts to stitch together expressed regions (ERs) -by searching for splice junction coordinates that overlap with the start and end position of these expressed regions. +`fastder` takes genome-wide coverage files and splice junction coordinates as input. Coverage can be supplied as BedGraph by default, or as BigWig when the build was configured with `-DFASTDER_USE_LIBBIGWIG=ON`. The tool averages across samples and applies a coverage threshold to identify consecutive regions with above-threshold expression. It then stitches expressed regions (ERs) together when a splice junction in the input matches the end of one ER and the start of the next. +Coverage is held in memory as a sparse list of intervals rather than a dense per-base vector. On chr21 this keeps the resident set in the low hundreds of MB instead of multiple GB at full hg38. The strand of each splice junction is read into the in-memory model but is not yet used by the stitching step; strand-aware stitching is planned and the data structures already carry the field. + + +## Building + +The default build needs cmake (4.0 or newer) and a C++20 compiler: + +``` +mkdir build +cd build +cmake .. +make -j +``` + +To read BigWig coverage directly instead of converting to BedGraph first, +configure with `-DFASTDER_USE_LIBBIGWIG=ON`. CMake will fetch libBigWig from +GitHub at configure time. zlib and libcurl headers must be available. + +``` +cmake -DFASTDER_USE_LIBBIGWIG=ON .. +``` + +The unit tests run with `ctest` from the build directory. Two tests are gated +on the libBigWig option and are skipped in the default build. ## Installation diff --git a/cpp/Integrator.cpp b/cpp/Integrator.cpp index ac602b2..7fcb77b 100644 --- a/cpp/Integrator.cpp +++ b/cpp/Integrator.cpp @@ -3,7 +3,6 @@ // #include -#include #include // constructor @@ -43,163 +42,70 @@ bool Integrator::sj_too_far_back(const uint64_t most_recent_er_end, const uint64 && !within_threshold(most_recent_er_end, sj_start); } -// Build chains of ERs connected by SJs of one strand, and append the -// chains of length 2 or more to stitched_ERs (tagged with that strand). -// Single-ER candidates produced along the way are not emitted here; the -// caller is responsible for emitting unstitched ERs once with strand '.'. -// -// Algorithm: walk the chromosome's ERs left to right. Keep a candidate chain -// in progress. For each new ER, try to extend the candidate via the next -// available SJ on this strand. If the SJ matches (coverage and position), the -// chain grows; otherwise the candidate is closed (emitted only if it has 2+ -// ERs) and a fresh candidate is seeded with the current ER. Indices of any -// ERs that wound up inside an emitted chain are added to consumed_indices. -void Integrator::stitch_one_strand(const std::string& chrom, - char strand, - const std::vector& strand_sjs, - const std::vector& ers, - const std::vector& rr_all_sj, - std::unordered_set& consumed_indices, - std::vector& output) +void Integrator::stitch_up(std::unordered_map>& expressed_regions, const std::unordered_map>& mm_chrom_sj, const std::vector& rr_all_sj) { - if (ers.empty() || strand_sjs.empty()) return; - - auto sj_it = strand_sjs.begin(); - - StitchedER candidate(ers[0], 0); - candidate.strand = strand; - bool have_candidate = true; - - auto close_candidate = [&]() + // Single-pass chain-building per chromosome, ignoring SJ strand. This + // matches upstream's behaviour exactly so the perf branch's output GTF + // is identical to upstream's on the same input. Strand-aware stitching + // (bucketing SJs by SJRow.strand and running two passes per chromosome) + // is a separate, output-changing feature that should be added in its own + // branch and validated against simulated truth in fastder-evaluation. + // The strand fields on BedGraphRow / SJRow / StitchedER stay in place so + // that follow-up work has the plumbing ready. + for (auto& sjs : mm_chrom_sj) { - // a chain has been extended at least once when er_ids has more than the seed entry - if (candidate.er_ids.size() > 1) + std::string chrom = sjs.first; + StitchedER er1 = StitchedER(expressed_regions.at(chrom).at(0), 0); // first ER becomes the first stitched ER + stitched_ERs.emplace_back(er1); + auto current_sj_id = sjs.second.begin(); + + int max_stitched_ers = 0; + int nof_stitched_ers = 0; + for (int i = 1; i < static_cast(expressed_regions.at(chrom).size()); ++i) { - output.emplace_back(candidate); - for (int id : candidate.er_ids) + const auto& expressed_region = expressed_regions[chrom][i]; + if (current_sj_id != sjs.second.end()) { - if (id >= 0) consumed_indices.insert(id); + StitchedER& current_stitched_er = stitched_ERs.back(); + + // skip ahead to a SJ whose start lines up with the chain's end + while (current_sj_id != sjs.second.end() + && (current_stitched_er.end > rr_all_sj[*current_sj_id - 1].start + && !within_threshold(current_stitched_er.end, rr_all_sj[*current_sj_id - 1].start)) + && rr_all_sj[*current_sj_id - 1].chrom == chrom) + { + ++current_sj_id; + } + if (current_sj_id == sjs.second.end()) + { + --current_sj_id; + } + if (is_similar(current_stitched_er, expressed_region, rr_all_sj[*current_sj_id - 1])) + { + uint64_t sj_length = expressed_region.start - expressed_regions[chrom][current_stitched_er.er_ids.back()].end; + current_stitched_er.append(-1, sj_length, 0.0); // append the intron placeholder + current_stitched_er.append(i, expressed_region.length, expressed_region.coverage); + ++nof_stitched_ers; + ++current_sj_id; + + if (max_stitched_ers < nof_stitched_ers) + { + max_stitched_ers = nof_stitched_ers; + } + } + else + { + nof_stitched_ers = 1; // reset counter + stitched_ERs.emplace_back(StitchedER(expressed_region, i)); + } + } + else + { + // no more SJs on this chromosome: each remaining ER forms its own StitchedER + stitched_ERs.emplace_back(StitchedER(expressed_region, i)); } } - have_candidate = false; - }; - - int max_chain_len = 0; // number of ERs in the longest chain emitted - - for (int i = 1; i < static_cast(ers.size()); ++i) - { - const auto& expressed_region = ers[i]; - if (!have_candidate) - { - candidate = StitchedER(expressed_region, i); - candidate.strand = strand; - have_candidate = true; - continue; - } - - if (sj_it == strand_sjs.end()) - { - // no SJs of this strand left: close the current candidate and start fresh - // with this ER as its own (unstitched) seed - close_candidate(); - candidate = StitchedER(expressed_region, i); - candidate.strand = strand; - have_candidate = true; - continue; - } - - // skip past SJs whose end coordinate is too far behind the chain - while (sj_it != strand_sjs.end() - && (candidate.end > rr_all_sj[*sj_it - 1].start - && !within_threshold(candidate.end, rr_all_sj[*sj_it - 1].start)) - && rr_all_sj[*sj_it - 1].chrom == chrom) - { - ++sj_it; - } - // if we ran out, fall back to the last entry so dereferencing is safe - const auto sj_to_check = (sj_it == strand_sjs.end()) ? std::prev(strand_sjs.end()) : sj_it; - - if (is_similar(candidate, expressed_region, rr_all_sj[*sj_to_check - 1])) - { - uint64_t sj_length = expressed_region.start - ers[candidate.er_ids.back()].end; - candidate.append(-1, sj_length, 0.0); // append the intron placeholder - candidate.append(i, expressed_region.length, expressed_region.coverage); - // count actual ERs in chain (non-negative er_ids entries) - int er_count = 0; - for (int id : candidate.er_ids) if (id >= 0) ++er_count; - if (er_count > max_chain_len) max_chain_len = er_count; - if (sj_it != strand_sjs.end()) ++sj_it; - } - else - { - close_candidate(); - candidate = StitchedER(expressed_region, i); - candidate.strand = strand; - have_candidate = true; - } - } - - if (have_candidate) close_candidate(); - - if (max_chain_len > 0) - { - std::cout << "[INFO] Longest stitched ER in " << chrom << " (" << strand << ") contains " - << max_chain_len << " ERs" << std::endl; - } -} - - -void Integrator::stitch_up(std::unordered_map>& expressed_regions, const std::unordered_map>& mm_chrom_sj, const std::vector& rr_all_sj) -{ - // For each chromosome: - // 1. Bucket the chromosome's SJs by strand. SJRow.strand is true for '+', false for '-'. - // 2. Run stitch_one_strand twice (once per non-empty bucket). Each pass - // emits only chains of 2 or more ERs, tagged with its strand, and - // records the indices of consumed ERs. - // 3. Walk the chromosome's ERs and emit each ER not consumed by any - // chain as a single-ER StitchedER tagged with strand '.'. This makes - // sure every ER appears in stitched_ERs exactly once: either inside - // a strand-labelled chain (when SJs supported it) or as an unstranded - // standalone ER (when no SJ on either strand stitched it). - for (const auto& sjs : mm_chrom_sj) - { - const std::string& chrom = sjs.first; - if (expressed_regions.find(chrom) == expressed_regions.end()) continue; - const auto& ers = expressed_regions.at(chrom); - if (ers.empty()) continue; - - std::vector plus_sjs; - std::vector minus_sjs; - plus_sjs.reserve(sjs.second.size()); - minus_sjs.reserve(sjs.second.size()); - for (uint32_t sj_id : sjs.second) - { - if (sj_id == 0 || sj_id - 1 >= rr_all_sj.size()) continue; - (rr_all_sj[sj_id - 1].strand ? plus_sjs : minus_sjs).emplace_back(sj_id); - } - - std::unordered_set consumed; - // Collect this chromosome's StitchedERs in a local vector so we can - // sort them by start position before appending. The original - // (unstranded) algorithm always emitted ERs in genomic order, and - // write_to_gtf relies on it; running two strand passes plus a - // standalone pass would otherwise interleave them out of order. - std::vector chrom_stitched; - if (!plus_sjs.empty()) stitch_one_strand(chrom, '+', plus_sjs, ers, rr_all_sj, consumed, chrom_stitched); - if (!minus_sjs.empty()) stitch_one_strand(chrom, '-', minus_sjs, ers, rr_all_sj, consumed, chrom_stitched); - - // any ER not pulled into a chain is emitted once, unstranded - for (int i = 0; i < static_cast(ers.size()); ++i) - { - if (consumed.count(i)) continue; - StitchedER fresh(ers[i], i); - // strand stays '.' (default) - chrom_stitched.emplace_back(fresh); - } - - std::sort(chrom_stitched.begin(), chrom_stitched.end(), - [](const StitchedER& a, const StitchedER& b) { return a.start < b.start; }); - for (auto& ser : chrom_stitched) stitched_ERs.emplace_back(std::move(ser)); + std::cout << "[INFO] Longest stitched ER in " << chrom << " contains " << max_stitched_ers << " ERs" << std::endl; } } diff --git a/cpp/Integrator.h b/cpp/Integrator.h index 9f1f4cd..2bd28b9 100644 --- a/cpp/Integrator.h +++ b/cpp/Integrator.h @@ -14,7 +14,6 @@ #include #include #include -#include class Integrator { @@ -22,19 +21,6 @@ class Integrator Integrator(double coverage_tolerance_, int position_tolerance_); void stitch_up(std::unordered_map>& expressed_regions, const std::unordered_map>& mm_chrom_sj, const std::vector& rr_all_sj); - // Per-strand pass invoked by stitch_up. Builds chains of ERs connected by - // SJs of the requested strand and appends them to stitched_ERs only when - // the chain contains 2 or more ERs. Indices of ERs that ended up in any - // emitted chain are added to consumed_indices so the caller can avoid - // emitting them again as unstranded standalone ERs. Exposed publicly so - // tests can exercise one bucket in isolation. - void stitch_one_strand(const std::string& chrom, - char strand, - const std::vector& strand_sjs, - const std::vector& ers, - const std::vector& rr_all_sj, - std::unordered_set& consumed_indices, - std::vector& output); bool within_threshold(double val1, double val2) const; bool within_threshold(uint64_t val1, uint64_t val2) const; bool is_similar(const StitchedER& most_recent_er, const BedGraphRow& expressed_region, const SJRow& current_sj); diff --git a/tests/UnitTests.cpp b/tests/UnitTests.cpp index ef49d84..5547bbb 100644 --- a/tests/UnitTests.cpp +++ b/tests/UnitTests.cpp @@ -738,54 +738,19 @@ TEST(Averager, FindERsMinLengthFilters) // ===================================================================== -// Integrator strand-aware stitch_up +// Integrator stitch_up (strand-agnostic single-pass) // ===================================================================== +// +// stitch_up is currently strand-agnostic by design: it walks each +// chromosome's ERs once and tries to extend the current chain via the next +// SJ in mm_chrom_sj[chrom] regardless of SJ strand. Strand-aware stitching +// is on the roadmap but not active in this branch, so these tests assert +// the strand-agnostic behaviour and that every ER appears in stitched_ERs +// exactly once. -// Each ER appears in stitched_ERs exactly once: ERs that participate in a -// chain are tagged with that chain's strand, ERs left over come back with '.'. -TEST(Integrator, StitchUpSinglePlusChain) -{ - std::vector rr_all_sj = { - SJRow("chr1", 10500, 11000, 500, '+', false), // sj_id 1, strand=true - }; - std::unordered_map> expressed_regions; - expressed_regions["chr1"] = { - BedGraphRow("chr1", 10000, 10500, 100.0), - BedGraphRow("chr1", 11000, 12500, 101.0), - }; - std::unordered_map> mm_chrom_sj; - mm_chrom_sj["chr1"] = {1}; - - Integrator integrator(0.1, 5); - integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); - - ASSERT_EQ(integrator.stitched_ERs.size(), 1u); - EXPECT_EQ(integrator.stitched_ERs[0].strand, '+'); - EXPECT_EQ(integrator.stitched_ERs[0].er_ids.size(), 3u); -} - -TEST(Integrator, StitchUpSingleMinusChain) -{ - std::vector rr_all_sj = { - SJRow("chr1", 10500, 11000, 500, '-', false), // sj_id 1, strand=false - }; - std::unordered_map> expressed_regions; - expressed_regions["chr1"] = { - BedGraphRow("chr1", 10000, 10500, 100.0), - BedGraphRow("chr1", 11000, 12500, 101.0), - }; - std::unordered_map> mm_chrom_sj; - mm_chrom_sj["chr1"] = {1}; - - Integrator integrator(0.1, 5); - integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); - - ASSERT_EQ(integrator.stitched_ERs.size(), 1u); - EXPECT_EQ(integrator.stitched_ERs[0].strand, '-'); -} - -// An ER that no SJ stitches must be emitted exactly once with strand '.'. -TEST(Integrator, StitchUpUnstitchedERIsEmittedOnceUnstranded) +// Each ER on a chromosome appears in stitched_ERs exactly once: total exon +// count equals total ER count. +TEST(Integrator, StitchUpEachERAppearsExactlyOnce) { std::vector rr_all_sj = { SJRow("chr1", 10500, 11000, 500, '+', false), @@ -793,11 +758,11 @@ TEST(Integrator, StitchUpUnstitchedERIsEmittedOnceUnstranded) }; std::unordered_map> expressed_regions; expressed_regions["chr1"] = { - BedGraphRow("chr1", 10000, 10500, 100.0), // chain start (+ pass) - BedGraphRow("chr1", 11000, 12500, 101.0), // chain extension - BedGraphRow("chr1", 13000, 14000, 5.0), // unstitched gap - BedGraphRow("chr1", 20000, 20500, 50.0), // chain start (- pass) - BedGraphRow("chr1", 21000, 22000, 50.0), // chain extension + BedGraphRow("chr1", 10000, 10500, 100.0), + BedGraphRow("chr1", 11000, 12500, 101.0), + BedGraphRow("chr1", 13000, 14000, 5.0), + BedGraphRow("chr1", 20000, 20500, 50.0), + BedGraphRow("chr1", 21000, 22000, 50.0), }; std::unordered_map> mm_chrom_sj; mm_chrom_sj["chr1"] = {1, 2}; @@ -805,30 +770,24 @@ TEST(Integrator, StitchUpUnstitchedERIsEmittedOnceUnstranded) Integrator integrator(0.1, 5); integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); - // Expect: + chain (ERs 0,1), unstranded standalone (ER 2), - chain (ERs 3,4) - ASSERT_EQ(integrator.stitched_ERs.size(), 3u); - - int unstranded_count = 0; - int plus_count = 0; - int minus_count = 0; + int total_ers_in_output = 0; for (const auto& ser : integrator.stitched_ERs) { - if (ser.strand == '.') ++unstranded_count; - if (ser.strand == '+') ++plus_count; - if (ser.strand == '-') ++minus_count; + for (int id : ser.er_ids) + { + if (id >= 0) ++total_ers_in_output; + } } - EXPECT_EQ(unstranded_count, 1); - EXPECT_EQ(plus_count, 1); - EXPECT_EQ(minus_count, 1); + EXPECT_EQ(total_ers_in_output, 5); } -// stitched_ERs must be sorted by start within a chromosome regardless of which -// strand pass produced them, since write_to_gtf relies on genomic order. +// stitch_up walks ERs in their input order, so stitched_ERs are emitted in +// genomic order on a chromosome. write_to_gtf relies on this. TEST(Integrator, StitchUpEmitsInGenomicOrder) { std::vector rr_all_sj = { - SJRow("chr1", 20500, 21000, 500, '+', false), // late + chain - SJRow("chr1", 10500, 11000, 500, '-', false), // early - chain + SJRow("chr1", 10500, 11000, 500, '-', false), + SJRow("chr1", 20500, 21000, 500, '+', false), }; std::unordered_map> expressed_regions; expressed_regions["chr1"] = { @@ -843,12 +802,11 @@ TEST(Integrator, StitchUpEmitsInGenomicOrder) Integrator integrator(0.1, 5); integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); - ASSERT_EQ(integrator.stitched_ERs.size(), 2u); - // first by genomic order is the - chain (ER 0..1) - EXPECT_EQ(integrator.stitched_ERs[0].start, 10000u); - EXPECT_EQ(integrator.stitched_ERs[0].strand, '-'); - EXPECT_EQ(integrator.stitched_ERs[1].start, 20000u); - EXPECT_EQ(integrator.stitched_ERs[1].strand, '+'); + ASSERT_GE(integrator.stitched_ERs.size(), 2u); + for (size_t i = 1; i < integrator.stitched_ERs.size(); ++i) + { + EXPECT_LE(integrator.stitched_ERs[i - 1].start, integrator.stitched_ERs[i].start); + } } From 56f77307bbaae373af9c75af2f25d3b7b6bc3868 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Tue, 28 Apr 2026 18:55:49 +0200 Subject: [PATCH 5/7] Document, trim unused stuff --- cpp/Averager.cpp | 9 +++------ cpp/Averager.h | 9 +++++---- cpp/Integrator.h | 1 - cpp/Parser.cpp | 8 +++----- cpp/Parser.h | 1 - cpp/SJRow.h | 2 +- 6 files changed, 12 insertions(+), 18 deletions(-) diff --git a/cpp/Averager.cpp b/cpp/Averager.cpp index a919e5e..c660b8c 100644 --- a/cpp/Averager.cpp +++ b/cpp/Averager.cpp @@ -4,17 +4,14 @@ #include #include -#include -#include -#include #include -#include #include -#include +#include +#include +#include #include "BedGraphRow.h" #include "SJRow.h" #include "Averager.h" -#include Averager::Averager(int threads_) diff --git a/cpp/Averager.h b/cpp/Averager.h index 32af7d8..1431af9 100644 --- a/cpp/Averager.h +++ b/cpp/Averager.h @@ -36,10 +36,11 @@ class Averager { // BedGraphRows. Adjacent rows can be contiguous (row[i].end == row[i+1].start); // any uncovered region between rows is implicit zero coverage. std::unordered_map> mean_intervals; - // Expressed regions per chromosome, one BedGraphRow per ER. Strand on - // each ER is inherited from mean_intervals (currently '.' since the - // BigWig input is unstranded; the SJ-strand stitching that the - // Integrator does later is what tags the stitched ERs). + // Expressed regions per chromosome, one BedGraphRow per ER. The + // strand field is currently always '.' because the input coverage is + // unstranded and the stitch_up step is strand-agnostic. Stranded + // coverage and strand-aware stitching are planned follow-ups; the + // BedGraphRow / SJRow / StitchedER strand fields are already in place. std::unordered_map> expressed_regions; diff --git a/cpp/Integrator.h b/cpp/Integrator.h index 2bd28b9..fc1f7f1 100644 --- a/cpp/Integrator.h +++ b/cpp/Integrator.h @@ -13,7 +13,6 @@ #include #include #include -#include class Integrator { diff --git a/cpp/Parser.cpp b/cpp/Parser.cpp index a0d5b82..4f86523 100644 --- a/cpp/Parser.cpp +++ b/cpp/Parser.cpp @@ -11,14 +11,13 @@ #include #include #include -#include +#include +#include +#include #include // for library size which can be too large for unsigned int #include "Parser.h" -#include -#include - // constructor Parser::Parser(std::string path_, std::vector chromosomes_, int cores_) { path = path_; @@ -419,7 +418,6 @@ void Parser::read_all_bedgraphs(std::vector bedgraph_files, unsigne extern "C" { #include } -#include #endif std::vector Parser::read_bigwig(const std::string& filename, uint64_t& library_size, diff --git a/cpp/Parser.h b/cpp/Parser.h index ec0a76d..bbd1905 100644 --- a/cpp/Parser.h +++ b/cpp/Parser.h @@ -6,7 +6,6 @@ #define FASTDER_PARSE_H #include "SJRow.h" #include -#include #include #include #include diff --git a/cpp/SJRow.h b/cpp/SJRow.h index 2c71623..de00c8b 100644 --- a/cpp/SJRow.h +++ b/cpp/SJRow.h @@ -42,7 +42,7 @@ class SJRow } - // overload output operator for SJRow (debug only — not part of fastder's outputs) + // overload output operator for SJRow (debug only, not part of fastder's outputs) friend std::ostream& operator<< (std::ostream& os, const SJRow& row) { return os << row.chrom << "\t" << row.start << "\t" << row.end << "\t" << row.length From effab20eed56c765ff5958d177fa3c1340c87a28 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Wed, 29 Apr 2026 07:27:07 +0200 Subject: [PATCH 6/7] Bring back strand specificity --- cpp/Integrator.cpp | 203 +++++++++++++++++++++++++++++++------------- cpp/Integrator.h | 15 ++++ tests/UnitTests.cpp | 87 +++++++++++++++++++ 3 files changed, 247 insertions(+), 58 deletions(-) diff --git a/cpp/Integrator.cpp b/cpp/Integrator.cpp index 7fcb77b..a1de7fd 100644 --- a/cpp/Integrator.cpp +++ b/cpp/Integrator.cpp @@ -3,6 +3,7 @@ // #include +#include #include // constructor @@ -42,70 +43,156 @@ bool Integrator::sj_too_far_back(const uint64_t most_recent_er_end, const uint64 && !within_threshold(most_recent_er_end, sj_start); } -void Integrator::stitch_up(std::unordered_map>& expressed_regions, const std::unordered_map>& mm_chrom_sj, const std::vector& rr_all_sj) +// Per-strand chain builder. Walks the chromosome's ERs left to right, trying +// to extend a candidate chain via SJs of the requested strand. Emits chains +// of two or more ERs to output, tagged with strand. ERs already claimed by +// an earlier strand pass (present in consumed_indices on entry) are skipped: +// they cannot seed a new candidate, they cannot extend a candidate, and +// hitting one closes the current candidate. Indices of ERs that wind up in +// an emitted chain are added to consumed_indices. +void Integrator::stitch_one_strand(const std::string& chrom, + char strand, + const std::vector& strand_sjs, + const std::vector& ers, + const std::vector& rr_all_sj, + std::unordered_set& consumed_indices, + std::vector& output) { - // Single-pass chain-building per chromosome, ignoring SJ strand. This - // matches upstream's behaviour exactly so the perf branch's output GTF - // is identical to upstream's on the same input. Strand-aware stitching - // (bucketing SJs by SJRow.strand and running two passes per chromosome) - // is a separate, output-changing feature that should be added in its own - // branch and validated against simulated truth in fastder-evaluation. - // The strand fields on BedGraphRow / SJRow / StitchedER stay in place so - // that follow-up work has the plumbing ready. - for (auto& sjs : mm_chrom_sj) + if (ers.empty() || strand_sjs.empty()) return; + + auto sj_it = strand_sjs.begin(); + + // find first non-consumed ER for the seed + int seed_i = 0; + while (seed_i < static_cast(ers.size()) && consumed_indices.count(seed_i)) ++seed_i; + if (seed_i >= static_cast(ers.size())) return; + + StitchedER candidate(ers[seed_i], seed_i); + candidate.strand = strand; + bool have_candidate = true; + + auto close_candidate = [&]() { - std::string chrom = sjs.first; - StitchedER er1 = StitchedER(expressed_regions.at(chrom).at(0), 0); // first ER becomes the first stitched ER - stitched_ERs.emplace_back(er1); - auto current_sj_id = sjs.second.begin(); - - int max_stitched_ers = 0; - int nof_stitched_ers = 0; - for (int i = 1; i < static_cast(expressed_regions.at(chrom).size()); ++i) + if (candidate.er_ids.size() > 1) { - const auto& expressed_region = expressed_regions[chrom][i]; - if (current_sj_id != sjs.second.end()) - { - StitchedER& current_stitched_er = stitched_ERs.back(); - - // skip ahead to a SJ whose start lines up with the chain's end - while (current_sj_id != sjs.second.end() - && (current_stitched_er.end > rr_all_sj[*current_sj_id - 1].start - && !within_threshold(current_stitched_er.end, rr_all_sj[*current_sj_id - 1].start)) - && rr_all_sj[*current_sj_id - 1].chrom == chrom) - { - ++current_sj_id; - } - if (current_sj_id == sjs.second.end()) - { - --current_sj_id; - } - if (is_similar(current_stitched_er, expressed_region, rr_all_sj[*current_sj_id - 1])) - { - uint64_t sj_length = expressed_region.start - expressed_regions[chrom][current_stitched_er.er_ids.back()].end; - current_stitched_er.append(-1, sj_length, 0.0); // append the intron placeholder - current_stitched_er.append(i, expressed_region.length, expressed_region.coverage); - ++nof_stitched_ers; - ++current_sj_id; - - if (max_stitched_ers < nof_stitched_ers) - { - max_stitched_ers = nof_stitched_ers; - } - } - else - { - nof_stitched_ers = 1; // reset counter - stitched_ERs.emplace_back(StitchedER(expressed_region, i)); - } - } - else + output.emplace_back(candidate); + for (int id : candidate.er_ids) { - // no more SJs on this chromosome: each remaining ER forms its own StitchedER - stitched_ERs.emplace_back(StitchedER(expressed_region, i)); + if (id >= 0) consumed_indices.insert(id); } } - std::cout << "[INFO] Longest stitched ER in " << chrom << " contains " << max_stitched_ers << " ERs" << std::endl; + have_candidate = false; + }; + + int max_chain_len = 0; + + for (int i = seed_i + 1; i < static_cast(ers.size()); ++i) + { + if (consumed_indices.count(i)) + { + // ER was claimed by an earlier strand pass; close any in-progress chain + close_candidate(); + continue; + } + + const auto& expressed_region = ers[i]; + if (!have_candidate) + { + candidate = StitchedER(expressed_region, i); + candidate.strand = strand; + have_candidate = true; + continue; + } + + if (sj_it == strand_sjs.end()) + { + close_candidate(); + candidate = StitchedER(expressed_region, i); + candidate.strand = strand; + have_candidate = true; + continue; + } + + // skip past SJs that lie too far behind the chain's end + while (sj_it != strand_sjs.end() + && (candidate.end > rr_all_sj[*sj_it - 1].start + && !within_threshold(candidate.end, rr_all_sj[*sj_it - 1].start)) + && rr_all_sj[*sj_it - 1].chrom == chrom) + { + ++sj_it; + } + const auto sj_to_check = (sj_it == strand_sjs.end()) ? std::prev(strand_sjs.end()) : sj_it; + + if (is_similar(candidate, expressed_region, rr_all_sj[*sj_to_check - 1])) + { + uint64_t sj_length = expressed_region.start - ers[candidate.er_ids.back()].end; + candidate.append(-1, sj_length, 0.0); + candidate.append(i, expressed_region.length, expressed_region.coverage); + int er_count = 0; + for (int id : candidate.er_ids) if (id >= 0) ++er_count; + if (er_count > max_chain_len) max_chain_len = er_count; + if (sj_it != strand_sjs.end()) ++sj_it; + } + else + { + close_candidate(); + candidate = StitchedER(expressed_region, i); + candidate.strand = strand; + have_candidate = true; + } + } + + if (have_candidate) close_candidate(); + + if (max_chain_len > 0) + { + std::cout << "[INFO] Longest stitched ER in " << chrom << " (" << strand << ") contains " + << max_chain_len << " ERs" << std::endl; + } +} + + +void Integrator::stitch_up(std::unordered_map>& expressed_regions, const std::unordered_map>& mm_chrom_sj, const std::vector& rr_all_sj) +{ + // Strand-aware stitching. For each chromosome: + // 1. Bucket SJs by strand (SJRow.strand: true -> '+', false -> '-'). + // 2. Run stitch_one_strand for each non-empty bucket. The shared + // consumed_indices set ensures an ER can be in at most one chain. + // 3. Emit any ER not pulled into a chain as a single-ER StitchedER + // with strand '.'. + // 4. Sort the chromosome's StitchedERs by start so write_to_gtf reads + // them in genomic order even when chains came from different passes. + for (const auto& sjs : mm_chrom_sj) + { + const std::string& chrom = sjs.first; + if (expressed_regions.find(chrom) == expressed_regions.end()) continue; + const auto& ers = expressed_regions.at(chrom); + if (ers.empty()) continue; + + std::vector plus_sjs; + std::vector minus_sjs; + plus_sjs.reserve(sjs.second.size()); + minus_sjs.reserve(sjs.second.size()); + for (uint32_t sj_id : sjs.second) + { + if (sj_id == 0 || sj_id - 1 >= rr_all_sj.size()) continue; + (rr_all_sj[sj_id - 1].strand ? plus_sjs : minus_sjs).emplace_back(sj_id); + } + + std::unordered_set consumed; + std::vector chrom_stitched; + if (!plus_sjs.empty()) stitch_one_strand(chrom, '+', plus_sjs, ers, rr_all_sj, consumed, chrom_stitched); + if (!minus_sjs.empty()) stitch_one_strand(chrom, '-', minus_sjs, ers, rr_all_sj, consumed, chrom_stitched); + + for (int i = 0; i < static_cast(ers.size()); ++i) + { + if (consumed.count(i)) continue; + chrom_stitched.emplace_back(StitchedER(ers[i], i)); + } + + std::sort(chrom_stitched.begin(), chrom_stitched.end(), + [](const StitchedER& a, const StitchedER& b) { return a.start < b.start; }); + for (auto& ser : chrom_stitched) stitched_ERs.emplace_back(std::move(ser)); } } diff --git a/cpp/Integrator.h b/cpp/Integrator.h index fc1f7f1..f197629 100644 --- a/cpp/Integrator.h +++ b/cpp/Integrator.h @@ -13,6 +13,7 @@ #include #include #include +#include class Integrator { @@ -20,6 +21,20 @@ class Integrator Integrator(double coverage_tolerance_, int position_tolerance_); void stitch_up(std::unordered_map>& expressed_regions, const std::unordered_map>& mm_chrom_sj, const std::vector& rr_all_sj); + // Per-strand pass invoked by stitch_up. Builds chains of ERs connected + // by SJs of the requested strand and appends chains of length 2 or + // more to output, tagged with strand. Indices of ERs that ended up + // in any emitted chain (including ERs claimed by an earlier-running + // strand pass) are tracked in consumed_indices, so the second pass + // skips ERs the first one already used and stitch_up can emit the + // remaining ERs as unstranded standalones afterwards. + void stitch_one_strand(const std::string& chrom, + char strand, + const std::vector& strand_sjs, + const std::vector& ers, + const std::vector& rr_all_sj, + std::unordered_set& consumed_indices, + std::vector& output); bool within_threshold(double val1, double val2) const; bool within_threshold(uint64_t val1, uint64_t val2) const; bool is_similar(const StitchedER& most_recent_er, const BedGraphRow& expressed_region, const SJRow& current_sj); diff --git a/tests/UnitTests.cpp b/tests/UnitTests.cpp index 5547bbb..a3cded9 100644 --- a/tests/UnitTests.cpp +++ b/tests/UnitTests.cpp @@ -809,6 +809,93 @@ TEST(Integrator, StitchUpEmitsInGenomicOrder) } } +// A pure plus-strand chain comes back tagged '+'. Standalones from an +// otherwise-unmatched ER come back tagged '.'. +TEST(Integrator, StitchUpTagsPlusChainsAsPlus) +{ + std::vector rr_all_sj = { + SJRow("chr1", 10500, 11000, 500, '+', false), + }; + std::unordered_map> expressed_regions; + expressed_regions["chr1"] = { + BedGraphRow("chr1", 10000, 10500, 100.0), + BedGraphRow("chr1", 11000, 12500, 101.0), + }; + std::unordered_map> mm_chrom_sj; + mm_chrom_sj["chr1"] = {1}; + + Integrator integrator(0.1, 5); + integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); + + ASSERT_EQ(integrator.stitched_ERs.size(), 1u); + EXPECT_EQ(integrator.stitched_ERs[0].strand, '+'); + EXPECT_EQ(integrator.stitched_ERs[0].er_ids.size(), 3u); +} + +// Same scenario but with the SJ on the minus strand. Tag should be '-'. +TEST(Integrator, StitchUpTagsMinusChainsAsMinus) +{ + std::vector rr_all_sj = { + SJRow("chr1", 10500, 11000, 500, '-', false), + }; + std::unordered_map> expressed_regions; + expressed_regions["chr1"] = { + BedGraphRow("chr1", 10000, 10500, 100.0), + BedGraphRow("chr1", 11000, 12500, 101.0), + }; + std::unordered_map> mm_chrom_sj; + mm_chrom_sj["chr1"] = {1}; + + Integrator integrator(0.1, 5); + integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); + + ASSERT_EQ(integrator.stitched_ERs.size(), 1u); + EXPECT_EQ(integrator.stitched_ERs[0].strand, '-'); +} + +// Both strands present on the same chromosome. Each ER ends up in exactly +// one StitchedER. ERs claimed by the first strand pass do not appear in +// the second; ERs unclaimed by either are emitted as unstranded standalones. +TEST(Integrator, StitchUpEachERAppearsExactlyOnceWithBothStrands) +{ + std::vector rr_all_sj = { + SJRow("chr1", 10500, 11000, 500, '+', false), + SJRow("chr1", 20500, 21000, 500, '-', false), + }; + std::unordered_map> expressed_regions; + expressed_regions["chr1"] = { + BedGraphRow("chr1", 10000, 10500, 100.0), // chain start (+ pass) + BedGraphRow("chr1", 11000, 12500, 101.0), // chain extension (+) + BedGraphRow("chr1", 13000, 14000, 5.0), // unstitched gap + BedGraphRow("chr1", 20000, 20500, 50.0), // chain start (- pass) + BedGraphRow("chr1", 21000, 22000, 50.0), // chain extension (-) + }; + std::unordered_map> mm_chrom_sj; + mm_chrom_sj["chr1"] = {1, 2}; + + Integrator integrator(0.1, 5); + integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); + + int total_ers_in_output = 0; + int unstranded_count = 0; + int plus_count = 0; + int minus_count = 0; + for (const auto& ser : integrator.stitched_ERs) + { + for (int id : ser.er_ids) + { + if (id >= 0) ++total_ers_in_output; + } + if (ser.strand == '.') ++unstranded_count; + if (ser.strand == '+') ++plus_count; + if (ser.strand == '-') ++minus_count; + } + EXPECT_EQ(total_ers_in_output, 5); + EXPECT_EQ(plus_count, 1); + EXPECT_EQ(minus_count, 1); + EXPECT_EQ(unstranded_count, 1); +} + // ===================================================================== // libBigWig integration (gated). These run only when fastder is built with From 3617c8168fb10a03af65562e10e4128ecac5477b Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Wed, 29 Apr 2026 07:33:09 +0200 Subject: [PATCH 7/7] Write the strand by the stitcher --- cpp/GTFRow.cpp | 1 + tests/UnitTests.cpp | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/cpp/GTFRow.cpp b/cpp/GTFRow.cpp index c3b7f38..a6a2e4f 100644 --- a/cpp/GTFRow.cpp +++ b/cpp/GTFRow.cpp @@ -16,6 +16,7 @@ GTFRow::GTFRow(const StitchedER& region, std::string ftr, unsigned int id) score = region.across_er_coverage; start = region.start; end = region.end; + strand = std::string(1, region.strand); change_feature(ftr, id, 0); diff --git a/tests/UnitTests.cpp b/tests/UnitTests.cpp index a3cded9..50dfb0f 100644 --- a/tests/UnitTests.cpp +++ b/tests/UnitTests.cpp @@ -12,6 +12,8 @@ #include "SJRow.h" #include "Parser.h" #include "Averager.h" +#include "GTFRow.h" +#include TEST(SpliceTestChromOne, TwoStitchedERsTwoSJs) { @@ -896,6 +898,45 @@ TEST(Integrator, StitchUpEachERAppearsExactlyOnceWithBothStrands) EXPECT_EQ(unstranded_count, 1); } +// GTFRow must propagate the StitchedER strand into column 7 of the output +// row. Without this, gffcompare can never match transcripts or exons since +// it requires strand agreement at those levels. +TEST(GTFRow, SerializesPlusStrandFromStitchedER) +{ + BedGraphRow er("chr1", 10000, 10500, 100.0); + StitchedER s(er, 0); + s.strand = '+'; + + GTFRow row(s, "transcript", 1); + std::ostringstream os; + os << row; + std::string line = os.str(); + std::istringstream is(line); + std::vector cols; + std::string field; + while (std::getline(is, field, '\t')) cols.push_back(field); + ASSERT_GE(cols.size(), 7u); + EXPECT_EQ(cols[6], "+"); +} + +TEST(GTFRow, SerializesMinusStrandFromStitchedER) +{ + BedGraphRow er("chr1", 10000, 10500, 100.0); + StitchedER s(er, 0); + s.strand = '-'; + + GTFRow row(s, "transcript", 1); + std::ostringstream os; + os << row; + std::string line = os.str(); + std::istringstream is(line); + std::vector cols; + std::string field; + while (std::getline(is, field, '\t')) cols.push_back(field); + ASSERT_GE(cols.size(), 7u); + EXPECT_EQ(cols[6], "-"); +} + // ===================================================================== // libBigWig integration (gated). These run only when fastder is built with