diff --git a/README.md b/README.md index cc5a5fe..854d6e8 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ The tool aims to reconstruct expressed genes prior to splicing in an annotation- `fastder` takes genome-wide coverage files and splice junction coordinates as input. Coverage can be supplied as BedGraph by default, or as BigWig when the build was configured with `-DFASTDER_USE_LIBBIGWIG=ON`. The tool averages across samples and applies a coverage threshold to identify consecutive regions with above-threshold expression. It then stitches expressed regions (ERs) together when a splice junction in the input matches the end of one ER and the start of the next. -Coverage is held in memory as a sparse list of intervals rather than a dense per-base vector. On chr21 this keeps the resident set in the low hundreds of MB instead of multiple GB at full hg38. The strand of each splice junction is read into the in-memory model but is not yet used by the stitching step; strand-aware stitching is planned and the data structures already carry the field. +Coverage is held in memory as a sparse list of intervals rather than a dense per-base vector. On chr21 this keeps the resident set in the low hundreds of MB instead of multiple GB at full hg38. Splice junctions are partitioned by strand at the Integrator: each chromosome's expressed regions are walked once per strand, and chains built from junction-linked ERs are tagged `+` or `-`. Standalone ERs that no junction connected to a neighbour stay unstranded (`.`). ## Building diff --git a/cpp/Averager.cpp b/cpp/Averager.cpp index 8ae87f4..64b4482 100644 --- a/cpp/Averager.cpp +++ b/cpp/Averager.cpp @@ -146,6 +146,11 @@ void Averager::compute_mean_coverage(std::vector>& all_ return; } + // Reset state in case the same Averager instance is being reused. Stale + // chromosomes from a previous run would otherwise leak into find_ERs. + mean_intervals.clear(); + expressed_regions.clear(); + auto chrom_samples = bucket_by_chrom(all_bedgraphs); const size_t total_samples = all_bedgraphs.size(); diff --git a/cpp/Integrator.cpp b/cpp/Integrator.cpp index 2d9921b..f6a1853 100644 --- a/cpp/Integrator.cpp +++ b/cpp/Integrator.cpp @@ -154,6 +154,11 @@ void Integrator::stitch_one_strand(const std::string& chrom, void Integrator::stitch_up(std::unordered_map>& expressed_regions, const std::unordered_map>& mm_chrom_sj, const std::vector& rr_all_sj) { + // Reset stitched_ERs in case the same Integrator instance is being + // reused; otherwise results from a previous run would accumulate and + // write_to_gtf would emit duplicates. + stitched_ERs.clear(); + // Strand-aware stitching. For each chromosome with at least one ER: // 1. Bucket SJs by strand (SJRow.strand: true -> '+', false -> '-'). // A chromosome with no SJs at all skips both strand passes and diff --git a/cpp/Parser.cpp b/cpp/Parser.cpp index e6866df..8b9f7ef 100644 --- a/cpp/Parser.cpp +++ b/cpp/Parser.cpp @@ -196,6 +196,11 @@ void Parser::read_rr(std::string filename) // read MM file // IMPORTANT: the RR file is not sorted by chromosomes! void Parser::read_mm(std::string filename) { + // Reset MM-derived state in case the same Parser instance reads more + // than one MM file. Junction ids would otherwise accumulate across + // calls and downstream stitching would see duplicates. + mm_chrom_sj.clear(); + //read in file from path std::ifstream file(filename); // diff --git a/cpp/SJRow.h b/cpp/SJRow.h index de00c8b..c8d17ea 100644 --- a/cpp/SJRow.h +++ b/cpp/SJRow.h @@ -35,8 +35,10 @@ class SJRow friend std::istream& operator>>(std::istream &is, SJRow &row) { char strand_; std::string discard; - is >> row.chrom >> row.start >> row.end >> row.length >> strand_ >> row.annotated - >> discard >> discard >> discard >> discard; + if (!(is >> row.chrom >> row.start >> row.end >> row.length >> strand_ >> row.annotated + >> discard >> discard >> discard >> discard)) { + return is; + } row.strand = (strand_ == '+'); // 1 if + return is; } @@ -46,7 +48,7 @@ class SJRow friend std::ostream& operator<< (std::ostream& os, const SJRow& row) { return os << row.chrom << "\t" << row.start << "\t" << row.end << "\t" << row.length - << "\t" << row.strand << "\t" << row.annotated; + << "\t" << (row.strand ? '+' : '-') << "\t" << row.annotated; } }; diff --git a/tests/UnitTests.cpp b/tests/UnitTests.cpp index 8dad82c..68d971b 100644 --- a/tests/UnitTests.cpp +++ b/tests/UnitTests.cpp @@ -740,15 +740,15 @@ TEST(Averager, FindERsMinLengthFilters) // ===================================================================== -// Integrator stitch_up (strand-agnostic single-pass) +// Integrator stitch_up (strand-aware) // ===================================================================== // -// stitch_up is currently strand-agnostic by design: it walks each -// chromosome's ERs once and tries to extend the current chain via the next -// SJ in mm_chrom_sj[chrom] regardless of SJ strand. Strand-aware stitching -// is on the roadmap but not active in this branch, so these tests assert -// the strand-agnostic behaviour and that every ER appears in stitched_ERs -// exactly once. +// stitch_up partitions splice junctions by strand. Each chromosome's ERs +// are walked once per strand, and chains built from junction-linked ERs +// are tagged '+' or '-'. ERs that no junction stitched to a neighbour +// stay unstranded ('.'). The tests below verify the bookkeeping +// guarantees, in particular that every ER appears in stitched_ERs +// exactly once across the strand passes. // Each ER on a chromosome appears in stitched_ERs exactly once: total exon // count equals total ER count. @@ -1070,7 +1070,7 @@ TEST(BigWig, BedGraphAndBigWigEquivalentExpressedRegions) const std::vector> intervals = { {100, 300, 10.0f}, - {300, 400, 0.5f}, // below the 1.0 threshold used in find_ERs + {300, 400, 0.0f}, // below the 1.0 threshold used in find_ERs (integer-parseable for the BedGraph reader) {400, 700, 10.0f}, }; write_test_bigwig(bw_path, "chr1", 1000, intervals);