diff --git a/cpp/Averager.cpp b/cpp/Averager.cpp index c660b8c..c1ba4f0 100644 --- a/cpp/Averager.cpp +++ b/cpp/Averager.cpp @@ -202,7 +202,17 @@ void Averager::find_ERs(double threshold, int min_length) unsigned int i = next_index++; if (i >= chroms.size()) break; const std::string& chrom = chroms[i]; - const std::vector& intervals = mean_intervals[chrom]; + // Read-only lookup: operator[] would silently insert a default + // entry under concurrent access from worker threads, which is + // undefined behavior on a shared map. + const auto intervals_it = mean_intervals.find(chrom); + if (intervals_it == mean_intervals.end()) + { + std::cerr << "[ERROR] Missing mean_intervals entry for chromosome: " + << chrom << std::endl; + continue; + } + const std::vector& intervals = intervals_it->second; std::vector chrom_ers; diff --git a/cpp/Integrator.cpp b/cpp/Integrator.cpp index a1de7fd..2626390 100644 --- a/cpp/Integrator.cpp +++ b/cpp/Integrator.cpp @@ -154,29 +154,34 @@ void Integrator::stitch_one_strand(const std::string& chrom, void Integrator::stitch_up(std::unordered_map>& expressed_regions, const std::unordered_map>& mm_chrom_sj, const std::vector& rr_all_sj) { - // Strand-aware stitching. For each chromosome: + // Strand-aware stitching. For each chromosome with at least one ER: // 1. Bucket SJs by strand (SJRow.strand: true -> '+', false -> '-'). + // A chromosome with no SJs at all skips both strand passes and + // every ER is emitted as a single-ER StitchedER with strand '.'. // 2. Run stitch_one_strand for each non-empty bucket. The shared // consumed_indices set ensures an ER can be in at most one chain. // 3. Emit any ER not pulled into a chain as a single-ER StitchedER // with strand '.'. // 4. Sort the chromosome's StitchedERs by start so write_to_gtf reads // them in genomic order even when chains came from different passes. - for (const auto& sjs : mm_chrom_sj) + for (const auto& chrom_ers : expressed_regions) { - const std::string& chrom = sjs.first; - if (expressed_regions.find(chrom) == expressed_regions.end()) continue; - const auto& ers = expressed_regions.at(chrom); + const std::string& chrom = chrom_ers.first; + const auto& ers = chrom_ers.second; if (ers.empty()) continue; std::vector plus_sjs; std::vector minus_sjs; - plus_sjs.reserve(sjs.second.size()); - minus_sjs.reserve(sjs.second.size()); - for (uint32_t sj_id : sjs.second) + const auto sjs_it = mm_chrom_sj.find(chrom); + if (sjs_it != mm_chrom_sj.end()) { - if (sj_id == 0 || sj_id - 1 >= rr_all_sj.size()) continue; - (rr_all_sj[sj_id - 1].strand ? plus_sjs : minus_sjs).emplace_back(sj_id); + plus_sjs.reserve(sjs_it->second.size()); + minus_sjs.reserve(sjs_it->second.size()); + for (uint32_t sj_id : sjs_it->second) + { + if (sj_id == 0 || sj_id - 1 >= rr_all_sj.size()) continue; + (rr_all_sj[sj_id - 1].strand ? plus_sjs : minus_sjs).emplace_back(sj_id); + } } std::unordered_set consumed; diff --git a/tests/UnitTests.cpp b/tests/UnitTests.cpp index 50dfb0f..8dad82c 100644 --- a/tests/UnitTests.cpp +++ b/tests/UnitTests.cpp @@ -898,6 +898,30 @@ TEST(Integrator, StitchUpEachERAppearsExactlyOnceWithBothStrands) EXPECT_EQ(unstranded_count, 1); } +// A chromosome with expressed regions but no entry in mm_chrom_sj must still +// have its ERs emitted as single-ER unstranded StitchedERs. The previous +// implementation iterated only over mm_chrom_sj keys and silently dropped +// ERs from chromosomes with no junctions. +TEST(Integrator, StitchUpEmitsERsOnChromosomesWithNoSJs) +{ + std::vector rr_all_sj = {}; + std::unordered_map> expressed_regions; + expressed_regions["chr1"] = { + BedGraphRow("chr1", 10000, 10500, 100.0), + BedGraphRow("chr1", 11000, 12500, 101.0), + }; + std::unordered_map> mm_chrom_sj; + + Integrator integrator(0.1, 5); + integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); + + ASSERT_EQ(integrator.stitched_ERs.size(), 2u); + for (const auto& ser : integrator.stitched_ERs) { + EXPECT_EQ(ser.strand, '.'); + EXPECT_EQ(ser.er_ids.size(), 1u); + } +} + // GTFRow must propagate the StitchedER strand into column 7 of the output // row. Without this, gffcompare can never match transcripts or exons since // it requires strand agreement at those levels.