From e3a0da1ece1afb0cd2ca7f53d52299e6eada7838 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Mon, 18 May 2026 18:59:34 +0200 Subject: [PATCH 1/7] Fix off-by-one error --- CHANGELOG.md | 14 ++++++++ cpp/GTFRow.h | 6 +++- tests/gtfs/parser_test3.gtf | 30 ++++++++--------- tests/gtfs/splicing_scenarios_test1.gtf | 18 +++++----- tests/gtfs/splicing_scenarios_test2.gtf | 20 +++++------ tests/gtfs/splicing_scenarios_test3.gtf | 26 +++++++-------- tests/gtfs/splicing_scenarios_test4.gtf | 32 +++++++++--------- tests/gtfs/splicing_scenarios_test5.gtf | 44 ++++++++++++------------- tests/gtfs/splicing_scenarios_test6.gtf | 36 ++++++++++---------- tests/gtfs/splicing_scenarios_test7.gtf | 24 +++++++------- tests/gtfs/splicing_scenarios_test8.gtf | 26 +++++++-------- 11 files changed, 147 insertions(+), 129 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..d7cd519 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,14 @@ +# Changelog + +All notable changes to fastder are recorded in this file. + +## [Unreleased] + +### Fixed + +- GTF output is now 1-based. fastder's internal coordinates are 0-based + half-open (the BedGraph and BigWig convention), but the GTF writer emitted + the start column without converting it, so every gene, transcript and exon + start was one base too low. The start is now shifted by one on output. The + end needs no shift, because a 0-based half-open end equals the 1-based + inclusive end of the same interval. diff --git a/cpp/GTFRow.h b/cpp/GTFRow.h index 714378c..aa69813 100644 --- a/cpp/GTFRow.h +++ b/cpp/GTFRow.h @@ -28,7 +28,11 @@ class GTFRow // overload output operator for GTFRow friend std::ostream& operator<< (std::ostream& os, const GTFRow& row) { - return os << row.seqname << "\t" << row.source << "\t" << row.feature << "\t" << row.start << "\t" << row.end << "\t" << + // Internal coordinates are 0-based half-open (BedGraph / BigWig + // convention). GTF is 1-based inclusive, so the start is shifted by + // one. The end needs no shift: a 0-based half-open end equals the + // 1-based inclusive end of the same interval. + return os << row.seqname << "\t" << row.source << "\t" << row.feature << "\t" << (row.start + 1) << "\t" << row.end << "\t" << row.score << "\t" << row.strand << "\t" << row.frame << "\t" << row.attribute; } diff --git a/tests/gtfs/parser_test3.gtf b/tests/gtfs/parser_test3.gtf index 5cdb4ea..15e3ad4 100644 --- a/tests/gtfs/parser_test3.gtf +++ b/tests/gtfs/parser_test3.gtf @@ -2,18 +2,18 @@ #provider: FASTDER #contact: martina.lavanya@gmail.com #format: gtf -#date: 2026-4-28 -chr1 fastder gene 10000 10500 225.982 . . gene_id "gene1"; gene_name "faster_gene1"; -chr1 fastder transcript 10000 10500 225.982 . . gene_id "gene1"; transcript_id "tx1"; -chr1 fastder exon 10000 10500 225.982 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; -chr1 fastder gene 11000 12500 216.566 . . gene_id "gene2"; gene_name "faster_gene2"; -chr1 fastder transcript 11000 12500 216.566 . . gene_id "gene2"; transcript_id "tx2"; -chr1 fastder exon 11000 12500 216.566 . . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; -chr1 fastder gene 12861 14540 183.631 . . gene_id "gene3"; gene_name "faster_gene3"; -chr1 fastder transcript 12861 14540 183.631 . . gene_id "gene3"; transcript_id "tx3"; -chr1 fastder exon 12861 12999 172.148 . . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; -chr1 fastder exon 14001 14540 186.572 . . gene_id "gene3"; transcript_id "tx3"; exon_number "2"; -chr2 fastder gene 10000 12500 218.92 . . gene_id "gene4"; gene_name "faster_gene4"; -chr2 fastder transcript 10000 12500 218.92 . . gene_id "gene4"; transcript_id "tx4"; -chr2 fastder exon 10000 10500 225.982 . . gene_id "gene4"; transcript_id "tx4"; exon_number "1"; -chr2 fastder exon 11000 12500 216.566 . . gene_id "gene4"; transcript_id "tx4"; exon_number "2"; +#date: 2026-5-18 +chr1 fastder gene 10001 10500 225.982 . . gene_id "gene1"; gene_name "faster_gene1"; +chr1 fastder transcript 10001 10500 225.982 . . gene_id "gene1"; transcript_id "tx1"; +chr1 fastder exon 10001 10500 225.982 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; +chr1 fastder gene 11001 12500 216.566 . . gene_id "gene2"; gene_name "faster_gene2"; +chr1 fastder transcript 11001 12500 216.566 . . gene_id "gene2"; transcript_id "tx2"; +chr1 fastder exon 11001 12500 216.566 . . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; +chr1 fastder gene 12862 14540 183.631 - . gene_id "gene3"; gene_name "faster_gene3"; +chr1 fastder transcript 12862 14540 183.631 - . gene_id "gene3"; transcript_id "tx3"; +chr1 fastder exon 12862 12999 172.148 - . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; +chr1 fastder exon 14002 14540 186.572 - . gene_id "gene3"; transcript_id "tx3"; exon_number "2"; +chr2 fastder gene 10001 12500 218.92 - . gene_id "gene4"; gene_name "faster_gene4"; +chr2 fastder transcript 10001 12500 218.92 - . gene_id "gene4"; transcript_id "tx4"; +chr2 fastder exon 10001 10500 225.982 - . gene_id "gene4"; transcript_id "tx4"; exon_number "1"; +chr2 fastder exon 11001 12500 216.566 - . gene_id "gene4"; transcript_id "tx4"; exon_number "2"; diff --git a/tests/gtfs/splicing_scenarios_test1.gtf b/tests/gtfs/splicing_scenarios_test1.gtf index 235eeef..da85229 100644 --- a/tests/gtfs/splicing_scenarios_test1.gtf +++ b/tests/gtfs/splicing_scenarios_test1.gtf @@ -2,12 +2,12 @@ #provider: FASTDER #contact: martina.lavanya@gmail.com #format: gtf -#date: 2026-4-28 -chr1 fastder gene 10000 12500 100.75 . . gene_id "gene1"; gene_name "faster_gene1"; -chr1 fastder transcript 10000 12500 100.75 . . gene_id "gene1"; transcript_id "tx1"; -chr1 fastder exon 10000 10500 100 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; -chr1 fastder exon 11000 12500 101 . . gene_id "gene1"; transcript_id "tx1"; exon_number "2"; -chr1 fastder gene 12861 14540 29.7962 . . gene_id "gene2"; gene_name "faster_gene2"; -chr1 fastder transcript 12861 14540 29.7962 . . gene_id "gene2"; transcript_id "tx2"; -chr1 fastder exon 12861 12999 29 . . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; -chr1 fastder exon 14001 14540 30 . . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; +#date: 2026-5-18 +chr1 fastder gene 10001 12500 100.75 - . gene_id "gene1"; gene_name "faster_gene1"; +chr1 fastder transcript 10001 12500 100.75 - . gene_id "gene1"; transcript_id "tx1"; +chr1 fastder exon 10001 10500 100 - . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; +chr1 fastder exon 11001 12500 101 - . gene_id "gene1"; transcript_id "tx1"; exon_number "2"; +chr1 fastder gene 12862 14540 29.7962 - . gene_id "gene2"; gene_name "faster_gene2"; +chr1 fastder transcript 12862 14540 29.7962 - . gene_id "gene2"; transcript_id "tx2"; +chr1 fastder exon 12862 12999 29 - . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; +chr1 fastder exon 14002 14540 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; diff --git a/tests/gtfs/splicing_scenarios_test2.gtf b/tests/gtfs/splicing_scenarios_test2.gtf index 5ffeb25..e97733b 100644 --- a/tests/gtfs/splicing_scenarios_test2.gtf +++ b/tests/gtfs/splicing_scenarios_test2.gtf @@ -2,13 +2,13 @@ #provider: FASTDER #contact: martina.lavanya@gmail.com #format: gtf -#date: 2026-4-28 -chr1 fastder gene 10000 12500 100.75 . . gene_id "gene1"; gene_name "faster_gene1"; -chr1 fastder transcript 10000 12500 100.75 . . gene_id "gene1"; transcript_id "tx1"; -chr1 fastder exon 10000 10500 100 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; -chr1 fastder exon 11000 12500 101 . . gene_id "gene1"; transcript_id "tx1"; exon_number "2"; -chr1 fastder gene 12861 15300 29.784 . . gene_id "gene2"; gene_name "faster_gene2"; -chr1 fastder transcript 12861 15300 29.784 . . gene_id "gene2"; transcript_id "tx2"; -chr1 fastder exon 12861 12999 29 . . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; -chr1 fastder exon 14001 14201 30 . . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; -chr1 fastder exon 14999 15300 30 . . gene_id "gene2"; transcript_id "tx2"; exon_number "3"; +#date: 2026-5-18 +chr1 fastder gene 10001 12500 100.75 - . gene_id "gene1"; gene_name "faster_gene1"; +chr1 fastder transcript 10001 12500 100.75 - . gene_id "gene1"; transcript_id "tx1"; +chr1 fastder exon 10001 10500 100 - . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; +chr1 fastder exon 11001 12500 101 - . gene_id "gene1"; transcript_id "tx1"; exon_number "2"; +chr1 fastder gene 12862 15300 29.784 - . gene_id "gene2"; gene_name "faster_gene2"; +chr1 fastder transcript 12862 15300 29.784 - . gene_id "gene2"; transcript_id "tx2"; +chr1 fastder exon 12862 12999 29 - . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; +chr1 fastder exon 14002 14201 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; +chr1 fastder exon 15000 15300 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "3"; diff --git a/tests/gtfs/splicing_scenarios_test3.gtf b/tests/gtfs/splicing_scenarios_test3.gtf index a9e68b4..0bc7a31 100644 --- a/tests/gtfs/splicing_scenarios_test3.gtf +++ b/tests/gtfs/splicing_scenarios_test3.gtf @@ -2,16 +2,16 @@ #provider: FASTDER #contact: martina.lavanya@gmail.com #format: gtf -#date: 2026-4-28 -chr1 fastder gene 10000 12500 100.75 . . gene_id "gene1"; gene_name "faster_gene1"; -chr1 fastder transcript 10000 12500 100.75 . . gene_id "gene1"; transcript_id "tx1"; -chr1 fastder exon 10000 10500 100 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; -chr1 fastder exon 11000 12500 101 . . gene_id "gene1"; transcript_id "tx1"; exon_number "2"; -chr1 fastder gene 12861 15300 29.784 . . gene_id "gene2"; gene_name "faster_gene2"; -chr1 fastder transcript 12861 15300 29.784 . . gene_id "gene2"; transcript_id "tx2"; -chr1 fastder exon 12861 12999 29 . . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; -chr1 fastder exon 14001 14201 30 . . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; -chr1 fastder exon 14999 15300 30 . . gene_id "gene2"; transcript_id "tx2"; exon_number "3"; -chr1 fastder gene 15300 15600 37 . . gene_id "gene3"; gene_name "faster_gene3"; -chr1 fastder transcript 15300 15600 37 . . gene_id "gene3"; transcript_id "tx3"; -chr1 fastder exon 15300 15600 37 . . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; +#date: 2026-5-18 +chr1 fastder gene 10001 12500 100.75 - . gene_id "gene1"; gene_name "faster_gene1"; +chr1 fastder transcript 10001 12500 100.75 - . gene_id "gene1"; transcript_id "tx1"; +chr1 fastder exon 10001 10500 100 - . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; +chr1 fastder exon 11001 12500 101 - . gene_id "gene1"; transcript_id "tx1"; exon_number "2"; +chr1 fastder gene 12862 15300 29.784 - . gene_id "gene2"; gene_name "faster_gene2"; +chr1 fastder transcript 12862 15300 29.784 - . gene_id "gene2"; transcript_id "tx2"; +chr1 fastder exon 12862 12999 29 - . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; +chr1 fastder exon 14002 14201 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; +chr1 fastder exon 15000 15300 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "3"; +chr1 fastder gene 15301 15600 37 . . gene_id "gene3"; gene_name "faster_gene3"; +chr1 fastder transcript 15301 15600 37 . . gene_id "gene3"; transcript_id "tx3"; +chr1 fastder exon 15301 15600 37 . . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; diff --git a/tests/gtfs/splicing_scenarios_test4.gtf b/tests/gtfs/splicing_scenarios_test4.gtf index 174d888..8e5a5bd 100644 --- a/tests/gtfs/splicing_scenarios_test4.gtf +++ b/tests/gtfs/splicing_scenarios_test4.gtf @@ -2,19 +2,19 @@ #provider: FASTDER #contact: martina.lavanya@gmail.com #format: gtf -#date: 2026-4-28 -chr1 fastder gene 10000 12500 100.75 . . gene_id "gene1"; gene_name "faster_gene1"; -chr1 fastder transcript 10000 12500 100.75 . . gene_id "gene1"; transcript_id "tx1"; -chr1 fastder exon 10000 10500 100 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; -chr1 fastder exon 11000 12500 101 . . gene_id "gene1"; transcript_id "tx1"; exon_number "2"; -chr1 fastder gene 12861 15300 29.784 . . gene_id "gene2"; gene_name "faster_gene2"; -chr1 fastder transcript 12861 15300 29.784 . . gene_id "gene2"; transcript_id "tx2"; -chr1 fastder exon 12861 12999 29 . . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; -chr1 fastder exon 14001 14201 30 . . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; -chr1 fastder exon 14999 15300 30 . . gene_id "gene2"; transcript_id "tx2"; exon_number "3"; -chr1 fastder gene 15300 15600 37 . . gene_id "gene3"; gene_name "faster_gene3"; -chr1 fastder transcript 15300 15600 37 . . gene_id "gene3"; transcript_id "tx3"; -chr1 fastder exon 15300 15600 37 . . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; -chr1 fastder gene 15600 15800 87 . . gene_id "gene4"; gene_name "faster_gene4"; -chr1 fastder transcript 15600 15800 87 . . gene_id "gene4"; transcript_id "tx4"; -chr1 fastder exon 15600 15800 87 . . gene_id "gene4"; transcript_id "tx4"; exon_number "1"; +#date: 2026-5-18 +chr1 fastder gene 10001 12500 100.75 - . gene_id "gene1"; gene_name "faster_gene1"; +chr1 fastder transcript 10001 12500 100.75 - . gene_id "gene1"; transcript_id "tx1"; +chr1 fastder exon 10001 10500 100 - . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; +chr1 fastder exon 11001 12500 101 - . gene_id "gene1"; transcript_id "tx1"; exon_number "2"; +chr1 fastder gene 12862 15300 29.784 - . gene_id "gene2"; gene_name "faster_gene2"; +chr1 fastder transcript 12862 15300 29.784 - . gene_id "gene2"; transcript_id "tx2"; +chr1 fastder exon 12862 12999 29 - . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; +chr1 fastder exon 14002 14201 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; +chr1 fastder exon 15000 15300 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "3"; +chr1 fastder gene 15301 15600 37 . . gene_id "gene3"; gene_name "faster_gene3"; +chr1 fastder transcript 15301 15600 37 . . gene_id "gene3"; transcript_id "tx3"; +chr1 fastder exon 15301 15600 37 . . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; +chr1 fastder gene 15601 15800 87 . . gene_id "gene4"; gene_name "faster_gene4"; +chr1 fastder transcript 15601 15800 87 . . gene_id "gene4"; transcript_id "tx4"; +chr1 fastder exon 15601 15800 87 . . gene_id "gene4"; transcript_id "tx4"; exon_number "1"; diff --git a/tests/gtfs/splicing_scenarios_test5.gtf b/tests/gtfs/splicing_scenarios_test5.gtf index b981f8d..07cefdf 100644 --- a/tests/gtfs/splicing_scenarios_test5.gtf +++ b/tests/gtfs/splicing_scenarios_test5.gtf @@ -2,25 +2,25 @@ #provider: FASTDER #contact: martina.lavanya@gmail.com #format: gtf -#date: 2026-4-28 -chr1 fastder gene 10000 10500 100 . . gene_id "gene1"; gene_name "faster_gene1"; -chr1 fastder transcript 10000 10500 100 . . gene_id "gene1"; transcript_id "tx1"; -chr1 fastder exon 10000 10500 100 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; -chr1 fastder gene 11000 12500 101 . . gene_id "gene2"; gene_name "faster_gene2"; -chr1 fastder transcript 11000 12500 101 . . gene_id "gene2"; transcript_id "tx2"; -chr1 fastder exon 11000 12500 101 . . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; -chr1 fastder gene 12861 12999 29 . . gene_id "gene3"; gene_name "faster_gene3"; -chr1 fastder transcript 12861 12999 29 . . gene_id "gene3"; transcript_id "tx3"; -chr1 fastder exon 12861 12999 29 . . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; -chr1 fastder gene 14001 14201 30 . . gene_id "gene4"; gene_name "faster_gene4"; -chr1 fastder transcript 14001 14201 30 . . gene_id "gene4"; transcript_id "tx4"; -chr1 fastder exon 14001 14201 30 . . gene_id "gene4"; transcript_id "tx4"; exon_number "1"; -chr1 fastder gene 14999 15300 30 . . gene_id "gene5"; gene_name "faster_gene5"; -chr1 fastder transcript 14999 15300 30 . . gene_id "gene5"; transcript_id "tx5"; -chr1 fastder exon 14999 15300 30 . . gene_id "gene5"; transcript_id "tx5"; exon_number "1"; -chr1 fastder gene 15300 15600 37 . . gene_id "gene6"; gene_name "faster_gene6"; -chr1 fastder transcript 15300 15600 37 . . gene_id "gene6"; transcript_id "tx6"; -chr1 fastder exon 15300 15600 37 . . gene_id "gene6"; transcript_id "tx6"; exon_number "1"; -chr1 fastder gene 15600 15800 87 . . gene_id "gene7"; gene_name "faster_gene7"; -chr1 fastder transcript 15600 15800 87 . . gene_id "gene7"; transcript_id "tx7"; -chr1 fastder exon 15600 15800 87 . . gene_id "gene7"; transcript_id "tx7"; exon_number "1"; +#date: 2026-5-18 +chr1 fastder gene 10001 10500 100 . . gene_id "gene1"; gene_name "faster_gene1"; +chr1 fastder transcript 10001 10500 100 . . gene_id "gene1"; transcript_id "tx1"; +chr1 fastder exon 10001 10500 100 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; +chr1 fastder gene 11001 12500 101 . . gene_id "gene2"; gene_name "faster_gene2"; +chr1 fastder transcript 11001 12500 101 . . gene_id "gene2"; transcript_id "tx2"; +chr1 fastder exon 11001 12500 101 . . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; +chr1 fastder gene 12862 12999 29 . . gene_id "gene3"; gene_name "faster_gene3"; +chr1 fastder transcript 12862 12999 29 . . gene_id "gene3"; transcript_id "tx3"; +chr1 fastder exon 12862 12999 29 . . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; +chr1 fastder gene 14002 14201 30 . . gene_id "gene4"; gene_name "faster_gene4"; +chr1 fastder transcript 14002 14201 30 . . gene_id "gene4"; transcript_id "tx4"; +chr1 fastder exon 14002 14201 30 . . gene_id "gene4"; transcript_id "tx4"; exon_number "1"; +chr1 fastder gene 15000 15300 30 . . gene_id "gene5"; gene_name "faster_gene5"; +chr1 fastder transcript 15000 15300 30 . . gene_id "gene5"; transcript_id "tx5"; +chr1 fastder exon 15000 15300 30 . . gene_id "gene5"; transcript_id "tx5"; exon_number "1"; +chr1 fastder gene 15301 15600 37 . . gene_id "gene6"; gene_name "faster_gene6"; +chr1 fastder transcript 15301 15600 37 . . gene_id "gene6"; transcript_id "tx6"; +chr1 fastder exon 15301 15600 37 . . gene_id "gene6"; transcript_id "tx6"; exon_number "1"; +chr1 fastder gene 15601 15800 87 . . gene_id "gene7"; gene_name "faster_gene7"; +chr1 fastder transcript 15601 15800 87 . . gene_id "gene7"; transcript_id "tx7"; +chr1 fastder exon 15601 15800 87 . . gene_id "gene7"; transcript_id "tx7"; exon_number "1"; diff --git a/tests/gtfs/splicing_scenarios_test6.gtf b/tests/gtfs/splicing_scenarios_test6.gtf index 410d536..1ba9610 100644 --- a/tests/gtfs/splicing_scenarios_test6.gtf +++ b/tests/gtfs/splicing_scenarios_test6.gtf @@ -2,21 +2,21 @@ #provider: FASTDER #contact: martina.lavanya@gmail.com #format: gtf -#date: 2026-4-28 -chr1 fastder gene 10000 12500 100.75 . . gene_id "gene1"; gene_name "faster_gene1"; -chr1 fastder transcript 10000 12500 100.75 . . gene_id "gene1"; transcript_id "tx1"; -chr1 fastder exon 10000 10500 100 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; -chr1 fastder exon 11000 12500 101 . . gene_id "gene1"; transcript_id "tx1"; exon_number "2"; -chr1 fastder gene 12861 12999 29 . . gene_id "gene2"; gene_name "faster_gene2"; -chr1 fastder transcript 12861 12999 29 . . gene_id "gene2"; transcript_id "tx2"; -chr1 fastder exon 12861 12999 29 . . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; -chr1 fastder gene 14001 15300 30 . . gene_id "gene3"; gene_name "faster_gene3"; -chr1 fastder transcript 14001 15300 30 . . gene_id "gene3"; transcript_id "tx3"; -chr1 fastder exon 14001 14201 30 . . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; -chr1 fastder exon 14999 15300 30 . . gene_id "gene3"; transcript_id "tx3"; exon_number "2"; -chr1 fastder gene 15300 15600 37 . . gene_id "gene4"; gene_name "faster_gene4"; -chr1 fastder transcript 15300 15600 37 . . gene_id "gene4"; transcript_id "tx4"; -chr1 fastder exon 15300 15600 37 . . gene_id "gene4"; transcript_id "tx4"; exon_number "1"; -chr1 fastder gene 15600 15800 87 . . gene_id "gene5"; gene_name "faster_gene5"; -chr1 fastder transcript 15600 15800 87 . . gene_id "gene5"; transcript_id "tx5"; -chr1 fastder exon 15600 15800 87 . . gene_id "gene5"; transcript_id "tx5"; exon_number "1"; +#date: 2026-5-18 +chr1 fastder gene 10001 12500 100.75 - . gene_id "gene1"; gene_name "faster_gene1"; +chr1 fastder transcript 10001 12500 100.75 - . gene_id "gene1"; transcript_id "tx1"; +chr1 fastder exon 10001 10500 100 - . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; +chr1 fastder exon 11001 12500 101 - . gene_id "gene1"; transcript_id "tx1"; exon_number "2"; +chr1 fastder gene 12862 12999 29 . . gene_id "gene2"; gene_name "faster_gene2"; +chr1 fastder transcript 12862 12999 29 . . gene_id "gene2"; transcript_id "tx2"; +chr1 fastder exon 12862 12999 29 . . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; +chr1 fastder gene 14002 15300 30 - . gene_id "gene3"; gene_name "faster_gene3"; +chr1 fastder transcript 14002 15300 30 - . gene_id "gene3"; transcript_id "tx3"; +chr1 fastder exon 14002 14201 30 - . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; +chr1 fastder exon 15000 15300 30 - . gene_id "gene3"; transcript_id "tx3"; exon_number "2"; +chr1 fastder gene 15301 15600 37 . . gene_id "gene4"; gene_name "faster_gene4"; +chr1 fastder transcript 15301 15600 37 . . gene_id "gene4"; transcript_id "tx4"; +chr1 fastder exon 15301 15600 37 . . gene_id "gene4"; transcript_id "tx4"; exon_number "1"; +chr1 fastder gene 15601 15800 87 . . gene_id "gene5"; gene_name "faster_gene5"; +chr1 fastder transcript 15601 15800 87 . . gene_id "gene5"; transcript_id "tx5"; +chr1 fastder exon 15601 15800 87 . . gene_id "gene5"; transcript_id "tx5"; exon_number "1"; diff --git a/tests/gtfs/splicing_scenarios_test7.gtf b/tests/gtfs/splicing_scenarios_test7.gtf index f90eb85..b0b942f 100644 --- a/tests/gtfs/splicing_scenarios_test7.gtf +++ b/tests/gtfs/splicing_scenarios_test7.gtf @@ -2,15 +2,15 @@ #provider: FASTDER #contact: martina.lavanya@gmail.com #format: gtf -#date: 2026-4-28 -chr1 fastder gene 9000 9200 2 . . gene_id "gene1"; gene_name "faster_gene1"; -chr1 fastder transcript 9000 9200 2 . . gene_id "gene1"; transcript_id "tx1"; -chr1 fastder exon 9000 9200 2 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; -chr1 fastder gene 10000 12500 100.75 . . gene_id "gene2"; gene_name "faster_gene2"; -chr1 fastder transcript 10000 12500 100.75 . . gene_id "gene2"; transcript_id "tx2"; -chr1 fastder exon 10000 10500 100 . . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; -chr1 fastder exon 11000 12500 101 . . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; -chr1 fastder gene 14001 15300 30 . . gene_id "gene3"; gene_name "faster_gene3"; -chr1 fastder transcript 14001 15300 30 . . gene_id "gene3"; transcript_id "tx3"; -chr1 fastder exon 14001 14201 30 . . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; -chr1 fastder exon 14999 15300 30 . . gene_id "gene3"; transcript_id "tx3"; exon_number "2"; +#date: 2026-5-18 +chr1 fastder gene 9001 9200 2 . . gene_id "gene1"; gene_name "faster_gene1"; +chr1 fastder transcript 9001 9200 2 . . gene_id "gene1"; transcript_id "tx1"; +chr1 fastder exon 9001 9200 2 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; +chr1 fastder gene 10001 12500 100.75 - . gene_id "gene2"; gene_name "faster_gene2"; +chr1 fastder transcript 10001 12500 100.75 - . gene_id "gene2"; transcript_id "tx2"; +chr1 fastder exon 10001 10500 100 - . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; +chr1 fastder exon 11001 12500 101 - . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; +chr1 fastder gene 14002 15300 30 - . gene_id "gene3"; gene_name "faster_gene3"; +chr1 fastder transcript 14002 15300 30 - . gene_id "gene3"; transcript_id "tx3"; +chr1 fastder exon 14002 14201 30 - . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; +chr1 fastder exon 15000 15300 30 - . gene_id "gene3"; transcript_id "tx3"; exon_number "2"; diff --git a/tests/gtfs/splicing_scenarios_test8.gtf b/tests/gtfs/splicing_scenarios_test8.gtf index 0850c2e..34b6fbe 100644 --- a/tests/gtfs/splicing_scenarios_test8.gtf +++ b/tests/gtfs/splicing_scenarios_test8.gtf @@ -2,16 +2,16 @@ #provider: FASTDER #contact: martina.lavanya@gmail.com #format: gtf -#date: 2026-4-28 -chr1 fastder gene 9000 9200 2 . . gene_id "gene1"; gene_name "faster_gene1"; -chr1 fastder transcript 9000 9200 2 . . gene_id "gene1"; transcript_id "tx1"; -chr1 fastder exon 9000 9200 2 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; -chr1 fastder gene 10000 15300 30.9503 . . gene_id "gene2"; gene_name "faster_gene2"; -chr1 fastder transcript 10000 15300 30.9503 . . gene_id "gene2"; transcript_id "tx2"; -chr1 fastder exon 10000 10500 30 . . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; -chr1 fastder exon 11000 14201 31 . . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; -chr1 fastder exon 14999 15300 32 . . gene_id "gene2"; transcript_id "tx2"; exon_number "3"; -chr2 fastder gene 1501 4000 10.1188 . . gene_id "gene3"; gene_name "faster_gene3"; -chr2 fastder transcript 1501 4000 10.1188 . . gene_id "gene3"; transcript_id "tx3"; -chr2 fastder exon 1501 2999 10 . . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; -chr2 fastder exon 3798 4000 11 . . gene_id "gene3"; transcript_id "tx3"; exon_number "2"; +#date: 2026-5-18 +chr1 fastder gene 9001 9200 2 . . gene_id "gene1"; gene_name "faster_gene1"; +chr1 fastder transcript 9001 9200 2 . . gene_id "gene1"; transcript_id "tx1"; +chr1 fastder exon 9001 9200 2 . . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; +chr1 fastder gene 10001 15300 30.9503 - . gene_id "gene2"; gene_name "faster_gene2"; +chr1 fastder transcript 10001 15300 30.9503 - . gene_id "gene2"; transcript_id "tx2"; +chr1 fastder exon 10001 10500 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; +chr1 fastder exon 11001 14201 31 - . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; +chr1 fastder exon 15000 15300 32 - . gene_id "gene2"; transcript_id "tx2"; exon_number "3"; +chr2 fastder gene 1502 4000 10.1188 - . gene_id "gene3"; gene_name "faster_gene3"; +chr2 fastder transcript 1502 4000 10.1188 - . gene_id "gene3"; transcript_id "tx3"; +chr2 fastder exon 1502 2999 10 - . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; +chr2 fastder exon 3799 4000 11 - . gene_id "gene3"; transcript_id "tx3"; exon_number "2"; From ebad7f069084c83ead4cf60fa1f4b2e5f181fe99 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Mon, 18 May 2026 19:17:20 +0200 Subject: [PATCH 2/7] Update coverage thresholds, understitching, and exon boundaries (now by SJ) --- CHANGELOG.md | 14 ++++++ cpp/Integrator.cpp | 63 +++++++++++++++++-------- cpp/StitchedER.cpp | 8 +++- cpp/StitchedER.h | 9 +++- cpp/main.cpp | 16 +++++-- tests/gtfs/parser_test3.gtf | 4 +- tests/gtfs/splicing_scenarios_test1.gtf | 4 +- tests/gtfs/splicing_scenarios_test2.gtf | 6 +-- tests/gtfs/splicing_scenarios_test3.gtf | 6 +-- tests/gtfs/splicing_scenarios_test4.gtf | 6 +-- tests/gtfs/splicing_scenarios_test6.gtf | 4 +- tests/gtfs/splicing_scenarios_test7.gtf | 4 +- tests/gtfs/splicing_scenarios_test8.gtf | 8 ++-- 13 files changed, 104 insertions(+), 48 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d7cd519..7bd3e40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,3 +12,17 @@ All notable changes to fastder are recorded in this file. start was one base too low. The start is now shifted by one on output. The end needs no shift, because a 0-based half-open end equals the 1-based inclusive end of the same interval. + +### Changed + +- Stitching is no longer gated on coverage similarity. The default + `--coverage-tolerance` is raised to 1000, which makes the gate effectively + off. A splice junction is the evidence that two regions belong to one + transcript, and exon coverage varies widely within a transcript, so the + previous default (0.8) wrongly rejected legitimate stitches. The flag is + kept as a knob. +- Exon boundaries are snapped to splice junctions. When a stitched chain + crosses a junction, the exon edges on either side are set to the junction's + donor and acceptor coordinates instead of the coverage-derived expressed- + region edges. Edges with no adjacent junction, namely the outer ends of a + chain and both ends of a monoexonic region, keep the coverage extent. diff --git a/cpp/Integrator.cpp b/cpp/Integrator.cpp index f6a1853..be45a45 100644 --- a/cpp/Integrator.cpp +++ b/cpp/Integrator.cpp @@ -125,9 +125,14 @@ void Integrator::stitch_one_strand(const std::string& chrom, if (is_similar(candidate, expressed_region, rr_all_sj[*sj_to_check - 1])) { + const SJRow& used_sj = rr_all_sj[*sj_to_check - 1]; uint64_t sj_length = expressed_region.start - ers[candidate.er_ids.back()].end; - candidate.append(-1, sj_length, 0.0); - candidate.append(i, expressed_region.length, expressed_region.coverage); + // record the splice junction's [donor, acceptor] for the spliced + // region, and the ER's coverage extent for the appended exon, so + // write_to_gtf can snap exon edges to the splice sites. + candidate.append(-1, sj_length, 0.0, used_sj.start, used_sj.end); + candidate.append(i, expressed_region.length, expressed_region.coverage, + expressed_region.start, expressed_region.end); int er_count = 0; for (int id : candidate.er_ids) if (id >= 0) ++er_count; if (er_count > max_chain_len) max_chain_len = er_count; @@ -240,28 +245,46 @@ void Integrator::write_to_gtf(const std::string& output_path) for (unsigned int i = 0; i < this->stitched_ERs.size(); ++i) { - // each stitched_er is both a gene and a transcript - GTFRow gtf_row = GTFRow(stitched_ERs[i], "gene", i + 1); + const StitchedER& ser = stitched_ERs[i]; + + // Resolve each exon's [start, end]. The default is the ER's + // coverage-derived extent; an edge that abuts a splice junction is + // snapped to the junction coordinate (the donor for an exon end, the + // acceptor for the next exon start), since the junction marks the + // exact splice site. Edges with no adjacent junction, namely the + // outer ends of a chain and both ends of a monoexonic ER, keep the + // coverage extent. + std::vector> exons; + std::vector exon_scores; + for (unsigned int k = 0; k < ser.er_ids.size(); ++k) + { + if (ser.er_ids.at(k) == -1) continue; // spliced region, not an exon + uint64_t ex_start = ser.er_bounds.at(k).first; + uint64_t ex_end = ser.er_bounds.at(k).second; + if (k > 0 && ser.er_ids.at(k - 1) == -1) + ex_start = ser.er_bounds.at(k - 1).second; // splice acceptor + if (k + 1 < ser.er_ids.size() && ser.er_ids.at(k + 1) == -1) + ex_end = ser.er_bounds.at(k + 1).first; // splice donor + exons.emplace_back(ex_start, ex_end); + exon_scores.push_back(ser.all_coverages.at(k).second); + } + if (exons.empty()) continue; + + // each stitched_er is both a gene and a transcript; their span runs + // from the first exon start to the last exon end + GTFRow gtf_row = GTFRow(ser, "gene", i + 1); + gtf_row.start = exons.front().first; + gtf_row.end = exons.back().second; out << gtf_row << std::endl; gtf_row.change_feature("transcript", i + 1, 0); out << gtf_row << std::endl; - int exon_nr = 1; - // add the ERs within the stitched_er - for (unsigned int k = 0; k < stitched_ERs[i].er_ids.size(); ++k) + for (unsigned int e = 0; e < exons.size(); ++e) { - if (stitched_ERs[i].er_ids.at(k) != -1){ - gtf_row.change_feature("exon", i + 1, exon_nr); - // need to include SJ length as well - gtf_row.end = gtf_row.start + stitched_ERs.at(i).all_coverages.at(k).first; // start + length = end - gtf_row.score = stitched_ERs.at(i).all_coverages.at(k).second; // use the per-exon average coverage here instead of the overall coverage - out << gtf_row << std::endl; - gtf_row.start = gtf_row.end; - ++exon_nr; - } - else - { - gtf_row.start += stitched_ERs.at(i).all_coverages.at(k).first; // add length of the SJ - } + gtf_row.change_feature("exon", i + 1, e + 1); + gtf_row.start = exons.at(e).first; + gtf_row.end = exons.at(e).second; + gtf_row.score = exon_scores.at(e); + out << gtf_row << std::endl; } } out.close(); diff --git a/cpp/StitchedER.cpp b/cpp/StitchedER.cpp index 869bb45..b4d1caa 100644 --- a/cpp/StitchedER.cpp +++ b/cpp/StitchedER.cpp @@ -14,6 +14,7 @@ StitchedER::StitchedER(const BedGraphRow& expressed_region, int er_id) er_ids = {er_id}; across_er_coverage = expressed_region.coverage; all_coverages = {std::make_pair(expressed_region.length, expressed_region.coverage)}; + er_bounds = {std::make_pair(expressed_region.start, expressed_region.end)}; total_length = expressed_region.length; } @@ -39,11 +40,14 @@ double StitchedER::get_avg_coverage() const return sum / full_length; } -// note that er_id is 0-indexed -void StitchedER::append(int er_id, unsigned int length, double coverage) +// note that er_id is 0-indexed. lo/hi is the ER's coverage extent, or the +// splice junction's [donor, acceptor] when er_id is -1 (a spliced region). +void StitchedER::append(int er_id, unsigned int length, double coverage, + uint64_t lo, uint64_t hi) { er_ids.emplace_back(er_id); // -1 for spliced regions all_coverages.push_back({std::make_pair(length, coverage)}); + er_bounds.push_back(std::make_pair(lo, hi)); // only update avg coverage if an exon was added if (er_id != -1) { diff --git a/cpp/StitchedER.h b/cpp/StitchedER.h index 9ee8147..aa0ff77 100644 --- a/cpp/StitchedER.h +++ b/cpp/StitchedER.h @@ -16,7 +16,8 @@ class StitchedER StitchedER() = default; StitchedER(const BedGraphRow& expressed_region, int er_id); - void append(int er_id, unsigned int length, double coverage); + void append(int er_id, unsigned int length, double coverage, + uint64_t lo, uint64_t hi); double get_avg_coverage() const; //bool is_similar(double val1, double val2); @@ -28,6 +29,12 @@ class StitchedER // example: stitched_ER consists of er_ids 45, 46, 47, 49 == vector indices of expressed_regions double across_er_coverage; // avg (weighted) coverage of all exons that are part of the stitched ER so far std::vector> all_coverages; // stores a pair of er length (= weight) + normalized average coverage of the er + // Genomic [start, end) of each entry in er_ids, aligned by index. For a + // real ER it is the coverage-derived extent; for a spliced region (-1) it + // is the splice junction's [donor, acceptor]. write_to_gtf snaps an exon + // edge to the adjacent junction coordinate and falls back to the coverage + // extent where an edge has no junction. + std::vector> er_bounds; unsigned int total_length; // combined length of all ERs (excluding spliced regions) uint64_t start; uint64_t end; diff --git a/cpp/main.cpp b/cpp/main.cpp index 50c6071..501cb42 100644 --- a/cpp/main.cpp +++ b/cpp/main.cpp @@ -18,7 +18,12 @@ int main(int argc, char* argv[]) { // default values (if not provided by user) int position_tolerance = 5; int min_length = 10; - double coverage_tolerance = 0.8; + // coverage_tolerance gates stitching on how similar two regions' coverage + // is. A splice junction is the real evidence that two exons belong to one + // transcript, and the coverage of exons within a transcript varies widely, + // so this gate defaults effectively off (any fold-difference passes) and + // is left as a knob only for deliberate experiments. + double coverage_tolerance = 1000.0; double min_coverage = 0.05; std::string directory; int cores = 10; @@ -45,9 +50,12 @@ int main(int argc, char* argv[]) { << " Example: --min-coverage 0.25\n\n" << " --position-tolerance Maximum permitted position deviation of splice junction and ER coordinates, in [nt]. Default = 5 nt.\n" << " Example: --position-tolerance 5\n\n" - << " --coverage-tolerance Permitted coverage deviation within a stitched ER, as a proportion (e.g. 0.1 = 10 %).\n" - << " The value is not strictly bound in (0,1). Default = 0.7.\n" - << " Example: --coverage-tolerance 0.8\n\n" + << " --coverage-tolerance Permitted coverage deviation when stitching, as a proportion: a\n" + << " neighbour passes if its coverage is within (1 +/- tolerance)\n" + << " of the running average. Not bound to (0,1); 2.0 allows a 3x\n" + << " spread. Default = 1000 (the gate is effectively off, stitching\n" + << " is driven by splice junctions).\n" + << " Example: --coverage-tolerance 2.0\n\n" << " --cores Number of cores that fastder may use. Default = 10 cores.\n" << " Example: --cores 23\n\n" << "Example:\n" diff --git a/tests/gtfs/parser_test3.gtf b/tests/gtfs/parser_test3.gtf index 15e3ad4..5208271 100644 --- a/tests/gtfs/parser_test3.gtf +++ b/tests/gtfs/parser_test3.gtf @@ -11,8 +11,8 @@ chr1 fastder transcript 11001 12500 216.566 . . gene_id "gene2"; transcript_id " chr1 fastder exon 11001 12500 216.566 . . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; chr1 fastder gene 12862 14540 183.631 - . gene_id "gene3"; gene_name "faster_gene3"; chr1 fastder transcript 12862 14540 183.631 - . gene_id "gene3"; transcript_id "tx3"; -chr1 fastder exon 12862 12999 172.148 - . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; -chr1 fastder exon 14002 14540 186.572 - . gene_id "gene3"; transcript_id "tx3"; exon_number "2"; +chr1 fastder exon 12862 13000 172.148 - . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; +chr1 fastder exon 14001 14540 186.572 - . gene_id "gene3"; transcript_id "tx3"; exon_number "2"; chr2 fastder gene 10001 12500 218.92 - . gene_id "gene4"; gene_name "faster_gene4"; chr2 fastder transcript 10001 12500 218.92 - . gene_id "gene4"; transcript_id "tx4"; chr2 fastder exon 10001 10500 225.982 - . gene_id "gene4"; transcript_id "tx4"; exon_number "1"; diff --git a/tests/gtfs/splicing_scenarios_test1.gtf b/tests/gtfs/splicing_scenarios_test1.gtf index da85229..04034ac 100644 --- a/tests/gtfs/splicing_scenarios_test1.gtf +++ b/tests/gtfs/splicing_scenarios_test1.gtf @@ -9,5 +9,5 @@ chr1 fastder exon 10001 10500 100 - . gene_id "gene1"; transcript_id "tx1"; exon chr1 fastder exon 11001 12500 101 - . gene_id "gene1"; transcript_id "tx1"; exon_number "2"; chr1 fastder gene 12862 14540 29.7962 - . gene_id "gene2"; gene_name "faster_gene2"; chr1 fastder transcript 12862 14540 29.7962 - . gene_id "gene2"; transcript_id "tx2"; -chr1 fastder exon 12862 12999 29 - . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; -chr1 fastder exon 14002 14540 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; +chr1 fastder exon 12862 13000 29 - . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; +chr1 fastder exon 14001 14540 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; diff --git a/tests/gtfs/splicing_scenarios_test2.gtf b/tests/gtfs/splicing_scenarios_test2.gtf index e97733b..c270bb0 100644 --- a/tests/gtfs/splicing_scenarios_test2.gtf +++ b/tests/gtfs/splicing_scenarios_test2.gtf @@ -9,6 +9,6 @@ chr1 fastder exon 10001 10500 100 - . gene_id "gene1"; transcript_id "tx1"; exon chr1 fastder exon 11001 12500 101 - . gene_id "gene1"; transcript_id "tx1"; exon_number "2"; chr1 fastder gene 12862 15300 29.784 - . gene_id "gene2"; gene_name "faster_gene2"; chr1 fastder transcript 12862 15300 29.784 - . gene_id "gene2"; transcript_id "tx2"; -chr1 fastder exon 12862 12999 29 - . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; -chr1 fastder exon 14002 14201 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; -chr1 fastder exon 15000 15300 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "3"; +chr1 fastder exon 12862 13000 29 - . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; +chr1 fastder exon 14001 14200 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; +chr1 fastder exon 15001 15300 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "3"; diff --git a/tests/gtfs/splicing_scenarios_test3.gtf b/tests/gtfs/splicing_scenarios_test3.gtf index 0bc7a31..5177cae 100644 --- a/tests/gtfs/splicing_scenarios_test3.gtf +++ b/tests/gtfs/splicing_scenarios_test3.gtf @@ -9,9 +9,9 @@ chr1 fastder exon 10001 10500 100 - . gene_id "gene1"; transcript_id "tx1"; exon chr1 fastder exon 11001 12500 101 - . gene_id "gene1"; transcript_id "tx1"; exon_number "2"; chr1 fastder gene 12862 15300 29.784 - . gene_id "gene2"; gene_name "faster_gene2"; chr1 fastder transcript 12862 15300 29.784 - . gene_id "gene2"; transcript_id "tx2"; -chr1 fastder exon 12862 12999 29 - . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; -chr1 fastder exon 14002 14201 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; -chr1 fastder exon 15000 15300 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "3"; +chr1 fastder exon 12862 13000 29 - . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; +chr1 fastder exon 14001 14200 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; +chr1 fastder exon 15001 15300 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "3"; chr1 fastder gene 15301 15600 37 . . gene_id "gene3"; gene_name "faster_gene3"; chr1 fastder transcript 15301 15600 37 . . gene_id "gene3"; transcript_id "tx3"; chr1 fastder exon 15301 15600 37 . . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; diff --git a/tests/gtfs/splicing_scenarios_test4.gtf b/tests/gtfs/splicing_scenarios_test4.gtf index 8e5a5bd..40845c5 100644 --- a/tests/gtfs/splicing_scenarios_test4.gtf +++ b/tests/gtfs/splicing_scenarios_test4.gtf @@ -9,9 +9,9 @@ chr1 fastder exon 10001 10500 100 - . gene_id "gene1"; transcript_id "tx1"; exon chr1 fastder exon 11001 12500 101 - . gene_id "gene1"; transcript_id "tx1"; exon_number "2"; chr1 fastder gene 12862 15300 29.784 - . gene_id "gene2"; gene_name "faster_gene2"; chr1 fastder transcript 12862 15300 29.784 - . gene_id "gene2"; transcript_id "tx2"; -chr1 fastder exon 12862 12999 29 - . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; -chr1 fastder exon 14002 14201 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; -chr1 fastder exon 15000 15300 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "3"; +chr1 fastder exon 12862 13000 29 - . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; +chr1 fastder exon 14001 14200 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; +chr1 fastder exon 15001 15300 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "3"; chr1 fastder gene 15301 15600 37 . . gene_id "gene3"; gene_name "faster_gene3"; chr1 fastder transcript 15301 15600 37 . . gene_id "gene3"; transcript_id "tx3"; chr1 fastder exon 15301 15600 37 . . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; diff --git a/tests/gtfs/splicing_scenarios_test6.gtf b/tests/gtfs/splicing_scenarios_test6.gtf index 1ba9610..a19baf5 100644 --- a/tests/gtfs/splicing_scenarios_test6.gtf +++ b/tests/gtfs/splicing_scenarios_test6.gtf @@ -12,8 +12,8 @@ chr1 fastder transcript 12862 12999 29 . . gene_id "gene2"; transcript_id "tx2"; chr1 fastder exon 12862 12999 29 . . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; chr1 fastder gene 14002 15300 30 - . gene_id "gene3"; gene_name "faster_gene3"; chr1 fastder transcript 14002 15300 30 - . gene_id "gene3"; transcript_id "tx3"; -chr1 fastder exon 14002 14201 30 - . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; -chr1 fastder exon 15000 15300 30 - . gene_id "gene3"; transcript_id "tx3"; exon_number "2"; +chr1 fastder exon 14002 14200 30 - . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; +chr1 fastder exon 15001 15300 30 - . gene_id "gene3"; transcript_id "tx3"; exon_number "2"; chr1 fastder gene 15301 15600 37 . . gene_id "gene4"; gene_name "faster_gene4"; chr1 fastder transcript 15301 15600 37 . . gene_id "gene4"; transcript_id "tx4"; chr1 fastder exon 15301 15600 37 . . gene_id "gene4"; transcript_id "tx4"; exon_number "1"; diff --git a/tests/gtfs/splicing_scenarios_test7.gtf b/tests/gtfs/splicing_scenarios_test7.gtf index b0b942f..f0e8112 100644 --- a/tests/gtfs/splicing_scenarios_test7.gtf +++ b/tests/gtfs/splicing_scenarios_test7.gtf @@ -12,5 +12,5 @@ chr1 fastder exon 10001 10500 100 - . gene_id "gene2"; transcript_id "tx2"; exon chr1 fastder exon 11001 12500 101 - . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; chr1 fastder gene 14002 15300 30 - . gene_id "gene3"; gene_name "faster_gene3"; chr1 fastder transcript 14002 15300 30 - . gene_id "gene3"; transcript_id "tx3"; -chr1 fastder exon 14002 14201 30 - . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; -chr1 fastder exon 15000 15300 30 - . gene_id "gene3"; transcript_id "tx3"; exon_number "2"; +chr1 fastder exon 14002 14200 30 - . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; +chr1 fastder exon 15001 15300 30 - . gene_id "gene3"; transcript_id "tx3"; exon_number "2"; diff --git a/tests/gtfs/splicing_scenarios_test8.gtf b/tests/gtfs/splicing_scenarios_test8.gtf index 34b6fbe..f781ce8 100644 --- a/tests/gtfs/splicing_scenarios_test8.gtf +++ b/tests/gtfs/splicing_scenarios_test8.gtf @@ -9,9 +9,9 @@ chr1 fastder exon 9001 9200 2 . . gene_id "gene1"; transcript_id "tx1"; exon_num chr1 fastder gene 10001 15300 30.9503 - . gene_id "gene2"; gene_name "faster_gene2"; chr1 fastder transcript 10001 15300 30.9503 - . gene_id "gene2"; transcript_id "tx2"; chr1 fastder exon 10001 10500 30 - . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; -chr1 fastder exon 11001 14201 31 - . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; -chr1 fastder exon 15000 15300 32 - . gene_id "gene2"; transcript_id "tx2"; exon_number "3"; +chr1 fastder exon 11001 14200 31 - . gene_id "gene2"; transcript_id "tx2"; exon_number "2"; +chr1 fastder exon 15001 15300 32 - . gene_id "gene2"; transcript_id "tx2"; exon_number "3"; chr2 fastder gene 1502 4000 10.1188 - . gene_id "gene3"; gene_name "faster_gene3"; chr2 fastder transcript 1502 4000 10.1188 - . gene_id "gene3"; transcript_id "tx3"; -chr2 fastder exon 1502 2999 10 - . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; -chr2 fastder exon 3799 4000 11 - . gene_id "gene3"; transcript_id "tx3"; exon_number "2"; +chr2 fastder exon 1502 3000 10 - . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; +chr2 fastder exon 3801 4000 11 - . gene_id "gene3"; transcript_id "tx3"; exon_number "2"; From ceff7f5ea95fe887617c7eb3675f97cee9e02457 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Mon, 18 May 2026 19:59:47 +0200 Subject: [PATCH 3/7] Patch stitching exons shorter than tolerances --- cpp/Integrator.cpp | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/cpp/Integrator.cpp b/cpp/Integrator.cpp index be45a45..0550e8a 100644 --- a/cpp/Integrator.cpp +++ b/cpp/Integrator.cpp @@ -123,6 +123,13 @@ void Integrator::stitch_one_strand(const std::string& chrom, } const auto sj_to_check = (sj_it == strand_sjs.end()) ? std::prev(strand_sjs.end()) : sj_it; + // TODO reject geometrically inconsistent stitches here, rather than + // only repairing them in write_to_gtf. When position_tolerance is + // larger than a short ER, the junctions matched to its two edges can + // snap past each other and imply a negative-length exon. write_to_gtf + // currently falls back to that ER's coverage extent; a cleaner fix is + // to refuse the stitch when position_tolerance exceeds the ER length + // or when the flanking junctions would produce a non-positive exon. if (is_similar(candidate, expressed_region, rr_all_sj[*sj_to_check - 1])) { const SJRow& used_sj = rr_all_sj[*sj_to_check - 1]; @@ -259,12 +266,23 @@ void Integrator::write_to_gtf(const std::string& output_path) for (unsigned int k = 0; k < ser.er_ids.size(); ++k) { if (ser.er_ids.at(k) == -1) continue; // spliced region, not an exon - uint64_t ex_start = ser.er_bounds.at(k).first; - uint64_t ex_end = ser.er_bounds.at(k).second; + const uint64_t cov_start = ser.er_bounds.at(k).first; + const uint64_t cov_end = ser.er_bounds.at(k).second; + uint64_t ex_start = cov_start; + uint64_t ex_end = cov_end; if (k > 0 && ser.er_ids.at(k - 1) == -1) ex_start = ser.er_bounds.at(k - 1).second; // splice acceptor if (k + 1 < ser.er_ids.size() && ser.er_ids.at(k + 1) == -1) ex_end = ser.er_bounds.at(k + 1).first; // splice donor + // Keep the snapped edges only if they still form a valid exon. + // With a large position_tolerance two junctions flanking a short + // expressed region can snap past each other (start >= end); fall + // back to the coverage extent, which is always a valid interval. + if (ex_start >= ex_end) + { + ex_start = cov_start; + ex_end = cov_end; + } exons.emplace_back(ex_start, ex_end); exon_scores.push_back(ser.all_coverages.at(k).second); } From 785a8d8d279f09632acca1b7d0cd9e476d7f0598 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Mon, 18 May 2026 20:35:12 +0200 Subject: [PATCH 4/7] Offset-by-one harmonization for SJ files --- CHANGELOG.md | 5 +++++ cpp/SJRow.h | 7 +++++++ tests/UnitTests.cpp | 11 +++++++---- tests/gtfs/parser_test3.gtf | 4 ++-- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7bd3e40..c6a85a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,11 @@ All notable changes to fastder are recorded in this file. start was one base too low. The start is now shifted by one on output. The end needs no shift, because a 0-based half-open end equals the 1-based inclusive end of the same interval. +- RR splice junction coordinates are converted to 0-based on read. The RR + file carries 1-based inclusive intron coordinates (the STAR SJ.out.tab + convention), but fastder works in 0-based half-open coordinates like the + coverage. The mismatch left junction-snapped exon ends one base too long. + The RR parser now shifts the junction start by one on read. ### Changed diff --git a/cpp/SJRow.h b/cpp/SJRow.h index c8d17ea..03ac3b2 100644 --- a/cpp/SJRow.h +++ b/cpp/SJRow.h @@ -40,6 +40,13 @@ class SJRow return is; } row.strand = (strand_ == '+'); // 1 if + + // The RR file gives 1-based inclusive intron coordinates (STAR + // SJ.out.tab convention). fastder works in 0-based half-open + // coordinates, like the BedGraph / BigWig coverage, so shift the + // start by one; the end already equals the 0-based half-open end. + // This keeps junction-to-ER comparisons and the exon-boundary + // snapping consistent with the coverage coordinates. + row.start -= 1; return is; } diff --git a/tests/UnitTests.cpp b/tests/UnitTests.cpp index 68d971b..d12c9f3 100644 --- a/tests/UnitTests.cpp +++ b/tests/UnitTests.cpp @@ -400,7 +400,8 @@ TEST(SJRow, ParsesAndDiscardsUnusedRRColumns) EXPECT_TRUE(iss.good() || iss.eof()); EXPECT_EQ(row.chrom, "chr1"); - EXPECT_EQ(row.start, 143465342u); + // The RR start is 1-based; the parser shifts it to 0-based half-open. + EXPECT_EQ(row.start, 143465341u); EXPECT_EQ(row.end, 143470614u); EXPECT_EQ(row.length, 5273u); EXPECT_FALSE(row.strand); // '-' -> false @@ -422,7 +423,8 @@ TEST(SJRow, ParsesPlaceholderAnnotationColumns) SJRow row; iss >> row; EXPECT_EQ(row.chrom, "chr21"); - EXPECT_EQ(row.start, 100u); + // 1-based RR start shifted to 0-based half-open by the parser. + EXPECT_EQ(row.start, 99u); EXPECT_EQ(row.end, 200u); EXPECT_TRUE(row.strand); // '+' EXPECT_FALSE(row.annotated); @@ -471,8 +473,9 @@ TEST(Parser, ChrFilterRetainsOnlyRequestedRows) EXPECT_EQ(parser.sj_id_remap[2], 2u); ASSERT_EQ(parser.rr_all_sj.size(), 2u); EXPECT_EQ(parser.rr_all_sj[0].chrom, "chr1"); - EXPECT_EQ(parser.rr_all_sj[0].start, 100u); - EXPECT_EQ(parser.rr_all_sj[1].start, 500u); + // RR starts are 1-based; the parser shifts them to 0-based half-open. + EXPECT_EQ(parser.rr_all_sj[0].start, 99u); + EXPECT_EQ(parser.rr_all_sj[1].start, 499u); fs::remove_all(tmp); } diff --git a/tests/gtfs/parser_test3.gtf b/tests/gtfs/parser_test3.gtf index 5208271..1c4ee1b 100644 --- a/tests/gtfs/parser_test3.gtf +++ b/tests/gtfs/parser_test3.gtf @@ -11,9 +11,9 @@ chr1 fastder transcript 11001 12500 216.566 . . gene_id "gene2"; transcript_id " chr1 fastder exon 11001 12500 216.566 . . gene_id "gene2"; transcript_id "tx2"; exon_number "1"; chr1 fastder gene 12862 14540 183.631 - . gene_id "gene3"; gene_name "faster_gene3"; chr1 fastder transcript 12862 14540 183.631 - . gene_id "gene3"; transcript_id "tx3"; -chr1 fastder exon 12862 13000 172.148 - . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; +chr1 fastder exon 12862 12999 172.148 - . gene_id "gene3"; transcript_id "tx3"; exon_number "1"; chr1 fastder exon 14001 14540 186.572 - . gene_id "gene3"; transcript_id "tx3"; exon_number "2"; chr2 fastder gene 10001 12500 218.92 - . gene_id "gene4"; gene_name "faster_gene4"; chr2 fastder transcript 10001 12500 218.92 - . gene_id "gene4"; transcript_id "tx4"; -chr2 fastder exon 10001 10500 225.982 - . gene_id "gene4"; transcript_id "tx4"; exon_number "1"; +chr2 fastder exon 10001 10499 225.982 - . gene_id "gene4"; transcript_id "tx4"; exon_number "1"; chr2 fastder exon 11001 12500 216.566 - . gene_id "gene4"; transcript_id "tx4"; exon_number "2"; From 786c482499935ca3401c8ef2de8ca548e87ba401 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Tue, 19 May 2026 08:25:38 +0200 Subject: [PATCH 5/7] Add leaky intron splitting --- CHANGELOG.md | 11 ++++++ cpp/Integrator.cpp | 95 +++++++++++++++++++++++++++++++++++++++++++++ cpp/Integrator.h | 9 +++++ cpp/main.cpp | 16 ++++++++ tests/UnitTests.cpp | 71 +++++++++++++++++++++++++++++++++ 5 files changed, 202 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c6a85a8..bb77cb9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,17 @@ All notable changes to fastder are recorded in this file. ## [Unreleased] +### Added + +- `--split-leaky-introns` flag. When set, an expressed region is split + wherever a splice junction lies fully inside it, meaning both the junction + donor and acceptor fall strictly within the region. Such a junction is + evidence that coverage merged two exons across a leaky or retained intron. + The intron is removed and the region becomes its flanking pieces. Off by + default, so the region caller's output is unchanged unless the flag is + given. The split runs before stitching, so chain building and the GTF + writer see the corrected boundaries. + ### Fixed - GTF output is now 1-based. fastder's internal coordinates are 0-based diff --git a/cpp/Integrator.cpp b/cpp/Integrator.cpp index 0550e8a..b401c2d 100644 --- a/cpp/Integrator.cpp +++ b/cpp/Integrator.cpp @@ -164,6 +164,101 @@ void Integrator::stitch_one_strand(const std::string& chrom, } +// Split expressed regions at fully contained splice junctions. A junction +// whose donor and acceptor both fall strictly inside one ER means the ER +// merged two exons across a leaky or retained intron. The contained introns +// are collected, overlapping ones merged so alternative junctions inside the +// same ER collapse to disjoint removal intervals, and the ER is replaced by +// the exonic pieces between them. Each piece inherits the parent ER's +// coverage and strand. ERs with no contained junction are left untouched. +void Integrator::split_leaky_introns( + std::unordered_map>& expressed_regions, + const std::unordered_map>& mm_chrom_sj, + const std::vector& rr_all_sj) +{ + uint64_t added_regions = 0; + for (auto& chrom_ers : expressed_regions) + { + const std::string& chrom = chrom_ers.first; + std::vector& ers = chrom_ers.second; + if (ers.empty()) continue; + + const auto sjs_it = mm_chrom_sj.find(chrom); + if (sjs_it == mm_chrom_sj.end() || sjs_it->second.empty()) continue; + + // intron [donor, acceptor) for every junction observed on this chrom, + // sorted by donor so the scan per ER can stop early + std::vector> introns; + introns.reserve(sjs_it->second.size()); + for (uint32_t sj_id : sjs_it->second) + { + if (sj_id == 0 || sj_id - 1 >= rr_all_sj.size()) continue; + const SJRow& sj = rr_all_sj[sj_id - 1]; + introns.emplace_back(sj.start, sj.end); + } + std::sort(introns.begin(), introns.end()); + + std::vector refined; + refined.reserve(ers.size()); + for (const BedGraphRow& er : ers) + { + // an intron is contained when its donor and acceptor both fall + // strictly inside the ER, leaving an exonic piece on either side + std::vector> contained; + for (const auto& intron : introns) + { + if (intron.first >= er.end) break; + if (intron.first > er.start && intron.second < er.end) + { + contained.emplace_back(intron); + } + } + if (contained.empty()) + { + refined.emplace_back(er); + continue; + } + // merge overlapping or touching introns into disjoint intervals + std::sort(contained.begin(), contained.end()); + std::vector> merged; + for (const auto& intron : contained) + { + if (!merged.empty() && intron.first <= merged.back().second) + { + merged.back().second = std::max(merged.back().second, intron.second); + } + else + { + merged.emplace_back(intron); + } + } + // emit the exonic pieces between the removed introns + uint64_t piece_start = er.start; + for (const auto& intron : merged) + { + if (intron.first > piece_start) + { + refined.emplace_back(BedGraphRow(chrom, piece_start, intron.first, + er.coverage, er.strand)); + } + piece_start = intron.second; + } + if (er.end > piece_start) + { + refined.emplace_back(BedGraphRow(chrom, piece_start, er.end, + er.coverage, er.strand)); + } + } + added_regions += refined.size() - ers.size(); + std::sort(refined.begin(), refined.end(), + [](const BedGraphRow& a, const BedGraphRow& b) { return a.start < b.start; }); + ers = std::move(refined); + } + std::cout << "[INFO] split-leaky-introns split expressed regions into " + << added_regions << " additional pieces." << std::endl; +} + + void Integrator::stitch_up(std::unordered_map>& expressed_regions, const std::unordered_map>& mm_chrom_sj, const std::vector& rr_all_sj) { // Reset stitched_ERs in case the same Integrator instance is being diff --git a/cpp/Integrator.h b/cpp/Integrator.h index 16ee3c7..1c798aa 100644 --- a/cpp/Integrator.h +++ b/cpp/Integrator.h @@ -32,6 +32,15 @@ class Integrator const std::vector& rr_all_sj, std::unordered_set& consumed_indices, std::vector& output); + // Splits expressed regions at fully contained splice junctions. A + // junction whose donor and acceptor both fall strictly inside one ER is + // evidence the ER merged two exons across a leaky or retained intron; + // the intron is dropped and the ER is replaced by its flanking pieces. + // Runs before stitch_up so the chain builder and write_to_gtf see the + // corrected boundaries. ERs with no contained junction are left as is. + void split_leaky_introns(std::unordered_map>& expressed_regions, + const std::unordered_map>& mm_chrom_sj, + const std::vector& rr_all_sj); bool within_threshold(double val1, double val2) const; bool within_threshold(uint64_t val1, uint64_t val2) const; bool is_similar(const StitchedER& most_recent_er, const BedGraphRow& expressed_region, const SJRow& current_sj); diff --git a/cpp/main.cpp b/cpp/main.cpp index 501cb42..d82a406 100644 --- a/cpp/main.cpp +++ b/cpp/main.cpp @@ -27,6 +27,9 @@ int main(int argc, char* argv[]) { double min_coverage = 0.05; std::string directory; int cores = 10; + // when true, split an expressed region at any splice junction that lies + // fully inside it (leaky / retained intron). Off by default. + bool do_split_leaky_introns = false; std::cout << "\n " @@ -56,6 +59,11 @@ int main(int argc, char* argv[]) { << " spread. Default = 1000 (the gate is effectively off, stitching\n" << " is driven by splice junctions).\n" << " Example: --coverage-tolerance 2.0\n\n" + << " --split-leaky-introns Split an expressed region wherever a splice junction lies\n" + << " fully inside it; such a junction is evidence the region\n" + << " merged two exons across a leaky or retained intron. The\n" + << " intron is dropped and the region becomes its flanking\n" + << " pieces. Takes no value; off by default.\n\n" << " --cores Number of cores that fastder may use. Default = 10 cores.\n" << " Example: --cores 23\n\n" << "Example:\n" @@ -112,6 +120,10 @@ int main(int argc, char* argv[]) { { directory = argv[++i]; } + else if (arg == "--split-leaky-introns") + { + do_split_leaky_introns = true; + } else { std::cout << "[ERROR] Unknown argument '" << argv[i] << "'" << std::endl; @@ -144,6 +156,10 @@ int main(int argc, char* argv[]) { // use splice junctions to stitch together expressed regions Integrator integrator = Integrator(coverage_tolerance, position_tolerance); + if (do_split_leaky_introns) + { + integrator.split_leaky_introns(averager.expressed_regions, parser.mm_chrom_sj, parser.rr_all_sj); + } integrator.stitch_up(averager.expressed_regions, parser.mm_chrom_sj, parser.rr_all_sj); diff --git a/tests/UnitTests.cpp b/tests/UnitTests.cpp index d12c9f3..6c14ffa 100644 --- a/tests/UnitTests.cpp +++ b/tests/UnitTests.cpp @@ -1123,4 +1123,75 @@ TEST(BigWig, BedGraphAndBigWigEquivalentExpressedRegions) fs::remove_all(tmp); #endif +} + +TEST(LeakyIntronSplit, SplitsRegionAtContainedJunction) +{ + // one ER spans [10000, 12000); a junction intron [10500, 11000) sits + // fully inside it, so the ER splits into [10000, 10500) and [11000, 12000) + std::vector rr_all_sj = { + SJRow("chr1", 10500, 11000, 500, '+', false), // id 1 + }; + std::unordered_map> expressed_regions; + expressed_regions["chr1"] = { + BedGraphRow("chr1", 10000, 12000, 100), + }; + std::unordered_map> mm_chrom_sj; + mm_chrom_sj["chr1"] = {1}; + + Integrator integrator = Integrator(1000.0, 5); + integrator.split_leaky_introns(expressed_regions, mm_chrom_sj, rr_all_sj); + + const auto& ers = expressed_regions["chr1"]; + ASSERT_EQ(ers.size(), 2); + EXPECT_EQ(ers[0].start, 10000); + EXPECT_EQ(ers[0].end, 10500); + EXPECT_EQ(ers[1].start, 11000); + EXPECT_EQ(ers[1].end, 12000); +} + +TEST(LeakyIntronSplit, KeepsRegionWhenJunctionTouchesEdge) +{ + // the junction donor coincides with the ER start, so it is not strictly + // contained and the ER is left as a single region + std::vector rr_all_sj = { + SJRow("chr1", 10000, 10500, 500, '+', false), // id 1 + }; + std::unordered_map> expressed_regions; + expressed_regions["chr1"] = { + BedGraphRow("chr1", 10000, 12000, 100), + }; + std::unordered_map> mm_chrom_sj; + mm_chrom_sj["chr1"] = {1}; + + Integrator integrator = Integrator(1000.0, 5); + integrator.split_leaky_introns(expressed_regions, mm_chrom_sj, rr_all_sj); + + ASSERT_EQ(expressed_regions["chr1"].size(), 1); + EXPECT_EQ(expressed_regions["chr1"][0].start, 10000); + EXPECT_EQ(expressed_regions["chr1"][0].end, 12000); +} + +TEST(LeakyIntronSplit, SplitsAtTwoContainedJunctions) +{ + // two contained junctions split the ER into three exonic pieces + std::vector rr_all_sj = { + SJRow("chr1", 10500, 11000, 500, '+', false), // id 1 + SJRow("chr1", 11500, 12000, 500, '+', false), // id 2 + }; + std::unordered_map> expressed_regions; + expressed_regions["chr1"] = { + BedGraphRow("chr1", 10000, 13000, 100), + }; + std::unordered_map> mm_chrom_sj; + mm_chrom_sj["chr1"] = {1, 2}; + + Integrator integrator = Integrator(1000.0, 5); + integrator.split_leaky_introns(expressed_regions, mm_chrom_sj, rr_all_sj); + + const auto& ers = expressed_regions["chr1"]; + ASSERT_EQ(ers.size(), 3); + EXPECT_EQ(ers[0].start, 10000); EXPECT_EQ(ers[0].end, 10500); + EXPECT_EQ(ers[1].start, 11000); EXPECT_EQ(ers[1].end, 11500); + EXPECT_EQ(ers[2].start, 12000); EXPECT_EQ(ers[2].end, 13000); } \ No newline at end of file From 6256c417106b298fc58044171c0d3f31f7315865 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Tue, 19 May 2026 09:01:21 +0200 Subject: [PATCH 6/7] Update docs, remove intron chopping --- CHANGELOG.md | 11 ------ README.md | 32 ++++++++------- cpp/Integrator.cpp | 95 --------------------------------------------- cpp/Integrator.h | 9 ----- cpp/main.cpp | 18 +-------- tests/UnitTests.cpp | 71 --------------------------------- 6 files changed, 19 insertions(+), 217 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bb77cb9..c6a85a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,17 +4,6 @@ All notable changes to fastder are recorded in this file. ## [Unreleased] -### Added - -- `--split-leaky-introns` flag. When set, an expressed region is split - wherever a splice junction lies fully inside it, meaning both the junction - donor and acceptor fall strictly within the region. Such a junction is - evidence that coverage merged two exons across a leaky or retained intron. - The intron is removed and the region becomes its flanking pieces. Off by - default, so the region caller's output is unchanged unless the flag is - given. The split runs before stitching, so chain building and the GTF - writer see the corrected boundaries. - ### Fixed - GTF output is now 1-based. fastder's internal coordinates are 0-based diff --git a/README.md b/README.md index 854d6e8..eaa0925 100644 --- a/README.md +++ b/README.md @@ -91,8 +91,7 @@ that the tool will only output expressed regions on chromosome 1, and will ignor - `--min-length 5` describes the minimum length (in bp) that an ER must have. For instance, three consecutive base pairs with coverage > 0.25 CPM will be ignored if the min length is set to 5 bp. - `--position-tolerance 5` describes the maximum permitted offset of the end position of an exon and the starting position of a splice junction. If this tolerance is set to 5, an ER with end position = 1000 bp and a splice junction with start position = 1005 bp will be stitched together (if the coverage and end junction match). - - `--coverage-tolerance 0.1` describes the maximum permitted coverage deviation between two ER that are separated by a spliced region. For a coverage tolerance of 0.1, - two ERs with coverage = 10 CPM and 11 CPM will be stitched together (if there is a matching splice junction). + - `--coverage-tolerance 1000` describes the permitted coverage deviation between two ERs separated by a spliced region. It defaults to 1000, effectively off, so stitching is driven by splice junctions rather than coverage similarity. A visualization of the different parameters is provided below. @@ -103,13 +102,14 @@ Usage: --dir ... \ [--chr ...] \ [--min-coverage ] \ + [--min-length ] \ [--position-tolerance ] \ [--coverage-tolerance ] \ - [--help] + [--cores ] Required inputs: - --dir ... Relative path from the build directory to the directory containing the input files. + --dir ... Relative path from the build directory, or an absolute path, to the directory containing the input files. Example: --dir ../../data/test_exon_skipping Optional inputs: @@ -118,24 +118,26 @@ Optional inputs: Default: all (chr1-chr22, chrX) Example: --chr chr1 chr2 chr3 - --min-length Minimum length [#bp] required for a region to qualify as an expressed region (ER). - Default: 5 bp - Example: --min-length 5 + --min-length Minimum length [#bp] required for a region to qualify as an expressed region (ER). + Default: 10 bp + Example: --min-length 10 --min-coverage Minimum coverage [CPM] required for a region to qualify as an ER. Normalized in-place by library size. - Default: 0.25 CPM + Default: 0.05 CPM Example: --min-coverage 0.25 --position-tolerance Maximum allowed positional deviation between splice junction and ER coordinates [bp]. Default: 5 bp Example: --position-tolerance 5 - --coverage-tolerance Allowed relative deviation in coverage between stitched ERs (e.g. 0.1 = 10%). - Default: 0.1 - Example: --coverage-tolerance 0.1 + --coverage-tolerance Permitted coverage deviation between stitched ERs, as a proportion of the running average. + Default: 1000 (gate effectively off; stitching is junction-driven). + Example: --coverage-tolerance 2.0 - --help Show this help message. + --cores Number of cores fastder may use. + Default: 10 + Example: --cores 23 Example: @@ -144,9 +146,9 @@ Example: --dir ../../data/input \ --chr chr1 chr2 \ --position-tolerance 5 \ - --min-length 5 \ - --min-coverage 0.25 \ - --coverage-tolerance 0.1 + --min-length 10 \ + --min-coverage 0.05 \ + --cores 10 ``` diff --git a/cpp/Integrator.cpp b/cpp/Integrator.cpp index b401c2d..0550e8a 100644 --- a/cpp/Integrator.cpp +++ b/cpp/Integrator.cpp @@ -164,101 +164,6 @@ void Integrator::stitch_one_strand(const std::string& chrom, } -// Split expressed regions at fully contained splice junctions. A junction -// whose donor and acceptor both fall strictly inside one ER means the ER -// merged two exons across a leaky or retained intron. The contained introns -// are collected, overlapping ones merged so alternative junctions inside the -// same ER collapse to disjoint removal intervals, and the ER is replaced by -// the exonic pieces between them. Each piece inherits the parent ER's -// coverage and strand. ERs with no contained junction are left untouched. -void Integrator::split_leaky_introns( - std::unordered_map>& expressed_regions, - const std::unordered_map>& mm_chrom_sj, - const std::vector& rr_all_sj) -{ - uint64_t added_regions = 0; - for (auto& chrom_ers : expressed_regions) - { - const std::string& chrom = chrom_ers.first; - std::vector& ers = chrom_ers.second; - if (ers.empty()) continue; - - const auto sjs_it = mm_chrom_sj.find(chrom); - if (sjs_it == mm_chrom_sj.end() || sjs_it->second.empty()) continue; - - // intron [donor, acceptor) for every junction observed on this chrom, - // sorted by donor so the scan per ER can stop early - std::vector> introns; - introns.reserve(sjs_it->second.size()); - for (uint32_t sj_id : sjs_it->second) - { - if (sj_id == 0 || sj_id - 1 >= rr_all_sj.size()) continue; - const SJRow& sj = rr_all_sj[sj_id - 1]; - introns.emplace_back(sj.start, sj.end); - } - std::sort(introns.begin(), introns.end()); - - std::vector refined; - refined.reserve(ers.size()); - for (const BedGraphRow& er : ers) - { - // an intron is contained when its donor and acceptor both fall - // strictly inside the ER, leaving an exonic piece on either side - std::vector> contained; - for (const auto& intron : introns) - { - if (intron.first >= er.end) break; - if (intron.first > er.start && intron.second < er.end) - { - contained.emplace_back(intron); - } - } - if (contained.empty()) - { - refined.emplace_back(er); - continue; - } - // merge overlapping or touching introns into disjoint intervals - std::sort(contained.begin(), contained.end()); - std::vector> merged; - for (const auto& intron : contained) - { - if (!merged.empty() && intron.first <= merged.back().second) - { - merged.back().second = std::max(merged.back().second, intron.second); - } - else - { - merged.emplace_back(intron); - } - } - // emit the exonic pieces between the removed introns - uint64_t piece_start = er.start; - for (const auto& intron : merged) - { - if (intron.first > piece_start) - { - refined.emplace_back(BedGraphRow(chrom, piece_start, intron.first, - er.coverage, er.strand)); - } - piece_start = intron.second; - } - if (er.end > piece_start) - { - refined.emplace_back(BedGraphRow(chrom, piece_start, er.end, - er.coverage, er.strand)); - } - } - added_regions += refined.size() - ers.size(); - std::sort(refined.begin(), refined.end(), - [](const BedGraphRow& a, const BedGraphRow& b) { return a.start < b.start; }); - ers = std::move(refined); - } - std::cout << "[INFO] split-leaky-introns split expressed regions into " - << added_regions << " additional pieces." << std::endl; -} - - void Integrator::stitch_up(std::unordered_map>& expressed_regions, const std::unordered_map>& mm_chrom_sj, const std::vector& rr_all_sj) { // Reset stitched_ERs in case the same Integrator instance is being diff --git a/cpp/Integrator.h b/cpp/Integrator.h index 1c798aa..16ee3c7 100644 --- a/cpp/Integrator.h +++ b/cpp/Integrator.h @@ -32,15 +32,6 @@ class Integrator const std::vector& rr_all_sj, std::unordered_set& consumed_indices, std::vector& output); - // Splits expressed regions at fully contained splice junctions. A - // junction whose donor and acceptor both fall strictly inside one ER is - // evidence the ER merged two exons across a leaky or retained intron; - // the intron is dropped and the ER is replaced by its flanking pieces. - // Runs before stitch_up so the chain builder and write_to_gtf see the - // corrected boundaries. ERs with no contained junction are left as is. - void split_leaky_introns(std::unordered_map>& expressed_regions, - const std::unordered_map>& mm_chrom_sj, - const std::vector& rr_all_sj); bool within_threshold(double val1, double val2) const; bool within_threshold(uint64_t val1, uint64_t val2) const; bool is_similar(const StitchedER& most_recent_er, const BedGraphRow& expressed_region, const SJRow& current_sj); diff --git a/cpp/main.cpp b/cpp/main.cpp index d82a406..e6912eb 100644 --- a/cpp/main.cpp +++ b/cpp/main.cpp @@ -27,9 +27,6 @@ int main(int argc, char* argv[]) { double min_coverage = 0.05; std::string directory; int cores = 10; - // when true, split an expressed region at any splice junction that lies - // fully inside it (leaky / retained intron). Off by default. - bool do_split_leaky_introns = false; std::cout << "\n " @@ -51,6 +48,8 @@ int main(int argc, char* argv[]) { " Normalization is done in-place by library size. \n" " Default = 0.05 CPM.\n" << " Example: --min-coverage 0.25\n\n" + << " --min-length Minimum length, in [nt], for a region to qualify as an expressed region (ER). Default = 10 nt.\n" + << " Example: --min-length 10\n\n" << " --position-tolerance Maximum permitted position deviation of splice junction and ER coordinates, in [nt]. Default = 5 nt.\n" << " Example: --position-tolerance 5\n\n" << " --coverage-tolerance Permitted coverage deviation when stitching, as a proportion: a\n" @@ -59,11 +58,6 @@ int main(int argc, char* argv[]) { << " spread. Default = 1000 (the gate is effectively off, stitching\n" << " is driven by splice junctions).\n" << " Example: --coverage-tolerance 2.0\n\n" - << " --split-leaky-introns Split an expressed region wherever a splice junction lies\n" - << " fully inside it; such a junction is evidence the region\n" - << " merged two exons across a leaky or retained intron. The\n" - << " intron is dropped and the region becomes its flanking\n" - << " pieces. Takes no value; off by default.\n\n" << " --cores Number of cores that fastder may use. Default = 10 cores.\n" << " Example: --cores 23\n\n" << "Example:\n" @@ -120,10 +114,6 @@ int main(int argc, char* argv[]) { { directory = argv[++i]; } - else if (arg == "--split-leaky-introns") - { - do_split_leaky_introns = true; - } else { std::cout << "[ERROR] Unknown argument '" << argv[i] << "'" << std::endl; @@ -156,10 +146,6 @@ int main(int argc, char* argv[]) { // use splice junctions to stitch together expressed regions Integrator integrator = Integrator(coverage_tolerance, position_tolerance); - if (do_split_leaky_introns) - { - integrator.split_leaky_introns(averager.expressed_regions, parser.mm_chrom_sj, parser.rr_all_sj); - } integrator.stitch_up(averager.expressed_regions, parser.mm_chrom_sj, parser.rr_all_sj); diff --git a/tests/UnitTests.cpp b/tests/UnitTests.cpp index 6c14ffa..d12c9f3 100644 --- a/tests/UnitTests.cpp +++ b/tests/UnitTests.cpp @@ -1123,75 +1123,4 @@ TEST(BigWig, BedGraphAndBigWigEquivalentExpressedRegions) fs::remove_all(tmp); #endif -} - -TEST(LeakyIntronSplit, SplitsRegionAtContainedJunction) -{ - // one ER spans [10000, 12000); a junction intron [10500, 11000) sits - // fully inside it, so the ER splits into [10000, 10500) and [11000, 12000) - std::vector rr_all_sj = { - SJRow("chr1", 10500, 11000, 500, '+', false), // id 1 - }; - std::unordered_map> expressed_regions; - expressed_regions["chr1"] = { - BedGraphRow("chr1", 10000, 12000, 100), - }; - std::unordered_map> mm_chrom_sj; - mm_chrom_sj["chr1"] = {1}; - - Integrator integrator = Integrator(1000.0, 5); - integrator.split_leaky_introns(expressed_regions, mm_chrom_sj, rr_all_sj); - - const auto& ers = expressed_regions["chr1"]; - ASSERT_EQ(ers.size(), 2); - EXPECT_EQ(ers[0].start, 10000); - EXPECT_EQ(ers[0].end, 10500); - EXPECT_EQ(ers[1].start, 11000); - EXPECT_EQ(ers[1].end, 12000); -} - -TEST(LeakyIntronSplit, KeepsRegionWhenJunctionTouchesEdge) -{ - // the junction donor coincides with the ER start, so it is not strictly - // contained and the ER is left as a single region - std::vector rr_all_sj = { - SJRow("chr1", 10000, 10500, 500, '+', false), // id 1 - }; - std::unordered_map> expressed_regions; - expressed_regions["chr1"] = { - BedGraphRow("chr1", 10000, 12000, 100), - }; - std::unordered_map> mm_chrom_sj; - mm_chrom_sj["chr1"] = {1}; - - Integrator integrator = Integrator(1000.0, 5); - integrator.split_leaky_introns(expressed_regions, mm_chrom_sj, rr_all_sj); - - ASSERT_EQ(expressed_regions["chr1"].size(), 1); - EXPECT_EQ(expressed_regions["chr1"][0].start, 10000); - EXPECT_EQ(expressed_regions["chr1"][0].end, 12000); -} - -TEST(LeakyIntronSplit, SplitsAtTwoContainedJunctions) -{ - // two contained junctions split the ER into three exonic pieces - std::vector rr_all_sj = { - SJRow("chr1", 10500, 11000, 500, '+', false), // id 1 - SJRow("chr1", 11500, 12000, 500, '+', false), // id 2 - }; - std::unordered_map> expressed_regions; - expressed_regions["chr1"] = { - BedGraphRow("chr1", 10000, 13000, 100), - }; - std::unordered_map> mm_chrom_sj; - mm_chrom_sj["chr1"] = {1, 2}; - - Integrator integrator = Integrator(1000.0, 5); - integrator.split_leaky_introns(expressed_regions, mm_chrom_sj, rr_all_sj); - - const auto& ers = expressed_regions["chr1"]; - ASSERT_EQ(ers.size(), 3); - EXPECT_EQ(ers[0].start, 10000); EXPECT_EQ(ers[0].end, 10500); - EXPECT_EQ(ers[1].start, 11000); EXPECT_EQ(ers[1].end, 11500); - EXPECT_EQ(ers[2].start, 12000); EXPECT_EQ(ers[2].end, 13000); } \ No newline at end of file From 1d3b7ce4bbd0b568456dccb076543b6489e0cbd7 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Tue, 19 May 2026 09:34:25 +0200 Subject: [PATCH 7/7] Address code review, prepare 0.1.0 release, add Izaskun as cmake package contact too --- CHANGELOG.md | 11 ++- CMakeLists.txt | 4 +- cpp/SJRow.h | 7 ++ cpp/main.cpp | 2 +- tests/UnitTests.cpp | 118 +++++++++++++++++++++++++ tests/gtfs/write_gtf_fallback_test.gtf | 10 +++ tests/gtfs/write_gtf_snap_test.gtf | 10 +++ 7 files changed, 157 insertions(+), 5 deletions(-) create mode 100644 tests/gtfs/write_gtf_fallback_test.gtf create mode 100644 tests/gtfs/write_gtf_snap_test.gtf diff --git a/CHANGELOG.md b/CHANGELOG.md index c6a85a8..4e74f0d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ All notable changes to fastder are recorded in this file. -## [Unreleased] +## [0.1.0] - 2026-05-19 ### Fixed @@ -16,7 +16,9 @@ All notable changes to fastder are recorded in this file. file carries 1-based inclusive intron coordinates (the STAR SJ.out.tab convention), but fastder works in 0-based half-open coordinates like the coverage. The mismatch left junction-snapped exon ends one base too long. - The RR parser now shifts the junction start by one on read. + The RR parser now shifts the junction start by one on read, and rejects a + junction start of 0, which is malformed under the 1-based convention, + rather than letting the unsigned subtraction wrap. ### Changed @@ -31,3 +33,8 @@ All notable changes to fastder are recorded in this file. donor and acceptor coordinates instead of the coverage-derived expressed- region edges. Edges with no adjacent junction, namely the outer ends of a chain and both ends of a monoexonic region, keep the coverage extent. + +## [Legacy] + +Baseline forked from the upstream fastder repository, which carried no tagged +release. Versioned history begins at 0.1.0 above. diff --git a/CMakeLists.txt b/CMakeLists.txt index c203de8..79cd70b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,8 +58,8 @@ endif() set(CPACK_PACKAGE_NAME "fastder") set(CPACK_PACKAGE_VENDOR "Martina Lehmann") set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "fastder: A Fast, Agnostic, Splice-Junction-Based Tool for the Detection of Expressed Regions") -set(CPACK_PACKAGE_VERSION "0.0.1") -set(CPACK_PACKAGE_CONTACT "martina.lavanya@gmail.com") +set(CPACK_PACKAGE_VERSION "0.1.0") +set(CPACK_PACKAGE_CONTACT "martina.lavanya@gmail.com, izaskun.mallona.work@gmail.com") # Debian-specific packaging settings set(CPACK_DEBIAN_PACKAGE_MAINTAINER "Martina Lehmann") diff --git a/cpp/SJRow.h b/cpp/SJRow.h index 03ac3b2..b2ccbae 100644 --- a/cpp/SJRow.h +++ b/cpp/SJRow.h @@ -46,6 +46,13 @@ class SJRow // start by one; the end already equals the 0-based half-open end. // This keeps junction-to-ER comparisons and the exon-boundary // snapping consistent with the coverage coordinates. + // A valid 1-based start is at least 1; a 0 here means malformed or + // truncated input. Reject the row instead of underflowing uint64_t. + if (row.start == 0) + { + is.setstate(std::ios::failbit); + return is; + } row.start -= 1; return is; } diff --git a/cpp/main.cpp b/cpp/main.cpp index e6912eb..ede0787 100644 --- a/cpp/main.cpp +++ b/cpp/main.cpp @@ -62,7 +62,7 @@ int main(int argc, char* argv[]) { << " Example: --cores 23\n\n" << "Example:\n" << " ./fastder --dir ../data --chr chr1 chr2 --position-tolerance 5 " - "--min-coverage 0.05 --coverage-tolerance 0.7 --cores 23\n" + "--min-coverage 0.05 --coverage-tolerance 2.0 --cores 23\n" << std::endl; // parse command-line arguments diff --git a/tests/UnitTests.cpp b/tests/UnitTests.cpp index d12c9f3..6129a58 100644 --- a/tests/UnitTests.cpp +++ b/tests/UnitTests.cpp @@ -965,6 +965,124 @@ TEST(GTFRow, SerializesMinusStrandFromStitchedER) } +// operator<< converts the 0-based half-open internal coordinates to GTF's +// 1-based inclusive convention: the start column is start + 1, the end +// column equals end unchanged. +TEST(GTFRow, SerializesOneBasedInclusiveCoordinates) +{ + BedGraphRow er("chr1", 10000, 10500, 100.0); + StitchedER s(er, 0); + GTFRow row(s, "exon", 1); + row.start = 10000; + row.end = 10500; + + std::ostringstream os; + os << row; + std::istringstream is(os.str()); + std::vector cols; + std::string field; + while (std::getline(is, field, '\t')) cols.push_back(field); + ASSERT_GE(cols.size(), 5u); + EXPECT_EQ(cols[3], "10001"); // start + 1 + EXPECT_EQ(cols[4], "10500"); // end unchanged +} + + +// Helper: read a GTF written by write_to_gtf and return the exon rows as +// (start, end) pairs in file order. Columns 4 and 5 are 1-based inclusive. +static std::vector> read_gtf_exons(const std::string& path) +{ + std::vector> exons; + std::ifstream in(path); + std::string line; + while (std::getline(in, line)) + { + if (line.empty() || line[0] == '#') continue; + std::istringstream is(line); + std::vector cols; + std::string field; + while (std::getline(is, field, '\t')) cols.push_back(field); + if (cols.size() < 5 || cols[2] != "exon") continue; + exons.emplace_back(std::stoull(cols[3]), std::stoull(cols[4])); + } + return exons; +} + + +// write_to_gtf snaps exon edges to the flanking splice junctions. In a +// three-ER chain the internal edges must equal the junction donor / acceptor +// coordinates rather than the coverage extent; the outer edges keep coverage. +TEST(IntegratorWriteGtf, ExonEdgesSnapToSpliceJunctions) +{ + // SJ coordinates are 0-based half-open here, as fastder holds them after + // parsing; the SJRow constructor stores them verbatim. + std::vector rr_all_sj = { + SJRow("chr1", 10500, 11000, 500, '-', false), + SJRow("chr1", 13000, 14000, 1000, '-', false) + }; + std::unordered_map> expressed_regions; + expressed_regions["chr1"] = { + BedGraphRow("chr1", 10000, 10498, 100), + BedGraphRow("chr1", 11002, 12999, 101), + BedGraphRow("chr1", 14003, 14540, 30) + }; + std::unordered_map> mm_chrom_sj; + mm_chrom_sj["chr1"] = {1, 2}; + // coverage_tolerance off so the chain forms regardless of exon coverage. + Integrator integrator = Integrator(1000.0, 5); + integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); + const std::string path = "../../tests/gtfs/write_gtf_snap_test.gtf"; + integrator.write_to_gtf(path); + + auto exons = read_gtf_exons(path); + ASSERT_EQ(exons.size(), 3u); + // Internal edges snap to the junctions, serialized 1-based inclusive. + EXPECT_EQ(exons[0].second, 10500u); // first exon end -> donor 10500 + EXPECT_EQ(exons[1].first, 11001u); // second exon start -> acceptor 11000 + EXPECT_EQ(exons[1].second, 13000u); // second exon end -> donor 13000 + EXPECT_EQ(exons[2].first, 14001u); // third exon start -> acceptor 14000 +} + + +// When the junctions flanking a short middle exon snap past each other, so +// the snapped start lands at or beyond the snapped end, write_to_gtf must +// fall back to that ER's coverage extent so the emitted exon stays valid. +TEST(IntegratorWriteGtf, FallsBackToCoverageExtentWhenJunctionsSnapPast) +{ + // SJ coordinates are 0-based half-open. The right junction's donor (11004) + // lies before the left junction's acceptor (11008), so snapping the middle + // exon to them would invert it; the coverage extent is used instead. + std::vector rr_all_sj = { + SJRow("chr1", 10500, 11008, 508, '-', false), + SJRow("chr1", 11004, 12000, 996, '-', false) + }; + std::unordered_map> expressed_regions; + expressed_regions["chr1"] = { + BedGraphRow("chr1", 10000, 10500, 100), + BedGraphRow("chr1", 11000, 11012, 100), + BedGraphRow("chr1", 12000, 12500, 100) + }; + std::unordered_map> mm_chrom_sj; + mm_chrom_sj["chr1"] = {1, 2}; + Integrator integrator = Integrator(1000.0, 50); + integrator.stitch_up(expressed_regions, mm_chrom_sj, rr_all_sj); + const std::string path = "../../tests/gtfs/write_gtf_fallback_test.gtf"; + integrator.write_to_gtf(path); + + auto exons = read_gtf_exons(path); + ASSERT_EQ(exons.size(), 3u); + // Every emitted exon is a valid interval. + for (const auto& ex : exons) + { + EXPECT_LE(ex.first, ex.second); + } + // The middle exon falls back to its coverage extent 11000..11012, + // serialized 1-based inclusive as 11001..11012. + EXPECT_EQ(exons[1].first, 11001u); + EXPECT_EQ(exons[1].second, 11012u); +} + + // ===================================================================== // libBigWig integration (gated). These run only when fastder is built with // -DFASTDER_USE_LIBBIGWIG=ON; otherwise they GTEST_SKIP so the unbuilt code diff --git a/tests/gtfs/write_gtf_fallback_test.gtf b/tests/gtfs/write_gtf_fallback_test.gtf new file mode 100644 index 0000000..6b00bc4 --- /dev/null +++ b/tests/gtfs/write_gtf_fallback_test.gtf @@ -0,0 +1,10 @@ +#description: expressed region annotation of the genome based on BedGraph and MM / RR splice junction information. +#provider: FASTDER +#contact: martina.lavanya@gmail.com +#format: gtf +#date: 2026-5-19 +chr1 fastder gene 10001 12500 100 - . gene_id "gene1"; gene_name "faster_gene1"; +chr1 fastder transcript 10001 12500 100 - . gene_id "gene1"; transcript_id "tx1"; +chr1 fastder exon 10001 10500 100 - . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; +chr1 fastder exon 11001 11012 100 - . gene_id "gene1"; transcript_id "tx1"; exon_number "2"; +chr1 fastder exon 12001 12500 100 - . gene_id "gene1"; transcript_id "tx1"; exon_number "3"; diff --git a/tests/gtfs/write_gtf_snap_test.gtf b/tests/gtfs/write_gtf_snap_test.gtf new file mode 100644 index 0000000..ed2a5a4 --- /dev/null +++ b/tests/gtfs/write_gtf_snap_test.gtf @@ -0,0 +1,10 @@ +#description: expressed region annotation of the genome based on BedGraph and MM / RR splice junction information. +#provider: FASTDER +#contact: martina.lavanya@gmail.com +#format: gtf +#date: 2026-5-19 +chr1 fastder gene 10001 14540 88.2609 - . gene_id "gene1"; gene_name "faster_gene1"; +chr1 fastder transcript 10001 14540 88.2609 - . gene_id "gene1"; transcript_id "tx1"; +chr1 fastder exon 10001 10500 100 - . gene_id "gene1"; transcript_id "tx1"; exon_number "1"; +chr1 fastder exon 11001 13000 101 - . gene_id "gene1"; transcript_id "tx1"; exon_number "2"; +chr1 fastder exon 14001 14540 30 - . gene_id "gene1"; transcript_id "tx1"; exon_number "3";