From 7df6aa50f6195ff19d7ddea3223e9fb24e6285dd Mon Sep 17 00:00:00 2001 From: Pascal Hertleif Date: Mon, 4 May 2026 15:56:56 +0200 Subject: [PATCH] reader: make FASTA reference optional Open BAM/SAM/CRAM without a FASTA: bases that don't need a reference decode as before, and `pileup()` reports `Base::Unknown` for the reference column. CRAM follows the writer's `RR` flag and per-slice `embedded_reference` to decide whether external reference bytes are needed; if a slice needs them and no FASTA was supplied, fetch returns `CramError::MissingReference` rather than silently producing N's. `Readers::open` now takes `impl Into>` for the FASTA argument, so `Readers::open(bam, &fa)`, `Readers::open(bam, Some(&fa))`, and `Readers::open(bam, None)` all work. Closes #46. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/seqair/examples/aligned_pairs_walk.rs | 4 +- crates/seqair/examples/mpileup.rs | 4 +- crates/seqair/examples/pileup_extras.rs | 5 +- .../seqair/examples/pileup_extras_simple.rs | 2 +- .../seqair/examples/simple_variant_caller.rs | 2 +- crates/seqair/src/cram/reader.rs | 78 +++++- crates/seqair/src/cram/slice.rs | 17 ++ crates/seqair/src/fasta/reader.rs | 3 + crates/seqair/src/reader/formats.rs | 7 - crates/seqair/src/reader/indexed.rs | 46 ++-- crates/seqair/src/reader/readers.rs | 133 ++++++---- crates/seqair/tests/cram_tlen.rs | 2 +- crates/seqair/tests/cram_version_matrix.rs | 3 +- crates/seqair/tests/mm_ml_roundtrip.rs | 3 +- crates/seqair/tests/no_reference.rs | 228 ++++++++++++++++++ crates/seqair/tests/unified_reader.rs | 16 +- docs/spec/1-1-unified_reader.md | 8 +- docs/spec/2-cram-1-reader.md | 7 +- docs/spec/4-pileup.md | 3 + 19 files changed, 466 insertions(+), 105 deletions(-) create mode 100644 crates/seqair/tests/no_reference.rs diff --git a/crates/seqair/examples/aligned_pairs_walk.rs b/crates/seqair/examples/aligned_pairs_walk.rs index d0d1f09b..6b65399a 100644 --- a/crates/seqair/examples/aligned_pairs_walk.rs +++ b/crates/seqair/examples/aligned_pairs_walk.rs @@ -84,8 +84,8 @@ struct Counts { fn main() -> anyhow::Result<()> { let args = Cli::parse(); - let mut readers = - Readers::open(&args.input, &args.reference).context("could not open BAM + FASTA")?; + let mut readers = Readers::open(&args.input, args.reference.as_path()) + .context("could not open BAM + FASTA")?; // Resolve the region against the BAM header. `RegionString` is parsed // by clap; here we lift its `(name, Pos1, Pos1)` into the engine's diff --git a/crates/seqair/examples/mpileup.rs b/crates/seqair/examples/mpileup.rs index f89a5c17..1632aa7a 100644 --- a/crates/seqair/examples/mpileup.rs +++ b/crates/seqair/examples/mpileup.rs @@ -61,8 +61,8 @@ const MAX_TILE_LEN: u32 = 1_000_000; fn main() -> anyhow::Result<()> { let args = Cli::parse(); - let mut readers = - Readers::open(&args.input, &args.reference).context("could not open alignment file")?; + let mut readers = Readers::open(&args.input, args.reference.as_path()) + .context("could not open alignment file")?; let mut output: Box = if let Some(out) = &args.out { let file = diff --git a/crates/seqair/examples/pileup_extras.rs b/crates/seqair/examples/pileup_extras.rs index d67f0462..4a3222ad 100644 --- a/crates/seqair/examples/pileup_extras.rs +++ b/crates/seqair/examples/pileup_extras.rs @@ -95,8 +95,9 @@ impl CustomizeRecordStore for ReadInfoBuilder { fn main() -> anyhow::Result<()> { let args = Cli::parse(); - let mut readers = Readers::open_customized(&args.input, &args.reference, ReadInfoBuilder) - .context("could not open BAM + FASTA")?; + let mut readers = + Readers::open_customized(&args.input, args.reference.as_path(), ReadInfoBuilder) + .context("could not open BAM + FASTA")?; // Tile the requested region into 100 kb segments. Unmapped/secondary // records are already dropped at push time by `ReadInfoBuilder::keep_record`, diff --git a/crates/seqair/examples/pileup_extras_simple.rs b/crates/seqair/examples/pileup_extras_simple.rs index 405f52eb..53c18c3e 100644 --- a/crates/seqair/examples/pileup_extras_simple.rs +++ b/crates/seqair/examples/pileup_extras_simple.rs @@ -25,7 +25,7 @@ fn main() -> anyhow::Result<()> { let mut readers = Readers::open_customized( &args.input, - &args.reference, + args.reference.as_path(), ReadInfoBuilder { min_mapq: args.min_mapq }, )?; let max_len = std::num::NonZeroU32::new(100_000).expect("non-zero literal"); diff --git a/crates/seqair/examples/simple_variant_caller.rs b/crates/seqair/examples/simple_variant_caller.rs index ff2bb9db..51debb84 100644 --- a/crates/seqair/examples/simple_variant_caller.rs +++ b/crates/seqair/examples/simple_variant_caller.rs @@ -144,7 +144,7 @@ fn main() -> anyhow::Result<()> { } let mut readers = - Readers::::open_customized(&args.input, &args.reference, DropUseless) + Readers::::open_customized(&args.input, args.reference.as_path(), DropUseless) .context("could not open BAM + FASTA")?; // ── Build VCF header ─────────────────────────────────────────────── diff --git a/crates/seqair/src/cram/reader.rs b/crates/seqair/src/cram/reader.rs index 54393a23..5d684c64 100644 --- a/crates/seqair/src/cram/reader.rs +++ b/crates/seqair/src/cram/reader.rs @@ -250,7 +250,11 @@ pub struct CramShared { /// re-scanning the header text. pub read_group_ids: Vec, pub cram_path: PathBuf, - pub fasta_path: PathBuf, + /// Path of the FASTA reference, if one was supplied at open time. + /// `None` when the reader was opened without a reference + /// (`r[cram.fasta.optional]`); `MissingReference` is then surfaced lazily + /// at fetch time only for slices that actually need an external reference. + pub fasta_path: Option, } /// Parse `@RG ID:` values from SAM header text in declaration order. @@ -272,7 +276,9 @@ fn parse_read_group_ids(header_text: &str) -> Vec { pub struct IndexedCramReader { file: R, - fasta: IndexedFastaReader, + /// Optional FASTA for sequence reconstruction. `None` when the reader was + /// opened without a reference (`r[cram.fasta.optional]`). + fasta: Option>, shared: Arc, // Scratch buffers reused across fetch_into calls. The per-record // ones (`name_buf`, `feature_byte_buf`, `cigar_ops_buf`) used to be @@ -311,6 +317,18 @@ impl IndexedCramReader { /// Open a CRAM file with a FASTA reference for sequence reconstruction. #[instrument(level = "debug", fields(cram = %cram_path.display(), fasta = %fasta_path.display()))] pub fn open(cram_path: &Path, fasta_path: &Path) -> Result { + Self::open_optional_fasta(cram_path, Some(fasta_path)) + } + + /// Open a CRAM file without a FASTA reference. See `r[cram.fasta.optional]`: + /// open always succeeds, and `MissingReference` is only surfaced at fetch + /// time for slices that actually need an external reference. + #[instrument(level = "debug", fields(cram = %cram_path.display()))] + pub fn open_without_reference(cram_path: &Path) -> Result { + Self::open_optional_fasta(cram_path, None) + } + + fn open_optional_fasta(cram_path: &Path, fasta_path: Option<&Path>) -> Result { let mut file = File::open(cram_path) .map_err(|source| CramError::Open { path: cram_path.to_path_buf(), source })?; @@ -342,8 +360,8 @@ impl IndexedCramReader { let cram_index = CramIndex::from_path(&crai_path)?; // r[impl cram.scope.reference_required] - // Open FASTA reader - let fasta = IndexedFastaReader::open(fasta_path)?; + // r[impl cram.fasta.optional] + let fasta = fasta_path.map(IndexedFastaReader::open).transpose()?; let read_group_ids = parse_read_group_ids(header.header_text()); @@ -355,7 +373,7 @@ impl IndexedCramReader { header, read_group_ids, cram_path: cram_path.to_path_buf(), - fasta_path: fasta_path.to_path_buf(), + fasta_path: fasta_path.map(Path::to_path_buf), }), container_buf: Vec::new(), cigar_buf: Vec::new(), @@ -371,10 +389,14 @@ impl IndexedCramReader { }) } + // r[impl unified.fork_cram] pub fn fork(&self) -> Result { let file = File::open(&self.shared.cram_path) .map_err(|source| CramError::Open { path: self.shared.cram_path.clone(), source })?; - let fasta = self.fasta.fork()?; + let fasta = match self.fasta.as_ref() { + Some(f) => Some(f.fork()?), + None => None, + }; Ok(IndexedCramReader { file, fasta, @@ -438,13 +460,13 @@ impl IndexedCramReader>> { Ok(IndexedCramReader { file: Cursor::new(cram_data), - fasta, + fasta: Some(fasta), shared: Arc::new(CramShared { index: cram_index, header, read_group_ids, cram_path: PathBuf::from(""), - fasta_path: PathBuf::from(""), + fasta_path: Some(PathBuf::from("")), }), container_buf: Vec::new(), cigar_buf: Vec::new(), @@ -650,9 +672,21 @@ impl IndexedCramReader { }; self.ref_seq_buf.clear(); - if ref_start < ref_end_clamped { + // r[impl cram.fasta.optional] + // Only fetch external reference bytes when this container's + // compression header says they're needed AND we actually have a + // FASTA. When `RR=false`, records carry their bases verbatim or + // as deltas against an embedded reference, so the FASTA fetch is + // pure waste. When the FASTA is missing, we leave `ref_seq_buf` + // empty and let `decode_slice` decide whether the slice can still + // be decoded (embedded ref or unmapped) or must error out + // (`r[cram.edge.missing_reference]`). + if ref_start < ref_end_clamped + && ch.preservation.reference_required + && let Some(fasta) = self.fasta.as_mut() + { // r[impl cram.edge.missing_reference] - self.fasta + fasta .fetch_seq_into( ref_name, Pos0::try_from(ref_start) @@ -673,6 +707,7 @@ impl IndexedCramReader { _ => CramError::from(e), })?; } + let fasta_available = self.fasta.is_some(); // Decode each slice listed by CRAI as overlapping our query. // CRAI's `slice_offset` matches the container's landmark value @@ -697,6 +732,7 @@ impl IndexedCramReader { slice_offset, &self.ref_seq_buf, ref_start.cast_signed(), + fasta_available, &self.shared.header, &self.shared.read_group_ids, tid, @@ -851,4 +887,26 @@ mod tests { "message: {msg}" ); } + + // r[verify cram.fasta.optional] + #[test] + fn open_without_reference_succeeds() { + let reader = IndexedCramReader::open_without_reference(cram_path()).unwrap(); + assert!(reader.header().target_count() > 0); + } + + // r[verify cram.edge.missing_reference] + #[test] + fn fetch_without_reference_errors_when_slice_needs_fasta() { + let mut reader = IndexedCramReader::open_without_reference(cram_path()).unwrap(); + let mut store = RecordStore::new(); + let tid = 0; + let err = reader + .fetch_into(tid, Pos0::new(0).unwrap(), Pos0::max_value(), &mut store) + .expect_err("CRAM with external-only ref must error without FASTA"); + assert!( + matches!(err, CramError::MissingReference { .. }), + "expected MissingReference, got {err:?}" + ); + } } diff --git a/crates/seqair/src/cram/slice.rs b/crates/seqair/src/cram/slice.rs index b37a208d..607ce833 100644 --- a/crates/seqair/src/cram/slice.rs +++ b/crates/seqair/src/cram/slice.rs @@ -114,6 +114,7 @@ pub(crate) fn decode_slice( slice_offset: usize, reference_seq: &[u8], ref_start_0based: i64, + fasta_available: bool, header: &BamHeader, read_group_ids: &[SmolStr], tid: u32, @@ -222,6 +223,21 @@ pub(crate) fn decode_slice( let effective_ref: &[u8] = if sh.embedded_reference >= 0 { &embedded_ref_data } else { reference_seq }; + // r[impl cram.fasta.optional] + // r[impl cram.edge.missing_reference] + // The reader was opened without a FASTA, the writer set RR=true, and this + // slice does not embed its own reference — bases cannot be reconstructed. + // Surface a typed error rather than silently producing N's. + if !fasta_available + && ch.preservation.reference_required + && sh.embedded_reference < 0 + && sh.ref_seq_id >= 0 + { + return Err(CramError::MissingReference { + contig: header.target_name(tid).unwrap_or("?").into(), + }); + } + let core_data = core_block.ok_or(CramError::MissingCoreDataBlock)?; let mut ctx = DecodeContext::new(&core_data.data, external_blocks); @@ -1358,6 +1374,7 @@ mod tests { first_landmark as usize, &fake_ref, ref_start, + true, &bam_header, &[], 0, diff --git a/crates/seqair/src/fasta/reader.rs b/crates/seqair/src/fasta/reader.rs index 82e8fca1..9b323ff2 100644 --- a/crates/seqair/src/fasta/reader.rs +++ b/crates/seqair/src/fasta/reader.rs @@ -62,6 +62,9 @@ pub enum FastaError { #[error("BGZF FASTA missing GZI index (internal error)")] MissingGzi, + + #[error("no FASTA reference attached; reopen with `Readers::open` to enable reference lookups")] + NotConfigured, } fn format_available_sequences(available: &[SmolStr]) -> String { diff --git a/crates/seqair/src/reader/formats.rs b/crates/seqair/src/reader/formats.rs index 91310d2c..2e7be17a 100644 --- a/crates/seqair/src/reader/formats.rs +++ b/crates/seqair/src/reader/formats.rs @@ -72,13 +72,6 @@ pub fn detect(path: &Path) -> Result { #[derive(Debug, thiserror::Error)] #[non_exhaustive] pub enum FormatDetectionError { - #[error( - "CRAM format detected but no FASTA reference provided. \ - Use Readers::open(alignment_path, fasta_path) instead, \ - or convert to BAM with `samtools view -b`" - )] - CramRequiresFasta, - #[error("file too short to determine format ({len} bytes)")] FileTooShort { path: PathBuf, len: u64 }, diff --git a/crates/seqair/src/reader/indexed.rs b/crates/seqair/src/reader/indexed.rs index 36f6a2bb..89c98064 100644 --- a/crates/seqair/src/reader/indexed.rs +++ b/crates/seqair/src/reader/indexed.rs @@ -1,5 +1,5 @@ use super::ReaderError; -use super::formats::{self, Format, FormatDetectionError}; +use super::formats::{self, Format}; use crate::{ bam::{ BamHeader, IndexedBamReader, @@ -106,29 +106,26 @@ impl IndexedReader { // r[impl unified.reader_api] impl IndexedReader { - /// Open a BAM or bgzf-compressed SAM file, auto-detecting the format. + /// Open a BAM, bgzf-compressed SAM, or CRAM file, auto-detecting the format. /// - /// CRAM files are detected but require a FASTA reference — use - /// [`crate::Readers::open`] instead. This method returns an error for CRAM - /// with a message directing the user to provide a reference. + /// For CRAM, opening without a FASTA always succeeds; missing-reference + /// errors only surface at fetch time when a slice actually needs an + /// external reference (`r[cram.fasta.optional]`). // r[impl unified.readers_backward_compat] #[instrument(level = "debug", fields(path = %path.display()))] pub fn open(path: &Path) -> Result { - match formats::detect(path)? { - Format::Bam => { - let reader = IndexedBamReader::open(path)?; - Ok(IndexedReader::Bam(reader)) - } - Format::Sam => { - let reader = IndexedSamReader::open(path)?; - Ok(IndexedReader::Sam(reader)) - } - Format::Cram => Err(FormatDetectionError::CramRequiresFasta.into()), - } + Self::open_with_optional_fasta(path, None) } /// Open any format, with a FASTA path for CRAM support. pub(crate) fn open_with_fasta(path: &Path, fasta_path: &Path) -> Result { + Self::open_with_optional_fasta(path, Some(fasta_path)) + } + + fn open_with_optional_fasta( + path: &Path, + fasta_path: Option<&Path>, + ) -> Result { match formats::detect(path)? { Format::Bam => { let reader = IndexedBamReader::open(path)?; @@ -139,7 +136,10 @@ impl IndexedReader { Ok(IndexedReader::Sam(reader)) } Format::Cram => { - let reader = IndexedCramReader::open(path, fasta_path)?; + let reader = match fasta_path { + Some(fp) => IndexedCramReader::open(path, fp)?, + None => IndexedCramReader::open_without_reference(path)?, + }; Ok(IndexedReader::Cram(Box::new(reader))) } } @@ -163,17 +163,19 @@ mod tests { use std::io::Write as _; use tempfile::NamedTempFile; + // r[verify cram.fasta.optional] + /// `IndexedReader::open` MUST accept CRAM files without a FASTA. The + /// reader fails with a downstream CRAM error (here, missing CRAI for our + /// stub file) rather than with `CramRequiresFasta`. #[test] - fn cram_requires_fasta() { - // A file starting with the CRAM magic bytes — IndexedReader::open (no FASTA) - // must return CramRequiresFasta. + fn cram_open_without_fasta_does_not_short_circuit() { let mut f = NamedTempFile::new().expect("tempfile"); f.write_all(b"CRAM\x03\x00").expect("write"); f.flush().expect("flush"); let err = IndexedReader::open(f.path()).unwrap_err(); assert!( - matches!(err, ReaderError::Format { source: FormatDetectionError::CramRequiresFasta }), - "unexpected error: {err}" + !matches!(&err, ReaderError::Format { .. }), + "open should reach the CRAM reader, not error during format detection: {err}" ); } } diff --git a/crates/seqair/src/reader/readers.rs b/crates/seqair/src/reader/readers.rs index 58987eb1..6b46b0a3 100644 --- a/crates/seqair/src/reader/readers.rs +++ b/crates/seqair/src/reader/readers.rs @@ -120,7 +120,10 @@ use tracing::instrument; // r[impl unified.readers_struct] pub struct Readers { pub(crate) alignment: IndexedReader, - pub(crate) fasta: IndexedFastaReader, + /// Optional FASTA reference — `None` when opened via + /// [`Readers::open_without_reference`] or + /// [`Readers::open_customized_without_reference`]. + pub(crate) fasta: Option, pub(crate) store: RecordStore, pub(crate) fasta_buf: Vec, pub(crate) customize: E, @@ -133,13 +136,34 @@ impl std::fmt::Debug for Readers { } impl Readers<()> { - /// Open an alignment file (BAM/SAM/CRAM) and a FASTA reference. + /// Open an alignment file (BAM/SAM/CRAM) with an optional FASTA reference. /// - /// Auto-detects the alignment format. For CRAM, the FASTA path is passed - /// to the CRAM reader for sequence reconstruction. + /// `fasta_path` accepts both `&Path` (FASTA attached) and `Option<&Path>` + /// (`Some(path)` for FASTA, `None` for none) — pick whichever reads + /// nicer at the call site: + /// + /// ```no_run + /// # use seqair::reader::Readers; + /// # use std::path::Path; + /// # fn ex() -> Result<(), Box> { + /// let _with = Readers::open(Path::new("a.bam"), Path::new("ref.fa"))?; + /// let _no_ref = Readers::open(Path::new("a.bam"), None)?; + /// let _maybe = Readers::open(Path::new("a.bam"), Some(Path::new("ref.fa")))?; + /// # Ok(()) + /// # } + /// ``` + /// + /// Auto-detects the alignment format. For CRAM, the FASTA path (if any) + /// is passed to the CRAM reader for sequence reconstruction; CRAM follows + /// `r[cram.fasta.optional]` and only errors at fetch time when a slice + /// actually needs an external reference. With no FASTA, `pileup()` skips + /// the FASTA fetch and every `PileupColumn::reference_base()` reports + /// `Base::Unknown`. // r[impl unified.readers_open] - #[instrument(level = "debug", fields(alignment = %alignment_path.display(), fasta = %fasta_path.display()))] - pub fn open(alignment_path: &Path, fasta_path: &Path) -> Result { + pub fn open<'a>( + alignment_path: &Path, + fasta_path: impl Into>, + ) -> Result { Self::open_customized(alignment_path, fasta_path, ()) } } @@ -148,15 +172,24 @@ impl Readers { // r[impl unified.readers_open_customized] /// Open like [`open`](Readers::open) but attach a [`CustomizeRecordStore`] /// value so per-record filtering and extras computation runs every time - /// records are loaded. - pub fn open_customized( + /// records are loaded. `fasta_path` accepts both `&Path` and + /// `Option<&Path>`, identically to [`open`](Readers::open). + #[instrument(level = "debug", skip(fasta_path, customize), fields(alignment = %alignment_path.display()))] + pub fn open_customized<'a>( alignment_path: &Path, - fasta_path: &Path, + fasta_path: impl Into>, customize: E, ) -> Result { - let fasta = IndexedFastaReader::open(fasta_path) - .map_err(|source| ReaderError::FastaOpen { source })?; - let alignment = IndexedReader::open_with_fasta(alignment_path, fasta_path)?; + let fasta_path: Option<&Path> = fasta_path.into(); + let (alignment, fasta) = match fasta_path { + Some(fp) => { + let fasta = IndexedFastaReader::open(fp) + .map_err(|source| ReaderError::FastaOpen { source })?; + let alignment = IndexedReader::open_with_fasta(alignment_path, fp)?; + (alignment, Some(fasta)) + } + None => (IndexedReader::open(alignment_path)?, None), + }; Ok(Readers { alignment, fasta, @@ -166,13 +199,17 @@ impl Readers { }) } - /// Fork both the alignment reader and the FASTA reader. + /// Fork both the alignment reader and the FASTA reader (if any). /// /// The customize value is cloned so each fork has its own copy. // r[impl unified.readers_fork] + // r[impl unified.fork_cram] pub fn fork(&self) -> Result { let alignment = self.alignment.fork()?; - let fasta = self.fasta.fork().map_err(|source| ReaderError::FastaFork { source })?; + let fasta = match self.fasta.as_ref() { + Some(f) => Some(f.fork().map_err(|source| ReaderError::FastaFork { source })?), + None => None, + }; Ok(Readers { alignment, fasta, @@ -329,24 +366,32 @@ impl Readers { let customize = &mut self.customize; alignment.fetch_into_customized(tid.as_u32(), start, end, store, customize)?; - // Fetch `[start, end]` (inclusive). FASTA APIs expect half-open [start, stop). - // Use the u64 path so `end == Pos0::max_value()` doesn't truncate the - // last reference base — `stop = end + 1` is i32::MAX + 1, which doesn't - // fit in a Pos0 but does fit comfortably in a u64. + // Fetch `[start, end]` (inclusive) when a FASTA is attached. With no + // FASTA the engine simply has no reference window — `reference_base` + // returns `Base::Unknown` per `r[pileup.reference_base.optional]`. + // FASTA APIs expect half-open [start, stop). Use the u64 path so + // `end == Pos0::max_value()` doesn't truncate the last reference + // base — `stop = end + 1` is i32::MAX + 1, which doesn't fit in a + // Pos0 but does fit comfortably in a u64. let contig_name = segment.contig(); - let stop_u64 = end.as_u64().saturating_add(1); - self.fasta - .fetch_seq_into_u64(contig_name, start.as_u64(), stop_u64, &mut self.fasta_buf) - .map_err(|source| ReaderError::FastaFetch { - contig: contig_name.clone(), - start: start.as_u64(), - end: end.as_u64(), - source, - })?; - // Convert in-place and copy into the Rc<[Base]> while keeping - // `fasta_buf` (and its capacity) for the next pileup call. - let bases: &[Base] = Base::convert_ascii_in_place_as_slice(&mut self.fasta_buf); - let ref_seq = RefSeq::new(Rc::from(bases), start); + let ref_seq = match self.fasta.as_mut() { + Some(fasta) => { + let stop_u64 = end.as_u64().saturating_add(1); + fasta + .fetch_seq_into_u64(contig_name, start.as_u64(), stop_u64, &mut self.fasta_buf) + .map_err(|source| ReaderError::FastaFetch { + contig: contig_name.clone(), + start: start.as_u64(), + end: end.as_u64(), + source, + })?; + // Convert in-place and copy into the Rc<[Base]> while keeping + // `fasta_buf` (and its capacity) for the next pileup call. + let bases: &[Base] = Base::convert_ascii_in_place_as_slice(&mut self.fasta_buf); + Some(RefSeq::new(Rc::from(bases), start)) + } + None => None, + }; // Move the populated store into the engine. After this `mem::take`, // `self.store` holds a default (empty) RecordStore — the slot the @@ -354,33 +399,37 @@ impl Readers { let store = std::mem::take(&mut self.store); let mut engine = PileupEngine::new(store, start, end); - engine.set_reference_seq(ref_seq); + if let Some(rs) = ref_seq { + engine.set_reference_seq(rs); + } Ok(PileupGuard::new(engine, &mut self.store)) } - pub fn fasta(&self) -> &IndexedFastaReader { - &self.fasta + /// Borrow the FASTA reader, if one is attached. + pub fn fasta(&self) -> Option<&IndexedFastaReader> { + self.fasta.as_ref() } - pub fn fasta_mut(&mut self) -> &mut IndexedFastaReader { - &mut self.fasta + /// Mutably borrow the FASTA reader, if one is attached. + pub fn fasta_mut(&mut self) -> Option<&mut IndexedFastaReader> { + self.fasta.as_mut() } // r[impl fasta.fetch.buffer_reuse] /// Fetch a reference sequence region and return it as `Rc<[Base]>`. /// - /// Uses an internal buffer whose capacity is retained across calls. The - /// returned `Rc<[Base]>` owns its own copy of the bases — the internal - /// buffer is converted in place and the slice is copied into the `Rc` - /// allocation, so we pay one allocation for the `Rc` and zero for the - /// buffer on subsequent calls. + /// Returns `Err(FastaError::NotConfigured)` when this `Readers` was + /// opened without a FASTA. Otherwise uses an internal buffer whose + /// capacity is retained across calls; the returned `Rc<[Base]>` owns its + /// own copy of the bases. pub fn fetch_base_seq( &mut self, name: &str, start: Pos0, stop: Pos0, ) -> Result, FastaError> { - self.fasta.fetch_seq_into(name, start, stop, &mut self.fasta_buf)?; + let fasta = self.fasta.as_mut().ok_or(FastaError::NotConfigured)?; + fasta.fetch_seq_into(name, start, stop, &mut self.fasta_buf)?; // Convert ASCII → Base in place; the &[Base] borrow keeps fasta_buf // alive (and its capacity). let bases: &[Base] = Base::convert_ascii_in_place_as_slice(&mut self.fasta_buf); diff --git a/crates/seqair/tests/cram_tlen.rs b/crates/seqair/tests/cram_tlen.rs index 4dfc1411..27737ac9 100644 --- a/crates/seqair/tests/cram_tlen.rs +++ b/crates/seqair/tests/cram_tlen.rs @@ -151,7 +151,7 @@ fn assert_cram_fields_with_tlen(name: &str) { .expect("samtools index"); assert!(status.success(), "samtools index failed for {name}"); - let mut readers = seqair::reader::Readers::open(&cram_copy, &ref_fasta) + let mut readers = seqair::reader::Readers::open(&cram_copy, ref_fasta.as_path()) .unwrap_or_else(|e| panic!("{name}: open failed: {e:#}")); let tid = readers.header().tid("ref").expect("contig 'ref'"); let mut store = RecordStore::new(); diff --git a/crates/seqair/tests/cram_version_matrix.rs b/crates/seqair/tests/cram_version_matrix.rs index ff19d11e..61daefb1 100644 --- a/crates/seqair/tests/cram_version_matrix.rs +++ b/crates/seqair/tests/cram_version_matrix.rs @@ -336,7 +336,8 @@ fn cram_embedded_reference() { assert!(samtools_count > 0, "embedded-ref CRAM should have records"); // seqair reads the same CRAM with the external FASTA - let mut readers = Readers::open(&cram, &htslib_fasta("c1.fa")).expect("seqair: open"); + let fasta = htslib_fasta("c1.fa"); + let mut readers = Readers::open(&cram, fasta.as_path()).expect("seqair: open"); let tid = readers.header().tid("c1").expect("tid"); let mut store = RecordStore::new(); readers diff --git a/crates/seqair/tests/mm_ml_roundtrip.rs b/crates/seqair/tests/mm_ml_roundtrip.rs index 4d0bc8a5..5c291810 100644 --- a/crates/seqair/tests/mm_ml_roundtrip.rs +++ b/crates/seqair/tests/mm_ml_roundtrip.rs @@ -223,7 +223,8 @@ fn mm_ml_cram_roundtrip() { assert!(status.success()); // Read CRAM with seqair - let mut readers = seqair::reader::Readers::open(&cram_path, &ref_path).expect("open CRAM"); + let mut readers = + seqair::reader::Readers::open(&cram_path, ref_path.as_path()).expect("open CRAM"); let tid = readers.header().tid("chr1").expect("tid"); let mut store = RecordStore::new(); readers diff --git a/crates/seqair/tests/no_reference.rs b/crates/seqair/tests/no_reference.rs new file mode 100644 index 00000000..1860cedf --- /dev/null +++ b/crates/seqair/tests/no_reference.rs @@ -0,0 +1,228 @@ +//! Reading BAM/SAM/CRAM without a FASTA reference. +//! +//! Covers `r[unified.readers_open]` (the no-FASTA half of the polymorphic +//! signature), `r[cram.fasta.optional]`, and `r[pileup.reference_base.optional]`. +#![allow( + clippy::unwrap_used, + clippy::expect_used, + clippy::panic, + clippy::indexing_slicing, + clippy::arithmetic_side_effects, + reason = "test code" +)] +#![allow( + clippy::cast_possible_truncation, + clippy::cast_possible_wrap, + reason = "test code with known small values" +)] + +use seqair::bam::{Pos0, RecordStore}; +use seqair::cram::reader::CramError; +use seqair::reader::{ReaderError, Readers, SegmentOptions}; +use seqair_types::Base; +use std::num::NonZeroU32; +use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; + +fn test_bam_path() -> &'static Path { + Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/../../tests/data/test.bam")) +} + +fn test_cram_v30_path() -> &'static Path { + Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/../../tests/data/test_v30.cram")) +} + +fn test_fasta_path() -> &'static Path { + Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/../../tests/data/test.fasta.gz")) +} + +fn htslib_sam(name: &str) -> PathBuf { + Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/../../tests/htslib/sam/")).join(name) +} + +fn htslib_fasta(name: &str) -> PathBuf { + Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/../../tests/htslib/fasta/")).join(name) +} + +/// Convert a SAM to CRAM, optionally embedding the reference. +/// Mirrors `tests/cram_version_matrix.rs::sam_to_cram`. +fn sam_to_cram( + dir: &Path, + sam_path: &Path, + fasta_path: &Path, + version: &str, + extra_opts: &[&str], +) -> PathBuf { + let bam_path = dir.join("sorted.bam"); + let cram_path = dir.join(format!("test_{version}.cram")); + + let status = Command::new("samtools") + .args(["sort", "-o"]) + .arg(&bam_path) + .arg(sam_path) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .expect("samtools not found"); + assert!(status.success(), "samtools sort failed for {}", sam_path.display()); + + let mut cmd = Command::new("samtools"); + cmd.args(["view", "-C"]).arg("--output-fmt-option").arg(format!("version={version}")); + for opt in extra_opts { + cmd.arg("--output-fmt-option").arg(opt); + } + cmd.arg("-T").arg(fasta_path).arg("-o").arg(&cram_path).arg(&bam_path); + let status = + cmd.stdout(Stdio::null()).stderr(Stdio::null()).status().expect("samtools view -C"); + assert!(status.success(), "samtools view -C failed"); + + let status = Command::new("samtools") + .arg("index") + .arg(&cram_path) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .expect("samtools index"); + assert!(status.success(), "samtools index failed"); + + cram_path +} + +// r[verify unified.readers_open] +/// Opening a BAM without a FASTA via `Readers::open(path, None)` must succeed +/// and let `fetch_into` work. +#[test] +fn bam_open_without_reference_fetches_records() { + let mut readers = Readers::open(test_bam_path(), None).expect("open BAM without FASTA"); + let tid = readers.header().tid("chr19").expect("test BAM has chr19"); + let mut store = RecordStore::new(); + let count = readers + .fetch_into(tid, Pos0::new(6_100_000).unwrap(), Pos0::new(6_200_000).unwrap(), &mut store) + .expect("fetch_into without FASTA"); + assert!(count > 0, "test BAM should yield records in chr19:6.1-6.2M"); + assert_eq!(store.len(), count); +} + +// r[verify pileup.reference_base.optional] +/// Pileup without a FASTA must succeed; every column reports `Base::Unknown` +/// for the reference base, but per-read bases are still present. +#[test] +fn bam_pileup_without_reference_returns_unknown_ref_base() { + let mut readers = Readers::open(test_bam_path(), None).expect("open BAM without FASTA"); + let opts = SegmentOptions::new(NonZeroU32::new(3_000).unwrap()); + let segments: Vec<_> = readers + .segments(("chr19", Pos0::new(6_103_500).unwrap(), Pos0::new(6_106_500).unwrap()), opts) + .expect("segments") + .collect(); + assert!(!segments.is_empty(), "test BAM should yield at least one segment"); + + let mut total_columns = 0usize; + let mut total_alignments = 0usize; + { + let mut p = readers.pileup(&segments[0]).expect("pileup without FASTA"); + while let Some(col) = p.pileups() { + assert_eq!( + col.reference_base(), + Base::Unknown, + "pileup without FASTA must report reference_base = Unknown" + ); + total_columns += 1; + total_alignments += col.alignments().count(); + } + } + assert!(total_columns > 0, "expected at least one pileup column"); + assert!(total_alignments > 0, "pileup columns should still expose per-read alignments"); +} + +// r[verify cram.fasta.optional] +/// CRAM with embedded reference (RR=true but `embedded_reference >= 0` per +/// slice) MUST decode without an external FASTA. samtools writes such files +/// with `--output-fmt-option embed_ref=2`. +#[test] +fn cram_embedded_reference_no_external_fasta() { + let dir = tempfile::tempdir().expect("tempdir"); + let cram = sam_to_cram( + dir.path(), + &htslib_sam("c1#clip.sam"), + &htslib_fasta("c1.fa"), + "3.0", + &["embed_ref=2"], + ); + + let fasta = htslib_fasta("c1.fa"); + let with_fasta_count = { + let mut readers = Readers::open(&cram, fasta.as_path()).expect("open with FASTA"); + let tid = readers.header().tid("c1").expect("tid"); + let mut store = RecordStore::new(); + readers + .fetch_into(tid, Pos0::new(0).unwrap(), Pos0::new(10).unwrap(), &mut store) + .expect("fetch with FASTA"); + store.len() + }; + + let mut readers = + Readers::open(cram.as_path(), None).expect("open embed-ref CRAM without FASTA"); + let tid = readers.header().tid("c1").expect("tid"); + let mut store = RecordStore::new(); + readers + .fetch_into(tid, Pos0::new(0).unwrap(), Pos0::new(10).unwrap(), &mut store) + .expect("fetch embed-ref CRAM without FASTA"); + assert!(!store.is_empty(), "embed-ref CRAM should still yield records without FASTA"); + assert_eq!( + store.len(), + with_fasta_count, + "embed-ref CRAM record count should match between with-FASTA and without-FASTA" + ); +} + +// r[verify cram.edge.missing_reference] +/// CRAM with a normal external reference (RR=true, no embedded ref) MUST +/// surface a `MissingReference` error when no FASTA was supplied. +#[test] +fn cram_external_reference_required_errors_without_fasta() { + let mut readers = Readers::open(test_cram_v30_path(), None) + .expect("open external-ref CRAM without FASTA (open is fine, only fetch errors)"); + let tid = readers.header().tid("chr19").expect("tid"); + let mut store = RecordStore::new(); + let err = readers + .fetch_into(tid, Pos0::new(6_100_000).unwrap(), Pos0::new(6_200_000).unwrap(), &mut store) + .expect_err("fetch must fail when CRAM needs an external reference and we have none"); + match err { + ReaderError::Cram { source: CramError::MissingReference { contig } } => { + assert_eq!(contig.as_str(), "chr19"); + } + other => panic!("expected CramError::MissingReference, got {other:?}"), + } +} + +// r[verify unified.readers_backward_compat] +/// `Readers::open` (with FASTA) must still work — backwards compatibility. +#[test] +fn readers_open_with_fasta_still_works() { + let mut readers = Readers::open(test_bam_path(), test_fasta_path()).expect("open with FASTA"); + let tid = readers.header().tid("chr19").expect("tid"); + let mut store = RecordStore::new(); + let count = readers + .fetch_into(tid, Pos0::new(6_100_000).unwrap(), Pos0::new(6_200_000).unwrap(), &mut store) + .expect("fetch with FASTA"); + assert!(count > 0); +} + +// r[verify unified.readers_fork] +/// Forking a `Readers` opened without a FASTA must also produce a fork +/// without a FASTA. A subsequent `pileup` on the fork must work the same way. +#[test] +fn fork_without_reference_keeps_no_reference() { + let readers = Readers::open(test_bam_path(), None).expect("open without FASTA"); + let mut forked = readers.fork().expect("fork preserves no-reference"); + let opts = SegmentOptions::new(NonZeroU32::new(3_000).unwrap()); + let segments: Vec<_> = forked + .segments(("chr19", Pos0::new(6_103_500).unwrap(), Pos0::new(6_106_500).unwrap()), opts) + .expect("segments on forked reader") + .collect(); + assert!(!segments.is_empty()); + let mut p = forked.pileup(&segments[0]).expect("pileup on forked reader without FASTA"); + while let Some(col) = p.pileups() { + assert_eq!(col.reference_base(), Base::Unknown); + } +} diff --git a/crates/seqair/tests/unified_reader.rs b/crates/seqair/tests/unified_reader.rs index e4cf04df..62a671e9 100644 --- a/crates/seqair/tests/unified_reader.rs +++ b/crates/seqair/tests/unified_reader.rs @@ -192,14 +192,16 @@ fn fork_works_for_both_formats() { // r[verify unified.detect_format] // r[verify unified.readers_backward_compat] +// r[verify cram.fasta.optional] +/// `IndexedReader::open` MUST accept a CRAM file without a FASTA. Open is +/// always allowed to succeed; missing-reference errors only surface at fetch +/// time. #[test] -fn indexed_reader_open_rejects_cram_without_fasta() { - let err = IndexedReader::open(test_cram_path()).unwrap_err(); - let msg = err.to_string(); - assert!( - msg.contains("CRAM") && msg.contains("reference"), - "error should mention CRAM and reference: {msg}" - ); +fn indexed_reader_open_accepts_cram_without_fasta() { + let reader = IndexedReader::open(test_cram_path()) + .expect("IndexedReader::open must accept CRAM without a FASTA"); + assert!(matches!(reader, IndexedReader::Cram(_))); + assert!(reader.header().target_count() > 0); } // ── Readers struct tests ───────────────────────────────────────────── diff --git a/docs/spec/1-1-unified_reader.md b/docs/spec/1-1-unified_reader.md index 599f6e3f..0796c54a 100644 --- a/docs/spec/1-1-unified_reader.md +++ b/docs/spec/1-1-unified_reader.md @@ -90,7 +90,7 @@ r[unified.fork_sam] SAM fork: shares `Arc` holding parsed tabix/CSI index + header, opens fresh `File` handle for BGZF reading. r[unified.fork_cram] -CRAM fork: shares `Arc` holding CRAI index + header, opens fresh `File` handle for container reading. Each fork MUST have its own FASTA reader (via `fasta_reader.fork()`) for thread-safe reference lookups. Reference caching (`r[cram.perf.reference_caching]`) MUST be per-fork, not shared. +CRAM fork: shares `Arc` holding CRAI index + header, opens fresh `File` handle for container reading. When the source reader carries a FASTA, each fork MUST also have its own FASTA reader (via `fasta_reader.fork()`) for thread-safe reference lookups. When the source reader was opened without a FASTA (`r[cram.fasta.optional]`), the fork MUST also have no FASTA. Reference caching (`r[cram.perf.reference_caching]`) MUST be per-fork, not shared. ## Readers: alignment + reference bundle @@ -98,10 +98,10 @@ r[unified.readers_struct] The `Readers` struct bundles an `IndexedReader` (alignment) with an `IndexedFastaReader` (reference) in a single type. This eliminates the need for separate FASTA path passing — CRAM can access the reference it needs for sequence reconstruction, and all formats have uniform open/fork semantics. `Readers` is parameterized on `E: CustomizeRecordStore = ()` so callers can attach a per-record customize value at open time (see `r[record_store.customize.trait]`); the customize value's `keep_record` runs at fetch time and `compute` runs by `pileup()` after every fetch. r[unified.readers_open] -`Readers::open(alignment_path, fasta_path)` MUST auto-detect the alignment format (via `r[unified.detect_format]`), open the appropriate reader, and open the FASTA reader. For CRAM, the fasta_path is passed to `IndexedCramReader::open()` for sequence reconstruction. For BAM/SAM, the FASTA reader is opened but not used by the alignment reader. `Readers::open` is only available when `E = ()`; for non-trivial customizers use `open_customized`. +`Readers::open(alignment_path, fasta_path)` MUST auto-detect the alignment format (via `r[unified.detect_format]`) and open the appropriate reader. The `fasta_path` parameter MUST accept both a `&Path` (FASTA attached) and `Option<&Path>` (`None` for no FASTA, `Some(path)` for FASTA) — concretely, the parameter type is `impl Into>`, so `Readers::open(bam, &ref_fa)`, `Readers::open(bam, Some(&ref_fa))`, and `Readers::open(bam, None)` MUST all compile and work. With a FASTA, the path is passed to `IndexedCramReader::open()` for CRAM sequence reconstruction; for BAM/SAM the FASTA reader is opened but not used during decoding. With no FASTA, the alignment reader is opened reference-free: BAM and SAM never need a reference for record decoding, and CRAM follows `r[cram.fasta.optional]`. The returned `Readers` then carries `fasta = None`, so `pileup()` MUST skip the FASTA fetch and the resulting `PileupColumn::reference_base()` MUST be `Base::Unknown` for every column; callers that later need bases for a CRAM slice without an embedded reference will see `CramError::MissingReference` propagated through `ReaderError::Cram`. `Readers::open` is only available when `E = ()`; for non-trivial customizers use `open_customized`. r[unified.readers_open_customized] -`Readers::::open_customized(alignment_path, fasta_path, customize)` MUST open the readers the same way as `Readers::open` but carry the user-supplied customize value along for `pileup()` to invoke on each fetched region. The customize value is stored by ownership inside the `Readers` struct. +`Readers::::open_customized(alignment_path, fasta_path, customize)` MUST open the readers the same way as `Readers::open` (including the `impl Into>` polymorphism for the FASTA argument) but carry the user-supplied customize value along for `pileup()` to invoke on each fetched region. The customize value is stored by ownership inside the `Readers` struct. r[unified.readers_fork] `Readers::fork()` MUST fork both the alignment reader and the FASTA reader, returning a new `Readers` with independent I/O handles but shared immutable state (indices, headers). The CRAM fork gets its own FASTA reader via `IndexedFastaReader::fork()`. When `E ≠ ()`, `fork()` MUST clone the customize value into the new `Readers` (the `Clone` bound on `CustomizeRecordStore` guarantees this is cheap). @@ -279,7 +279,7 @@ r[unified.fetch_counts] > - `customize() -> &E` / `customize_mut() -> &mut E` — direct access to inspect or reset the customize value's state between regions. r[unified.readers_backward_compat] -`IndexedReader::open(path)` MUST continue to work for BAM and SAM files without a FASTA path. CRAM detection in `IndexedReader::open()` MUST return an error explaining that CRAM requires a reference and suggesting `Readers::open()` instead. This preserves backward compatibility for code that only needs BAM/SAM. +`IndexedReader::open(path)` MUST continue to work for BAM, SAM, and CRAM files without a FASTA path. For CRAM the reader follows `r[cram.fasta.optional]`: opening always succeeds, and missing-reference errors only surface at fetch time when a slice actually needs an external reference. ## API surface diff --git a/docs/spec/2-cram-1-reader.md b/docs/spec/2-cram-1-reader.md index bebb16f1..152c0d6d 100644 --- a/docs/spec/2-cram-1-reader.md +++ b/docs/spec/2-cram-1-reader.md @@ -18,10 +18,13 @@ The reader MUST support CRAM v3.0 and v3.1. CRAM v2.0 and v2.1 MAY be supported The reader is read-only. CRAM writing is out of scope — rastair writes BAM (via htslib) for output. r[cram.scope.reference_required] -CRAM decoding requires access to the reference genome (FASTA). The reader MUST accept a FASTA reader (the one being built on the other branch) for sequence lookups. Embedded reference slices (where the reference is stored in the CRAM container itself) MUST also be supported as a fallback. +CRAM record decoding can require access to a reference genome (FASTA), but the FASTA is _optional_ at open time: many fields (header, CRAI, slice metadata, position, flags, MAPQ, CIGAR, qname, mate info, qualities, aux tags) decode without any reference. Bases need a reference _only_ when the writer set `RR=true` in the compression header _and_ the slice does not embed its own reference. The reader MUST therefore accept opening with no FASTA and only error out at fetch time when a slice actually needs an external reference (`RR=true` and `embedded_reference < 0` and `ref_seq_id >= 0`). When a FASTA is supplied, the reader MUST use it; embedded reference slices MUST always take precedence over the external FASTA, including when the FASTA is also present. Note: htslib supports automatic reference download via `REF_PATH`/`REF_CACHE` environment variables and the EBI reference server. This is out of scope for seqair, but when the FASTA reader is missing or the required contig is absent, the error message SHOULD mention `REF_PATH`/`REF_CACHE` as an alternative for users familiar with the htslib workflow. +r[cram.fasta.optional] +`IndexedCramReader::open(path)` and the FASTA-less constructor on the unified reader MUST succeed for any well-formed CRAM file regardless of whether bases will later be decodable. The decision "does this slice need an external reference?" is taken at fetch time by inspecting the container's compression-header `RR` flag and the slice's `embedded_reference` field. When `RR=false` the reader MUST skip the external FASTA fetch entirely (records carry their bases verbatim or as deltas against the embedded ref). When `embedded_reference >= 0` the reader MUST use the embedded block and MUST NOT consult the external FASTA. When `RR=true` _and_ `embedded_reference < 0` _and_ `ref_seq_id >= 0` _and_ no FASTA was supplied at open, the reader MUST return `CramError::MissingReference` for that slice — silently producing `N` bases is forbidden. + ## File structure > _[CRAM3] §6 "File definition" — CRAM magic, major/minor version, file ID_ @@ -504,7 +507,7 @@ r[cram.edge.reference_mismatch] If the reference MD5 in the slice header does not match the MD5 of the FASTA sequence for that region, the reader MUST return an error. The MD5 is computed over the uppercase reference sequence for the exact range `[alignment_start, alignment_start + alignment_span)`, with no newlines or other formatting. If the range extends beyond the reference length, this is a file/reference mismatch error. r[cram.edge.missing_reference] -If the FASTA reader cannot provide the reference sequence for a slice's region (e.g., contig not in FASTA), the reader MUST return an error unless the slice has an embedded reference. The error SHOULD mention `REF_PATH`/`REF_CACHE` as alternatives. +If the FASTA reader cannot provide the reference sequence for a slice's region (e.g., contig not in FASTA), or no FASTA was supplied at open (`r[cram.fasta.optional]`), the reader MUST return `CramError::MissingReference` _unless_ the slice has an embedded reference (`embedded_reference >= 0`) or the container's compression header has `RR=false`. The error SHOULD mention `REF_PATH`/`REF_CACHE` as alternatives. The check MUST happen before record reconstruction so that "missing reference" surfaces as a typed error rather than as silent `N` bases. r[cram.edge.unknown_read_names] When `RN=false` in the preservation map, read names are not available. The reader MUST log a warning (once per slice). Records without read names are stored with empty qnames. The pileup engine's overlapping-pair dedup MUST skip records with empty or `*` qnames, since mate matching requires real qnames. diff --git a/docs/spec/4-pileup.md b/docs/spec/4-pileup.md index ffcb72ef..c30ac54d 100644 --- a/docs/spec/4-pileup.md +++ b/docs/spec/4-pileup.md @@ -55,6 +55,9 @@ Reads with the unmapped flag (0x4) MUST be excluded from the pileup. htslib's `b r[pileup.empty_seq_unknown_base] Mapped records with `SEQ=*` (`seq_len == 0`) MUST appear in every pileup column they overlap, with `base = Base::Unknown` and `qual = BaseQuality::UNAVAILABLE` at every M/=/X/I CIGAR op. htslib's pileup includes such records — `bam_seqi` reads beyond the empty SEQ buffer and the `0xFF` qual sentinel decodes to `N`. Silently dropping them would produce a different `depth()` and `match_depth()` than htslib for any BAM containing secondary alignments without sequence (a common SAM/BAM convention). Insertions on empty-SEQ records use the same Unknown/UNAVAILABLE pair for the anchor base. +r[pileup.reference_base.optional] +`PileupColumn::reference_base()` MUST return `Base::Unknown` when no reference sequence is attached to the engine (no `set_reference_seq` call) or when the requested position falls outside the loaded reference window. This makes the `Readers::open_without_reference` flow (`r[unified.readers_open_without_reference]`) usable end-to-end: `pileup()` skips the FASTA fetch, every column reports `reference_base() == Base::Unknown`, and downstream logic can opt out of reference-based filtering instead of crashing or seeing arbitrary bases. + r[pileup.zero_refspan_reads] Reads with zero reference-consuming CIGAR operations (pure soft-clip, insertion-only) have `end_pos == pos` and MUST appear in exactly one pileup column at their `pos`. They MUST NOT be skipped or cause off-by-one errors.