Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions crates/seqair/examples/aligned_pairs_walk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,8 @@ struct Counts {
fn main() -> anyhow::Result<()> {
let args = Cli::parse();

let mut readers =
Readers::open(&args.input, &args.reference).context("could not open BAM + FASTA")?;
let mut readers = Readers::open(&args.input, args.reference.as_path())
.context("could not open BAM + FASTA")?;

// Resolve the region against the BAM header. `RegionString` is parsed
// by clap; here we lift its `(name, Pos1, Pos1)` into the engine's
Expand Down
4 changes: 2 additions & 2 deletions crates/seqair/examples/mpileup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ const MAX_TILE_LEN: u32 = 1_000_000;
fn main() -> anyhow::Result<()> {
let args = Cli::parse();

let mut readers =
Readers::open(&args.input, &args.reference).context("could not open alignment file")?;
let mut readers = Readers::open(&args.input, args.reference.as_path())
.context("could not open alignment file")?;

let mut output: Box<dyn Write> = if let Some(out) = &args.out {
let file =
Expand Down
5 changes: 3 additions & 2 deletions crates/seqair/examples/pileup_extras.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,9 @@ impl CustomizeRecordStore for ReadInfoBuilder {
fn main() -> anyhow::Result<()> {
let args = Cli::parse();

let mut readers = Readers::open_customized(&args.input, &args.reference, ReadInfoBuilder)
.context("could not open BAM + FASTA")?;
let mut readers =
Readers::open_customized(&args.input, args.reference.as_path(), ReadInfoBuilder)
.context("could not open BAM + FASTA")?;

// Tile the requested region into 100 kb segments. Unmapped/secondary
// records are already dropped at push time by `ReadInfoBuilder::keep_record`,
Expand Down
2 changes: 1 addition & 1 deletion crates/seqair/examples/pileup_extras_simple.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ fn main() -> anyhow::Result<()> {

let mut readers = Readers::open_customized(
&args.input,
&args.reference,
args.reference.as_path(),
ReadInfoBuilder { min_mapq: args.min_mapq },
)?;
let max_len = std::num::NonZeroU32::new(100_000).expect("non-zero literal");
Expand Down
2 changes: 1 addition & 1 deletion crates/seqair/examples/simple_variant_caller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ fn main() -> anyhow::Result<()> {
}

let mut readers =
Readers::<DropUseless>::open_customized(&args.input, &args.reference, DropUseless)
Readers::<DropUseless>::open_customized(&args.input, args.reference.as_path(), DropUseless)
.context("could not open BAM + FASTA")?;

// ── Build VCF header ───────────────────────────────────────────────
Expand Down
78 changes: 68 additions & 10 deletions crates/seqair/src/cram/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,11 @@ pub struct CramShared {
/// re-scanning the header text.
pub read_group_ids: Vec<SmolStr>,
pub cram_path: PathBuf,
pub fasta_path: PathBuf,
/// Path of the FASTA reference, if one was supplied at open time.
/// `None` when the reader was opened without a reference
/// (`r[cram.fasta.optional]`); `MissingReference` is then surfaced lazily
/// at fetch time only for slices that actually need an external reference.
pub fasta_path: Option<PathBuf>,
}

/// Parse `@RG ID:` values from SAM header text in declaration order.
Expand All @@ -272,7 +276,9 @@ fn parse_read_group_ids(header_text: &str) -> Vec<SmolStr> {

pub struct IndexedCramReader<R: Read + Seek = File> {
file: R,
fasta: IndexedFastaReader<R>,
/// Optional FASTA for sequence reconstruction. `None` when the reader was
/// opened without a reference (`r[cram.fasta.optional]`).
fasta: Option<IndexedFastaReader<R>>,
shared: Arc<CramShared>,
// Scratch buffers reused across fetch_into calls. The per-record
// ones (`name_buf`, `feature_byte_buf`, `cigar_ops_buf`) used to be
Expand Down Expand Up @@ -311,6 +317,18 @@ impl IndexedCramReader<File> {
/// Open a CRAM file with a FASTA reference for sequence reconstruction.
#[instrument(level = "debug", fields(cram = %cram_path.display(), fasta = %fasta_path.display()))]
pub fn open(cram_path: &Path, fasta_path: &Path) -> Result<Self, CramError> {
Self::open_optional_fasta(cram_path, Some(fasta_path))
}

/// Open a CRAM file without a FASTA reference. See `r[cram.fasta.optional]`:
/// open always succeeds, and `MissingReference` is only surfaced at fetch
/// time for slices that actually need an external reference.
#[instrument(level = "debug", fields(cram = %cram_path.display()))]
pub fn open_without_reference(cram_path: &Path) -> Result<Self, CramError> {
Self::open_optional_fasta(cram_path, None)
}

fn open_optional_fasta(cram_path: &Path, fasta_path: Option<&Path>) -> Result<Self, CramError> {
let mut file = File::open(cram_path)
.map_err(|source| CramError::Open { path: cram_path.to_path_buf(), source })?;

Expand Down Expand Up @@ -342,8 +360,8 @@ impl IndexedCramReader<File> {
let cram_index = CramIndex::from_path(&crai_path)?;

// r[impl cram.scope.reference_required]
// Open FASTA reader
let fasta = IndexedFastaReader::open(fasta_path)?;
// r[impl cram.fasta.optional]
let fasta = fasta_path.map(IndexedFastaReader::open).transpose()?;

let read_group_ids = parse_read_group_ids(header.header_text());

Expand All @@ -355,7 +373,7 @@ impl IndexedCramReader<File> {
header,
read_group_ids,
cram_path: cram_path.to_path_buf(),
fasta_path: fasta_path.to_path_buf(),
fasta_path: fasta_path.map(Path::to_path_buf),
}),
container_buf: Vec::new(),
cigar_buf: Vec::new(),
Expand All @@ -371,10 +389,14 @@ impl IndexedCramReader<File> {
})
}

// r[impl unified.fork_cram]
pub fn fork(&self) -> Result<Self, CramError> {
let file = File::open(&self.shared.cram_path)
.map_err(|source| CramError::Open { path: self.shared.cram_path.clone(), source })?;
let fasta = self.fasta.fork()?;
let fasta = match self.fasta.as_ref() {
Some(f) => Some(f.fork()?),
None => None,
};
Ok(IndexedCramReader {
file,
fasta,
Expand Down Expand Up @@ -438,13 +460,13 @@ impl IndexedCramReader<Cursor<Vec<u8>>> {

Ok(IndexedCramReader {
file: Cursor::new(cram_data),
fasta,
fasta: Some(fasta),
shared: Arc::new(CramShared {
index: cram_index,
header,
read_group_ids,
cram_path: PathBuf::from("<fuzz>"),
fasta_path: PathBuf::from("<fuzz>"),
fasta_path: Some(PathBuf::from("<fuzz>")),
}),
container_buf: Vec::new(),
cigar_buf: Vec::new(),
Expand Down Expand Up @@ -650,9 +672,21 @@ impl<R: Read + Seek> IndexedCramReader<R> {
};

self.ref_seq_buf.clear();
if ref_start < ref_end_clamped {
// r[impl cram.fasta.optional]
// Only fetch external reference bytes when this container's
// compression header says they're needed AND we actually have a
// FASTA. When `RR=false`, records carry their bases verbatim or
// as deltas against an embedded reference, so the FASTA fetch is
// pure waste. When the FASTA is missing, we leave `ref_seq_buf`
// empty and let `decode_slice` decide whether the slice can still
// be decoded (embedded ref or unmapped) or must error out
// (`r[cram.edge.missing_reference]`).
if ref_start < ref_end_clamped
&& ch.preservation.reference_required
&& let Some(fasta) = self.fasta.as_mut()
{
// r[impl cram.edge.missing_reference]
self.fasta
fasta
.fetch_seq_into(
ref_name,
Pos0::try_from(ref_start)
Expand All @@ -673,6 +707,7 @@ impl<R: Read + Seek> IndexedCramReader<R> {
_ => CramError::from(e),
})?;
}
let fasta_available = self.fasta.is_some();

// Decode each slice listed by CRAI as overlapping our query.
// CRAI's `slice_offset` matches the container's landmark value
Expand All @@ -697,6 +732,7 @@ impl<R: Read + Seek> IndexedCramReader<R> {
slice_offset,
&self.ref_seq_buf,
ref_start.cast_signed(),
fasta_available,
&self.shared.header,
&self.shared.read_group_ids,
tid,
Expand Down Expand Up @@ -851,4 +887,26 @@ mod tests {
"message: {msg}"
);
}

// r[verify cram.fasta.optional]
#[test]
fn open_without_reference_succeeds() {
let reader = IndexedCramReader::open_without_reference(cram_path()).unwrap();
assert!(reader.header().target_count() > 0);
}

// r[verify cram.edge.missing_reference]
#[test]
fn fetch_without_reference_errors_when_slice_needs_fasta() {
let mut reader = IndexedCramReader::open_without_reference(cram_path()).unwrap();
let mut store = RecordStore::new();
let tid = 0;
let err = reader
.fetch_into(tid, Pos0::new(0).unwrap(), Pos0::max_value(), &mut store)
.expect_err("CRAM with external-only ref must error without FASTA");
assert!(
matches!(err, CramError::MissingReference { .. }),
"expected MissingReference, got {err:?}"
);
}
}
17 changes: 17 additions & 0 deletions crates/seqair/src/cram/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ pub(crate) fn decode_slice<E: CustomizeRecordStore>(
slice_offset: usize,
reference_seq: &[u8],
ref_start_0based: i64,
fasta_available: bool,
header: &BamHeader,
read_group_ids: &[SmolStr],
tid: u32,
Expand Down Expand Up @@ -222,6 +223,21 @@ pub(crate) fn decode_slice<E: CustomizeRecordStore>(
let effective_ref: &[u8] =
if sh.embedded_reference >= 0 { &embedded_ref_data } else { reference_seq };

// r[impl cram.fasta.optional]
// r[impl cram.edge.missing_reference]
// The reader was opened without a FASTA, the writer set RR=true, and this
// slice does not embed its own reference — bases cannot be reconstructed.
// Surface a typed error rather than silently producing N's.
if !fasta_available
&& ch.preservation.reference_required
&& sh.embedded_reference < 0
&& sh.ref_seq_id >= 0
{
return Err(CramError::MissingReference {
contig: header.target_name(tid).unwrap_or("?").into(),
});
}

let core_data = core_block.ok_or(CramError::MissingCoreDataBlock)?;

let mut ctx = DecodeContext::new(&core_data.data, external_blocks);
Expand Down Expand Up @@ -1358,6 +1374,7 @@ mod tests {
first_landmark as usize,
&fake_ref,
ref_start,
true,
&bam_header,
&[],
0,
Expand Down
3 changes: 3 additions & 0 deletions crates/seqair/src/fasta/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ pub enum FastaError {

#[error("BGZF FASTA missing GZI index (internal error)")]
MissingGzi,

#[error("no FASTA reference attached; reopen with `Readers::open` to enable reference lookups")]
NotConfigured,
}

fn format_available_sequences(available: &[SmolStr]) -> String {
Expand Down
7 changes: 0 additions & 7 deletions crates/seqair/src/reader/formats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,6 @@ pub fn detect(path: &Path) -> Result<Format, ReaderError> {
#[derive(Debug, thiserror::Error)]
#[non_exhaustive]
pub enum FormatDetectionError {
#[error(
"CRAM format detected but no FASTA reference provided. \
Use Readers::open(alignment_path, fasta_path) instead, \
or convert to BAM with `samtools view -b`"
)]
CramRequiresFasta,

#[error("file too short to determine format ({len} bytes)")]
FileTooShort { path: PathBuf, len: u64 },

Expand Down
46 changes: 24 additions & 22 deletions crates/seqair/src/reader/indexed.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use super::ReaderError;
use super::formats::{self, Format, FormatDetectionError};
use super::formats::{self, Format};
use crate::{
bam::{
BamHeader, IndexedBamReader,
Expand Down Expand Up @@ -106,29 +106,26 @@ impl<R: Read + Seek> IndexedReader<R> {

// r[impl unified.reader_api]
impl IndexedReader<std::fs::File> {
/// Open a BAM or bgzf-compressed SAM file, auto-detecting the format.
/// Open a BAM, bgzf-compressed SAM, or CRAM file, auto-detecting the format.
///
/// CRAM files are detected but require a FASTA reference — use
/// [`crate::Readers::open`] instead. This method returns an error for CRAM
/// with a message directing the user to provide a reference.
/// For CRAM, opening without a FASTA always succeeds; missing-reference
/// errors only surface at fetch time when a slice actually needs an
/// external reference (`r[cram.fasta.optional]`).
// r[impl unified.readers_backward_compat]
#[instrument(level = "debug", fields(path = %path.display()))]
pub fn open(path: &Path) -> Result<Self, ReaderError> {
match formats::detect(path)? {
Format::Bam => {
let reader = IndexedBamReader::open(path)?;
Ok(IndexedReader::Bam(reader))
}
Format::Sam => {
let reader = IndexedSamReader::open(path)?;
Ok(IndexedReader::Sam(reader))
}
Format::Cram => Err(FormatDetectionError::CramRequiresFasta.into()),
}
Self::open_with_optional_fasta(path, None)
}

/// Open any format, with a FASTA path for CRAM support.
pub(crate) fn open_with_fasta(path: &Path, fasta_path: &Path) -> Result<Self, ReaderError> {
Self::open_with_optional_fasta(path, Some(fasta_path))
}

fn open_with_optional_fasta(
path: &Path,
fasta_path: Option<&Path>,
) -> Result<Self, ReaderError> {
match formats::detect(path)? {
Format::Bam => {
let reader = IndexedBamReader::open(path)?;
Expand All @@ -139,7 +136,10 @@ impl IndexedReader<std::fs::File> {
Ok(IndexedReader::Sam(reader))
}
Format::Cram => {
let reader = IndexedCramReader::open(path, fasta_path)?;
let reader = match fasta_path {
Some(fp) => IndexedCramReader::open(path, fp)?,
None => IndexedCramReader::open_without_reference(path)?,
};
Ok(IndexedReader::Cram(Box::new(reader)))
}
}
Expand All @@ -163,17 +163,19 @@ mod tests {
use std::io::Write as _;
use tempfile::NamedTempFile;

// r[verify cram.fasta.optional]
/// `IndexedReader::open` MUST accept CRAM files without a FASTA. The
/// reader fails with a downstream CRAM error (here, missing CRAI for our
/// stub file) rather than with `CramRequiresFasta`.
#[test]
fn cram_requires_fasta() {
// A file starting with the CRAM magic bytes — IndexedReader::open (no FASTA)
// must return CramRequiresFasta.
fn cram_open_without_fasta_does_not_short_circuit() {
let mut f = NamedTempFile::new().expect("tempfile");
f.write_all(b"CRAM\x03\x00").expect("write");
f.flush().expect("flush");
let err = IndexedReader::open(f.path()).unwrap_err();
assert!(
matches!(err, ReaderError::Format { source: FormatDetectionError::CramRequiresFasta }),
"unexpected error: {err}"
!matches!(&err, ReaderError::Format { .. }),
"open should reach the CRAM reader, not error during format detection: {err}"
);
}
}
Loading
Loading