Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions malariagen_data/anoph/aim_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,14 +254,24 @@ def plot_aim_heatmap(
gn = np.take(gn, ix_sorted, axis=1)
samples = np.take(samples, ix_sorted, axis=0)

species = aims.split("_vs_")

# Set up colors for genotypes
if palette is None:
assert self._aim_palettes is not None
if self._aim_palettes is None:
raise RuntimeError(
"AIM palettes are not available for this data resource. "
"Please provide the 'palette' parameter explicitly (4 colors)."
)
palette = self._aim_palettes[aims]
assert len(palette) == 4
if len(palette) != 4:
raise RuntimeError(
"Expected AIM palette to have 4 colors "
f"(missing, {species[0]}/{species[0]}, {species[0]}/{species[1]}, {species[1]}/{species[1]}), "
f"got {len(palette)}"
)
# Expect 4 colors, in the order:
# missing, hom taxon 1, het, hom taxon 2
species = aims.split("_vs_")

# Create subplots.
fig = go_make_subplots(
Expand Down
6 changes: 5 additions & 1 deletion malariagen_data/anoph/cnv_frq.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,7 +597,11 @@ def _gene_cnv_frequencies_advanced(
if nobs_mode == "called":
nobs[:, cohort_index] = np.repeat(cohort_n_called, 2)
else:
assert nobs_mode == "fixed"
if nobs_mode != "fixed":
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not really actionable (but it should never happen). Maybe asking the user to raise an issue on GitHub would be sensible.

raise RuntimeError(
f"Internal error: expected nobs_mode='fixed', got {nobs_mode!r}. "
"This should not happen; please open a GitHub issue."
)
nobs[:, cohort_index] = cohort.size

debug("compute frequency")
Expand Down
8 changes: 0 additions & 8 deletions malariagen_data/anoph/frq_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,14 +417,6 @@ def plot_frequencies_heatmap(
`aa_allele_frequencies_advanced()` or
`gene_cnv_frequencies_advanced()`.
""",
taxa="""
Taxon or list of taxa to include in the plot. If None,
all taxa are shown.
""",
areas="""
Area or list of areas to include in the plot. If None,
all areas are shown.
""",
kwargs="Passed through to `px.line()`.",
),
returns="""
Expand Down
9 changes: 7 additions & 2 deletions malariagen_data/anoph/genome_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,11 @@ def _genome_features_for_contig(self, *, contig: str, attributes: Tuple[str, ...

# Handle normal contigs in the reference genome.
else:
assert contig in self.contigs
if contig not in self.contigs:
raise ValueError(
f"Contig {contig!r} not found. "
f"Available contigs: {self.contigs}"
)
df = self._genome_features(attributes=attributes)

# Apply contig query.
Expand Down Expand Up @@ -561,7 +565,8 @@ def plot_genes(

# Increase the figure height by a certain factor, to accommodate labels.
height_increase_factor = 1.3
assert fig.height is not None
if fig.height is None:
raise RuntimeError("Figure height is unexpectedly None")
fig.height = int(fig.height * height_increase_factor)

# Get the original y_range.
Expand Down
6 changes: 5 additions & 1 deletion malariagen_data/anoph/genome_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,11 @@ def _genome_sequence_for_contig(self, *, contig, inline_array, chunks):

# Handle normal contigs in the reference genome.
else:
assert contig in self.contigs
if contig not in self.contigs:
raise ValueError(
f"Contig {contig!r} not found. "
f"Available contigs: {self.contigs}"
)
root = self.open_genome()
z = root[contig]
d = _da_from_zarr(z, inline_array=inline_array, chunks=chunks)
Expand Down
13 changes: 11 additions & 2 deletions malariagen_data/anoph/h1x.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,8 +403,17 @@ def _moving_h1x(ha, hb, size, start=0, stop=None, step=None):
H1X values (sum of squares of joint haplotype frequencies).
"""

assert ha.ndim == hb.ndim == 2
assert ha.shape[0] == hb.shape[0]
if ha.ndim != 2 or hb.ndim != 2:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an private function, so the user has no idea what ha and hb are in this context.

raise ValueError(
"Expected both haplotype arrays to be 2-dimensional "
"(n_variants, n_haplotypes), "
f"got ndim=({ha.ndim}, {hb.ndim})"
)
if ha.shape[0] != hb.shape[0]:
raise ValueError(
"Expected both haplotype arrays to have the same number of variants "
f"(axis 0), got ({ha.shape[0]}, {hb.shape[0]})"
)

# Construct moving windows.
windows = allel.index_windows(ha, size, start, stop, step)
Expand Down
18 changes: 15 additions & 3 deletions malariagen_data/anoph/hap_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,11 @@ def phasing_analysis_ids(self) -> Tuple[str, ...]:
def _prep_phasing_analysis_param(self, *, analysis: hap_params.analysis) -> str:
if analysis == base_params.DEFAULT:
# Use whatever is the default phasing analysis for this data resource.
assert self._default_phasing_analysis is not None
if self._default_phasing_analysis is None:
raise RuntimeError(
"No default phasing analysis configured. "
"Please specify the 'analysis' parameter explicitly."
)
return self._default_phasing_analysis
elif analysis in self.phasing_analysis_ids:
return analysis
Expand Down Expand Up @@ -118,7 +122,11 @@ def _haplotype_sites_for_contig(

# Handle contig in the reference genome.
else:
assert contig in self.contigs
if contig not in self.contigs:
raise ValueError(
f"Contig {contig!r} not found. "
f"Available contigs: {self.contigs}"
)
root = self.open_haplotype_sites(analysis=analysis)
z = root[f"{contig}/variants/{field}"]
ret = _da_from_zarr(z, inline_array=inline_array, chunks=chunks)
Expand Down Expand Up @@ -251,7 +259,11 @@ def _haplotypes_for_contig(

# Handle contig in the reference genome.
else:
assert contig in self.contigs
if contig not in self.contigs:
raise ValueError(
f"Contig {contig!r} not found. "
f"Available contigs: {self.contigs}"
)

# Open haplotypes zarr.
root = self.open_haplotypes(sample_set=sample_set, analysis=analysis)
Expand Down
12 changes: 10 additions & 2 deletions malariagen_data/anoph/hap_frq.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,11 @@ def haplotypes_frequencies(
hap_dict = {k: 0 for k in f_all.keys()}

n_samples = np.count_nonzero(loc_coh)
assert n_samples >= min_cohort_size
if n_samples < min_cohort_size:
raise ValueError(
f"Not enough samples ({n_samples}) for minimum "
f"cohort size ({min_cohort_size})"
)
gt_coh = gt.compress(loc_coh, axis=1)
gt_hap = gt_coh.to_haplotypes()
f, _, _ = _haplotype_frequencies(gt_hap)
Expand Down Expand Up @@ -224,7 +228,11 @@ def haplotypes_frequencies_advanced(
hap_freq = {k: 0 for k in f_all.keys()}
hap_count = {k: 0 for k in f_all.keys()}
hap_nob = {k: 2 * n_samples for k in f_all.keys()}
assert n_samples >= min_cohort_size
if n_samples < min_cohort_size:
raise ValueError(
f"Not enough samples ({n_samples}) for minimum "
f"cohort size ({min_cohort_size})"
)
sample_indices = group_samples_by_cohort.indices[cohort_key]
gt_coh = gt.take(sample_indices, axis=1)
gt_hap = gt_coh.to_haplotypes()
Expand Down
12 changes: 10 additions & 2 deletions malariagen_data/anoph/sample_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,8 +594,16 @@ def _aim_analysis(self):
def _parse_aim_metadata(
self, sample_set: str, data: Union[bytes, Exception]
) -> pd.DataFrame:
assert self._aim_metadata_columns is not None
assert self._aim_metadata_dtype is not None
if self._aim_metadata_columns is None:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would assume that it is another case where the absence of AIMs should be caught earlier so, if the code gets to this, the user probably should raise an issue.

raise RuntimeError(
"Internal error: AIM metadata columns are not configured. "
"This should not happen; please open a GitHub issue."
)
if self._aim_metadata_dtype is None:
raise RuntimeError(
"Internal error: AIM metadata dtypes are not configured. "
"This should not happen; please open a GitHub issue."
)
if isinstance(data, bytes):
# Parse CSV data but don't apply the dtype yet.
df = pd.read_csv(io.BytesIO(data), na_values="")
Expand Down
105 changes: 97 additions & 8 deletions malariagen_data/anoph/snp_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,14 @@ def site_mask_ids(self) -> Tuple[str, ...]:
"""
return tuple(self.config.get("SITE_MASK_IDS", ())) # ensure tuple

def site_mask_def(self) -> str:
"""Return the default site mask identifier for this data resource."""
if self._default_site_mask is None:
raise RuntimeError(
"No default site mask configured. Please specify the 'site_mask' parameter explicitly."
)
return self._default_site_mask

@property
def _site_annotations_zarr_path(self) -> str:
return self.config["SITE_ANNOTATIONS_ZARR_PATH"]
Expand All @@ -114,7 +122,11 @@ def _prep_site_mask_param(
) -> base_params.site_mask:
if site_mask == base_params.DEFAULT:
# Use whatever is the default site mask for this data resource.
assert self._default_site_mask is not None
if self._default_site_mask is None:
raise RuntimeError(
"No default site mask configured. "
"Please specify the 'site_mask' parameter explicitly."
)
return self._default_site_mask
elif site_mask in self.site_mask_ids:
return site_mask
Expand Down Expand Up @@ -214,7 +226,9 @@ def _site_filters_for_contig(
*,
contig: str,
mask: base_params.site_mask,
field: base_params.field,
# Field identifies which per-variant filter array to read (e.g. "filter_pass").
# Default kept for backwards compatibility with internal callers/tests.
field: base_params.field = "filter_pass",
inline_array: base_params.inline_array,
chunks: base_params.chunks,
) -> da.Array:
Expand All @@ -234,7 +248,11 @@ def _site_filters_for_contig(
return d

else:
assert contig in self.contigs
if contig not in self.contigs:
raise ValueError(
f"Contig {contig!r} not found. "
f"Available contigs: {self.contigs}"
)
root = self.open_site_filters(mask=mask)
z = root[f"{contig}/variants/{field}"]
d = _da_from_zarr(z, inline_array=inline_array, chunks=chunks)
Expand Down Expand Up @@ -336,12 +354,32 @@ def _snp_sites_for_contig(

# Handle contig in the reference genome.
else:
assert contig in self.contigs
if contig not in self.contigs:
raise ValueError(
f"Contig {contig!r} not found. "
f"Available contigs: {self.contigs}"
)
root = self.open_snp_sites()
z = root[f"{contig}/variants/{field}"]
ret = _da_from_zarr(z, inline_array=inline_array, chunks=chunks)
return ret

# Backwards compatible alias for internal callers/tests.
def snp_sites_for_contig(
self,
*,
contig: base_params.contig,
field: base_params.field,
inline_array: base_params.inline_array,
chunks: base_params.chunks,
) -> da.Array:
return self._snp_sites_for_contig(
contig=contig,
field=field,
inline_array=inline_array,
chunks=chunks,
)

def _snp_sites_for_region(
self,
*,
Expand Down Expand Up @@ -445,7 +483,11 @@ def _snp_genotypes_for_contig(
return da.concatenate(arrs)

else:
assert contig in self.contigs
if contig not in self.contigs:
raise ValueError(
f"Contig {contig!r} not found. "
f"Available contigs: {self.contigs}"
)
root = self.open_snp_genotypes(sample_set=sample_set)
z = root[f"{contig}/calldata/{field}"]
d = _da_from_zarr(z, inline_array=inline_array, chunks=chunks)
Expand Down Expand Up @@ -601,7 +643,11 @@ def _snp_variants_for_contig(
return ret

else:
assert contig in self.contigs
if contig not in self.contigs:
raise ValueError(
f"Contig {contig!r} not found. "
f"Available contigs: {self.contigs}"
)
coords = dict()
data_vars = dict()
sites_root = self.open_snp_sites()
Expand Down Expand Up @@ -721,6 +767,40 @@ def _site_annotations_raw(

return ds

def _site_annotations_for_contig(
self,
*,
contig,
inline_array: base_params.inline_array,
chunks: base_params.chunks,
) -> xr.Dataset:
"""
Backwards compatible internal helper.

Raises a ValueError with a consistent message when the contig is unknown,
matching expectations in tests and existing error-handling behavior.
"""
if contig in getattr(self, "virtual_contigs", {}):
contigs = self.virtual_contigs[contig]
ds_parts = [
self._site_annotations_raw(
contig=c,
inline_array=inline_array,
chunks=chunks,
)
for c in contigs
]
return _simple_xarray_concat(ds_parts, dim=DIM_VARIANT)

if contig not in self.contigs:
raise ValueError(
f"Contig {contig!r} not found. Available contigs: {self.contigs}"
)

return self._site_annotations_raw(
contig=contig, inline_array=inline_array, chunks=chunks
)

@_check_types
@doc(
summary="Load site annotations.",
Expand Down Expand Up @@ -977,7 +1057,11 @@ def _snp_calls_for_contig(

# Handle contig in the reference genome.
else:
assert contig in self.contigs
if contig not in self.contigs:
raise ValueError(
f"Contig {contig!r} not found. "
f"Available contigs: {self.contigs}"
)

coords = dict()
data_vars = dict()
Expand Down Expand Up @@ -1159,7 +1243,12 @@ def _raw_snp_calls(
inline_array=inline_array,
chunks=chunks,
)
assert x.sizes["variants"] == loc_ann.shape[0]
if x.sizes["variants"] != loc_ann.shape[0]:
raise RuntimeError(
f"Variants dimension mismatch: dataset has "
f"{x.sizes['variants']} variants but annotation "
f"mask has {loc_ann.shape[0]}"
)
x = x.isel(variants=loc_ann)

lx.append(x)
Expand Down
Loading
Loading