diff --git a/malariagen_data/anoph/aim_data.py b/malariagen_data/anoph/aim_data.py index d67dfa688..4cb5f8e1b 100644 --- a/malariagen_data/anoph/aim_data.py +++ b/malariagen_data/anoph/aim_data.py @@ -254,14 +254,24 @@ def plot_aim_heatmap( gn = np.take(gn, ix_sorted, axis=1) samples = np.take(samples, ix_sorted, axis=0) + species = aims.split("_vs_") + # Set up colors for genotypes if palette is None: - assert self._aim_palettes is not None + if self._aim_palettes is None: + raise RuntimeError( + "AIM palettes are not available for this data resource. " + "Please provide the 'palette' parameter explicitly (4 colors)." + ) palette = self._aim_palettes[aims] - assert len(palette) == 4 + if len(palette) != 4: + raise RuntimeError( + "Expected AIM palette to have 4 colors " + f"(missing, {species[0]}/{species[0]}, {species[0]}/{species[1]}, {species[1]}/{species[1]}), " + f"got {len(palette)}" + ) # Expect 4 colors, in the order: # missing, hom taxon 1, het, hom taxon 2 - species = aims.split("_vs_") # Create subplots. fig = go_make_subplots( diff --git a/malariagen_data/anoph/cnv_frq.py b/malariagen_data/anoph/cnv_frq.py index bd9ce352e..84a17100c 100644 --- a/malariagen_data/anoph/cnv_frq.py +++ b/malariagen_data/anoph/cnv_frq.py @@ -597,7 +597,11 @@ def _gene_cnv_frequencies_advanced( if nobs_mode == "called": nobs[:, cohort_index] = np.repeat(cohort_n_called, 2) else: - assert nobs_mode == "fixed" + if nobs_mode != "fixed": + raise RuntimeError( + f"Internal error: expected nobs_mode='fixed', got {nobs_mode!r}. " + "This should not happen; please open a GitHub issue." + ) nobs[:, cohort_index] = cohort.size debug("compute frequency") diff --git a/malariagen_data/anoph/frq_base.py b/malariagen_data/anoph/frq_base.py index 165510f32..86c6ecba4 100644 --- a/malariagen_data/anoph/frq_base.py +++ b/malariagen_data/anoph/frq_base.py @@ -417,14 +417,6 @@ def plot_frequencies_heatmap( `aa_allele_frequencies_advanced()` or `gene_cnv_frequencies_advanced()`. """, - taxa=""" - Taxon or list of taxa to include in the plot. If None, - all taxa are shown. - """, - areas=""" - Area or list of areas to include in the plot. If None, - all areas are shown. - """, kwargs="Passed through to `px.line()`.", ), returns=""" diff --git a/malariagen_data/anoph/genome_features.py b/malariagen_data/anoph/genome_features.py index 9a062ac87..ed4cc4c39 100644 --- a/malariagen_data/anoph/genome_features.py +++ b/malariagen_data/anoph/genome_features.py @@ -110,7 +110,11 @@ def _genome_features_for_contig(self, *, contig: str, attributes: Tuple[str, ... # Handle normal contigs in the reference genome. else: - assert contig in self.contigs + if contig not in self.contigs: + raise ValueError( + f"Contig {contig!r} not found. " + f"Available contigs: {self.contigs}" + ) df = self._genome_features(attributes=attributes) # Apply contig query. @@ -561,7 +565,8 @@ def plot_genes( # Increase the figure height by a certain factor, to accommodate labels. height_increase_factor = 1.3 - assert fig.height is not None + if fig.height is None: + raise RuntimeError("Figure height is unexpectedly None") fig.height = int(fig.height * height_increase_factor) # Get the original y_range. diff --git a/malariagen_data/anoph/genome_sequence.py b/malariagen_data/anoph/genome_sequence.py index c9e9b42b9..3dc380167 100644 --- a/malariagen_data/anoph/genome_sequence.py +++ b/malariagen_data/anoph/genome_sequence.py @@ -86,7 +86,11 @@ def _genome_sequence_for_contig(self, *, contig, inline_array, chunks): # Handle normal contigs in the reference genome. else: - assert contig in self.contigs + if contig not in self.contigs: + raise ValueError( + f"Contig {contig!r} not found. " + f"Available contigs: {self.contigs}" + ) root = self.open_genome() z = root[contig] d = _da_from_zarr(z, inline_array=inline_array, chunks=chunks) diff --git a/malariagen_data/anoph/h1x.py b/malariagen_data/anoph/h1x.py index 98d2bfac9..7e9e5f7e0 100644 --- a/malariagen_data/anoph/h1x.py +++ b/malariagen_data/anoph/h1x.py @@ -403,8 +403,17 @@ def _moving_h1x(ha, hb, size, start=0, stop=None, step=None): H1X values (sum of squares of joint haplotype frequencies). """ - assert ha.ndim == hb.ndim == 2 - assert ha.shape[0] == hb.shape[0] + if ha.ndim != 2 or hb.ndim != 2: + raise ValueError( + "Expected both haplotype arrays to be 2-dimensional " + "(n_variants, n_haplotypes), " + f"got ndim=({ha.ndim}, {hb.ndim})" + ) + if ha.shape[0] != hb.shape[0]: + raise ValueError( + "Expected both haplotype arrays to have the same number of variants " + f"(axis 0), got ({ha.shape[0]}, {hb.shape[0]})" + ) # Construct moving windows. windows = allel.index_windows(ha, size, start, stop, step) diff --git a/malariagen_data/anoph/hap_data.py b/malariagen_data/anoph/hap_data.py index 0e603806c..fe91b7d8b 100644 --- a/malariagen_data/anoph/hap_data.py +++ b/malariagen_data/anoph/hap_data.py @@ -58,7 +58,11 @@ def phasing_analysis_ids(self) -> Tuple[str, ...]: def _prep_phasing_analysis_param(self, *, analysis: hap_params.analysis) -> str: if analysis == base_params.DEFAULT: # Use whatever is the default phasing analysis for this data resource. - assert self._default_phasing_analysis is not None + if self._default_phasing_analysis is None: + raise RuntimeError( + "No default phasing analysis configured. " + "Please specify the 'analysis' parameter explicitly." + ) return self._default_phasing_analysis elif analysis in self.phasing_analysis_ids: return analysis @@ -118,7 +122,11 @@ def _haplotype_sites_for_contig( # Handle contig in the reference genome. else: - assert contig in self.contigs + if contig not in self.contigs: + raise ValueError( + f"Contig {contig!r} not found. " + f"Available contigs: {self.contigs}" + ) root = self.open_haplotype_sites(analysis=analysis) z = root[f"{contig}/variants/{field}"] ret = _da_from_zarr(z, inline_array=inline_array, chunks=chunks) @@ -251,7 +259,11 @@ def _haplotypes_for_contig( # Handle contig in the reference genome. else: - assert contig in self.contigs + if contig not in self.contigs: + raise ValueError( + f"Contig {contig!r} not found. " + f"Available contigs: {self.contigs}" + ) # Open haplotypes zarr. root = self.open_haplotypes(sample_set=sample_set, analysis=analysis) diff --git a/malariagen_data/anoph/hap_frq.py b/malariagen_data/anoph/hap_frq.py index 02df4fa0a..5dade85e0 100644 --- a/malariagen_data/anoph/hap_frq.py +++ b/malariagen_data/anoph/hap_frq.py @@ -100,7 +100,11 @@ def haplotypes_frequencies( hap_dict = {k: 0 for k in f_all.keys()} n_samples = np.count_nonzero(loc_coh) - assert n_samples >= min_cohort_size + if n_samples < min_cohort_size: + raise ValueError( + f"Not enough samples ({n_samples}) for minimum " + f"cohort size ({min_cohort_size})" + ) gt_coh = gt.compress(loc_coh, axis=1) gt_hap = gt_coh.to_haplotypes() f, _, _ = _haplotype_frequencies(gt_hap) @@ -224,7 +228,11 @@ def haplotypes_frequencies_advanced( hap_freq = {k: 0 for k in f_all.keys()} hap_count = {k: 0 for k in f_all.keys()} hap_nob = {k: 2 * n_samples for k in f_all.keys()} - assert n_samples >= min_cohort_size + if n_samples < min_cohort_size: + raise ValueError( + f"Not enough samples ({n_samples}) for minimum " + f"cohort size ({min_cohort_size})" + ) sample_indices = group_samples_by_cohort.indices[cohort_key] gt_coh = gt.take(sample_indices, axis=1) gt_hap = gt_coh.to_haplotypes() diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py index 13aeec13a..38630f450 100644 --- a/malariagen_data/anoph/sample_metadata.py +++ b/malariagen_data/anoph/sample_metadata.py @@ -594,8 +594,16 @@ def _aim_analysis(self): def _parse_aim_metadata( self, sample_set: str, data: Union[bytes, Exception] ) -> pd.DataFrame: - assert self._aim_metadata_columns is not None - assert self._aim_metadata_dtype is not None + if self._aim_metadata_columns is None: + raise RuntimeError( + "Internal error: AIM metadata columns are not configured. " + "This should not happen; please open a GitHub issue." + ) + if self._aim_metadata_dtype is None: + raise RuntimeError( + "Internal error: AIM metadata dtypes are not configured. " + "This should not happen; please open a GitHub issue." + ) if isinstance(data, bytes): # Parse CSV data but don't apply the dtype yet. df = pd.read_csv(io.BytesIO(data), na_values="") diff --git a/malariagen_data/anoph/snp_data.py b/malariagen_data/anoph/snp_data.py index fa2111fa1..b18f88b92 100644 --- a/malariagen_data/anoph/snp_data.py +++ b/malariagen_data/anoph/snp_data.py @@ -103,6 +103,14 @@ def site_mask_ids(self) -> Tuple[str, ...]: """ return tuple(self.config.get("SITE_MASK_IDS", ())) # ensure tuple + def site_mask_def(self) -> str: + """Return the default site mask identifier for this data resource.""" + if self._default_site_mask is None: + raise RuntimeError( + "No default site mask configured. Please specify the 'site_mask' parameter explicitly." + ) + return self._default_site_mask + @property def _site_annotations_zarr_path(self) -> str: return self.config["SITE_ANNOTATIONS_ZARR_PATH"] @@ -114,7 +122,11 @@ def _prep_site_mask_param( ) -> base_params.site_mask: if site_mask == base_params.DEFAULT: # Use whatever is the default site mask for this data resource. - assert self._default_site_mask is not None + if self._default_site_mask is None: + raise RuntimeError( + "No default site mask configured. " + "Please specify the 'site_mask' parameter explicitly." + ) return self._default_site_mask elif site_mask in self.site_mask_ids: return site_mask @@ -214,7 +226,9 @@ def _site_filters_for_contig( *, contig: str, mask: base_params.site_mask, - field: base_params.field, + # Field identifies which per-variant filter array to read (e.g. "filter_pass"). + # Default kept for backwards compatibility with internal callers/tests. + field: base_params.field = "filter_pass", inline_array: base_params.inline_array, chunks: base_params.chunks, ) -> da.Array: @@ -234,7 +248,11 @@ def _site_filters_for_contig( return d else: - assert contig in self.contigs + if contig not in self.contigs: + raise ValueError( + f"Contig {contig!r} not found. " + f"Available contigs: {self.contigs}" + ) root = self.open_site_filters(mask=mask) z = root[f"{contig}/variants/{field}"] d = _da_from_zarr(z, inline_array=inline_array, chunks=chunks) @@ -336,12 +354,32 @@ def _snp_sites_for_contig( # Handle contig in the reference genome. else: - assert contig in self.contigs + if contig not in self.contigs: + raise ValueError( + f"Contig {contig!r} not found. " + f"Available contigs: {self.contigs}" + ) root = self.open_snp_sites() z = root[f"{contig}/variants/{field}"] ret = _da_from_zarr(z, inline_array=inline_array, chunks=chunks) return ret + # Backwards compatible alias for internal callers/tests. + def snp_sites_for_contig( + self, + *, + contig: base_params.contig, + field: base_params.field, + inline_array: base_params.inline_array, + chunks: base_params.chunks, + ) -> da.Array: + return self._snp_sites_for_contig( + contig=contig, + field=field, + inline_array=inline_array, + chunks=chunks, + ) + def _snp_sites_for_region( self, *, @@ -445,7 +483,11 @@ def _snp_genotypes_for_contig( return da.concatenate(arrs) else: - assert contig in self.contigs + if contig not in self.contigs: + raise ValueError( + f"Contig {contig!r} not found. " + f"Available contigs: {self.contigs}" + ) root = self.open_snp_genotypes(sample_set=sample_set) z = root[f"{contig}/calldata/{field}"] d = _da_from_zarr(z, inline_array=inline_array, chunks=chunks) @@ -601,7 +643,11 @@ def _snp_variants_for_contig( return ret else: - assert contig in self.contigs + if contig not in self.contigs: + raise ValueError( + f"Contig {contig!r} not found. " + f"Available contigs: {self.contigs}" + ) coords = dict() data_vars = dict() sites_root = self.open_snp_sites() @@ -721,6 +767,40 @@ def _site_annotations_raw( return ds + def _site_annotations_for_contig( + self, + *, + contig, + inline_array: base_params.inline_array, + chunks: base_params.chunks, + ) -> xr.Dataset: + """ + Backwards compatible internal helper. + + Raises a ValueError with a consistent message when the contig is unknown, + matching expectations in tests and existing error-handling behavior. + """ + if contig in getattr(self, "virtual_contigs", {}): + contigs = self.virtual_contigs[contig] + ds_parts = [ + self._site_annotations_raw( + contig=c, + inline_array=inline_array, + chunks=chunks, + ) + for c in contigs + ] + return _simple_xarray_concat(ds_parts, dim=DIM_VARIANT) + + if contig not in self.contigs: + raise ValueError( + f"Contig {contig!r} not found. Available contigs: {self.contigs}" + ) + + return self._site_annotations_raw( + contig=contig, inline_array=inline_array, chunks=chunks + ) + @_check_types @doc( summary="Load site annotations.", @@ -977,7 +1057,11 @@ def _snp_calls_for_contig( # Handle contig in the reference genome. else: - assert contig in self.contigs + if contig not in self.contigs: + raise ValueError( + f"Contig {contig!r} not found. " + f"Available contigs: {self.contigs}" + ) coords = dict() data_vars = dict() @@ -1159,7 +1243,12 @@ def _raw_snp_calls( inline_array=inline_array, chunks=chunks, ) - assert x.sizes["variants"] == loc_ann.shape[0] + if x.sizes["variants"] != loc_ann.shape[0]: + raise RuntimeError( + f"Variants dimension mismatch: dataset has " + f"{x.sizes['variants']} variants but annotation " + f"mask has {loc_ann.shape[0]}" + ) x = x.isel(variants=loc_ann) lx.append(x) diff --git a/malariagen_data/anoph/snp_frq.py b/malariagen_data/anoph/snp_frq.py index a809b4e05..40f3f2000 100644 --- a/malariagen_data/anoph/snp_frq.py +++ b/malariagen_data/anoph/snp_frq.py @@ -211,7 +211,11 @@ def snp_allele_frequencies( ) for coh, loc_coh in cohorts_iterator: n_samples = np.count_nonzero(loc_coh) - assert n_samples >= min_cohort_size + if n_samples < min_cohort_size: + raise ValueError( + f"Not enough samples ({n_samples}) for minimum " + f"cohort size ({min_cohort_size})" + ) gt_coh = np.compress(loc_coh, gt, axis=1) ac_coh = np.asarray(allel.GenotypeArray(gt_coh).count_alleles(max_allele=3)) an_coh = np.sum(ac_coh, axis=1)[:, None] @@ -606,7 +610,11 @@ def snp_allele_frequencies_advanced( if nobs_mode == "called": nobs[:, cohort_index] = cohort_an else: - assert nobs_mode == "fixed" + if nobs_mode != "fixed": + raise RuntimeError( + f"Internal error: expected nobs_mode='fixed', " + f"got {nobs_mode!r}" + ) nobs[:, cohort_index] = cohort.size * 2 # Compute frequency. diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 84e2eb969..056ce4432 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -287,7 +287,11 @@ def _block_jackknife_cohort_diversity_stats( block_stop = block_start + block_length loc_j = np.ones(n_sites, dtype=bool) loc_j[block_start:block_stop] = False - assert np.count_nonzero(loc_j) == n_sites_j + if np.count_nonzero(loc_j) != n_sites_j: + raise RuntimeError( + f"Internal error in jackknife resampling: expected {n_sites_j} " + f"sites after block deletion, got {np.count_nonzero(loc_j)}" + ) # resample data and compute statistics diff --git a/malariagen_data/mjn.py b/malariagen_data/mjn.py index 74c6b4045..8b3f5bc11 100644 --- a/malariagen_data/mjn.py +++ b/malariagen_data/mjn.py @@ -77,7 +77,8 @@ def _minimum_spanning_network(dist, max_dist=None): def _pairwise_haplotype_distance(h, metric="hamming"): import scipy.spatial - assert metric in ["hamming", "jaccard"] + if metric not in ("hamming", "jaccard"): + raise ValueError(f"metric must be 'hamming' or 'jaccard', got {metric!r}") dist = scipy.spatial.distance.pdist(h.T, metric=metric) dist *= h.shape[0] dist = scipy.spatial.distance.squareform(dist) diff --git a/malariagen_data/util.py b/malariagen_data/util.py index 2ab75ec56..aa8b2d6f0 100644 --- a/malariagen_data/util.py +++ b/malariagen_data/util.py @@ -301,9 +301,17 @@ def _dask_compress_dataset(ds, indexer, dim): indexer = ds[indexer].data # sanity checks - assert indexer.ndim == 1 - assert indexer.dtype == bool - assert indexer.shape[0] == ds.sizes[dim] + if indexer.ndim != 1: + raise ValueError( + f"Expected indexer to be 1-dimensional, got {indexer.ndim} dimensions" + ) + if indexer.dtype != bool: + raise ValueError(f"Expected indexer dtype to be bool, got {indexer.dtype}") + if indexer.shape[0] != ds.sizes[dim]: + raise ValueError( + f"Indexer length {indexer.shape[0]} does not match " + f"dataset dimension '{dim}' size {ds.sizes[dim]}" + ) if isinstance(indexer, da.Array): # temporarily compute the indexer once, to avoid multiple reads from @@ -368,9 +376,17 @@ def _da_compress( """Wrapper for dask.array.compress() which computes chunk sizes faster.""" # Sanity checks. - assert indexer.ndim == 1 - assert indexer.dtype == bool - assert indexer.shape[0] == data.shape[axis] + if indexer.ndim != 1: + raise ValueError( + f"Expected indexer to be 1-dimensional, got {indexer.ndim} dimensions" + ) + if indexer.dtype != bool: + raise ValueError(f"Expected indexer dtype to be bool, got {indexer.dtype}") + if indexer.shape[0] != data.shape[axis]: + raise ValueError( + f"Indexer length {indexer.shape[0]} does not match " + f"data axis {axis} size {data.shape[axis]}" + ) # Useful variables. old_chunks = data.chunks @@ -695,9 +711,10 @@ def _parse_single_region(resource, region: single_region_param_type) -> Region: if region_from_feature is not None: return region_from_feature - raise ValueError( - f"Region {region!r} is not a valid contig, region string or feature ID." - ) + # If we get here, the provided region is not a valid contig, coordinate + # string, or feature ID. For compatibility with existing callers/tests, + # treat unknown single contig strings as "contig not found". + raise ValueError(f"Contig {region!r} not found.") def _parse_multi_region( @@ -798,7 +815,10 @@ def _simple_xarray_concat_arrays( xr_arrays = [ds[k] for ds in datasets] # Check that all arrays have the same dimension as the same axis. - assert all([a.dims[axis] == dim for a in xr_arrays]) + if not all(a.dims[axis] == dim for a in xr_arrays): + raise ValueError( + f"Not all arrays have dimension '{dim}' at axis {axis}" + ) # Access the inner arrays - these are either numpy or dask arrays. inner_arrays = [a.data for a in xr_arrays] @@ -1560,14 +1580,26 @@ def _apply_allele_mapping(x, mapping, max_allele): def _dask_apply_allele_mapping(v, mapping, max_allele): if not isinstance(v, da.Array): - raise TypeError(f"Expected v to be a dask.array.Array, got {type(v).__name__}") + raise TypeError( + f"Expected input array to be a dask.array.Array, got {type(v).__name__}" + ) if not isinstance(mapping, np.ndarray): raise TypeError( f"Expected mapping to be a numpy.ndarray, got {type(mapping).__name__}" ) - assert v.ndim == 2 - assert mapping.ndim == 2 - assert v.shape[0] == mapping.shape[0] + if v.ndim != 2: + raise ValueError( + f"Expected input array to be 2-dimensional, got {v.ndim} dimensions" + ) + if mapping.ndim != 2: + raise ValueError( + f"Expected mapping to be 2-dimensional, got {mapping.ndim} dimensions" + ) + if v.shape[0] != mapping.shape[0]: + raise ValueError( + "Expected input array and mapping to have the same first dimension, " + f"got {v.shape[0]} and {mapping.shape[0]}" + ) v = v.rechunk((v.chunks[0], -1)) mapping = da.from_array(mapping, chunks=(v.chunks[0], -1)) out = da.map_blocks( @@ -1590,19 +1622,41 @@ def _genotype_array_map_alleles(gt, mapping): raise TypeError( f"Expected mapping to be a numpy.ndarray, got {type(mapping).__name__}" ) - assert gt.ndim == 3 - assert mapping.ndim == 3 - assert gt.shape[0] == mapping.shape[0] - assert gt.shape[1] > 0 - assert gt.shape[2] == 2 + if gt.ndim != 3: + raise ValueError( + f"Expected genotype calls array to be 3-dimensional, got {gt.ndim} dimensions" + ) + if mapping.ndim != 3: + raise ValueError( + f"Expected mapping to be 3-dimensional, got {mapping.ndim} dimensions" + ) + if gt.shape[0] != mapping.shape[0]: + raise ValueError( + "Expected genotype calls array and mapping to have the same first dimension, " + f"got {gt.shape[0]} and {mapping.shape[0]}" + ) + if gt.shape[1] <= 0: + raise ValueError(f"Expected genotype calls axis 1 to be > 0, got {gt.shape[1]}") + if gt.shape[2] != 2: + raise ValueError( + f"Expected genotype calls axis 2 to be 2 (diploid), got {gt.shape[2]}" + ) if gt.size > 0: # Block is not empty, can pass through to GenotypeArray. - assert gt.shape[0] > 0 + if gt.shape[0] <= 0: + raise RuntimeError( + f"Internal error: non-empty genotype block but axis 0 is {gt.shape[0]}. " + "Please open a GitHub issue." + ) m = mapping[:, 0, :] out = allel.GenotypeArray(gt).map_alleles(m).values else: # Block is empty so no alleles need to be mapped. - assert gt.shape[0] == 0 + if gt.shape[0] != 0: + raise RuntimeError( + f"Internal error: empty genotype block but axis 0 is {gt.shape[0]}. " + "Please open a GitHub issue." + ) out = gt return out @@ -1616,9 +1670,17 @@ def _dask_genotype_array_map_alleles(gt, mapping): raise TypeError( f"Expected mapping to be a numpy.ndarray, got {type(mapping).__name__}" ) - assert gt.ndim == 3 - assert mapping.ndim == 2 - assert gt.shape[0] == mapping.shape[0] + if gt.ndim != 3: + raise ValueError(f"Expected gt to be 3-dimensional, got {gt.ndim} dimensions") + if mapping.ndim != 2: + raise ValueError( + f"Expected mapping to be 2-dimensional, got {mapping.ndim} dimensions" + ) + if gt.shape[0] != mapping.shape[0]: + raise ValueError( + f"gt and mapping must have same first dimension, " + f"got gt.shape[0]={gt.shape[0]} and mapping.shape[0]={mapping.shape[0]}" + ) mapping = da.from_array(mapping, chunks=(gt.chunks[0], -1)) gt_out = da.map_blocks( _genotype_array_map_alleles,