From 8ec1880d12583d3b9dee056c16538e822a038ee8 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Tue, 3 Feb 2026 12:22:32 +0300 Subject: [PATCH 01/32] obs and obsm more detail validation --- src/cap_upload_validator/errors.py | 33 +++-- src/cap_upload_validator/upload_validator.py | 133 ++++++++++++++----- 2 files changed, 122 insertions(+), 44 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index b766819..75e274b 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -78,21 +78,30 @@ class AnnDataFileMissingCountMatrix(CapException): class AnnDataMissingEmbeddings(CapException): name = "AnnDataMissingEmbeddings" - message = \ - """ - The embedding is missing or is incorrectly named: embeddings must be a [n_cells x 2] - numpy array saved with the prefix X_, for example: X_tsne, X_pca or X_umap. - """ + + def __init__(self, details: str = ""): + base_message = ( + "The embedding is missing or incorrectly formatted. " + "Embeddings must be stored in `.obsm` as [n_cells x 2] datasets " + "with names starting with 'X_' (e.g. X_umap, X_tsne)." + ) + if details: + self.message = f"{base_message}\nDetails:\n{details}" + else: + self.message = base_message class AnnDataMissingObsColumns(CapException): name = "AnnDataMissingObsColumns" - message = \ - """ - Required obs column(s) missing: file must contain - 'assay', 'disease', 'organism' and 'tissue' fields or - corresponding '_ontology_term_id' fields with valid values. - """ + + def __init__(self, details: str = ""): + base_message = ( + "Required obs metadata is missing. File must contain " + "'assay', 'disease', 'organism', 'tissue' " + "or corresponding '_ontology_term_id' fields." + ) + self.message = f"{base_message}\nDetails: {details}" if details else base_message + class AnnDataNoneInGeneralMetadata(CapException): name = "AnnDataNoneInGeneralMetadata" @@ -103,6 +112,7 @@ class AnnDataNoneInGeneralMetadata(CapException): corresponding '_ontology_term_id' fields with valid values. """ + class AnnDataNonStandardVarError(CapException): name = "AnnDataNonStandardVarError" message = \ @@ -114,6 +124,7 @@ class AnnDataNonStandardVarError(CapException): support@celltype.info and we will work to accommodate your request. """ + class CSCMatrixInX(CapException): name = "CSCMatrixInX" diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index b425158..4d42786 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -135,59 +135,126 @@ def _check_is_positive_integers(self, cap_adata: CapAnnData) -> bool: return True def _check_obsm(self, cap_adata: CapAnnData) -> None: + """ + Validate presence and correctness of embeddings in .obsm. + Appends AnnDataMissingEmbeddings if invalid. + """ logger.debug("Begin checking obsm") - if self._has_embeddings(cap_adata) is False: - self._multi_exception.append(AnnDataMissingEmbeddings()) - logger.debug("Finished checking obsm!") - def _has_embeddings(self, cap_adata: CapAnnData) -> bool: if cap_adata.obsm is None: - logger.debug("Obsm is not found in anndata!") - return False - + self._multi_exception.append( + AnnDataMissingEmbeddings("Obsm is not found in anndata!") + ) + return + n_cells = cap_adata.shape[0] + obsm_keys = list(cap_adata.obsm_keys()) + + if not obsm_keys: + self._multi_exception.append( + AnnDataMissingEmbeddings("Obsm exists but contains no keys.") + ) + return - for field in cap_adata.obsm_keys(): - if field.startswith(EMBEDDING_PREFIX): - entity = cap_adata.obsm[field] - if isinstance(entity, Dataset) and entity.shape == (n_cells, 2): - # looking for dense matrix of N x 2 shape - return True + embedding_keys = [k for k in obsm_keys if k.startswith(EMBEDDING_PREFIX)] - logger.debug(f"Embeddings not found in obsm_keys = {cap_adata.obsm_keys()}!") - return False + if not embedding_keys: + self._multi_exception.append( + AnnDataMissingEmbeddings( + f"Obsm keys found: {obsm_keys}, " + f"but none start with required prefix '{EMBEDDING_PREFIX}'." + ) + ) + return + + errors = [] + + for key in embedding_keys: + entity = cap_adata.obsm[key] + + if not isinstance(entity, Dataset): + errors.append( + f"{key}: expected h5py.Dataset, found {type(entity).__name__}" + ) + continue + + if entity.shape != (n_cells, 2): + errors.append( + f"{key}: invalid shape {entity.shape}, expected ({n_cells}, 2)" + ) + continue + + # Found at least one valid embedding, so success case + return + + # No valid embeddings found + self._multi_exception.append( + AnnDataMissingEmbeddings( + "Embedding candidates found but invalid:\n" + "\n".join(errors) + ) + ) + + logger.debug("Finished checking obsm!") def _check_obs(self, cap_adata: CapAnnData) -> None: logger.debug("Start checking obs") - obs_keys = cap_adata.obs_keys() + + obs_keys = list(cap_adata.obs_keys()) logger.debug(f"Checking obs_columns = {obs_keys} for required {GENERAL_METADATA}!") - + + # If obs missing entirely if cap_adata.obs is None or not obs_keys: logger.warning(".obs is missing!") - self._multi_exception.append(AnnDataMissingObsColumns()) + self._multi_exception.append( + AnnDataMissingObsColumns(".obs is missing completely.") + ) return + missing_columns = [] + empty_columns = [] + for col in GENERAL_METADATA: ont_id_col = f"{col}_ontology_term_id" + col_in_obs = col in obs_keys ont_id_col_in_obs = ont_id_col in obs_keys + # Column missing entirely if not (col_in_obs or ont_id_col_in_obs): - logger.debug(f"Column {col} is missing in .obs!") - self._multi_exception.append(AnnDataMissingObsColumns(f"{col} column is missing in .obs!")) - return - else: - if col_in_obs: - if not self._check_df_col_for_none(cap_adata.obs[col]): - logger.debug(f"Column {col} contains None or empty values in .obs!") - self._multi_exception.append(AnnDataNoneInGeneralMetadata(f"{col} column contains None or empty values in .obs!")) - return - if ont_id_col_in_obs: - cap_adata.read_obs([ont_id_col]) - if not self._check_df_col_for_none(cap_adata.obs[ont_id_col]): - logger.debug(f"Column {ont_id_col} contains None or empty values in .obs!") - self._multi_exception.append(AnnDataNoneInGeneralMetadata(f"{ont_id_col} column contains None or empty values in .obs!")) - return + missing_columns.append(col) + continue + + # Validate regular column values + if col_in_obs: + if not self._check_df_col_for_none(cap_adata.obs[col]): + empty_columns.append(col) + + # Validate ontology column values + if ont_id_col_in_obs: + if ont_id_col not in cap_adata.obs.columns: + cap_adata.read_obs(columns=[ont_id_col]) + + if not self._check_df_col_for_none(cap_adata.obs[ont_id_col]): + empty_columns.append(ont_id_col) + + # Report missing columns + if missing_columns: + self._multi_exception.append( + AnnDataMissingObsColumns( + "Missing required obs columns: " + + ", ".join(missing_columns) + ) + ) + + # Report empty/None columns + if empty_columns: + self._multi_exception.append( + AnnDataNoneInGeneralMetadata( + "Required obs columns contain empty/None values: " + + ", ".join(empty_columns) + ) + ) + logger.debug("Finished checking obs!") @staticmethod From 2fc0d111d2c341776d749bd926411ac308bca4e1 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Tue, 10 Feb 2026 09:45:33 +0300 Subject: [PATCH 02/32] var, bad anndata, X check --- src/cap_upload_validator/errors.py | 63 ++++++-- src/cap_upload_validator/upload_validator.py | 142 +++++++++---------- 2 files changed, 121 insertions(+), 84 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 75e274b..53a3c6f 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -71,9 +71,14 @@ def to_list(self) -> List[CapException]: return self.ex_list -class AnnDataFileMissingCountMatrix(CapException): - name = "AnnDataFileMissingCountMatrix" - message = "DataFile Incorrect format: raw data matrix is missing in .raw.X or .X." +class AnnDataMissingCountMatrix(CapException): + name = "AnnDataMissingCountMatrix" + message = "Count matrix is missing in both `.X` and `.raw.X`." + + +class AnnDataInvalidCountMatrix(CapException): + name = "AnnDataInvalidCountMatrix" + message = "Count matrix contains invalid values." class AnnDataMissingEmbeddings(CapException): @@ -113,16 +118,48 @@ class AnnDataNoneInGeneralMetadata(CapException): """ -class AnnDataNonStandardVarError(CapException): - name = "AnnDataNonStandardVarError" - message = \ - """ - File does not contain valid ENSEMBL terms in var. - We currently support Homo sapiens and Mus musculus. - In the case of multiple species in the dataset, orthologous Homo sapiens genes are required. - If there are other species you wish to upload to CAP, please contact - support@celltype.info and we will work to accommodate your request. - """ +class AnnDataVarError(CapException): + name = "AnnDataVarError" + + +class AnnDataMissingVarIndex(AnnDataVarError): + name = "AnnDataMissingVarIndex" + message = "The `.var.index` is missing or empty." + + +class AnnDataNumericVarIndex(AnnDataVarError): + name = "AnnDataNumericVarIndex" + message = "The `.var.index` contains numeric values instead of gene identifiers." + + +class AnnDataVarNotSubsetOfRawVar(AnnDataVarError): + name = "AnnDataVarNotSubsetOfRawVar" + message = "`var.index` must be a subset of `raw.var.index`." + + +class AnnDataUnsupportedOrganism(AnnDataVarError): + name = "AnnDataUnsupportedOrganism" + message = ( + "The organism in the dataset is not supported. " + "Currently supported: Homo sapiens and Mus musculus." + ) + + +class AnnDataMixedSpeciesGenes(AnnDataVarError): + name = "AnnDataMixedSpeciesGenes" + message = ( + "Multiple organisms detected. " + "Gene identifiers must be orthologous Homo sapiens ENSEMBL genes." + ) + + +class AnnDataGenesNotInReference(AnnDataVarError): + name = "AnnDataGenesNotInReference" + + def __init__(self, n_missing: int): + self.message = ( + f"{n_missing} gene identifiers were not found in the reference gene map." + ) class CSCMatrixInX(CapException): diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index 4d42786..429e67b 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -17,9 +17,15 @@ from .errors import ( CapMultiException, AnnDataFileMissingCountMatrix, + AnnDataInvalidCountMatrix, AnnDataMissingEmbeddings, AnnDataMissingObsColumns, - AnnDataNonStandardVarError, + AnnDataMissingVarIndex, + AnnDataNumericVarIndex, + AnnDataVarNotSubsetOfRawVar, + AnnDataUnsupportedOrganism, + AnnDataGenesNotInReference, + AnnDataDuplicateGenes, BadAnnDataFile, AnnDataNoneInGeneralMetadata, CSCMatrixInX, @@ -61,7 +67,9 @@ def validate(self, report_success: bool = True) -> None: logger.debug("Begin anndata file validation...") if not str(self._adata_path).endswith(".h5ad"): - raise BadAnnDataFile + raise BadAnnDataFile( + details=f"Expected '.h5ad' file, but got: {self._adata_path}" + ) with read_h5ad(self._adata_path, edit=False) as cap_adata: cap_adata.read_obs(columns=GENERAL_METADATA) # TODO: read all columns? @@ -87,7 +95,9 @@ def find_missing_genes(self) -> Optional[pd.DataFrame]: logger.debug("Begin finding missing genes...") if not str(self._adata_path).endswith(".h5ad"): - raise BadAnnDataFile + raise BadAnnDataFile( + details=f"Expected '.h5ad' file, but got: {self._adata_path}" + ) missing_genes = None with read_h5ad(self._adata_path, edit=False) as cap_adata: @@ -104,11 +114,18 @@ def find_missing_genes(self) -> Optional[pd.DataFrame]: def _check_X(self, cap_adata: CapAnnData) -> None: - logger.debug("Begin checking X") X = cap_adata.raw.X if cap_adata.raw is not None else cap_adata.X - if X is None or not self._check_is_positive_integers(cap_adata): + + if X is None: self._multi_exception.append(AnnDataFileMissingCountMatrix()) - logger.debug("Finished checking X!") + return + + if not self._check_is_positive_integers(cap_adata): + self._multi_exception.append( + AnnDataInvalidCountMatrix( + details="Values must be non-negative integers." + ) + ) @staticmethod def has_only_integers(arr: np.ndarray) -> bool: @@ -264,87 +281,70 @@ def _check_df_col_for_none(series: pd.Series) -> bool: def _check_var_index(self, cap_adata: CapAnnData) -> Optional[pd.Series]: logger.debug("Start checking var index...") + index = cap_adata.var.index + + if index is None or index.empty: + self._multi_exception.append(AnnDataMissingVarIndex()) + return + + if pd.api.types.is_any_real_numeric_dtype(index): + self._multi_exception.append(AnnDataNumericVarIndex()) + return + clean_index = self._remove_gene_version(index) self._ensembl_ids = clean_index if not clean_index.is_unique: - logger.debug("There are non unique gene ids in .var.index!") - self._multi_exception.append(AnnDataNonStandardVarError()) - return + self._multi_exception.append( + AnnDataDuplicateGenes( + details="Duplicate gene IDs found after removing version suffixes." + ) + ) + return - # Check if the var.index is a subset of raw.var.index if cap_adata.raw is not None and cap_adata.raw.var is not None: - logger.debug("As of raw exists, checking that var.index is a subset of raw.var.index!") if not index.isin(cap_adata.raw.var.index).all(): - self._multi_exception.append(AnnDataNonStandardVarError()) + self._multi_exception.append(AnnDataVarNotSubsetOfRawVar()) return - # Check the number of organisms in the dataset - known_organisms = [HomoSapiens, MusMusculus] # Only Human and Mouse supported this moment - known_organisms_values = {ko.name for ko in known_organisms} - obs_keys = cap_adata.obs_keys() - if ORGANISM_COLUMN in obs_keys: - dataset_organisms = cap_adata.obs[ORGANISM_COLUMN].unique().tolist() - if "" in dataset_organisms: - dataset_organisms.remove("") - dataset_organisms = list(map(str_to_organism, dataset_organisms)) - elif ORGANISM_ONT_ID_COLUMN in obs_keys: - if ORGANISM_ONT_ID_COLUMN not in cap_adata.obs.columns: - cap_adata.read_obs(columns=[ORGANISM_ONT_ID_COLUMN]) - org_ont_ids = cap_adata.obs[ORGANISM_ONT_ID_COLUMN].unique().tolist() - if "" in org_ont_ids: - org_ont_ids.remove("") - dataset_organisms = list(map(ontology_id_to_organism, org_ont_ids)) - else: - dataset_organisms = [] - logger.debug(f"Organism(s) in dataset = {dataset_organisms}, known organisms = {known_organisms_values}") - - missing_genes_mask = None - # Check ENSEMBL ids for supported organism + dataset_organisms = self._get_dataset_organisms(cap_adata) + + if len(dataset_organisms) == 0: + logger.debug("No organism info found; skipping gene validation.") + return + if len(dataset_organisms) == 1: organism = dataset_organisms[0] self._organism = organism - if organism.name in known_organisms_values: - logger.debug("There is the only known organisms in dataset, so we must check for Unsemble IDs in var.index!") - missing_genes_mask = self._validate_gene_ids(ens_ids=clean_index, organism=organism) - else: - logger.debug("Unknown organisms in dataset found, index var validation skipped!") - elif len(dataset_organisms) > 1: - logger.debug("There are multiple organisms in dataset") - self._organism = MultiSpecies - missing_genes_mask = self._validate_gene_ids( - ens_ids=clean_index, - organism=self._organism, - ) - logger.debug("Finished checking var index!") - return missing_genes_mask + + if organism.name not in {HomoSapiens.name, MusMusculus.name}: + self._multi_exception.append(AnnDataUnsupportedOrganism()) + return + + return self._validate_gene_ids(clean_index, organism) + + # multi-species + self._organism = MultiSpecies + return self._validate_gene_ids(clean_index, MultiSpecies) def _validate_gene_ids( - self, - ens_ids: pd.Series, - organism: str, - ) -> Optional[pd.Series]: - """ - The method finds missing genes from gene map for given organism. - Return None if all genes are valid. - Else return pd.Series of boolean mask of missing genes. - """ - if ens_ids.empty or pd.api.types.is_any_real_numeric_dtype(ens_ids): - # Gene names are missed - logger.debug("Gene names are missed!") - self._multi_exception.append(AnnDataNonStandardVarError()) - return - - # Check genes with gene maps + self, + ens_ids: pd.Index, + organism: Organism, + ) -> Optional[pd.Series]: + df = GeneMap.data_frame(organisms=organism) - missing_genes_mask = ~ens_ids.isin(df['ENSEMBL_gene']) - if missing_genes_mask.any(): - # Gene names are non standard - logger.debug("Gene names are not standard!") - self._multi_exception.append(AnnDataNonStandardVarError()) - return missing_genes_mask - + missing_mask = ~ens_ids.isin(df["ENSEMBL_gene"]) + + if missing_mask.any(): + self._multi_exception.append( + AnnDataGenesNotInReference(n_missing=missing_mask.sum()) + ) + return missing_mask + + return None + @staticmethod def _remove_gene_version(ensemble_ids: pd.Index) -> pd.Index: """ From c6375a37f4ac7cca66d044867e6e0f066e059122 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Tue, 10 Feb 2026 09:53:40 +0300 Subject: [PATCH 03/32] fix --- src/cap_upload_validator/errors.py | 8 --- src/cap_upload_validator/upload_validator.py | 67 +++++++++++++------- 2 files changed, 45 insertions(+), 30 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 53a3c6f..0507cd3 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -137,14 +137,6 @@ class AnnDataVarNotSubsetOfRawVar(AnnDataVarError): message = "`var.index` must be a subset of `raw.var.index`." -class AnnDataUnsupportedOrganism(AnnDataVarError): - name = "AnnDataUnsupportedOrganism" - message = ( - "The organism in the dataset is not supported. " - "Currently supported: Homo sapiens and Mus musculus." - ) - - class AnnDataMixedSpeciesGenes(AnnDataVarError): name = "AnnDataMixedSpeciesGenes" message = ( diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index 429e67b..ef607bd 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -23,9 +23,7 @@ AnnDataMissingVarIndex, AnnDataNumericVarIndex, AnnDataVarNotSubsetOfRawVar, - AnnDataUnsupportedOrganism, AnnDataGenesNotInReference, - AnnDataDuplicateGenes, BadAnnDataFile, AnnDataNoneInGeneralMetadata, CSCMatrixInX, @@ -296,51 +294,76 @@ def _check_var_index(self, cap_adata: CapAnnData) -> Optional[pd.Series]: self._ensembl_ids = clean_index if not clean_index.is_unique: - self._multi_exception.append( - AnnDataDuplicateGenes( - details="Duplicate gene IDs found after removing version suffixes." - ) - ) + self._multi_exception.append(AnnDataGenesNotInReference(n_missing=0)) return + # var and raw.var if cap_adata.raw is not None and cap_adata.raw.var is not None: if not index.isin(cap_adata.raw.var.index).all(): self._multi_exception.append(AnnDataVarNotSubsetOfRawVar()) return - dataset_organisms = self._get_dataset_organisms(cap_adata) + # organism detection + known_organisms = {HomoSapiens.name, MusMusculus.name} + obs_keys = cap_adata.obs_keys() - if len(dataset_organisms) == 0: - logger.debug("No organism info found; skipping gene validation.") - return + if ORGANISM_COLUMN in obs_keys: + dataset_organisms = cap_adata.obs[ORGANISM_COLUMN].unique().tolist() + dataset_organisms = [o for o in dataset_organisms if o] + dataset_organisms = list(map(str_to_organism, dataset_organisms)) + + elif ORGANISM_ONT_ID_COLUMN in obs_keys: + if ORGANISM_ONT_ID_COLUMN not in cap_adata.obs.columns: + cap_adata.read_obs(columns=[ORGANISM_ONT_ID_COLUMN]) + org_ids = cap_adata.obs[ORGANISM_ONT_ID_COLUMN].unique().tolist() + org_ids = [o for o in org_ids if o] + dataset_organisms = list(map(ontology_id_to_organism, org_ids)) + + else: + dataset_organisms = [] + + logger.debug(f"Organism(s) in dataset = {dataset_organisms}") + + # validate organism if len(dataset_organisms) == 1: organism = dataset_organisms[0] self._organism = organism - if organism.name not in {HomoSapiens.name, MusMusculus.name}: - self._multi_exception.append(AnnDataUnsupportedOrganism()) - return + if organism.name in known_organisms: + return self._validate_gene_ids(clean_index, organism) - return self._validate_gene_ids(clean_index, organism) + # unknown organism => skip validation + logger.debug("Unknown organism found; skipping gene validation.") + return + + if len(dataset_organisms) > 1: + self._organism = MultiSpecies + return self._validate_gene_ids(clean_index, MultiSpecies) + + logger.debug("Finished checking var index!") + return - # multi-species - self._organism = MultiSpecies - return self._validate_gene_ids(clean_index, MultiSpecies) - def _validate_gene_ids( self, ens_ids: pd.Index, organism: Organism, ) -> Optional[pd.Series]: + if ens_ids.empty or pd.api.types.is_any_real_numeric_dtype(ens_ids): + self._multi_exception.append(AnnDataNumericVarIndex()) + return + df = GeneMap.data_frame(organisms=organism) missing_mask = ~ens_ids.isin(df["ENSEMBL_gene"]) if missing_mask.any(): - self._multi_exception.append( - AnnDataGenesNotInReference(n_missing=missing_mask.sum()) - ) + if organism is MultiSpecies: + self._multi_exception.append(AnnDataMixedSpeciesGenes()) + else: + self._multi_exception.append( + AnnDataGenesNotInReference(n_missing=missing_mask.sum()) + ) return missing_mask return None From b5d1fc68c1125422d7dfd61464ca842e5eb6c234 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Tue, 10 Feb 2026 09:55:40 +0300 Subject: [PATCH 04/32] Update upload_validator.py --- src/cap_upload_validator/upload_validator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index ef607bd..ae08734 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -22,6 +22,7 @@ AnnDataMissingObsColumns, AnnDataMissingVarIndex, AnnDataNumericVarIndex, + AnnDataMixedSpeciesGenes, AnnDataVarNotSubsetOfRawVar, AnnDataGenesNotInReference, BadAnnDataFile, From c0e93e074ea7c62a69af1d24f2c68d53cf723891 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Tue, 10 Feb 2026 09:58:36 +0300 Subject: [PATCH 05/32] Update upload_validator.py --- src/cap_upload_validator/upload_validator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index ae08734..01292aa 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -16,7 +16,7 @@ ) from .errors import ( CapMultiException, - AnnDataFileMissingCountMatrix, + AnnDataMissingCountMatrix, AnnDataInvalidCountMatrix, AnnDataMissingEmbeddings, AnnDataMissingObsColumns, @@ -116,7 +116,7 @@ def _check_X(self, cap_adata: CapAnnData) -> None: X = cap_adata.raw.X if cap_adata.raw is not None else cap_adata.X if X is None: - self._multi_exception.append(AnnDataFileMissingCountMatrix()) + self._multi_exception.append(AnnDataMissingCountMatrix()) return if not self._check_is_positive_integers(cap_adata): From 1b18b5f00110b07706422ba1a63cfb024d9b1184 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Tue, 10 Feb 2026 13:32:26 +0300 Subject: [PATCH 06/32] test fixes --- src/cap_upload_validator/errors.py | 4 ++-- src/cap_upload_validator/upload_validator.py | 4 +--- test/test_upload_validator.py | 6 +++--- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 0507cd3..cca7b45 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -73,12 +73,12 @@ def to_list(self) -> List[CapException]: class AnnDataMissingCountMatrix(CapException): name = "AnnDataMissingCountMatrix" - message = "Count matrix is missing in both `.X` and `.raw.X`." + message = "Matrix is missing in both `.X` and `.raw.X`." class AnnDataInvalidCountMatrix(CapException): name = "AnnDataInvalidCountMatrix" - message = "Count matrix contains invalid values." + message = "Matrix values must be non-negative integers." class AnnDataMissingEmbeddings(CapException): diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index 01292aa..c6f97de 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -121,9 +121,7 @@ def _check_X(self, cap_adata: CapAnnData) -> None: if not self._check_is_positive_integers(cap_adata): self._multi_exception.append( - AnnDataInvalidCountMatrix( - details="Values must be non-negative integers." - ) + AnnDataInvalidCountMatrix() ) @staticmethod diff --git a/test/test_upload_validator.py b/test/test_upload_validator.py index 3d9c104..57467fc 100644 --- a/test/test_upload_validator.py +++ b/test/test_upload_validator.py @@ -29,7 +29,7 @@ from cap_upload_validator.errors import ( AnnDataMissingEmbeddings, AnnDataMissingObsColumns, - AnnDataNonStandardVarError, + AnnDataVarError, CapMultiException, AnnDataNoneInGeneralMetadata, CSCMatrixInX, @@ -154,7 +154,7 @@ def check_var_index(): adata.write_h5ad(filename=file_path) try: check_var_index() - except AnnDataNonStandardVarError: + except AnnDataVarError: pass except Exception as e: assert False, f"Unpredicted error: {e}" @@ -174,7 +174,7 @@ def check_var_index(): adata.write_h5ad(filename=file_path) try: check_var_index() - except AnnDataNonStandardVarError: + except AnnDataVarError: pass except Exception as e: assert False, f"Unpredicted error: {e}" From dc8d33e7b7277bfff500177848adc0392fcf1c3b Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Tue, 10 Feb 2026 13:45:20 +0300 Subject: [PATCH 07/32] minor revert --- src/cap_upload_validator/errors.py | 14 +++++++------- test/test_upload_validator.py | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index cca7b45..0044e45 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -118,26 +118,26 @@ class AnnDataNoneInGeneralMetadata(CapException): """ -class AnnDataVarError(CapException): - name = "AnnDataVarError" +class AnnDataNonStandardVarError(CapException): + name = "AnnDataNonStandardVarError" -class AnnDataMissingVarIndex(AnnDataVarError): +class AnnDataMissingVarIndex(AnnDataNonStandardVarError): name = "AnnDataMissingVarIndex" message = "The `.var.index` is missing or empty." -class AnnDataNumericVarIndex(AnnDataVarError): +class AnnDataNumericVarIndex(AnnDataNonStandardVarError): name = "AnnDataNumericVarIndex" message = "The `.var.index` contains numeric values instead of gene identifiers." -class AnnDataVarNotSubsetOfRawVar(AnnDataVarError): +class AnnDataVarNotSubsetOfRawVar(AnnDataNonStandardVarError): name = "AnnDataVarNotSubsetOfRawVar" message = "`var.index` must be a subset of `raw.var.index`." -class AnnDataMixedSpeciesGenes(AnnDataVarError): +class AnnDataMixedSpeciesGenes(AnnDataNonStandardVarError): name = "AnnDataMixedSpeciesGenes" message = ( "Multiple organisms detected. " @@ -145,7 +145,7 @@ class AnnDataMixedSpeciesGenes(AnnDataVarError): ) -class AnnDataGenesNotInReference(AnnDataVarError): +class AnnDataGenesNotInReference(AnnDataNonStandardVarError): name = "AnnDataGenesNotInReference" def __init__(self, n_missing: int): diff --git a/test/test_upload_validator.py b/test/test_upload_validator.py index 57467fc..3d9c104 100644 --- a/test/test_upload_validator.py +++ b/test/test_upload_validator.py @@ -29,7 +29,7 @@ from cap_upload_validator.errors import ( AnnDataMissingEmbeddings, AnnDataMissingObsColumns, - AnnDataVarError, + AnnDataNonStandardVarError, CapMultiException, AnnDataNoneInGeneralMetadata, CSCMatrixInX, @@ -154,7 +154,7 @@ def check_var_index(): adata.write_h5ad(filename=file_path) try: check_var_index() - except AnnDataVarError: + except AnnDataNonStandardVarError: pass except Exception as e: assert False, f"Unpredicted error: {e}" @@ -174,7 +174,7 @@ def check_var_index(): adata.write_h5ad(filename=file_path) try: check_var_index() - except AnnDataVarError: + except AnnDataNonStandardVarError: pass except Exception as e: assert False, f"Unpredicted error: {e}" From 3d42164fc824317e4f2f4175d7eea5d1df0148b0 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Tue, 10 Feb 2026 14:48:37 +0300 Subject: [PATCH 08/32] Update upload_validator.py --- src/cap_upload_validator/upload_validator.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index c6f97de..1efe742 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -348,7 +348,11 @@ def _validate_gene_ids( ens_ids: pd.Index, organism: Organism, ) -> Optional[pd.Series]: - + """ + The method finds missing genes from gene map for given organism. + Return None if all genes are valid. + Else return pd.Series of boolean mask of missing genes. + """ if ens_ids.empty or pd.api.types.is_any_real_numeric_dtype(ens_ids): self._multi_exception.append(AnnDataNumericVarIndex()) return From 70dea6679e981caf32cd34ce599ac9f7e58efff8 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Tue, 10 Feb 2026 15:05:50 +0300 Subject: [PATCH 09/32] Update upload_validator.py --- src/cap_upload_validator/upload_validator.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index 1efe742..e05d0b2 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -353,7 +353,13 @@ def _validate_gene_ids( Return None if all genes are valid. Else return pd.Series of boolean mask of missing genes. """ - if ens_ids.empty or pd.api.types.is_any_real_numeric_dtype(ens_ids): + if ens_ids.empty: + self._multi_exception.append(AnnDataMissingVarIndex()) + logger.debug("Gene names are missed!") + return + + if pd.api.types.is_any_real_numeric_dtype(ens_ids): + logger.debug("Gene names are numeric!") self._multi_exception.append(AnnDataNumericVarIndex()) return From 146c769a16f07a5320073d17df6bd9584ed2c0e5 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Tue, 10 Feb 2026 15:44:38 +0300 Subject: [PATCH 10/32] Update upload_validator.py --- src/cap_upload_validator/upload_validator.py | 68 +++++++++++--------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index e05d0b2..657334b 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -156,8 +156,10 @@ def _check_obsm(self, cap_adata: CapAnnData) -> None: logger.debug("Begin checking obsm") if cap_adata.obsm is None: + reason = "Obsm is not found in anndata!" + logger.debug(reason) self._multi_exception.append( - AnnDataMissingEmbeddings("Obsm is not found in anndata!") + AnnDataMissingEmbeddings(reason) ) return @@ -165,19 +167,20 @@ def _check_obsm(self, cap_adata: CapAnnData) -> None: obsm_keys = list(cap_adata.obsm_keys()) if not obsm_keys: + reason = "Obsm exists but contains no keys." + logger.debug(reason) self._multi_exception.append( - AnnDataMissingEmbeddings("Obsm exists but contains no keys.") + AnnDataMissingEmbeddings(reason) ) return embedding_keys = [k for k in obsm_keys if k.startswith(EMBEDDING_PREFIX)] if not embedding_keys: + reason = f"Obsm keys found: {obsm_keys}, but none start with required prefix '{EMBEDDING_PREFIX}'." + logger.debug(reason) self._multi_exception.append( - AnnDataMissingEmbeddings( - f"Obsm keys found: {obsm_keys}, " - f"but none start with required prefix '{EMBEDDING_PREFIX}'." - ) + AnnDataMissingEmbeddings(reason) ) return @@ -202,10 +205,10 @@ def _check_obsm(self, cap_adata: CapAnnData) -> None: return # No valid embeddings found + reason = "Embedding candidates found but invalid:\n" + "\n".join(errors) + logger.debug(reason) self._multi_exception.append( - AnnDataMissingEmbeddings( - "Embedding candidates found but invalid:\n" + "\n".join(errors) - ) + AnnDataMissingEmbeddings(reason) ) logger.debug("Finished checking obsm!") @@ -218,9 +221,10 @@ def _check_obs(self, cap_adata: CapAnnData) -> None: # If obs missing entirely if cap_adata.obs is None or not obs_keys: - logger.warning(".obs is missing!") + reason = ".obs is missing completely." + logger.debug(reason) self._multi_exception.append( - AnnDataMissingObsColumns(".obs is missing completely.") + AnnDataMissingObsColumns(reason) ) return @@ -253,20 +257,18 @@ def _check_obs(self, cap_adata: CapAnnData) -> None: # Report missing columns if missing_columns: + reason = "Missing required obs columns: " + ", ".join(missing_columns) + logger.debug(reason) self._multi_exception.append( - AnnDataMissingObsColumns( - "Missing required obs columns: " - + ", ".join(missing_columns) - ) + AnnDataMissingObsColumns(reason) ) # Report empty/None columns if empty_columns: + reason = "Required obs columns contain empty/None values: " + ", ".join(empty_columns) + logger.debug(reason) self._multi_exception.append( - AnnDataNoneInGeneralMetadata( - "Required obs columns contain empty/None values: " - + ", ".join(empty_columns) - ) + AnnDataNoneInGeneralMetadata(reason) ) logger.debug("Finished checking obs!") @@ -296,14 +298,15 @@ def _check_var_index(self, cap_adata: CapAnnData) -> Optional[pd.Series]: self._multi_exception.append(AnnDataGenesNotInReference(n_missing=0)) return - # var and raw.var + # Check if the var.index is a subset of raw.var.index if cap_adata.raw is not None and cap_adata.raw.var is not None: + logger.debug("As of raw exists, checking that var.index is a subset of raw.var.index!") if not index.isin(cap_adata.raw.var.index).all(): self._multi_exception.append(AnnDataVarNotSubsetOfRawVar()) return - # organism detection - known_organisms = {HomoSapiens.name, MusMusculus.name} + # Check the number of organisms in the dataset + known_organisms = {HomoSapiens.name, MusMusculus.name} # Only Human and Mouse supported this moment obs_keys = cap_adata.obs_keys() if ORGANISM_COLUMN in obs_keys: @@ -322,26 +325,27 @@ def _check_var_index(self, cap_adata: CapAnnData) -> Optional[pd.Series]: else: dataset_organisms = [] - logger.debug(f"Organism(s) in dataset = {dataset_organisms}") + logger.debug(f"Organism(s) in dataset = {dataset_organisms}, known organisms = {known_organisms}") - # validate organism + missing_genes_mask = None + # Check ENSEMBL ids for supported organism if len(dataset_organisms) == 1: organism = dataset_organisms[0] self._organism = organism if organism.name in known_organisms: - return self._validate_gene_ids(clean_index, organism) + logger.debug("There is the only known organisms in dataset, so we must check for Unsemble IDs in var.index!") + missing_genes_mask = self._validate_gene_ids(clean_index, organism) - # unknown organism => skip validation - logger.debug("Unknown organism found; skipping gene validation.") + logger.debug("Unknown organisms in dataset found, index var validation skipped!") return - - if len(dataset_organisms) > 1: + elif len(dataset_organisms) > 1: + logger.debug("There are multiple organisms in dataset") self._organism = MultiSpecies - return self._validate_gene_ids(clean_index, MultiSpecies) + missing_genes_mask = self._validate_gene_ids(clean_index, MultiSpecies) logger.debug("Finished checking var index!") - return + return missing_genes_mask def _validate_gene_ids( self, @@ -368,8 +372,10 @@ def _validate_gene_ids( if missing_mask.any(): if organism is MultiSpecies: + logger.debug("Gene names are from mixes species!") self._multi_exception.append(AnnDataMixedSpeciesGenes()) else: + logger.debug("Gene names are not standard!") self._multi_exception.append( AnnDataGenesNotInReference(n_missing=missing_mask.sum()) ) From 0b20f494c464f56b22d4109368a031c3c9d7ea2f Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Tue, 10 Feb 2026 15:49:17 +0300 Subject: [PATCH 11/32] Update upload_validator.py --- src/cap_upload_validator/upload_validator.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index 657334b..c621fb2 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -113,6 +113,7 @@ def find_missing_genes(self) -> Optional[pd.DataFrame]: def _check_X(self, cap_adata: CapAnnData) -> None: + logger.debug("Begin checking X") X = cap_adata.raw.X if cap_adata.raw is not None else cap_adata.X if X is None: @@ -124,6 +125,8 @@ def _check_X(self, cap_adata: CapAnnData) -> None: AnnDataInvalidCountMatrix() ) + logger.debug("Finish checking X") + @staticmethod def has_only_integers(arr: np.ndarray) -> bool: return np.all((arr - arr.astype(int)) == 0) From ae59d1761da1ada3cbbc044b02bcc5768ba8f874 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Tue, 10 Feb 2026 15:55:34 +0300 Subject: [PATCH 12/32] fixes --- src/cap_upload_validator/errors.py | 11 +++++++++-- src/cap_upload_validator/upload_validator.py | 6 ++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 0044e45..48a5579 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -144,13 +144,20 @@ class AnnDataMixedSpeciesGenes(AnnDataNonStandardVarError): "Gene identifiers must be orthologous Homo sapiens ENSEMBL genes." ) +class AnnDataGeneIndexIsNotUnique(AnnDataNonStandardVarError): + name = "AnnDataGeneIndexIsNotUnique" + + def __init__(self): + self.message = ( + "Gene identifiers must be unique." + ) class AnnDataGenesNotInReference(AnnDataNonStandardVarError): name = "AnnDataGenesNotInReference" - def __init__(self, n_missing: int): + def __init__(self): self.message = ( - f"{n_missing} gene identifiers were not found in the reference gene map." + f"Gene identifiers were not found in the reference gene map." ) diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index c621fb2..d8fd3e1 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -24,6 +24,7 @@ AnnDataNumericVarIndex, AnnDataMixedSpeciesGenes, AnnDataVarNotSubsetOfRawVar, + AnnDataGeneIndexIsNotUnique, AnnDataGenesNotInReference, BadAnnDataFile, AnnDataNoneInGeneralMetadata, @@ -298,7 +299,8 @@ def _check_var_index(self, cap_adata: CapAnnData) -> Optional[pd.Series]: self._ensembl_ids = clean_index if not clean_index.is_unique: - self._multi_exception.append(AnnDataGenesNotInReference(n_missing=0)) + logger.debug("There are non unique gene ids in .var.index!") + self._multi_exception.append(AnnDataGeneIndexIsNotUnique()) return # Check if the var.index is a subset of raw.var.index @@ -380,7 +382,7 @@ def _validate_gene_ids( else: logger.debug("Gene names are not standard!") self._multi_exception.append( - AnnDataGenesNotInReference(n_missing=missing_mask.sum()) + AnnDataGenesNotInReference() ) return missing_mask From 62b26a7de34b25cd56d5e0b47629c32dae0623ba Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Tue, 10 Feb 2026 15:57:43 +0300 Subject: [PATCH 13/32] simplify --- src/cap_upload_validator/errors.py | 8 +------- src/cap_upload_validator/upload_validator.py | 13 ++++--------- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 48a5579..d04cbde 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -137,13 +137,6 @@ class AnnDataVarNotSubsetOfRawVar(AnnDataNonStandardVarError): message = "`var.index` must be a subset of `raw.var.index`." -class AnnDataMixedSpeciesGenes(AnnDataNonStandardVarError): - name = "AnnDataMixedSpeciesGenes" - message = ( - "Multiple organisms detected. " - "Gene identifiers must be orthologous Homo sapiens ENSEMBL genes." - ) - class AnnDataGeneIndexIsNotUnique(AnnDataNonStandardVarError): name = "AnnDataGeneIndexIsNotUnique" @@ -152,6 +145,7 @@ def __init__(self): "Gene identifiers must be unique." ) + class AnnDataGenesNotInReference(AnnDataNonStandardVarError): name = "AnnDataGenesNotInReference" diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index d8fd3e1..f1f651f 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -22,7 +22,6 @@ AnnDataMissingObsColumns, AnnDataMissingVarIndex, AnnDataNumericVarIndex, - AnnDataMixedSpeciesGenes, AnnDataVarNotSubsetOfRawVar, AnnDataGeneIndexIsNotUnique, AnnDataGenesNotInReference, @@ -376,14 +375,10 @@ def _validate_gene_ids( missing_mask = ~ens_ids.isin(df["ENSEMBL_gene"]) if missing_mask.any(): - if organism is MultiSpecies: - logger.debug("Gene names are from mixes species!") - self._multi_exception.append(AnnDataMixedSpeciesGenes()) - else: - logger.debug("Gene names are not standard!") - self._multi_exception.append( - AnnDataGenesNotInReference() - ) + logger.debug("Gene names are not standard!") + self._multi_exception.append( + AnnDataGenesNotInReference() + ) return missing_mask return None From e0350aa62cd04c120070b5aa809111e911b1c2f9 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Tue, 10 Feb 2026 16:00:21 +0300 Subject: [PATCH 14/32] code style fix --- src/cap_upload_validator/upload_validator.py | 44 +++++--------------- 1 file changed, 11 insertions(+), 33 deletions(-) diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index f1f651f..36761e2 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -121,9 +121,7 @@ def _check_X(self, cap_adata: CapAnnData) -> None: return if not self._check_is_positive_integers(cap_adata): - self._multi_exception.append( - AnnDataInvalidCountMatrix() - ) + self._multi_exception.append(AnnDataInvalidCountMatrix()) logger.debug("Finish checking X") @@ -161,9 +159,7 @@ def _check_obsm(self, cap_adata: CapAnnData) -> None: if cap_adata.obsm is None: reason = "Obsm is not found in anndata!" logger.debug(reason) - self._multi_exception.append( - AnnDataMissingEmbeddings(reason) - ) + self._multi_exception.append(AnnDataMissingEmbeddings(reason)) return n_cells = cap_adata.shape[0] @@ -172,9 +168,7 @@ def _check_obsm(self, cap_adata: CapAnnData) -> None: if not obsm_keys: reason = "Obsm exists but contains no keys." logger.debug(reason) - self._multi_exception.append( - AnnDataMissingEmbeddings(reason) - ) + self._multi_exception.append(AnnDataMissingEmbeddings(reason)) return embedding_keys = [k for k in obsm_keys if k.startswith(EMBEDDING_PREFIX)] @@ -182,9 +176,7 @@ def _check_obsm(self, cap_adata: CapAnnData) -> None: if not embedding_keys: reason = f"Obsm keys found: {obsm_keys}, but none start with required prefix '{EMBEDDING_PREFIX}'." logger.debug(reason) - self._multi_exception.append( - AnnDataMissingEmbeddings(reason) - ) + self._multi_exception.append(AnnDataMissingEmbeddings(reason)) return errors = [] @@ -193,15 +185,11 @@ def _check_obsm(self, cap_adata: CapAnnData) -> None: entity = cap_adata.obsm[key] if not isinstance(entity, Dataset): - errors.append( - f"{key}: expected h5py.Dataset, found {type(entity).__name__}" - ) + errors.append(f"{key}: expected h5py.Dataset, found {type(entity).__name__}") continue if entity.shape != (n_cells, 2): - errors.append( - f"{key}: invalid shape {entity.shape}, expected ({n_cells}, 2)" - ) + errors.append(f"{key}: invalid shape {entity.shape}, expected ({n_cells}, 2)") continue # Found at least one valid embedding, so success case @@ -210,9 +198,7 @@ def _check_obsm(self, cap_adata: CapAnnData) -> None: # No valid embeddings found reason = "Embedding candidates found but invalid:\n" + "\n".join(errors) logger.debug(reason) - self._multi_exception.append( - AnnDataMissingEmbeddings(reason) - ) + self._multi_exception.append(AnnDataMissingEmbeddings(reason)) logger.debug("Finished checking obsm!") @@ -226,9 +212,7 @@ def _check_obs(self, cap_adata: CapAnnData) -> None: if cap_adata.obs is None or not obs_keys: reason = ".obs is missing completely." logger.debug(reason) - self._multi_exception.append( - AnnDataMissingObsColumns(reason) - ) + self._multi_exception.append(AnnDataMissingObsColumns(reason)) return missing_columns = [] @@ -262,17 +246,13 @@ def _check_obs(self, cap_adata: CapAnnData) -> None: if missing_columns: reason = "Missing required obs columns: " + ", ".join(missing_columns) logger.debug(reason) - self._multi_exception.append( - AnnDataMissingObsColumns(reason) - ) + self._multi_exception.append(AnnDataMissingObsColumns(reason)) # Report empty/None columns if empty_columns: reason = "Required obs columns contain empty/None values: " + ", ".join(empty_columns) logger.debug(reason) - self._multi_exception.append( - AnnDataNoneInGeneralMetadata(reason) - ) + self._multi_exception.append(AnnDataNoneInGeneralMetadata(reason)) logger.debug("Finished checking obs!") @@ -376,9 +356,7 @@ def _validate_gene_ids( if missing_mask.any(): logger.debug("Gene names are not standard!") - self._multi_exception.append( - AnnDataGenesNotInReference() - ) + self._multi_exception.append(AnnDataGenesNotInReference()) return missing_mask return None From c3bc35cfb5a0e370ee26a0fea209eb0edbc96c44 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 11 Feb 2026 11:13:18 +0300 Subject: [PATCH 15/32] bug fix --- src/cap_upload_validator/upload_validator.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index 36761e2..33c1282 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -318,11 +318,10 @@ def _check_var_index(self, cap_adata: CapAnnData) -> Optional[pd.Series]: self._organism = organism if organism.name in known_organisms: - logger.debug("There is the only known organisms in dataset, so we must check for Unsemble IDs in var.index!") + logger.debug("Single known organism found, validating gene IDs.") missing_genes_mask = self._validate_gene_ids(clean_index, organism) - - logger.debug("Unknown organisms in dataset found, index var validation skipped!") - return + else: + logger.debug("Unknown organism found, skipping gene validation.") elif len(dataset_organisms) > 1: logger.debug("There are multiple organisms in dataset") self._organism = MultiSpecies From 97f3cf9f6d2a278048be6471604e0e829f7cb94e Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 11 Feb 2026 11:26:50 +0300 Subject: [PATCH 16/32] Update errors.py --- src/cap_upload_validator/errors.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index d04cbde..069280a 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -141,18 +141,14 @@ class AnnDataGeneIndexIsNotUnique(AnnDataNonStandardVarError): name = "AnnDataGeneIndexIsNotUnique" def __init__(self): - self.message = ( - "Gene identifiers must be unique." - ) + self.message = "Gene identifiers must be unique." class AnnDataGenesNotInReference(AnnDataNonStandardVarError): name = "AnnDataGenesNotInReference" def __init__(self): - self.message = ( - f"Gene identifiers were not found in the reference gene map." - ) + self.message = "Gene identifiers were not found in the reference gene map." class CSCMatrixInX(CapException): From 138c9a17443f0fbba552ccfdffbc643ae150c739 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 11 Feb 2026 11:37:38 +0300 Subject: [PATCH 17/32] Update upload_validator.py --- src/cap_upload_validator/upload_validator.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index 33c1282..c5a7ebb 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -150,10 +150,6 @@ def _check_is_positive_integers(self, cap_adata: CapAnnData) -> bool: return True def _check_obsm(self, cap_adata: CapAnnData) -> None: - """ - Validate presence and correctness of embeddings in .obsm. - Appends AnnDataMissingEmbeddings if invalid. - """ logger.debug("Begin checking obsm") if cap_adata.obsm is None: From 88885832a9b25b96964e21b10ce6fb94fe27ad2b Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 11 Feb 2026 14:39:55 +0300 Subject: [PATCH 18/32] Update src/cap_upload_validator/errors.py Co-authored-by: Roman Mukhin <59999203+rm1113@users.noreply.github.com> --- src/cap_upload_validator/errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 069280a..682688f 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -78,7 +78,7 @@ class AnnDataMissingCountMatrix(CapException): class AnnDataInvalidCountMatrix(CapException): name = "AnnDataInvalidCountMatrix" - message = "Matrix values must be non-negative integers." + message = "Raw counts matrix values must be non-negative integers." class AnnDataMissingEmbeddings(CapException): From 810b580adf3c1d6197e6ba17566d6c1d8b49bd58 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 11 Feb 2026 14:40:33 +0300 Subject: [PATCH 19/32] Update src/cap_upload_validator/errors.py Co-authored-by: Roman Mukhin <59999203+rm1113@users.noreply.github.com> --- src/cap_upload_validator/errors.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 682688f..d6e304a 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -139,9 +139,7 @@ class AnnDataVarNotSubsetOfRawVar(AnnDataNonStandardVarError): class AnnDataGeneIndexIsNotUnique(AnnDataNonStandardVarError): name = "AnnDataGeneIndexIsNotUnique" - - def __init__(self): - self.message = "Gene identifiers must be unique." + message = "`var.index` must not contain duplicates." class AnnDataGenesNotInReference(AnnDataNonStandardVarError): From 32fb5cdf1490a869859409a52a67224807b8b40d Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 11 Feb 2026 14:46:01 +0300 Subject: [PATCH 20/32] AnnDataMissingObs --- src/cap_upload_validator/errors.py | 5 +++++ src/cap_upload_validator/upload_validator.py | 7 +++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index d6e304a..6dfa5cb 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -96,6 +96,11 @@ def __init__(self, details: str = ""): self.message = base_message +class AnnDataMissingObs(CapException): + name = "AnnDataMissingObs" + message = "The 'obs' is missing." + + class AnnDataMissingObsColumns(CapException): name = "AnnDataMissingObsColumns" diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index c5a7ebb..395e00c 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -19,6 +19,7 @@ AnnDataMissingCountMatrix, AnnDataInvalidCountMatrix, AnnDataMissingEmbeddings, + AnnDataMissingObs, AnnDataMissingObsColumns, AnnDataMissingVarIndex, AnnDataNumericVarIndex, @@ -204,11 +205,9 @@ def _check_obs(self, cap_adata: CapAnnData) -> None: obs_keys = list(cap_adata.obs_keys()) logger.debug(f"Checking obs_columns = {obs_keys} for required {GENERAL_METADATA}!") - # If obs missing entirely if cap_adata.obs is None or not obs_keys: - reason = ".obs is missing completely." - logger.debug(reason) - self._multi_exception.append(AnnDataMissingObsColumns(reason)) + logger.debug(".obs is missing completely.") + self._multi_exception.append(AnnDataMissingObs()) return missing_columns = [] From c6f67c2c29f4555b86363ea48a35f165a5019795 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 11 Feb 2026 14:51:52 +0300 Subject: [PATCH 21/32] Fixes --- src/cap_upload_validator/errors.py | 16 +++++----------- src/cap_upload_validator/upload_validator.py | 20 ++++++++------------ test/test_upload_validator.py | 6 ++---- 3 files changed, 15 insertions(+), 27 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 6dfa5cb..4aab540 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -83,17 +83,11 @@ class AnnDataInvalidCountMatrix(CapException): class AnnDataMissingEmbeddings(CapException): name = "AnnDataMissingEmbeddings" - - def __init__(self, details: str = ""): - base_message = ( - "The embedding is missing or incorrectly formatted. " - "Embeddings must be stored in `.obsm` as [n_cells x 2] datasets " - "with names starting with 'X_' (e.g. X_umap, X_tsne)." - ) - if details: - self.message = f"{base_message}\nDetails:\n{details}" - else: - self.message = base_message + message = ( + "The embedding is missing or incorrectly formatted. " + "Embeddings must be stored in `.obsm` as [n_cells x 2] datasets " + "with names starting with 'X_' (e.g. X_umap, X_tsne)." + ) class AnnDataMissingObs(CapException): diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index 395e00c..09d31b1 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -154,26 +154,23 @@ def _check_obsm(self, cap_adata: CapAnnData) -> None: logger.debug("Begin checking obsm") if cap_adata.obsm is None: - reason = "Obsm is not found in anndata!" - logger.debug(reason) - self._multi_exception.append(AnnDataMissingEmbeddings(reason)) + logger.debug("Obsm is not found in anndata!") + self._multi_exception.append(AnnDataMissingEmbeddings()) return n_cells = cap_adata.shape[0] obsm_keys = list(cap_adata.obsm_keys()) if not obsm_keys: - reason = "Obsm exists but contains no keys." - logger.debug(reason) - self._multi_exception.append(AnnDataMissingEmbeddings(reason)) + logger.debug("Obsm exists but contains no keys.") + self._multi_exception.append(AnnDataMissingEmbeddings()) return embedding_keys = [k for k in obsm_keys if k.startswith(EMBEDDING_PREFIX)] if not embedding_keys: - reason = f"Obsm keys found: {obsm_keys}, but none start with required prefix '{EMBEDDING_PREFIX}'." - logger.debug(reason) - self._multi_exception.append(AnnDataMissingEmbeddings(reason)) + logger.debug(f"Obsm keys found: {obsm_keys}, but none start with required prefix '{EMBEDDING_PREFIX}'.") + self._multi_exception.append(AnnDataMissingEmbeddings()) return errors = [] @@ -193,9 +190,8 @@ def _check_obsm(self, cap_adata: CapAnnData) -> None: return # No valid embeddings found - reason = "Embedding candidates found but invalid:\n" + "\n".join(errors) - logger.debug(reason) - self._multi_exception.append(AnnDataMissingEmbeddings(reason)) + logger.debug("Embedding candidates found but invalid:\n" + "\n".join(errors)) + self._multi_exception.append(AnnDataMissingEmbeddings()) logger.debug("Finished checking obsm!") diff --git a/test/test_upload_validator.py b/test/test_upload_validator.py index 3d9c104..32603ca 100644 --- a/test/test_upload_validator.py +++ b/test/test_upload_validator.py @@ -22,12 +22,10 @@ from cap_upload_validator.gene_mapping import ( GeneMap, HomoSapiens, - MusMusculus, - MultiSpecies, - UnsupportedOrganism, ) from cap_upload_validator.errors import ( AnnDataMissingEmbeddings, + AnnDataMissingObs, AnnDataMissingObsColumns, AnnDataNonStandardVarError, CapMultiException, @@ -108,7 +106,7 @@ def check_obs(ca, correct_expected: bool): v._check_obs(ca) if not correct_expected: assert False, "Must not be correct obs!" - except AnnDataMissingObsColumns: + except (AnnDataMissingObsColumns, AnnDataMissingObs): assert not correct_expected, "Unexpected result" check_obs(cap_adata, True) From a7d4676822acb28395b684bc67147c5369966958 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 11 Feb 2026 15:00:09 +0300 Subject: [PATCH 22/32] AnnDataUnsupportedGenes --- src/cap_upload_validator/errors.py | 21 ++++++++++++++------ src/cap_upload_validator/upload_validator.py | 7 ++++--- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 4aab540..6ba462f 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -99,12 +99,12 @@ class AnnDataMissingObsColumns(CapException): name = "AnnDataMissingObsColumns" def __init__(self, details: str = ""): - base_message = ( + msg = ( "Required obs metadata is missing. File must contain " "'assay', 'disease', 'organism', 'tissue' " "or corresponding '_ontology_term_id' fields." ) - self.message = f"{base_message}\nDetails: {details}" if details else base_message + self.message = f"{msg}\nDetails: {details}" if details else msg class AnnDataNoneInGeneralMetadata(CapException): @@ -141,11 +141,20 @@ class AnnDataGeneIndexIsNotUnique(AnnDataNonStandardVarError): message = "`var.index` must not contain duplicates." -class AnnDataGenesNotInReference(AnnDataNonStandardVarError): - name = "AnnDataGenesNotInReference" +class AnnDataUnsupportedGenes(AnnDataNonStandardVarError): + name = "AnnDataUnsupportedGenes" - def __init__(self): - self.message = "Gene identifiers were not found in the reference gene map." + def __init__(self, missing_genes_count: int = None): + msg = ( + "File does not contain valid ENSEMBL terms in var.\n" + "We currently support Homo sapiens and Mus musculus.\n" + "In the case of multiple species in the dataset, orthologous Homo sapiens genes are required.\n" + "If there are other species you wish to upload to CAP, please contact " + "support@celltype.info and we will work to accommodate your request." + ) + if missing_genes_count is not None: + msg += f"\nNumber of unsupported genes found: {missing_genes_count}" + self.message = msg class CSCMatrixInX(CapException): diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index 09d31b1..aa4f059 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -25,7 +25,7 @@ AnnDataNumericVarIndex, AnnDataVarNotSubsetOfRawVar, AnnDataGeneIndexIsNotUnique, - AnnDataGenesNotInReference, + AnnDataUnsupportedGenes, BadAnnDataFile, AnnDataNoneInGeneralMetadata, CSCMatrixInX, @@ -345,8 +345,9 @@ def _validate_gene_ids( missing_mask = ~ens_ids.isin(df["ENSEMBL_gene"]) if missing_mask.any(): - logger.debug("Gene names are not standard!") - self._multi_exception.append(AnnDataGenesNotInReference()) + missing_genes_count = missing_mask.sum() + logger.debug(f"{missing_genes_count} gene(s) are not standard!") + self._multi_exception.append(AnnDataUnsupportedGenes(missing_genes_count=missing_genes_count)) return missing_mask return None From 4240c8304f9a15fbc0f65b04cc101557d017a99d Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 11 Feb 2026 15:02:05 +0300 Subject: [PATCH 23/32] Update upload_validator.py --- src/cap_upload_validator/upload_validator.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index aa4f059..8966de0 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -67,9 +67,7 @@ def validate(self, report_success: bool = True) -> None: logger.debug("Begin anndata file validation...") if not str(self._adata_path).endswith(".h5ad"): - raise BadAnnDataFile( - details=f"Expected '.h5ad' file, but got: {self._adata_path}" - ) + raise BadAnnDataFile() with read_h5ad(self._adata_path, edit=False) as cap_adata: cap_adata.read_obs(columns=GENERAL_METADATA) # TODO: read all columns? @@ -95,9 +93,7 @@ def find_missing_genes(self) -> Optional[pd.DataFrame]: logger.debug("Begin finding missing genes...") if not str(self._adata_path).endswith(".h5ad"): - raise BadAnnDataFile( - details=f"Expected '.h5ad' file, but got: {self._adata_path}" - ) + raise BadAnnDataFile() missing_genes = None with read_h5ad(self._adata_path, edit=False) as cap_adata: From e1710868aae2dff1dde9acd20672d72e61aff1f7 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 11 Feb 2026 15:19:56 +0300 Subject: [PATCH 24/32] fix missing_columns for obs usage --- src/cap_upload_validator/errors.py | 8 ++++++-- src/cap_upload_validator/upload_validator.py | 5 ++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 6ba462f..4cced0f 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -98,13 +98,17 @@ class AnnDataMissingObs(CapException): class AnnDataMissingObsColumns(CapException): name = "AnnDataMissingObsColumns" - def __init__(self, details: str = ""): + def __init__(self, missing_columns: list[str] = None): msg = ( "Required obs metadata is missing. File must contain " "'assay', 'disease', 'organism', 'tissue' " "or corresponding '_ontology_term_id' fields." ) - self.message = f"{msg}\nDetails: {details}" if details else msg + if missing_columns: + cols_str = ", ".join(missing_columns) + self.message = f"{msg}\nMissing columns: {cols_str}" + else: + self.message = msg class AnnDataNoneInGeneralMetadata(CapException): diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index 8966de0..90bfcbd 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -231,9 +231,8 @@ def _check_obs(self, cap_adata: CapAnnData) -> None: # Report missing columns if missing_columns: - reason = "Missing required obs columns: " + ", ".join(missing_columns) - logger.debug(reason) - self._multi_exception.append(AnnDataMissingObsColumns(reason)) + logger.debug("Missing required obs columns: " + ", ".join(missing_columns)) + self._multi_exception.append(AnnDataMissingObsColumns(missing_columns=missing_columns)) # Report empty/None columns if empty_columns: From 7b0b2a31abaf079c2c71dfc32aee6b94bb7be0f5 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 11 Feb 2026 15:23:34 +0300 Subject: [PATCH 25/32] Fix empty_columns usage --- src/cap_upload_validator/errors.py | 18 ++++++++++++------ src/cap_upload_validator/upload_validator.py | 5 ++--- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 4cced0f..c6e60ea 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -113,12 +113,18 @@ def __init__(self, missing_columns: list[str] = None): class AnnDataNoneInGeneralMetadata(CapException): name = "AnnDataNoneInGeneralMetadata" - message = \ - """ - Required obs column(s) contain empty or None values: file must contain - 'assay', 'disease', 'organism' and 'tissue' fields or - corresponding '_ontology_term_id' fields with valid values. - """ + + def __init__(self, empty_columns: list[str] = None): + msg = ( + "Required obs column(s) contain empty or None values. " + "File must contain 'assay', 'disease', 'organism', and 'tissue' fields " + "or corresponding '_ontology_term_id' fields with valid values." + ) + if empty_columns: + cols_str = ", ".join(empty_columns) + self.message = f"{msg}\nColumns with missing values: {cols_str}" + else: + self.message = msg class AnnDataNonStandardVarError(CapException): diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index 90bfcbd..dcc3dda 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -236,9 +236,8 @@ def _check_obs(self, cap_adata: CapAnnData) -> None: # Report empty/None columns if empty_columns: - reason = "Required obs columns contain empty/None values: " + ", ".join(empty_columns) - logger.debug(reason) - self._multi_exception.append(AnnDataNoneInGeneralMetadata(reason)) + logger.debug("Required obs columns contain empty/None values: " + ", ".join(empty_columns)) + self._multi_exception.append(AnnDataNoneInGeneralMetadata(empty_columns=empty_columns)) logger.debug("Finished checking obs!") From f5139d81c8be847b2d8cadf1a2c93472ef576f8c Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 11 Feb 2026 15:33:57 +0300 Subject: [PATCH 26/32] Revert obsm changes --- src/cap_upload_validator/upload_validator.py | 50 ++++++-------------- 1 file changed, 14 insertions(+), 36 deletions(-) diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index dcc3dda..f45cd4b 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -148,48 +148,26 @@ def _check_is_positive_integers(self, cap_adata: CapAnnData) -> bool: def _check_obsm(self, cap_adata: CapAnnData) -> None: logger.debug("Begin checking obsm") + if self._has_embeddings(cap_adata) is False: + self._multi_exception.append(AnnDataMissingEmbeddings()) + logger.debug("Finished checking obsm!") + def _has_embeddings(self, cap_adata: CapAnnData) -> bool: if cap_adata.obsm is None: logger.debug("Obsm is not found in anndata!") - self._multi_exception.append(AnnDataMissingEmbeddings()) - return - + return False + n_cells = cap_adata.shape[0] - obsm_keys = list(cap_adata.obsm_keys()) - - if not obsm_keys: - logger.debug("Obsm exists but contains no keys.") - self._multi_exception.append(AnnDataMissingEmbeddings()) - return - embedding_keys = [k for k in obsm_keys if k.startswith(EMBEDDING_PREFIX)] + for field in cap_adata.obsm_keys(): + if field.startswith(EMBEDDING_PREFIX): + entity = cap_adata.obsm[field] + if isinstance(entity, Dataset) and entity.shape == (n_cells, 2): + # looking for dense matrix of N x 2 shape + return True - if not embedding_keys: - logger.debug(f"Obsm keys found: {obsm_keys}, but none start with required prefix '{EMBEDDING_PREFIX}'.") - self._multi_exception.append(AnnDataMissingEmbeddings()) - return - - errors = [] - - for key in embedding_keys: - entity = cap_adata.obsm[key] - - if not isinstance(entity, Dataset): - errors.append(f"{key}: expected h5py.Dataset, found {type(entity).__name__}") - continue - - if entity.shape != (n_cells, 2): - errors.append(f"{key}: invalid shape {entity.shape}, expected ({n_cells}, 2)") - continue - - # Found at least one valid embedding, so success case - return - - # No valid embeddings found - logger.debug("Embedding candidates found but invalid:\n" + "\n".join(errors)) - self._multi_exception.append(AnnDataMissingEmbeddings()) - - logger.debug("Finished checking obsm!") + logger.debug(f"Embeddings not found in obsm_keys = {cap_adata.obsm_keys()}!") + return False def _check_obs(self, cap_adata: CapAnnData) -> None: logger.debug("Start checking obs") From 6e6285009dbf19b6338c277959eafa4efa4c7068 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 11 Feb 2026 15:46:18 +0300 Subject: [PATCH 27/32] none and empty values in obs columns separation --- src/cap_upload_validator/errors.py | 22 +++++---- src/cap_upload_validator/upload_validator.py | 49 +++++++++++++++----- 2 files changed, 51 insertions(+), 20 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index c6e60ea..2c3218b 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -114,17 +114,23 @@ def __init__(self, missing_columns: list[str] = None): class AnnDataNoneInGeneralMetadata(CapException): name = "AnnDataNoneInGeneralMetadata" - def __init__(self, empty_columns: list[str] = None): + def __init__( + self, + none_columns: list[str] | None = None, + empty_columns: list[str] | None = None, + ): msg = ( - "Required obs column(s) contain empty or None values. " - "File must contain 'assay', 'disease', 'organism', and 'tissue' fields " - "or corresponding '_ontology_term_id' fields with valid values." + "Required obs metadata contains invalid values.\n" + "All required fields must be filled with valid values." ) + + if none_columns: + msg += "\nColumns with None / NaN values: " + ", ".join(sorted(none_columns)) + if empty_columns: - cols_str = ", ".join(empty_columns) - self.message = f"{msg}\nColumns with missing values: {cols_str}" - else: - self.message = msg + msg += "\nColumns with empty values: " + ", ".join(sorted(empty_columns)) + + self.message = msg class AnnDataNonStandardVarError(CapException): diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index f45cd4b..a5d5547 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -180,8 +180,16 @@ def _check_obs(self, cap_adata: CapAnnData) -> None: self._multi_exception.append(AnnDataMissingObs()) return - missing_columns = [] - empty_columns = [] + missing_columns: list[str] = [] + none_columns: set[str] = set() + empty_columns: set[str] = set() + + def _check_column(series: pd.Series, name: str): + has_none, has_empty = self._classify_missing(series) + if has_none: + none_columns.add(name) + if has_empty: + empty_columns.add(name) for col in GENERAL_METADATA: ont_id_col = f"{col}_ontology_term_id" @@ -196,16 +204,14 @@ def _check_obs(self, cap_adata: CapAnnData) -> None: # Validate regular column values if col_in_obs: - if not self._check_df_col_for_none(cap_adata.obs[col]): - empty_columns.append(col) + _check_column(cap_adata.obs[col], col) # Validate ontology column values if ont_id_col_in_obs: if ont_id_col not in cap_adata.obs.columns: cap_adata.read_obs(columns=[ont_id_col]) - if not self._check_df_col_for_none(cap_adata.obs[ont_id_col]): - empty_columns.append(ont_id_col) + _check_column(cap_adata.obs[ont_id_col], ont_id_col) # Report missing columns if missing_columns: @@ -213,16 +219,35 @@ def _check_obs(self, cap_adata: CapAnnData) -> None: self._multi_exception.append(AnnDataMissingObsColumns(missing_columns=missing_columns)) # Report empty/None columns - if empty_columns: - logger.debug("Required obs columns contain empty/None values: " + ", ".join(empty_columns)) - self._multi_exception.append(AnnDataNoneInGeneralMetadata(empty_columns=empty_columns)) + if none_columns or empty_columns: + logger.debug( + "Required obs columns contain empty: %s or None: %s values.", + ", ".join(sorted(empty_columns)) if empty_columns else "—", + ", ".join(sorted(none_columns)) if none_columns else "—", + ) + self._multi_exception.append( + AnnDataNoneInGeneralMetadata( + none_columns=list(none_columns), + empty_columns=list(empty_columns), + ) + ) logger.debug("Finished checking obs!") @staticmethod - def _check_df_col_for_none(series: pd.Series) -> bool: - series = series.replace(r'^\s*$', np.nan, regex=True) - return pd.notna(series).all() + def _classify_missing(series: pd.Series) -> tuple[bool, bool]: + """ + Returns: + has_none -> True if None / NaN values are present + has_empty -> True if empty or whitespace-only strings are present + """ + has_none = series.isna().any() + + # Only check empties on non-null values + non_null = series.dropna() + has_empty = non_null.astype(str).str.match(r"^\s*$").any() + + return has_none, has_empty def _check_var_index(self, cap_adata: CapAnnData) -> Optional[pd.Series]: logger.debug("Start checking var index...") From d68bd2bace2f2a7931fc984b365179b7b5ff1456 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 11 Feb 2026 15:47:58 +0300 Subject: [PATCH 28/32] minor refactoring --- src/cap_upload_validator/errors.py | 4 ++-- src/cap_upload_validator/upload_validator.py | 4 ++-- test/test_upload_validator.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 2c3218b..7972276 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -111,8 +111,8 @@ def __init__(self, missing_columns: list[str] = None): self.message = msg -class AnnDataNoneInGeneralMetadata(CapException): - name = "AnnDataNoneInGeneralMetadata" +class AnnDataEmptyOrNoneInGeneralMetadata(CapException): + name = "AnnDataEmptyOrNoneInGeneralMetadata" def __init__( self, diff --git a/src/cap_upload_validator/upload_validator.py b/src/cap_upload_validator/upload_validator.py index a5d5547..3f03c8a 100644 --- a/src/cap_upload_validator/upload_validator.py +++ b/src/cap_upload_validator/upload_validator.py @@ -27,7 +27,7 @@ AnnDataGeneIndexIsNotUnique, AnnDataUnsupportedGenes, BadAnnDataFile, - AnnDataNoneInGeneralMetadata, + AnnDataEmptyOrNoneInGeneralMetadata, CSCMatrixInX, ) from typing import Optional @@ -226,7 +226,7 @@ def _check_column(series: pd.Series, name: str): ", ".join(sorted(none_columns)) if none_columns else "—", ) self._multi_exception.append( - AnnDataNoneInGeneralMetadata( + AnnDataEmptyOrNoneInGeneralMetadata( none_columns=list(none_columns), empty_columns=list(empty_columns), ) diff --git a/test/test_upload_validator.py b/test/test_upload_validator.py index 32603ca..b0ae059 100644 --- a/test/test_upload_validator.py +++ b/test/test_upload_validator.py @@ -29,7 +29,7 @@ AnnDataMissingObsColumns, AnnDataNonStandardVarError, CapMultiException, - AnnDataNoneInGeneralMetadata, + AnnDataEmptyOrNoneInGeneralMetadata, CSCMatrixInX, ) @@ -258,7 +258,7 @@ def test_ontology_id_instead_general_metadata(names_provided, with_none): v._multi_exception.raise_on_append = True if with_none: - context = pytest.raises(AnnDataNoneInGeneralMetadata) + context = pytest.raises(AnnDataEmptyOrNoneInGeneralMetadata) else: context = nullcontext() From ca7b0e66608dfec2215e05af8ba0a958f6e4a576 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 11 Feb 2026 15:49:28 +0300 Subject: [PATCH 29/32] Update errors.py --- src/cap_upload_validator/errors.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 7972276..6c5bab6 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Optional class CapException(BaseException): @@ -116,8 +116,8 @@ class AnnDataEmptyOrNoneInGeneralMetadata(CapException): def __init__( self, - none_columns: list[str] | None = None, - empty_columns: list[str] | None = None, + none_columns: Optional[List[str]] = None, + empty_columns: Optional[List[str]] = None, ): msg = ( "Required obs metadata contains invalid values.\n" From b99d6eede4affb4b3a3374bc50e274319bc2f2e1 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 18 Feb 2026 10:58:38 +0300 Subject: [PATCH 30/32] Minor english fixes in errors --- src/cap_upload_validator/errors.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 6c5bab6..66f3394 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -3,7 +3,7 @@ class CapException(BaseException): name = "Unknown" - message = "Useless CAP exception" + message = "Generic CAP exception" def __str__(self) -> str: return f"{self.name}: {self.message}" @@ -11,7 +11,7 @@ def __str__(self) -> str: def __eq__(self, other): if isinstance(other, CapException): return (self.name, self.message) == (other.name, other.message) - raise TypeError(f"The __eq__ operation doesn't defined for CapException and {type(other)}!") + raise TypeError(f"The __eq__ operation is not defined for CapException and {type(other)}!") def __hash__(self): return hash((self.name, self.message)) @@ -19,7 +19,7 @@ def __hash__(self): class BadAnnDataFile(CapException): name = 'Unknown' - message = 'File format is not supported!' + message = 'The file format is not supported!' class CapMultiException(CapException): @@ -84,15 +84,15 @@ class AnnDataInvalidCountMatrix(CapException): class AnnDataMissingEmbeddings(CapException): name = "AnnDataMissingEmbeddings" message = ( - "The embedding is missing or incorrectly formatted. " - "Embeddings must be stored in `.obsm` as [n_cells x 2] datasets " - "with names starting with 'X_' (e.g. X_umap, X_tsne)." + "Embeddings are missing or incorrectly formatted. " + "They must be stored in `.obsm` as [n_cells × 2] datasets " + "with names starting with 'X_' (e.g., X_umap, X_tsne)." ) class AnnDataMissingObs(CapException): name = "AnnDataMissingObs" - message = "The 'obs' is missing." + message = "The `.obs` is missing." class AnnDataMissingObsColumns(CapException): @@ -100,7 +100,7 @@ class AnnDataMissingObsColumns(CapException): def __init__(self, missing_columns: list[str] = None): msg = ( - "Required obs metadata is missing. File must contain " + "Required `.obs` metadata is missing. The file must contain " "'assay', 'disease', 'organism', 'tissue' " "or corresponding '_ontology_term_id' fields." ) @@ -120,7 +120,7 @@ def __init__( empty_columns: Optional[List[str]] = None, ): msg = ( - "Required obs metadata contains invalid values.\n" + "Required `.obs` metadata metadata contains missing or invalid values.\n" "All required fields must be filled with valid values." ) @@ -163,7 +163,7 @@ class AnnDataUnsupportedGenes(AnnDataNonStandardVarError): def __init__(self, missing_genes_count: int = None): msg = ( "File does not contain valid ENSEMBL terms in var.\n" - "We currently support Homo sapiens and Mus musculus.\n" + "We currently support only Homo sapiens and Mus musculus.\n" "In the case of multiple species in the dataset, orthologous Homo sapiens genes are required.\n" "If there are other species you wish to upload to CAP, please contact " "support@celltype.info and we will work to accommodate your request." @@ -184,6 +184,6 @@ def __init__(self, locations: list[str]): self.locations = locations loc_str = " and ".join(locations) self.message = ( - f"The CSC matrix is found in {loc_str}. " - "Gene expression matrix must be stored in CSR or dense format!" + f"A CSC matrix is found in {loc_str}. " + "The gene expression matrix must be stored in CSR or dense format!" ) From 09d21a930619871f783ddf7a7d42d6e2b2328f41 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 18 Feb 2026 11:44:26 +0300 Subject: [PATCH 31/32] Update errors.py --- src/cap_upload_validator/errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 66f3394..012c634 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -120,7 +120,7 @@ def __init__( empty_columns: Optional[List[str]] = None, ): msg = ( - "Required `.obs` metadata metadata contains missing or invalid values.\n" + "Required `.obs` metadata contains missing or invalid values.\n" "All required fields must be filled with valid values." ) From ff89a9e16991ec4aa599fbbd402fba33526ac7b1 Mon Sep 17 00:00:00 2001 From: Andrey Isaev Date: Wed, 18 Feb 2026 11:54:32 +0300 Subject: [PATCH 32/32] Update errors.py --- src/cap_upload_validator/errors.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/cap_upload_validator/errors.py b/src/cap_upload_validator/errors.py index 012c634..544c574 100644 --- a/src/cap_upload_validator/errors.py +++ b/src/cap_upload_validator/errors.py @@ -18,7 +18,7 @@ def __hash__(self): class BadAnnDataFile(CapException): - name = 'Unknown' + name = 'BadAnnDataFile' message = 'The file format is not supported!' @@ -57,8 +57,11 @@ def append(self, other: CapException) -> None: def __str__(self) -> str: own_str = super().__str__() res_list = [own_str] + self.ex_list - res_message = "\n".join(map(str, res_list)) - res_message += "\nFor details visit: \n\thttps://github.com/cellannotation/cap-validator/wiki/Validation-Errors" + res_message = "\n\n".join(map(str, res_list)) + res_message += ( + "\n\nFor details visit:\n" + "\thttps://github.com/cellannotation/cap-validator/wiki/Validation-Errors" + ) return res_message def have_errors(self) -> bool: