From 2a8e673a00ec3debf660b619c1509633c6729c60 Mon Sep 17 00:00:00 2001 From: Aljes Date: Thu, 15 Jan 2026 17:41:54 +0000 Subject: [PATCH 01/62] Tutorial template --- docs/TUTORIAL.md | 382 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 382 insertions(+) create mode 100644 docs/TUTORIAL.md diff --git a/docs/TUTORIAL.md b/docs/TUTORIAL.md new file mode 100644 index 0000000..a19a416 --- /dev/null +++ b/docs/TUTORIAL.md @@ -0,0 +1,382 @@ +# Tutorial: Using h5ad CLI with csvkit + +This tutorial demonstrates how to combine `h5ad` CLI with `csvkit` to explore, analyze, and subset large `.h5ad` files efficiently without loading them into memory. + +## Introduction + +### h5ad CLI +A command-line tool for working with AnnData (`.h5ad`) files. It streams data directly from disk, making it perfect for exploring huge single-cell datasets without memory constraints. + +**Key features:** +- `info` - Inspect file structure and dimensions +- `table` - Export metadata to CSV +- `subset` - Filter files by cell/gene names + +### csvkit +A suite of command-line tools for working with CSV files. Think of it as `awk`, `sed`, and `grep` but specifically designed for CSV data. + +**Key tools we'll use:** +- `csvcut` - Select specific columns +- `csvsql` - Execute SQL queries on CSV files +- `csvgrep` - Filter rows by pattern +- `csvlook` - Pretty-print CSV in terminal + +**Installation:** +```bash +pip install csvkit +``` + +## 1. Inspect File Structure with `info` + +First, let's see what's in our `.h5ad` file: + +```bash +h5ad info dataset.h5ad +``` + +**Example output:** +``` +File: dataset.h5ad +Dimensions: 50000 obs × 20000 var + +Top-level groups: + obs/ + - cell_type + - sample_id + - donor_id + - tissue + - n_genes + var/ + - gene_name + - highly_variable + X (sparse matrix) + layers/ + obsm/ + uns/ +``` + +This shows us that we have 50,000 cells with metadata including cell types, samples, and donor information. + +## 2. Export Metadata with `table` + +### 2.1 Basic Metadata Export + +Export all cell metadata (observations) to CSV: + +```bash +h5ad table dataset.h5ad --axis obs --output cell_metadata.csv +``` + +Export just specific columns: + +```bash +h5ad table dataset.h5ad --axis obs --columns cell_type,sample_id,donor_id --output cells.csv +``` + +Preview the first few rows in a nice table format: + +```bash +h5ad table dataset.h5ad --axis obs --columns cell_type,sample_id,donor_id --head 10 | csvlook +``` + +**Example output:** +``` +| obs_names | cell_type | sample_id | donor_id | +| ------------------- | ------------ | --------- | -------- | +| AAACCTGAGAAACCAT-1 | T cell | sample_1 | donor_A | +| AAACCTGAGACAGACC-1 | B cell | sample_1 | donor_A | +| AAACCTGAGGCATGGT-1 | NK cell | sample_2 | donor_B | +| AAACCTGCAAGCCGCT-1 | T cell | sample_2 | donor_B | +| AAACCTGCACATTAGC-1 | Monocyte | sample_1 | donor_A | +``` + +### 2.2 Calculate Statistics with `csvsql` + +Now let's analyze the metadata using SQL queries! + +**Count cells per cell type:** + +```bash +h5ad table dataset.h5ad --axis obs --columns cell_type | \ + csvsql --query "SELECT cell_type, COUNT(*) as n_cells FROM stdin GROUP BY cell_type ORDER BY n_cells DESC" | \ + csvlook +``` + +**Example output:** +``` +| cell_type | n_cells | +| ------------ | ------- | +| T cell | 15234 | +| Monocyte | 12456 | +| B cell | 8932 | +| NK cell | 5621 | +| DC | 3456 | +| Macrophage | 2301 | +``` + +**Count cells per cell type and sample:** + +```bash +h5ad table dataset.h5ad --axis obs --columns cell_type,sample_id | \ + csvsql --query "SELECT cell_type, sample_id, COUNT(*) as n_cells + FROM stdin + GROUP BY cell_type, sample_id + ORDER BY cell_type, sample_id" | \ + csvlook +``` + +**Example output:** +``` +| cell_type | sample_id | n_cells | +| ------------ | --------- | ------- | +| B cell | sample_1 | 4521 | +| B cell | sample_2 | 4411 | +| Monocyte | sample_1 | 6234 | +| Monocyte | sample_2 | 6222 | +| T cell | sample_1 | 7645 | +| T cell | sample_2 | 7589 | +``` + +**Calculate average gene count per cell type:** + +```bash +h5ad table dataset.h5ad --axis obs --columns cell_type,n_genes | \ + csvsql --query "SELECT cell_type, + AVG(n_genes) as avg_genes, + MIN(n_genes) as min_genes, + MAX(n_genes) as max_genes + FROM stdin + GROUP BY cell_type + ORDER BY avg_genes DESC" | \ + csvlook +``` + +**Find samples with low cell counts:** + +```bash +h5ad table dataset.h5ad --axis obs --columns sample_id | \ + csvsql --query "SELECT sample_id, COUNT(*) as n_cells + FROM stdin + GROUP BY sample_id + HAVING COUNT(*) < 1000 + ORDER BY n_cells" | \ + csvlook +``` + +## 3. Filter and Subset Data + +### 3.1 Extract Cell Names for a Specific Cell Type + +Let's say we want to create a subset containing only T cells. + +**Step 1: Export metadata and filter for T cells** + +```bash +h5ad table dataset.h5ad --axis obs --columns cell_type --output cell_metadata.csv +``` + +**Step 2: Use csvgrep to find T cells and extract their names** + +```bash +csvgrep -c cell_type -m "T cell" cell_metadata.csv | \ + csvcut -c obs_names | \ + tail -n +2 > tcell_names.txt +``` + +This creates a file `tcell_names.txt` with one cell barcode per line. + +**Alternative: Use csvsql for more complex filters** + +Get T cells from a specific donor: + +```bash +h5ad table dataset.h5ad --axis obs --columns cell_type,donor_id --output cell_metadata.csv + +csvsql --query "SELECT obs_names + FROM cell_metadata + WHERE cell_type = 'T cell' + AND donor_id = 'donor_A'" \ + cell_metadata.csv | \ + tail -n +2 > tcell_donor_A.txt +``` + +Get cells with high gene counts (>2000 genes): + +```bash +h5ad table dataset.h5ad --axis obs --columns n_genes --output cell_metadata.csv + +csvsql --query "SELECT obs_names + FROM cell_metadata + WHERE n_genes > 2000" \ + cell_metadata.csv | \ + tail -n +2 > high_quality_cells.txt +``` + +### 3.2 Create the Subset + +Now use the filtered cell list to create a new `.h5ad` file: + +```bash +h5ad subset dataset.h5ad tcells_only.h5ad --obs tcell_names.txt +``` + +**Verify the subset:** + +```bash +h5ad info tcells_only.h5ad +``` + +**Check the cell type distribution:** + +```bash +h5ad table tcells_only.h5ad --axis obs --columns cell_type | \ + csvsql --query "SELECT cell_type, COUNT(*) as n_cells FROM stdin GROUP BY cell_type" | \ + csvlook +``` + +### 3.3 Advanced: Subset by Both Cells and Genes + +Let's create a subset with specific cell types and a curated gene list. + +**Step 1: Filter cells (multiple cell types)** + +```bash +h5ad table dataset.h5ad --axis obs --columns cell_type --output cell_metadata.csv + +csvsql --query "SELECT obs_names + FROM cell_metadata + WHERE cell_type IN ('T cell', 'NK cell', 'B cell')" \ + cell_metadata.csv | \ + tail -n +2 > lymphocytes.txt +``` + +**Step 2: Create a gene list** + +You might have a predefined list or extract genes from the file: + +```bash +# Export all genes +h5ad table dataset.h5ad --axis var --columns gene_name --output genes.csv + +# Filter for specific genes (e.g., markers) +echo "CD3D +CD3E +CD4 +CD8A +CD8B +CD19 +CD20 +NCAM1" > marker_genes.txt +``` + +**Step 3: Create the subset** + +```bash +h5ad subset dataset.h5ad lymphocytes_markers.h5ad \ + --obs lymphocytes.txt \ + --var marker_genes.txt +``` + +**Verify:** + +```bash +h5ad info lymphocytes_markers.h5ad +``` + +## 4. Complete Example Workflow + +Here's a complete workflow combining everything: + +```bash +# 1. Inspect the file +h5ad info large_dataset.h5ad + +# 2. Export and analyze metadata +h5ad table large_dataset.h5ad --axis obs \ + --columns cell_type,sample_id,donor_id,n_genes \ + --output all_metadata.csv + +# 3. Generate statistics +echo "Cell type distribution:" +csvsql --query "SELECT cell_type, COUNT(*) as n_cells + FROM all_metadata + GROUP BY cell_type + ORDER BY n_cells DESC" \ + all_metadata.csv | csvlook + +echo "Sample distribution:" +csvsql --query "SELECT sample_id, donor_id, COUNT(*) as n_cells + FROM all_metadata + GROUP BY sample_id, donor_id" \ + all_metadata.csv | csvlook + +# 4. Filter for high-quality T cells from a specific donor +csvsql --query "SELECT obs_names + FROM all_metadata + WHERE cell_type = 'T cell' + AND donor_id = 'donor_A' + AND n_genes > 1500" \ + all_metadata.csv | \ + tail -n +2 > selected_cells.txt + +echo "Selected $(wc -l < selected_cells.txt) cells" + +# 5. Create subset +h5ad subset large_dataset.h5ad tcells_subset.h5ad --obs selected_cells.txt + +# 6. Verify result +h5ad info tcells_subset.h5ad +h5ad table tcells_subset.h5ad --axis obs --columns cell_type,donor_id | \ + csvsql --query "SELECT cell_type, donor_id, COUNT(*) as n_cells FROM stdin GROUP BY cell_type, donor_id" | \ + csvlook +``` + +## Tips and Best Practices + +1. **Use `--head` for quick previews** before exporting large files: + ```bash + h5ad table data.h5ad --axis obs --head 100 | csvlook + ``` + +2. **Pipe directly to csvkit** to avoid creating intermediate files: + ```bash + h5ad table data.h5ad --axis obs --columns cell_type | csvsql --query "..." + ``` + +3. **Check cell counts** before subsetting: + ```bash + wc -l selected_cells.txt # Should be > 0! + ``` + +4. **Use csvstat** for quick summary statistics: + ```bash + h5ad table data.h5ad --axis obs --columns n_genes,n_counts | csvstat + ``` + +5. **Combine with standard Unix tools**: + ```bash + # Get unique cell types + h5ad table data.h5ad --axis obs --columns cell_type | tail -n +2 | sort -u + + # Count samples + h5ad table data.h5ad --axis obs --columns sample_id | tail -n +2 | sort | uniq -c + ``` + +## Conclusion + +By combining `h5ad` CLI with `csvkit`, you can: +- ✅ Explore huge datasets without loading them into memory +- ✅ Perform complex queries and aggregations on metadata +- ✅ Create filtered subsets based on sophisticated criteria +- ✅ Work entirely on the command line without Python/R + +This workflow is especially powerful for: +- Initial data exploration +- Quality control analysis +- Creating test datasets +- Preparing data for downstream analysis +- Batch processing multiple files + +For more information: +- h5ad CLI: [README.md](../README.md) +- csvkit documentation: https://csvkit.readthedocs.io/ From 0aa445308f42b78adf6c4c8fa79dcbf7aab03f2d Mon Sep 17 00:00:00 2001 From: Aljes Date: Thu, 15 Jan 2026 19:34:18 +0000 Subject: [PATCH 02/62] Extended info command --- src/h5ad/cli.py | 32 ++++++-- src/h5ad/commands/info.py | 164 +++++++++++++++++++++++++++++++++++--- src/h5ad/info.py | 139 +++++++++++++++++++++++++++++++- 3 files changed, 317 insertions(+), 18 deletions(-) diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py index bb4749d..48ecd53 100644 --- a/src/h5ad/cli.py +++ b/src/h5ad/cli.py @@ -9,13 +9,15 @@ import h5py import numpy as np -from h5ad.commands import show_info, export_table, subset_h5ad app = typer.Typer( - help="Streaming CLI for huge .h5ad files (info, table, subset)." + help="Streaming CLI for huge .h5ad files (info, table, subset, export)." ) console = Console(stderr=True) +export_app = typer.Typer(help="Export objects from an .h5ad file to common formats.") +app.add_typer(export_app, name="export") + @app.command() def info( @@ -24,14 +26,32 @@ def info( help="Path to the .h5ad file", exists=True, readable=True, - ) + ), + obj: Optional[str] = typer.Option( + None, + "--object", + "-o", + help="Object path to inspect (e.g., 'obsm/X_pca', 'X', 'uns')", + ), + types: bool = typer.Option( + False, + "--types", + "-t", + help="Show detailed type information for all entries", + ), ) -> None: """ Show high-level information about the .h5ad file. - Args: - file (Path): Path to the .h5ad file + + Use --types to see type information for each entry. + Use --object to inspect a specific object in detail. + + Examples: + h5ad info data.h5ad + h5ad info --types data.h5ad + h5ad info --object obsm/X_pca data.h5ad """ - show_info(file, console) + show_info(file, console, show_types=types, obj_path=obj) @app.command() diff --git a/src/h5ad/commands/info.py b/src/h5ad/commands/info.py index 95c3c72..29c94ba 100644 --- a/src/h5ad/commands/info.py +++ b/src/h5ad/commands/info.py @@ -1,32 +1,174 @@ from pathlib import Path +from typing import Optional, Union import h5py import rich from rich.console import Console -from h5ad.info import axis_len +from rich.tree import Tree +from h5ad.info import axis_len, get_entry_type, format_type_info +# Preferred display order for top-level keys +KEY_ORDER = ["X", "obs", "var", "obsm", "varm", "layers", "obsp", "varp", "uns"] -def show_info(file: Path, console: Console) -> None: + +def _sort_keys(keys: list) -> list: + """Sort keys according to KEY_ORDER, with unknown keys at the end.""" + order_map = {k: i for i, k in enumerate(KEY_ORDER)} + return sorted(keys, key=lambda k: (order_map.get(k, len(KEY_ORDER)), k)) + + +def show_info( + file: Path, + console: Console, + show_types: bool = False, + obj_path: Optional[str] = None, +) -> None: """ Show high-level information about the .h5ad file. Args: file (Path): Path to the .h5ad file console (Console): Rich console for output + show_types (bool): Show detailed type information for each entry + obj_path (Optional[str]): Specific object path to inspect (e.g., 'obsm/X_pca') """ with h5py.File(file, "r") as f: + # If a specific path is requested, show detailed info for that object + if obj_path: + _show_object_info(f, obj_path, console) + return + # Get n_obs and n_var n_obs = axis_len(f, "obs") n_var = axis_len(f, "var") rich.print( f"[bold cyan]An object with n_obs × n_var: {n_obs if n_obs is not None else '?'} × {n_var if n_var is not None else '?'}[/]" ) - # List top-level keys and their sub-keys - for key, obj in sorted(f.items(), key=lambda x: len(x[0])): - # Only process Groups, skip Datasets like X - if isinstance(obj, h5py.Group): - sub_keys = [k for k in obj.keys() if k != "_index"] - if sub_keys and key != "X": - rich.print( - f"\t[bold yellow]{key}:[/]\t" - + ", ".join(f"[bright_white]{sub}[/]" for sub in sub_keys) + + if show_types: + _show_types_tree(f, console) + else: + # List top-level keys and their sub-keys (original behavior) + for key in _sort_keys(list(f.keys())): + obj = f[key] + # Only process Groups, skip Datasets like X + if isinstance(obj, h5py.Group): + sub_keys = [k for k in obj.keys() if k != "_index"] + if sub_keys and key != "X": + rich.print( + f"\t[bold yellow]{key}:[/]\t" + + ", ".join(f"[bright_white]{sub}[/]" for sub in sub_keys) + ) + + +def _show_types_tree(f: h5py.File, console: Console) -> None: + """Show a tree view with type information for all entries. + + Recursion depth by group: + - obs/var: top level only (no children) + - X: top level only + - obsm/obsp/varm/varp/layers: 1 level (show matrices) + - uns: 2 levels deep + """ + tree = Tree(f"[bold]{f.filename}[/]") + + # Define max depth for each top-level group + max_depth_map = { + "obs": 0, + "var": 0, + "X": 0, + "obsm": 1, + "obsp": 1, + "varm": 1, + "varp": 1, + "layers": 1, + "uns": 2, + } + + def add_node( + parent_tree: Tree, + name: str, + obj: Union[h5py.Group, h5py.Dataset], + current_depth: int, + max_depth: int, + ) -> None: + info = get_entry_type(obj) + type_str = format_type_info(info) + + if isinstance(obj, h5py.Dataset): + shape_str = f"[dim]{obj.shape}[/]" if obj.shape else "" + node_text = f"[bright_white]{name}[/] {shape_str} {type_str}" + parent_tree.add(node_text) + else: + # Group + node_text = f"[bold yellow]{name}/[/] {type_str}" + subtree = parent_tree.add(node_text) + + # Recurse only if within allowed depth + if current_depth < max_depth: + for child_name in sorted(obj.keys()): + if child_name == "_index": + continue + child_obj = obj[child_name] + add_node( + subtree, child_name, child_obj, current_depth + 1, max_depth ) + + # Add top-level items in preferred order + for key in _sort_keys(list(f.keys())): + obj = f[key] + # Skip empty groups + if isinstance(obj, h5py.Group): + children = [k for k in obj.keys() if k != "_index"] + if not children: + continue + max_depth = max_depth_map.get(key, 1) # default to 1 level for unknown groups + add_node(tree, key, obj, current_depth=0, max_depth=max_depth) + + console.print(tree) + + +def _show_object_info(f: h5py.File, obj_path: str, console: Console) -> None: + """Show detailed info for a specific object path.""" + # Normalize path + obj_path = obj_path.strip().lstrip("/") + + if obj_path not in f: + console.print(f"[bold red]Error:[/] '{obj_path}' not found in the file.") + return + + obj = f[obj_path] + info = get_entry_type(obj) + + console.print(f"\n[bold cyan]Path:[/] {obj_path}") + console.print(f"[bold cyan]Type:[/] {info['type']}") + + if info["encoding"]: + console.print(f"[bold cyan]Encoding:[/] {info['encoding']}") + + if info["shape"]: + console.print(f"[bold cyan]Shape:[/] {info['shape']}") + + if info["dtype"]: + console.print(f"[bold cyan]Dtype:[/] {info['dtype']}") + + console.print(f"[bold cyan]Details:[/] {info['details']}") + + # Show attributes if any + if obj.attrs: + console.print(f"\n[bold cyan]Attributes:[/]") + for k, v in obj.attrs.items(): + v_str = v.decode("utf-8") if isinstance(v, bytes) else str(v) + if len(v_str) > 80: + v_str = v_str[:77] + "..." + console.print(f" [dim]{k}:[/] {v_str}") + + # If it's a group, show children + if isinstance(obj, h5py.Group): + children = [k for k in obj.keys() if k != "_index"] + if children: + console.print(f"\n[bold cyan]Children:[/]") + for child_name in sorted(children): + child_obj = obj[child_name] + child_info = get_entry_type(child_obj) + type_str = format_type_info(child_info) + console.print(f" [bright_white]{child_name}[/] {type_str}") diff --git a/src/h5ad/info.py b/src/h5ad/info.py index 3535303..56cd90a 100644 --- a/src/h5ad/info.py +++ b/src/h5ad/info.py @@ -1,5 +1,142 @@ -from typing import Optional, Tuple +from typing import Optional, Tuple, Dict, Any, Union import h5py +import numpy as np + + +def get_entry_type(obj: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]: + """ + Determine the type/format of an HDF5 object for export guidance. + + Returns a dict with: + - type: str (e.g., 'dataframe', 'sparse-matrix', 'dense-matrix', 'dict', 'image', 'array', 'scalar') + - export_as: str (suggested export format: csv, mtx, npy, json, image) + - encoding: str (h5ad encoding-type if present) + - shape: tuple or None + - dtype: str or None + - details: str (human-readable description) + """ + result: Dict[str, Any] = { + "type": "unknown", + "export_as": None, + "encoding": None, + "shape": None, + "dtype": None, + "details": "", + } + + # Get encoding-type attribute if present + enc = obj.attrs.get("encoding-type", b"") + if isinstance(enc, bytes): + enc = enc.decode("utf-8") + result["encoding"] = enc if enc else None + + if isinstance(obj, h5py.Dataset): + result["shape"] = obj.shape + result["dtype"] = str(obj.dtype) + + # Scalar + if obj.shape == (): + result["type"] = "scalar" + result["export_as"] = "json" + result["details"] = f"Scalar value ({obj.dtype})" + return result + + # Check if it looks like an image (2D or 3D with small last dim) + if obj.ndim in (2, 3): + if obj.ndim == 2 or (obj.ndim == 3 and obj.shape[2] in (1, 3, 4)): + # Could be an image if dtype is numeric and reasonable size + if np.issubdtype(obj.dtype, np.number) or obj.dtype == np.bool_: + if obj.shape[0] <= 10000 and obj.shape[1] <= 10000: + result["type"] = "image" + result["export_as"] = "image" + result["details"] = ( + f"Image-like array {obj.shape} ({obj.dtype})" + ) + return result + + # 1D or 2D numeric array -> dense matrix / array + if obj.ndim == 1: + result["type"] = "array" + result["export_as"] = "npy" + result["details"] = f"1D array [{obj.shape[0]}] ({obj.dtype})" + elif obj.ndim == 2: + result["type"] = "dense-matrix" + result["export_as"] = "npy" + result["details"] = ( + f"Dense matrix {obj.shape[0]}×{obj.shape[1]} ({obj.dtype})" + ) + else: + result["type"] = "array" + result["export_as"] = "npy" + result["details"] = f"ND array {obj.shape} ({obj.dtype})" + + return result + + # It's a Group + if isinstance(obj, h5py.Group): + # Check for sparse matrix (CSR/CSC) + if enc in ("csr_matrix", "csc_matrix"): + shape = obj.attrs.get("shape", None) + shape_str = f"{shape[0]}×{shape[1]}" if shape is not None else "?" + result["type"] = "sparse-matrix" + result["export_as"] = "mtx" + result["details"] = ( + f"Sparse {enc.replace('_matrix', '').upper()} matrix {shape_str}" + ) + return result + + # Check for categorical + if enc == "categorical": + codes = obj.get("codes") + cats = obj.get("categories") + n_codes = codes.shape[0] if codes is not None else "?" + n_cats = cats.shape[0] if cats is not None else "?" + result["type"] = "categorical" + result["export_as"] = "csv" + result["details"] = f"Categorical [{n_codes} values, {n_cats} categories]" + return result + + # Check for dataframe (obs/var style with _index) + if "_index" in obj.attrs or "obs_names" in obj or "var_names" in obj: + n_cols = len([k for k in obj.keys() if k != "_index"]) + result["type"] = "dataframe" + result["export_as"] = "csv" + result["details"] = f"DataFrame with {n_cols} columns" + return result + + # Check for array-like groups (nullable integer, string array, etc.) + if enc in ("nullable-integer", "string-array"): + result["type"] = "array" + result["export_as"] = "npy" + result["details"] = f"Encoded array ({enc})" + return result + + # Generic dict/group + n_keys = len(list(obj.keys())) + result["type"] = "dict" + result["export_as"] = "json" + result["details"] = f"Group with {n_keys} keys" + return result + + return result + + +def format_type_info(info: Dict[str, Any]) -> str: + """Format type info as a colored string for display.""" + type_colors = { + "dataframe": "green", + "sparse-matrix": "magenta", + "dense-matrix": "blue", + "array": "blue", + "dict": "yellow", + "image": "cyan", + "categorical": "green", + "scalar": "white", + "unknown": "red", + } + + color = type_colors.get(info["type"], "white") + return f"[{color}]<{info['type']}>[/]" def axis_len(file: h5py.File, axis: str) -> Optional[int]: From 7c736034b7dc58cbd3c2740ccabd935bea379c9b Mon Sep 17 00:00:00 2001 From: Aljes Date: Thu, 15 Jan 2026 19:52:10 +0000 Subject: [PATCH 03/62] Add export command to CLI for exporting HDF5 objects in various formats --- src/h5ad/cli.py | 81 ++++++- src/h5ad/commands/__init__.py | 1 + src/h5ad/commands/export.py | 439 ++++++++++++++++++++++++++++++++++ src/h5ad/info.py | 30 +-- 4 files changed, 534 insertions(+), 17 deletions(-) create mode 100644 src/h5ad/commands/export.py diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py index 48ecd53..a0fd811 100644 --- a/src/h5ad/cli.py +++ b/src/h5ad/cli.py @@ -9,15 +9,18 @@ import h5py import numpy as np +from h5ad.commands import ( + show_info, + export_table, + subset_h5ad, + export_object, +) app = typer.Typer( help="Streaming CLI for huge .h5ad files (info, table, subset, export)." ) console = Console(stderr=True) -export_app = typer.Typer(help="Export objects from an .h5ad file to common formats.") -app.add_typer(export_app, name="export") - @app.command() def info( @@ -158,5 +161,77 @@ def subset( raise typer.Exit(code=1) +@app.command("export") +def export_cmd( + file: Path = typer.Argument( + ..., help="Path to the .h5ad file", exists=True, readable=True + ), + obj: str = typer.Argument( + ..., help="Object path to export (e.g., 'obs', 'X', 'obsm/X_pca', 'uns')" + ), + out: Path = typer.Argument( + ..., + help="Output file path. Extension determines format: .csv, .npy, .mtx, .json, .png/.jpg/.tiff", + ), + columns: Optional[str] = typer.Option( + None, + "--columns", + "-c", + help="Comma separated column names (for dataframe/CSV export only)", + ), + chunk_rows: int = typer.Option( + 10000, "--chunk-rows", "-r", help="Number of rows to read per chunk" + ), + head: Optional[int] = typer.Option( + None, "--head", "-n", help="Output only the first n rows (for CSV export)" + ), + max_elements: int = typer.Option( + 1_000_000, + "--max-elements", + help="Maximum array elements for JSON export", + ), + include_attrs: bool = typer.Option( + False, "--include-attrs", help="Include HDF5 attributes in JSON export" + ), +) -> None: + """ + Export an object from the h5ad file to a common format. + + The output format is auto-detected from the file extension: + - .csv : DataFrames (obs, var) + - .npy : Dense arrays/matrices (obsm/X_pca, varm/PCs, etc.) + - .mtx : Sparse matrices (X if sparse) + - .json : Dictionaries/scalars (uns, uns/colors, etc.) + - .png/.jpg/.tiff : Image-like arrays + + The object type is auto-detected and validated against the extension. + + Examples: + h5ad export data.h5ad obs obs.csv + h5ad export data.h5ad obsm/X_pca pca.npy + h5ad export data.h5ad X matrix.mtx + h5ad export data.h5ad uns metadata.json + """ + col_list: Optional[List[str]] = None + if columns: + col_list = [col.strip() for col in columns.split(",") if col.strip()] + + try: + export_object( + file=file, + obj=obj, + out=out, + columns=col_list, + chunk_rows=chunk_rows, + head=head, + max_elements=max_elements, + include_attrs=include_attrs, + console=console, + ) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) + + def main(argv: Optional[Sequence[str]] = None) -> None: app(standalone_mode=True) diff --git a/src/h5ad/commands/__init__.py b/src/h5ad/commands/__init__.py index e681fea..4c5fa9a 100644 --- a/src/h5ad/commands/__init__.py +++ b/src/h5ad/commands/__init__.py @@ -1,3 +1,4 @@ from h5ad.commands.info import show_info from h5ad.commands.table import export_table from h5ad.commands.subset import subset_h5ad +from h5ad.commands.export import export_object diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py new file mode 100644 index 0000000..8d237c9 --- /dev/null +++ b/src/h5ad/commands/export.py @@ -0,0 +1,439 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Dict, List, Optional, Union, cast + +import h5py +import numpy as np +from rich.console import Console + +from h5ad.commands.table import export_table +from h5ad.read import decode_str_array +from h5ad.info import get_entry_type + + +H5Obj = Union[h5py.Group, h5py.Dataset] + +# Map object types to valid output extensions +TYPE_EXTENSIONS = { + "dataframe": {".csv"}, + "sparse-matrix": {".mtx"}, + "dense-matrix": {".npy", ".png", ".jpg", ".jpeg", ".tif", ".tiff"}, + "array": {".npy", ".png", ".jpg", ".jpeg", ".tif", ".tiff"}, + "dict": {".json"}, + "scalar": {".json"}, + "categorical": {".csv"}, +} + +# Image extensions for validation +IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".tif", ".tiff"} + +# Known exportable types +EXPORTABLE_TYPES = set(TYPE_EXTENSIONS.keys()) + + +def _norm_path(p: str) -> str: + p = p.strip() + if not p: + raise ValueError("Object path must be non-empty.") + return p.lstrip("/") + + +def _get_encoding_type(group: h5py.Group) -> str: + enc = group.attrs.get("encoding-type", "") + if isinstance(enc, bytes): + enc = enc.decode("utf-8") + return str(enc) + + +def _resolve(file: h5py.File, obj: str) -> H5Obj: + obj = _norm_path(obj) + if obj not in file: + raise KeyError(f"'{obj}' not found in the file.") + return cast(H5Obj, file[obj]) + + +def _check_json_exportable(h5obj: H5Obj, max_elements: int, path: str = "") -> None: + """ + Recursively check if a group/dataset can be exported to JSON. + Raises ValueError if it contains non-exportable structures. + """ + if isinstance(h5obj, h5py.Dataset): + if h5obj.shape == (): + return # scalar is fine + n = int(np.prod(h5obj.shape)) if h5obj.shape else 0 + if n > max_elements: + raise ValueError( + f"Cannot export to JSON: '{path or h5obj.name}' has {n} elements " + f"(max {max_elements}). Use --max-elements to increase limit." + ) + return + + # It's a Group - check encoding + enc = _get_encoding_type(h5obj) + if enc in ("csr_matrix", "csc_matrix"): + raise ValueError( + f"Cannot export to JSON: '{path or h5obj.name}' is a sparse matrix. " + f"Export it as .mtx instead." + ) + + # Check children recursively + for key in h5obj.keys(): + child = h5obj[key] + child_path = f"{path}/{key}" if path else key + if isinstance(child, (h5py.Group, h5py.Dataset)): + _check_json_exportable( + cast(H5Obj, child), max_elements=max_elements, path=child_path + ) + + +def export_object( + file: Path, + obj: str, + out: Path, + columns: Optional[List[str]], + chunk_rows: int, + head: Optional[int], + max_elements: int, + include_attrs: bool, + console: Console, +) -> None: + """ + Export an HDF5 object to an appropriate format based on its type. + + Auto-detects the object type and validates the output file extension. + """ + obj = _norm_path(obj) + out_ext = out.suffix.lower() + + with h5py.File(file, "r") as f: + h5obj = _resolve(f, obj) + info = get_entry_type(h5obj) + obj_type = info["type"] + + # Check if type is exportable + if obj_type not in EXPORTABLE_TYPES: + raise ValueError( + f"Cannot export object of type '{obj_type}'. " + f"Exportable types: {', '.join(sorted(EXPORTABLE_TYPES))}." + ) + + # Check if extension matches the type + valid_exts = TYPE_EXTENSIONS.get(obj_type, set()) + if out_ext not in valid_exts: + ext_list = ", ".join(sorted(valid_exts)) + raise ValueError( + f"Output extension '{out_ext}' does not match object type '{obj_type}'. " + f"Expected: {ext_list}." + ) + + # Dispatch to appropriate export function + if obj_type == "dataframe": + # For dataframe, obj must be obs or var + if obj not in ("obs", "var"): + raise ValueError( + f"CSV export for dataframes currently supports only 'obs' or 'var', " + f"not '{obj}'." + ) + export_table( + file=file, + axis=obj, + columns=columns, + out=out, + chunk_rows=chunk_rows, + head=head, + console=console, + ) + + elif obj_type == "categorical": + # Categorical is also exported via table if it's a column in obs/var + raise ValueError( + f"Categorical objects should be exported as part of 'obs' or 'var' table. " + f"Use: h5ad export obs " + ) + + elif obj_type in ("dense-matrix", "array"): + if out_ext in IMAGE_EXTENSIONS: + # User wants image output - validate dimensions + _export_image(file=file, obj=obj, out=out, console=console) + else: + _export_npy( + file=file, obj=obj, out=out, chunk_rows=chunk_rows, console=console + ) + + elif obj_type == "sparse-matrix": + _export_mtx(file=file, obj=obj, out=out, console=console) + + elif obj_type in ("dict", "scalar"): + _export_json( + file=file, + obj=obj, + out=out, + max_elements=max_elements, + include_attrs=include_attrs, + console=console, + ) + + +def _export_npy( + file: Path, + obj: str, + out: Path, + chunk_rows: int, + console: Console, +) -> None: + """Export a dense HDF5 dataset to NumPy .npy without loading it all at once.""" + with h5py.File(file, "r") as f: + h5obj = _resolve(f, obj) + if isinstance(h5obj, h5py.Group): + raise ValueError("Target is a group; cannot export as .npy.") + + ds = h5obj + out.parent.mkdir(parents=True, exist_ok=True) + mm = np.lib.format.open_memmap(out, mode="w+", dtype=ds.dtype, shape=ds.shape) + try: + if ds.shape == (): + mm[...] = ds[()] + console.print(f"[green]Wrote[/] {out}") + return + + if ds.ndim == 1: + n = int(ds.shape[0]) + step = max(1, int(chunk_rows)) + with console.status( + f"[magenta]Exporting {obj} to {out}...[/]" + ) as status: + for start in range(0, n, step): + end = min(start + step, n) + status.update( + f"[magenta]Exporting {obj}: {start}-{end} of {n}...[/]" + ) + mm[start:end] = ds[start:end] + console.print(f"[green]Wrote[/] {out}") + return + + n0 = int(ds.shape[0]) + step0 = max(1, int(chunk_rows)) + with console.status(f"[magenta]Exporting {obj} to {out}...[/]") as status: + for start in range(0, n0, step0): + end = min(start + step0, n0) + status.update( + f"[magenta]Exporting {obj}: {start}-{end} of {n0}...[/]" + ) + mm[start:end, ...] = ds[start:end, ...] + console.print(f"[green]Wrote[/] {out}") + finally: + del mm + + +def _export_mtx(file: Path, obj: str, out: Path, console: Console) -> None: + """Export a CSR/CSC matrix group (AnnData encoding) to Matrix Market (.mtx).""" + with h5py.File(file, "r") as f: + h5obj = _resolve(f, obj) + if not isinstance(h5obj, h5py.Group): + raise ValueError( + "MTX export requires a CSR/CSC matrix group (not a dataset)." + ) + + enc = _get_encoding_type(h5obj) + if enc not in ("csr_matrix", "csc_matrix"): + raise ValueError( + f"Target group encoding-type is {enc!r}; expected 'csr_matrix' or 'csc_matrix'." + ) + + data = h5obj.get("data") + indices = h5obj.get("indices") + indptr = h5obj.get("indptr") + if ( + not isinstance(data, h5py.Dataset) + or not isinstance(indices, h5py.Dataset) + or not isinstance(indptr, h5py.Dataset) + ): + raise RuntimeError( + "Sparse matrix group must contain datasets: data, indices, indptr" + ) + + shape = h5obj.attrs.get("shape", None) + if shape is None: + raise RuntimeError( + "Sparse matrix group is missing required 'shape' attribute." + ) + n_rows, n_cols = (int(shape[0]), int(shape[1])) + + field = "real" if np.issubdtype(data.dtype, np.floating) else "integer" + + out.parent.mkdir(parents=True, exist_ok=True) + + indptr_arr = np.asarray(indptr[...], dtype=np.int64) + nnz_ptr = int(indptr_arr[-1]) if indptr_arr.size else 0 + nnz_data = int(data.shape[0]) + nnz_idx = int(indices.shape[0]) + nnz = min(nnz_ptr, nnz_data, nnz_idx) + + with open(out, "w", encoding="utf-8", newline="\n") as fh: + fh.write(f"%%MatrixMarket matrix coordinate {field} general\n") + fh.write("% generated by h5ad-cli\n") + fh.write(f"{n_rows} {n_cols} {nnz}\n") + + major = n_rows if enc == "csr_matrix" else n_cols + with console.status(f"[magenta]Exporting {obj} to {out}...[/]") as status: + for major_i in range(major): + start = min(int(indptr_arr[major_i]), nnz) + end = min(int(indptr_arr[major_i + 1]), nnz) + if end <= start: + continue + status.update( + f"[magenta]Exporting {obj}: block {major_i+1}/{major}...[/]" + ) + idx = np.asarray(indices[start:end], dtype=np.int64) + vals = np.asarray(data[start:end]) + m = min(len(idx), len(vals)) + if m == 0: + continue + idx = idx[:m] + vals = vals[:m] + for k in range(m): + if enc == "csr_matrix": + r = major_i + 1 + c = int(idx[k]) + 1 + else: + r = int(idx[k]) + 1 + c = major_i + 1 + v = vals[k] + if isinstance(v, np.generic): + v = v.item() + fh.write(f"{r} {c} {v}\n") + console.print(f"[green]Wrote[/] {out}") + + +def _export_json( + file: Path, + obj: str, + out: Path, + max_elements: int, + include_attrs: bool, + console: Console, +) -> None: + """Export an HDF5 group/dataset to JSON (best-effort, with size limits).""" + with h5py.File(file, "r") as f: + h5obj = _resolve(f, obj) + + # Check if exportable before attempting + _check_json_exportable(h5obj, max_elements=max_elements) + + payload = _to_jsonable( + h5obj, max_elements=max_elements, include_attrs=include_attrs + ) + out.parent.mkdir(parents=True, exist_ok=True) + with open(out, "w", encoding="utf-8") as fh: + json.dump(payload, fh, indent=2, ensure_ascii=False, sort_keys=True) + console.print(f"[green]Wrote[/] {out}") + + +def _attrs_to_jsonable( + attrs: h5py.AttributeManager, max_elements: int +) -> Dict[str, Any]: + out: Dict[str, Any] = {} + for k in attrs.keys(): + v = attrs.get(k) + out[str(k)] = _pyify(v, max_elements=max_elements) + return out + + +def _pyify(value: Any, max_elements: int) -> Any: + if isinstance(value, bytes): + try: + return value.decode("utf-8") + except Exception: + return value.decode("utf-8", errors="replace") + if isinstance(value, np.generic): + return value.item() + if isinstance(value, np.ndarray): + if value.size > max_elements: + raise ValueError( + f"Refusing to convert array of size {value.size} (> {max_elements}) to JSON." + ) + if np.issubdtype(value.dtype, np.bytes_) or value.dtype.kind == "O": + value = decode_str_array(value) + return value.tolist() + return value + + +def _dataset_to_jsonable(ds: h5py.Dataset, max_elements: int) -> Any: + if ds.shape == (): + v = ds[()] + return _pyify(v, max_elements=max_elements) + n = int(np.prod(ds.shape)) if ds.shape else 0 + if n > max_elements: + raise ValueError( + f"Refusing to convert dataset {ds.name!r} with {n} elements (> {max_elements}) to JSON." + ) + arr = np.asarray(ds[...]) + return _pyify(arr, max_elements=max_elements) + + +def _to_jsonable(h5obj: H5Obj, max_elements: int, include_attrs: bool) -> Any: + if isinstance(h5obj, h5py.Dataset): + return _dataset_to_jsonable(h5obj, max_elements=max_elements) + + # Group + d: Dict[str, Any] = {} + if include_attrs and len(h5obj.attrs): + d["__attrs__"] = _attrs_to_jsonable(h5obj.attrs, max_elements=max_elements) + + for key in h5obj.keys(): + child = h5obj[key] + if isinstance(child, (h5py.Group, h5py.Dataset)): + d[str(key)] = _to_jsonable( + cast(H5Obj, child), + max_elements=max_elements, + include_attrs=include_attrs, + ) + return d + + +def _export_image(file: Path, obj: str, out: Path, console: Console) -> None: + """Export an image-like dataset (H,W) or (H,W,C) to PNG/JPG/TIFF.""" + try: + from PIL import Image # type: ignore + except Exception as e: # pragma: no cover + raise RuntimeError( + "Pillow is required for image export. Install with: pip install h5ad[images]" + ) from e + + with h5py.File(file, "r") as f: + h5obj = _resolve(f, obj) + if not isinstance(h5obj, h5py.Dataset): + raise ValueError("Image export requires a dataset.") + arr = np.asarray(h5obj[...]) + + if arr.ndim not in (2, 3): + raise ValueError(f"Expected 2D or 3D image array; got shape {arr.shape}.") + if arr.ndim == 3 and arr.shape[2] not in (1, 3, 4): + raise ValueError( + f"Expected last dimension (channels) to be 1, 3, or 4; got {arr.shape}." + ) + + # Convert to uint8 for common image formats + if np.issubdtype(arr.dtype, np.floating): + amax = float(np.nanmax(arr)) if arr.size else 0.0 + if amax <= 1.0: + arr = np.clip(arr, 0.0, 1.0) * 255.0 + else: + arr = np.clip(arr, 0.0, 255.0) + arr = arr.astype(np.uint8) + elif np.issubdtype(arr.dtype, np.integer): + arr = np.clip(arr, 0, 255).astype(np.uint8) + elif arr.dtype == np.bool_: + arr = arr.astype(np.uint8) * 255 + else: + raise ValueError(f"Unsupported image dtype: {arr.dtype}") + + if arr.ndim == 3 and arr.shape[2] == 1: + arr = arr[:, :, 0] + + img = Image.fromarray(arr) + out.parent.mkdir(parents=True, exist_ok=True) + img.save(out) + console.print(f"[green]Wrote[/] {out}") diff --git a/src/h5ad/info.py b/src/h5ad/info.py index 56cd90a..94022a0 100644 --- a/src/h5ad/info.py +++ b/src/h5ad/info.py @@ -41,30 +41,33 @@ def get_entry_type(obj: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]: result["details"] = f"Scalar value ({obj.dtype})" return result - # Check if it looks like an image (2D or 3D with small last dim) - if obj.ndim in (2, 3): - if obj.ndim == 2 or (obj.ndim == 3 and obj.shape[2] in (1, 3, 4)): - # Could be an image if dtype is numeric and reasonable size - if np.issubdtype(obj.dtype, np.number) or obj.dtype == np.bool_: - if obj.shape[0] <= 10000 and obj.shape[1] <= 10000: - result["type"] = "image" - result["export_as"] = "image" - result["details"] = ( - f"Image-like array {obj.shape} ({obj.dtype})" - ) - return result - # 1D or 2D numeric array -> dense matrix / array if obj.ndim == 1: result["type"] = "array" result["export_as"] = "npy" result["details"] = f"1D array [{obj.shape[0]}] ({obj.dtype})" elif obj.ndim == 2: + # Check if it looks like an image (2D with reasonable image dimensions) + # Minimum 16x16, maximum 10000x10000, numeric dtype + if ( + obj.shape[0] >= 16 + and obj.shape[1] >= 16 + and obj.shape[0] <= 10000 + and obj.shape[1] <= 10000 + and (np.issubdtype(obj.dtype, np.number) or obj.dtype == np.bool_) + ): + # Could be an image, but default to dense-matrix + # Image export can still be used if user provides image extension + pass result["type"] = "dense-matrix" result["export_as"] = "npy" result["details"] = ( f"Dense matrix {obj.shape[0]}×{obj.shape[1]} ({obj.dtype})" ) + elif obj.ndim == 3: + result["type"] = "array" + result["export_as"] = "npy" + result["details"] = f"3D array {obj.shape} ({obj.dtype})" else: result["type"] = "array" result["export_as"] = "npy" @@ -129,7 +132,6 @@ def format_type_info(info: Dict[str, Any]) -> str: "dense-matrix": "blue", "array": "blue", "dict": "yellow", - "image": "cyan", "categorical": "green", "scalar": "white", "unknown": "red", From 48e7efc694fbea77b69c1075b96aab56961ab866 Mon Sep 17 00:00:00 2001 From: Aljes Date: Thu, 15 Jan 2026 19:52:18 +0000 Subject: [PATCH 04/62] Add tests for info command and entry type detection --- tests/test_cli.py | 51 ++++++++++++ tests/test_export.py | 172 ++++++++++++++++++++++++++++++++++++++++ tests/test_info_read.py | 81 ++++++++++++++++++- 3 files changed, 303 insertions(+), 1 deletion(-) create mode 100644 tests/test_export.py diff --git a/tests/test_cli.py b/tests/test_cli.py index 1659104..07031d7 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -33,6 +33,57 @@ def test_info_function_direct(self, sample_h5ad_file): # Should not raise exception show_info(sample_h5ad_file, console) + def test_info_types_flag(self, sample_h5ad_file): + """Test info command with --types flag.""" + result = runner.invoke(app, ["info", "--types", str(sample_h5ad_file)]) + assert result.exit_code == 0 + # Should show type annotations in angle brackets + # Output may go to stdout or stderr depending on console config + output = result.stdout + (result.stderr or "") + assert "<" in output + assert ">" in output + + def test_info_types_short_flag(self, sample_h5ad_file): + """Test info command with -t short flag.""" + result = runner.invoke(app, ["info", "-t", str(sample_h5ad_file)]) + assert result.exit_code == 0 + output = result.stdout + (result.stderr or "") + assert "<" in output + + def test_info_object_flag(self, sample_h5ad_file): + """Test info command with --object flag.""" + result = runner.invoke(app, ["info", "--object", "X", str(sample_h5ad_file)]) + assert result.exit_code == 0 + output = result.stdout + (result.stderr or "") + assert "Path:" in output + assert "Type:" in output + + def test_info_object_short_flag(self, sample_h5ad_file): + """Test info command with -o short flag.""" + result = runner.invoke(app, ["info", "-o", "obs", str(sample_h5ad_file)]) + assert result.exit_code == 0 + output = result.stdout + (result.stderr or "") + assert "Path:" in output + assert "dataframe" in output + + def test_info_object_nested_path(self, sample_h5ad_file): + """Test info command with nested object path.""" + result = runner.invoke( + app, ["info", "-o", "uns/description", str(sample_h5ad_file)] + ) + assert result.exit_code == 0 + output = result.stdout + (result.stderr or "") + assert "Path:" in output + + def test_info_object_not_found(self, sample_h5ad_file): + """Test info command with non-existent object path.""" + result = runner.invoke( + app, ["info", "-o", "nonexistent", str(sample_h5ad_file)] + ) + assert result.exit_code == 0 # Doesn't exit with error, just shows message + output = result.stdout + (result.stderr or "") + assert "not found" in output + class TestTableCommand: """Tests for table command.""" diff --git a/tests/test_export.py b/tests/test_export.py new file mode 100644 index 0000000..1c2d88f --- /dev/null +++ b/tests/test_export.py @@ -0,0 +1,172 @@ +"""Tests for the export command.""" + +import json +from pathlib import Path + +import h5py +import numpy as np +from typer.testing import CliRunner + +from h5ad.cli import app + + +runner = CliRunner() + + +def _read_mtx(path: Path) -> np.ndarray: + with open(path, "r", encoding="utf-8") as fh: + header = fh.readline() + assert header.startswith("%%MatrixMarket") + line = fh.readline() + while line.startswith("%"): + line = fh.readline() + n_rows, n_cols, nnz = map(int, line.split()) + mat = np.zeros((n_rows, n_cols), dtype=np.float32) + for _ in range(nnz): + r, c, v = fh.readline().split() + mat[int(r) - 1, int(c) - 1] = float(v) + return mat + + +class TestExportNpy: + def test_export_npy_dense_X(self, sample_h5ad_file, temp_dir): + out = temp_dir / "X.npy" + result = runner.invoke(app, ["export", str(sample_h5ad_file), "X", str(out)]) + assert result.exit_code == 0 + assert out.exists() + + got = np.load(out) + with h5py.File(sample_h5ad_file, "r") as f: + expected = np.asarray(f["X"][...]) + np.testing.assert_allclose(got, expected) + + +class TestExportMtx: + def test_export_mtx_csr(self, sample_sparse_csr_h5ad, temp_dir): + out = temp_dir / "X_csr.mtx" + result = runner.invoke( + app, ["export", str(sample_sparse_csr_h5ad), "X", str(out)] + ) + assert result.exit_code == 0 + assert out.exists() + + got = _read_mtx(out) + expected = np.array( + [ + [1.0, 0.0, 2.0], + [0.0, 0.0, 0.0], + [3.0, 4.0, 0.0], + [0.0, 5.0, 6.0], + ], + dtype=np.float32, + ) + np.testing.assert_allclose(got, expected) + + def test_export_mtx_csc(self, temp_dir): + # Build a small, consistent CSC matrix group + file_path = temp_dir / "test_csc.h5ad" + with h5py.File(file_path, "w") as f: + X = f.create_group("X") + X.attrs["encoding-type"] = "csc_matrix" + X.attrs["shape"] = (3, 4) + data = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], dtype=np.float32) + indices = np.array([0, 2, 0, 1, 1, 2], dtype=np.int32) + indptr = np.array([0, 2, 2, 4, 6], dtype=np.int32) + X.create_dataset("data", data=data) + X.create_dataset("indices", data=indices) + X.create_dataset("indptr", data=indptr) + + out = temp_dir / "X_csc.mtx" + result = runner.invoke(app, ["export", str(file_path), "X", str(out)]) + assert result.exit_code == 0 + assert out.exists() + + got = _read_mtx(out) + expected = np.array( + [ + [1.0, 0.0, 3.0, 0.0], + [0.0, 0.0, 4.0, 5.0], + [2.0, 0.0, 0.0, 6.0], + ], + dtype=np.float32, + ) + np.testing.assert_allclose(got, expected) + + +class TestExportJson: + def test_export_json_uns(self, sample_h5ad_file, temp_dir): + out = temp_dir / "uns.json" + result = runner.invoke(app, ["export", str(sample_h5ad_file), "uns", str(out)]) + assert result.exit_code == 0 + assert out.exists() + payload = json.loads(out.read_text(encoding="utf-8")) + assert "description" in payload + assert payload["description"] == ["Test dataset"] + + +class TestExportCsv: + def test_export_csv_obs(self, sample_h5ad_file, temp_dir): + out = temp_dir / "obs.csv" + result = runner.invoke(app, ["export", str(sample_h5ad_file), "obs", str(out)]) + assert result.exit_code == 0 + assert out.exists() + text = out.read_text(encoding="utf-8") + assert "obs_names" in text + + +class TestExportValidation: + def test_wrong_extension_for_type(self, sample_h5ad_file, temp_dir): + """Test that wrong extension is rejected.""" + out = temp_dir / "obs.npy" # obs is a dataframe, should be .csv + result = runner.invoke(app, ["export", str(sample_h5ad_file), "obs", str(out)]) + assert result.exit_code == 1 + assert "does not match" in result.output or "Expected" in result.output + + def test_sparse_matrix_wrong_extension(self, sample_sparse_csr_h5ad, temp_dir): + """Test that sparse matrix rejects .npy extension.""" + out = temp_dir / "X.npy" # sparse matrix should be .mtx + result = runner.invoke( + app, ["export", str(sample_sparse_csr_h5ad), "X", str(out)] + ) + assert result.exit_code == 1 + assert "does not match" in result.output or ".mtx" in result.output + + def test_dense_matrix_wrong_extension(self, sample_h5ad_file, temp_dir): + """Test that dense matrix rejects .csv extension.""" + out = temp_dir / "X.csv" # dense matrix should be .npy + result = runner.invoke(app, ["export", str(sample_h5ad_file), "X", str(out)]) + assert result.exit_code == 1 + assert "does not match" in result.output or ".npy" in result.output + + def test_json_wrong_extension(self, sample_h5ad_file, temp_dir): + """Test that dict rejects .npy extension.""" + out = temp_dir / "uns.npy" # uns is dict, should be .json + result = runner.invoke(app, ["export", str(sample_h5ad_file), "uns", str(out)]) + assert result.exit_code == 1 + assert "does not match" in result.output or ".json" in result.output + + def test_nonexistent_object(self, sample_h5ad_file, temp_dir): + """Test that nonexistent object path is rejected.""" + out = temp_dir / "output.csv" + result = runner.invoke( + app, ["export", str(sample_h5ad_file), "nonexistent/path", str(out)] + ) + assert result.exit_code == 1 + assert "not found" in result.output + + def test_unknown_type_rejected(self, temp_dir): + """Test that unknown/complex types are rejected.""" + file_path = temp_dir / "test_unknown.h5ad" + with h5py.File(file_path, "w") as f: + g = f.create_group("obs") + g.create_dataset("obs_names", data=np.array([b"cell1"])) + g.attrs["_index"] = "obs_names" + # Create a group without known encoding + weird = f.create_group("weird_group") + weird.attrs["encoding-type"] = "some_unknown_encoding" + + out = temp_dir / "weird.json" + result = runner.invoke(app, ["export", str(file_path), "weird_group", str(out)]) + # Should succeed as it's detected as dict + # but if it had sparse inside, it would fail + assert result.exit_code == 0 diff --git a/tests/test_info_read.py b/tests/test_info_read.py index 07b9a13..8ad47b4 100644 --- a/tests/test_info_read.py +++ b/tests/test_info_read.py @@ -3,10 +3,89 @@ import pytest import h5py import numpy as np -from h5ad.info import axis_len, get_axis_group +from h5ad.info import axis_len, get_axis_group, get_entry_type, format_type_info from h5ad.read import decode_str_array, read_categorical_column, col_chunk_as_strings +class TestGetEntryType: + """Tests for get_entry_type function.""" + + def test_get_entry_type_dataframe(self, sample_h5ad_file): + """Test type detection for dataframe (obs/var).""" + with h5py.File(sample_h5ad_file, "r") as f: + info = get_entry_type(f["obs"]) + assert info["type"] == "dataframe" + assert info["export_as"] == "csv" + + def test_get_entry_type_dense_matrix(self, sample_h5ad_file): + """Test type detection for dense matrix.""" + with h5py.File(sample_h5ad_file, "r") as f: + info = get_entry_type(f["X"]) + assert info["type"] == "dense-matrix" + assert info["export_as"] == "npy" + assert info["shape"] == (5, 4) + + def test_get_entry_type_sparse_matrix(self, sample_sparse_csr_h5ad): + """Test type detection for sparse matrix.""" + with h5py.File(sample_sparse_csr_h5ad, "r") as f: + info = get_entry_type(f["X"]) + assert info["type"] == "sparse-matrix" + assert info["export_as"] == "mtx" + assert info["encoding"] == "csr_matrix" + + def test_get_entry_type_dict(self, sample_h5ad_file): + """Test type detection for dict/group.""" + with h5py.File(sample_h5ad_file, "r") as f: + info = get_entry_type(f["uns"]) + assert info["type"] == "dict" + assert info["export_as"] == "json" + + def test_get_entry_type_1d_array(self, temp_dir): + """Test type detection for 1D array.""" + file_path = temp_dir / "test.h5ad" + with h5py.File(file_path, "w") as f: + f.create_dataset("arr", data=np.array([1, 2, 3, 4, 5])) + with h5py.File(file_path, "r") as f: + info = get_entry_type(f["arr"]) + assert info["type"] == "array" + assert info["export_as"] == "npy" + + def test_get_entry_type_scalar(self, temp_dir): + """Test type detection for scalar.""" + file_path = temp_dir / "test.h5ad" + with h5py.File(file_path, "w") as f: + f.create_dataset("scalar", data=42) + with h5py.File(file_path, "r") as f: + info = get_entry_type(f["scalar"]) + assert info["type"] == "scalar" + assert info["export_as"] == "json" + + +class TestFormatTypeInfo: + """Tests for format_type_info function.""" + + def test_format_type_info_dataframe(self): + """Test formatting dataframe type info.""" + info = {"type": "dataframe", "export_as": "csv"} + result = format_type_info(info) + assert "" in result + assert "green" in result + + def test_format_type_info_sparse(self): + """Test formatting sparse matrix type info.""" + info = {"type": "sparse-matrix", "export_as": "mtx"} + result = format_type_info(info) + assert "" in result + assert "magenta" in result + + def test_format_type_info_unknown(self): + """Test formatting unknown type info.""" + info = {"type": "unknown", "export_as": None} + result = format_type_info(info) + assert "" in result + assert "red" in result + + class TestAxisLen: """Tests for axis_len function.""" From cf8bb0840f2c79e4ef247b1c041d5cbf9658f817 Mon Sep 17 00:00:00 2001 From: Aljes Date: Thu, 15 Jan 2026 19:52:24 +0000 Subject: [PATCH 05/62] Add optional dependency for images support --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index c18faa4..3df76b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,9 @@ dev = [ "pytest>=8.3.4", "pytest-cov>=6.0.0", ] +images = [ + "pillow>=10.0.0", +] [build-system] requires = ["uv_build>=0.8.0,<0.9.0"] From ec482264cfad7441e2df885150347c2c46d00e06 Mon Sep 17 00:00:00 2001 From: Aljes Date: Thu, 15 Jan 2026 20:10:30 +0000 Subject: [PATCH 06/62] Add import command for importing data into h5ad files --- src/h5ad/cli.py | 82 +++++ src/h5ad/commands/__init__.py | 1 + src/h5ad/commands/import_data.py | 511 +++++++++++++++++++++++++++++++ tests/test_import.py | 444 +++++++++++++++++++++++++++ 4 files changed, 1038 insertions(+) create mode 100644 src/h5ad/commands/import_data.py create mode 100644 tests/test_import.py diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py index a0fd811..9ea5d26 100644 --- a/src/h5ad/cli.py +++ b/src/h5ad/cli.py @@ -14,6 +14,7 @@ export_table, subset_h5ad, export_object, + import_object, ) app = typer.Typer( @@ -233,5 +234,86 @@ def export_cmd( raise typer.Exit(code=1) +@app.command("import") +def import_cmd( + file: Path = typer.Argument( + ..., help="Path to the source .h5ad file", exists=True, readable=True + ), + obj: str = typer.Argument( + ..., + help="Object path to create/replace (e.g., 'obs', 'X', 'obsm/X_pca', 'uns')", + ), + input_file: Path = typer.Argument( + ..., + help="Input data file. Extension determines format: .csv, .npy, .mtx, .json", + exists=True, + readable=True, + ), + output: Optional[Path] = typer.Option( + None, + "--output", + "-o", + help="Output .h5ad file path. Required unless --inplace is specified.", + writable=True, + ), + inplace: bool = typer.Option( + False, + "--inplace", + help="Modify the source file directly instead of creating a new file.", + ), + index_column: Optional[str] = typer.Option( + None, + "--index-column", + "-i", + help="Column to use as index when importing CSV into obs/var. Defaults to first column.", + ), +) -> None: + """ + Import data from a file into the h5ad file. + + Creates or replaces an object at the specified path. By default, creates + a new output file. Use --inplace to modify the source file directly. + + The input format is auto-detected from the file extension: + - .csv : DataFrames (obs, var) + - .npy : Dense arrays/matrices (X, obsm/X_pca, varm/PCs, etc.) + - .mtx : Sparse matrices (X, layers/*) + - .json : Dictionaries (uns, uns/metadata, etc.) + + Dimensions are validated against existing obs/var: + - obs: row count must match n_obs + - var: row count must match n_var + - X, layers/*: must match (n_obs, n_var) + - obsm/*, obsp/*: first dimension must match n_obs + - varm/*, varp/*: first dimension must match n_var + + Examples: + h5ad import data.h5ad obs cells.csv -o output.h5ad -i cell_id + h5ad import data.h5ad obsm/X_pca pca.npy -o output.h5ad + h5ad import data.h5ad X matrix.mtx --inplace + h5ad import data.h5ad uns/metadata config.json -o new.h5ad + """ + if not inplace and output is None: + console.print( + "[bold red]Error:[/] Output file is required. " + "Use --output/-o to specify output file, or --inplace to modify source.", + ) + raise typer.Exit(code=1) + + try: + import_object( + file=file, + obj=obj, + input_file=input_file, + output_file=output, + inplace=inplace, + index_column=index_column, + console=console, + ) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) + + def main(argv: Optional[Sequence[str]] = None) -> None: app(standalone_mode=True) diff --git a/src/h5ad/commands/__init__.py b/src/h5ad/commands/__init__.py index 4c5fa9a..b4d6016 100644 --- a/src/h5ad/commands/__init__.py +++ b/src/h5ad/commands/__init__.py @@ -2,3 +2,4 @@ from h5ad.commands.table import export_table from h5ad.commands.subset import subset_h5ad from h5ad.commands.export import export_object +from h5ad.commands.import_data import import_object diff --git a/src/h5ad/commands/import_data.py b/src/h5ad/commands/import_data.py new file mode 100644 index 0000000..c208a9d --- /dev/null +++ b/src/h5ad/commands/import_data.py @@ -0,0 +1,511 @@ +"""Import command for creating/replacing objects in h5ad files.""" + +from __future__ import annotations + +import csv +import json +import shutil +from pathlib import Path +from typing import Any, List, Optional, Tuple, cast + +import h5py +import numpy as np +from rich.console import Console + + +# Map file extensions to expected input formats +EXTENSION_FORMAT = { + ".csv": "csv", + ".npy": "npy", + ".mtx": "mtx", + ".json": "json", +} + +# Define which object paths expect which dimensions +# obs-axis: first dimension must match n_obs +# var-axis: first dimension must match n_var +# matrix: must match (n_obs, n_var) +OBS_AXIS_PREFIXES = ("obs", "obsm/", "obsp/") +VAR_AXIS_PREFIXES = ("var", "varm/", "varp/") +MATRIX_PREFIXES = ("X", "layers/") + + +def _norm_path(p: str) -> str: + p = p.strip() + if not p: + raise ValueError("Object path must be non-empty.") + return p.lstrip("/") + + +def _get_axis_length(file: h5py.File, axis: str) -> Optional[int]: + """Get the length of obs or var axis.""" + if axis not in file: + return None + group = file[axis] + if not isinstance(group, h5py.Group): + return None + index_name = group.attrs.get("_index", None) + if index_name is None: + index_name = "obs_names" if axis == "obs" else "var_names" + if isinstance(index_name, bytes): + index_name = index_name.decode("utf-8") + if index_name not in group: + return None + dataset = group[index_name] + if isinstance(dataset, h5py.Dataset) and dataset.shape: + return int(dataset.shape[0]) + return None + + +def _validate_dimensions( + file: h5py.File, + obj_path: str, + data_shape: tuple, + console: Console, +) -> None: + """Validate that data dimensions match the target path requirements.""" + n_obs = _get_axis_length(file, "obs") + n_var = _get_axis_length(file, "var") + + # Check obs/var replacement (dataframe) + if obj_path == "obs": + if n_obs is not None and data_shape[0] != n_obs: + raise ValueError( + f"Row count mismatch: input has {data_shape[0]} rows, " + f"but obs has {n_obs} cells." + ) + return + if obj_path == "var": + if n_var is not None and data_shape[0] != n_var: + raise ValueError( + f"Row count mismatch: input has {data_shape[0]} rows, " + f"but var has {n_var} features." + ) + return + + # Check matrix (X, layers/*) + for prefix in MATRIX_PREFIXES: + if ( + obj_path == prefix + or obj_path.startswith(prefix + "/") + or obj_path.startswith(prefix) + ): + if obj_path == "X" or obj_path.startswith("layers/"): + if len(data_shape) < 2: + raise ValueError( + f"Matrix data requires 2D shape, got {len(data_shape)}D." + ) + if n_obs is not None and data_shape[0] != n_obs: + raise ValueError( + f"First dimension mismatch: input has {data_shape[0]} rows, " + f"but obs has {n_obs} cells." + ) + if n_var is not None and data_shape[1] != n_var: + raise ValueError( + f"Second dimension mismatch: input has {data_shape[1]} columns, " + f"but var has {n_var} features." + ) + return + + # Check obs-axis matrices (obsm/*, obsp/*) + for prefix in OBS_AXIS_PREFIXES: + if obj_path.startswith(prefix) and obj_path != "obs": + if n_obs is not None and data_shape[0] != n_obs: + raise ValueError( + f"First dimension mismatch: input has {data_shape[0]} rows, " + f"but obs has {n_obs} cells." + ) + # obsp should be square n_obs x n_obs + if obj_path.startswith("obsp/") and len(data_shape) >= 2: + if data_shape[1] != n_obs: + raise ValueError( + f"obsp matrix must be square (n_obs × n_obs): " + f"got {data_shape[0]}×{data_shape[1]}, expected {n_obs}×{n_obs}." + ) + return + + # Check var-axis matrices (varm/*, varp/*) + for prefix in VAR_AXIS_PREFIXES: + if obj_path.startswith(prefix) and obj_path != "var": + if n_var is not None and data_shape[0] != n_var: + raise ValueError( + f"First dimension mismatch: input has {data_shape[0]} rows, " + f"but var has {n_var} features." + ) + # varp should be square n_var x n_var + if obj_path.startswith("varp/") and len(data_shape) >= 2: + if data_shape[1] != n_var: + raise ValueError( + f"varp matrix must be square (n_var × n_var): " + f"got {data_shape[0]}×{data_shape[1]}, expected {n_var}×{n_var}." + ) + return + + # For other paths (like uns/*), no dimension validation + console.print(f"[dim]Note: No dimension validation for path '{obj_path}'[/]") + + +def _read_csv( + input_file: Path, + index_column: Optional[str], +) -> Tuple[List[dict], List[str], List[str], str]: + """ + Read CSV file and return rows, column names, index values, and index column name. + + Returns: + (rows, column_names, index_values, index_column_name) + """ + with open(input_file, "r", encoding="utf-8", newline="") as f: + reader = csv.DictReader(f) + if reader.fieldnames is None: + raise ValueError("CSV file has no header.") + fieldnames = list(reader.fieldnames) + + # Determine index column + if index_column: + if index_column not in fieldnames: + raise ValueError( + f"Index column '{index_column}' not found in CSV. " + f"Available columns: {', '.join(fieldnames)}" + ) + idx_col = index_column + else: + idx_col = fieldnames[0] + + # Read all rows + rows = list(reader) + + index_values = [row[idx_col] for row in rows] + data_columns = [c for c in fieldnames if c != idx_col] + + return rows, data_columns, index_values, idx_col + + +def _read_mtx( + input_file: Path, +) -> Tuple[List[Tuple[int, int, float]], Tuple[int, int], int]: + """ + Read Matrix Market file and return sparse matrix data. + + Returns: + (data, indices, indptr, shape, nnz, is_csr) + """ + with open(input_file, "r", encoding="utf-8") as fh: + header = fh.readline() + if not header.startswith("%%MatrixMarket"): + raise ValueError("Invalid MTX file: missing MatrixMarket header.") + + # Parse header for field type + parts = header.lower().split() + field = "real" + for p in parts: + if p in ("real", "integer", "complex", "pattern"): + field = p + break + + # Skip comments + line = fh.readline() + while line.startswith("%"): + line = fh.readline() + + # Read dimensions + dims = line.split() + n_rows, n_cols, nnz = int(dims[0]), int(dims[1]), int(dims[2]) + + # Read entries + entries = [] + for _ in range(nnz): + parts = fh.readline().split() + r, c = int(parts[0]) - 1, int(parts[1]) - 1 + if field == "pattern": + v = 1.0 + else: + v = float(parts[2]) + entries.append((r, c, v)) + + return entries, (n_rows, n_cols), nnz + + +def _create_csr_from_entries( + entries: List[Tuple[int, int, float]], shape: Tuple[int, int] +) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """Convert coordinate entries to CSR format.""" + n_rows, _ = shape + # Sort by row, then column + entries.sort(key=lambda x: (x[0], x[1])) + + data = np.array([e[2] for e in entries], dtype=np.float32) + indices = np.array([e[1] for e in entries], dtype=np.int32) + + # Build indptr + indptr = np.zeros(n_rows + 1, dtype=np.int32) + for r, _, _ in entries: + indptr[r + 1] += 1 + indptr = np.cumsum(indptr) + + return data, indices, indptr + + +def import_object( + file: Path, + obj: str, + input_file: Path, + output_file: Optional[Path], + inplace: bool, + index_column: Optional[str], + console: Console, +) -> None: + """ + Import data from a file into an h5ad object. + + Args: + file: Path to the source h5ad file + obj: Object path to create/replace (e.g., 'obs', 'obsm/X_pca', 'X') + input_file: Input data file (.csv, .npy, .mtx, .json) + output_file: Path to output h5ad file (None if inplace) + inplace: If True, modify the source file directly + index_column: Column to use as index for obs/var CSV import + console: Console for output + """ + # Determine target file + if inplace: + target_file = file + else: + if output_file is None: + raise ValueError("Output file is required unless --inplace is specified.") + # Copy source to output first + shutil.copy2(file, output_file) + target_file = output_file + console.print(f"[dim]Copied {file} → {output_file}[/]") + + obj = _norm_path(obj) + ext = input_file.suffix.lower() + + if ext not in EXTENSION_FORMAT: + raise ValueError( + f"Unsupported input file extension '{ext}'. " + f"Supported: {', '.join(sorted(EXTENSION_FORMAT.keys()))}" + ) + + fmt = EXTENSION_FORMAT[ext] + + # Validate index_column is only used for obs/var CSV + if index_column and (fmt != "csv" or obj not in ("obs", "var")): + raise ValueError( + "--index-column is only valid for CSV import into 'obs' or 'var'." + ) + + if fmt == "csv": + _import_csv(target_file, obj, input_file, index_column, console) + elif fmt == "npy": + _import_npy(target_file, obj, input_file, console) + elif fmt == "mtx": + _import_mtx(target_file, obj, input_file, console) + elif fmt == "json": + _import_json(target_file, obj, input_file, console) + + +def _import_csv( + file: Path, + obj: str, + input_file: Path, + index_column: Optional[str], + console: Console, +) -> None: + """Import CSV data into obs or var.""" + if obj not in ("obs", "var"): + raise ValueError( + f"CSV import is only supported for 'obs' or 'var', not '{obj}'." + ) + + rows, data_columns, index_values, _ = _read_csv(input_file, index_column) + n_rows = len(rows) + + with h5py.File(file, "a") as f: + # Validate dimensions if the file already has obs/var + _validate_dimensions(f, obj, (n_rows,), console) + + # Delete existing group if present + if obj in f: + del f[obj] + + # Create new group + group = f.create_group(obj) + index_name = "obs_names" if obj == "obs" else "var_names" + group.attrs["_index"] = index_name + group.attrs["encoding-type"] = "dataframe" + group.attrs["encoding-version"] = "0.2.0" + group.attrs["column-order"] = np.array(data_columns, dtype="S") + + # Create index dataset + group.create_dataset( + index_name, + data=np.array(index_values, dtype="S"), + ) + + # Create column datasets + for col in data_columns: + values = [row[col] for row in rows] + # Try to infer type + try: + arr = np.array(values, dtype=np.float64) + group.create_dataset(col, data=arr) + except (ValueError, TypeError): + try: + arr = np.array(values, dtype=np.int64) + group.create_dataset(col, data=arr) + except (ValueError, TypeError): + # Fallback to string + arr = np.array(values, dtype="S") + ds = group.create_dataset(col, data=arr) + ds.attrs["encoding-type"] = "string-array" + ds.attrs["encoding-version"] = "0.2.0" + + console.print( + f"[green]Imported[/] {n_rows} rows × {len(data_columns)} columns into '{obj}'" + ) + + +def _import_npy( + file: Path, + obj: str, + input_file: Path, + console: Console, +) -> None: + """Import NPY data into a dataset.""" + arr = np.load(input_file) + + with h5py.File(file, "a") as f: + _validate_dimensions(f, obj, arr.shape, console) + + # Handle nested paths + parts = obj.split("/") + parent_path = "/".join(parts[:-1]) + name = parts[-1] + + # Ensure parent groups exist + if parent_path: + if parent_path not in f: + f.create_group(parent_path) + parent = cast(h5py.Group, f[parent_path]) + else: + parent = f + + # Delete existing if present + if name in parent: + del parent[name] + + # Create dataset + parent.create_dataset(name, data=arr) + + shape_str = "×".join(str(d) for d in arr.shape) + console.print(f"[green]Imported[/] {shape_str} array into '{obj}'") + + +def _import_mtx( + file: Path, + obj: str, + input_file: Path, + console: Console, +) -> None: + """Import MTX (Matrix Market) data as CSR sparse matrix.""" + entries, shape, nnz = _read_mtx(input_file) + data, indices, indptr = _create_csr_from_entries(entries, shape) + + with h5py.File(file, "a") as f: + _validate_dimensions(f, obj, shape, console) + + # Handle nested paths + parts = obj.split("/") + parent_path = "/".join(parts[:-1]) + name = parts[-1] + + if parent_path: + if parent_path not in f: + f.create_group(parent_path) + parent = cast(h5py.Group, f[parent_path]) + else: + parent = f + + # Delete existing if present + if name in parent: + del parent[name] + + # Create sparse matrix group + group = parent.create_group(name) + group.attrs["encoding-type"] = "csr_matrix" + group.attrs["encoding-version"] = "0.1.0" + group.attrs["shape"] = np.array(shape, dtype=np.int64) + + group.create_dataset("data", data=data) + group.create_dataset("indices", data=indices) + group.create_dataset("indptr", data=indptr) + + console.print( + f"[green]Imported[/] {shape[0]}×{shape[1]} sparse matrix ({nnz} non-zero) into '{obj}'" + ) + + +def _import_json( + file: Path, + obj: str, + input_file: Path, + console: Console, +) -> None: + """Import JSON data into uns or other dict-like groups.""" + with open(input_file, "r", encoding="utf-8") as fh: + payload = json.load(fh) + + with h5py.File(file, "a") as f: + # Handle nested paths + parts = obj.split("/") + parent_path = "/".join(parts[:-1]) + name = parts[-1] + + if parent_path: + if parent_path not in f: + f.create_group(parent_path) + parent = cast(h5py.Group, f[parent_path]) + else: + parent = f + + # Delete existing if present + if name in parent: + del parent[name] + + # Create from JSON + _write_json_to_h5(parent, name, payload) + + console.print(f"[green]Imported[/] JSON data into '{obj}'") + + +def _write_json_to_h5(parent: h5py.Group, name: str, value: Any) -> None: + """Recursively write JSON-like data to HDF5.""" + if isinstance(value, dict): + group = parent.create_group(name) + for k, v in value.items(): + _write_json_to_h5(group, k, v) + elif isinstance(value, list): + # Try to convert to array + try: + arr = np.array(value) + if arr.dtype.kind in ("U", "O"): + arr = np.array(value, dtype="S") + parent.create_dataset(name, data=arr) + except (ValueError, TypeError): + # Fallback: store as JSON string + parent.create_dataset(name, data=json.dumps(value).encode("utf-8")) + elif isinstance(value, str): + parent.create_dataset(name, data=np.array([value], dtype="S")) + elif isinstance(value, bool): + parent.create_dataset(name, data=np.array(value, dtype=bool)) + elif isinstance(value, int): + parent.create_dataset(name, data=np.array(value, dtype=np.int64)) + elif isinstance(value, float): + parent.create_dataset(name, data=np.array(value, dtype=np.float64)) + elif value is None: + # Store None as empty string attribute or special marker + ds = parent.create_dataset(name, data=np.array([], dtype="S")) + ds.attrs["_is_none"] = True + else: + raise ValueError(f"Cannot convert JSON value of type {type(value).__name__}") diff --git a/tests/test_import.py b/tests/test_import.py new file mode 100644 index 0000000..895a488 --- /dev/null +++ b/tests/test_import.py @@ -0,0 +1,444 @@ +"""Tests for the import command.""" + +import json +from pathlib import Path + +import h5py +import numpy as np +from typer.testing import CliRunner + +from h5ad.cli import app + + +runner = CliRunner() + + +class TestImportCsv: + def test_import_csv_obs_inplace(self, sample_h5ad_file, temp_dir): + """Test importing CSV into obs with --inplace.""" + csv_file = temp_dir / "new_obs.csv" + csv_file.write_text( + "cell_id,score,label\n" + "cell_1,1.5,A\n" + "cell_2,2.5,B\n" + "cell_3,3.5,A\n" + "cell_4,4.5,C\n" + "cell_5,5.5,B\n" + ) + + result = runner.invoke( + app, + [ + "import", + str(sample_h5ad_file), + "obs", + str(csv_file), + "--inplace", + "-i", + "cell_id", + ], + ) + assert result.exit_code == 0 + assert "5 rows" in result.output + assert "2 columns" in result.output + + with h5py.File(sample_h5ad_file, "r") as f: + assert "obs" in f + obs = f["obs"] + assert "score" in obs + assert "label" in obs + + def test_import_csv_obs_output(self, sample_h5ad_file, temp_dir): + """Test importing CSV into obs with output file.""" + csv_file = temp_dir / "new_obs.csv" + csv_file.write_text( + "cell_id,score,label\n" + "cell_1,1.5,A\n" + "cell_2,2.5,B\n" + "cell_3,3.5,A\n" + "cell_4,4.5,C\n" + "cell_5,5.5,B\n" + ) + output_file = temp_dir / "output.h5ad" + + result = runner.invoke( + app, + [ + "import", + str(sample_h5ad_file), + "obs", + str(csv_file), + "-o", + str(output_file), + "-i", + "cell_id", + ], + ) + assert result.exit_code == 0 + assert output_file.exists() + + # Verify output file has the new data + with h5py.File(output_file, "r") as f: + assert "obs" in f + obs = f["obs"] + assert "score" in obs + + # Verify source file is unchanged + with h5py.File(sample_h5ad_file, "r") as f: + obs = f["obs"] + assert "score" not in obs + + def test_import_csv_var(self, sample_h5ad_file, temp_dir): + """Test importing CSV into var.""" + csv_file = temp_dir / "new_var.csv" + csv_file.write_text( + "gene_id,mean,std\n" + "gene_1,0.1,0.01\n" + "gene_2,0.2,0.02\n" + "gene_3,0.3,0.03\n" + "gene_4,0.4,0.04\n" + ) + + result = runner.invoke( + app, + [ + "import", + str(sample_h5ad_file), + "var", + str(csv_file), + "--inplace", + "-i", + "gene_id", + ], + ) + assert result.exit_code == 0 + assert "4 rows" in result.output + + def test_import_csv_dimension_mismatch(self, sample_h5ad_file, temp_dir): + """Test that dimension mismatch is rejected.""" + csv_file = temp_dir / "wrong_obs.csv" + csv_file.write_text("cell_id,score\ncell_1,1.0\ncell_2,2.0\n") + + result = runner.invoke( + app, + [ + "import", + str(sample_h5ad_file), + "obs", + str(csv_file), + "--inplace", + "-i", + "cell_id", + ], + ) + assert result.exit_code == 1 + assert "mismatch" in result.output.lower() + + def test_import_csv_invalid_index_column(self, sample_h5ad_file, temp_dir): + """Test that invalid index column is rejected.""" + csv_file = temp_dir / "obs.csv" + csv_file.write_text("a,b,c\n1,2,3\n") + + result = runner.invoke( + app, + [ + "import", + str(sample_h5ad_file), + "obs", + str(csv_file), + "--inplace", + "-i", + "nonexistent", + ], + ) + assert result.exit_code == 1 + assert "not found" in result.output.lower() + + def test_import_csv_not_obs_var(self, sample_h5ad_file, temp_dir): + """Test that CSV import is only allowed for obs/var.""" + csv_file = temp_dir / "data.csv" + csv_file.write_text("a,b\n1,2\n") + + result = runner.invoke( + app, + ["import", str(sample_h5ad_file), "uns/data", str(csv_file), "--inplace"], + ) + assert result.exit_code == 1 + assert "only supported for 'obs' or 'var'" in result.output + + def test_import_requires_output_or_inplace(self, sample_h5ad_file, temp_dir): + """Test that either --output or --inplace is required.""" + csv_file = temp_dir / "obs.csv" + csv_file.write_text("a,b\n1,2\n") + + result = runner.invoke( + app, + ["import", str(sample_h5ad_file), "obs", str(csv_file)], + ) + assert result.exit_code == 1 + assert "Output file is required" in result.output + + +class TestImportNpy: + def test_import_npy_obsm(self, sample_h5ad_file, temp_dir): + """Test importing NPY into obsm.""" + npy_file = temp_dir / "pca.npy" + arr = np.random.randn(5, 10).astype(np.float32) + np.save(npy_file, arr) + + result = runner.invoke( + app, + ["import", str(sample_h5ad_file), "obsm/X_pca", str(npy_file), "--inplace"], + ) + assert result.exit_code == 0 + assert "5×10" in result.output + + with h5py.File(sample_h5ad_file, "r") as f: + assert "obsm/X_pca" in f + np.testing.assert_allclose(f["obsm/X_pca"][...], arr) + + def test_import_npy_varm(self, sample_h5ad_file, temp_dir): + """Test importing NPY into varm.""" + npy_file = temp_dir / "pcs.npy" + arr = np.random.randn(4, 5).astype(np.float32) + np.save(npy_file, arr) + + result = runner.invoke( + app, + ["import", str(sample_h5ad_file), "varm/PCs", str(npy_file), "--inplace"], + ) + assert result.exit_code == 0 + + with h5py.File(sample_h5ad_file, "r") as f: + assert "varm/PCs" in f + + def test_import_npy_X(self, sample_h5ad_file, temp_dir): + """Test importing NPY into X.""" + npy_file = temp_dir / "X.npy" + arr = np.random.randn(5, 4).astype(np.float32) + np.save(npy_file, arr) + + result = runner.invoke( + app, + ["import", str(sample_h5ad_file), "X", str(npy_file), "--inplace"], + ) + assert result.exit_code == 0 + + with h5py.File(sample_h5ad_file, "r") as f: + assert "X" in f + np.testing.assert_allclose(f["X"][...], arr) + + def test_import_npy_dimension_mismatch_obsm(self, sample_h5ad_file, temp_dir): + """Test that obsm dimension mismatch is rejected.""" + npy_file = temp_dir / "bad_pca.npy" + arr = np.random.randn(10, 5).astype(np.float32) + np.save(npy_file, arr) + + result = runner.invoke( + app, + ["import", str(sample_h5ad_file), "obsm/X_pca", str(npy_file), "--inplace"], + ) + assert result.exit_code == 1 + assert "mismatch" in result.output.lower() + + def test_import_npy_dimension_mismatch_X(self, sample_h5ad_file, temp_dir): + """Test that X dimension mismatch is rejected.""" + npy_file = temp_dir / "bad_X.npy" + arr = np.random.randn(5, 10).astype(np.float32) + np.save(npy_file, arr) + + result = runner.invoke( + app, + ["import", str(sample_h5ad_file), "X", str(npy_file), "--inplace"], + ) + assert result.exit_code == 1 + assert "mismatch" in result.output.lower() + + +class TestImportMtx: + def test_import_mtx_X(self, sample_h5ad_file, temp_dir): + """Test importing MTX into X.""" + mtx_file = temp_dir / "X.mtx" + mtx_file.write_text( + "%%MatrixMarket matrix coordinate real general\n" + "% test matrix\n" + "5 4 5\n" + "1 1 1.0\n" + "2 2 2.0\n" + "3 3 3.0\n" + "4 4 4.0\n" + "5 1 5.0\n" + ) + + result = runner.invoke( + app, + ["import", str(sample_h5ad_file), "X", str(mtx_file), "--inplace"], + ) + assert result.exit_code == 0 + assert "5×4" in result.output + assert "5 non-zero" in result.output + + with h5py.File(sample_h5ad_file, "r") as f: + assert "X" in f + X = f["X"] + enc = X.attrs.get("encoding-type") + if isinstance(enc, bytes): + enc = enc.decode("utf-8") + assert enc == "csr_matrix" + + def test_import_mtx_layer(self, sample_h5ad_file, temp_dir): + """Test importing MTX into layers.""" + mtx_file = temp_dir / "layer.mtx" + mtx_file.write_text( + "%%MatrixMarket matrix coordinate real general\n" + "5 4 3\n" + "1 1 1.0\n" + "3 2 2.0\n" + "5 4 3.0\n" + ) + + result = runner.invoke( + app, + [ + "import", + str(sample_h5ad_file), + "layers/counts", + str(mtx_file), + "--inplace", + ], + ) + assert result.exit_code == 0 + + with h5py.File(sample_h5ad_file, "r") as f: + assert "layers/counts" in f + + def test_import_mtx_dimension_mismatch(self, sample_h5ad_file, temp_dir): + """Test that MTX dimension mismatch is rejected.""" + mtx_file = temp_dir / "bad.mtx" + mtx_file.write_text( + "%%MatrixMarket matrix coordinate real general\n" "10 4 1\n" "1 1 1.0\n" + ) + + result = runner.invoke( + app, + ["import", str(sample_h5ad_file), "X", str(mtx_file), "--inplace"], + ) + assert result.exit_code == 1 + assert "mismatch" in result.output.lower() + + +class TestImportJson: + def test_import_json_uns(self, sample_h5ad_file, temp_dir): + """Test importing JSON into uns.""" + json_file = temp_dir / "metadata.json" + json_file.write_text( + json.dumps( + { + "version": "1.0", + "colors": ["red", "green", "blue"], + "n_pcs": 50, + } + ) + ) + + result = runner.invoke( + app, + [ + "import", + str(sample_h5ad_file), + "uns/metadata", + str(json_file), + "--inplace", + ], + ) + assert result.exit_code == 0 + assert "JSON data" in result.output + + with h5py.File(sample_h5ad_file, "r") as f: + assert "uns/metadata" in f + assert "colors" in f["uns/metadata"] + assert "n_pcs" in f["uns/metadata"] + + def test_import_json_nested(self, sample_h5ad_file, temp_dir): + """Test importing nested JSON.""" + json_file = temp_dir / "config.json" + json_file.write_text( + json.dumps( + { + "settings": { + "threshold": 0.5, + "enabled": True, + }, + "labels": ["A", "B", "C"], + } + ) + ) + + result = runner.invoke( + app, + [ + "import", + str(sample_h5ad_file), + "uns/config", + str(json_file), + "--inplace", + ], + ) + assert result.exit_code == 0 + + with h5py.File(sample_h5ad_file, "r") as f: + assert "uns/config/settings" in f + assert "uns/config/labels" in f + + +class TestImportValidation: + def test_unsupported_extension(self, sample_h5ad_file, temp_dir): + """Test that unsupported extensions are rejected.""" + bad_file = temp_dir / "data.xlsx" + bad_file.write_text("dummy") + + result = runner.invoke( + app, + ["import", str(sample_h5ad_file), "obs", str(bad_file), "--inplace"], + ) + assert result.exit_code == 1 + assert "Unsupported" in result.output + + def test_index_column_only_for_csv_obs_var(self, sample_h5ad_file, temp_dir): + """Test that --index-column is only valid for CSV obs/var.""" + npy_file = temp_dir / "data.npy" + np.save(npy_file, np.array([1, 2, 3])) + + result = runner.invoke( + app, + [ + "import", + str(sample_h5ad_file), + "uns/data", + str(npy_file), + "--inplace", + "-i", + "col", + ], + ) + assert result.exit_code == 1 + assert "only valid for CSV" in result.output + + def test_replace_existing_object(self, sample_h5ad_file, temp_dir): + """Test that existing objects can be replaced.""" + with h5py.File(sample_h5ad_file, "r") as f: + original_X = np.array(f["X"][...]) + + npy_file = temp_dir / "new_X.npy" + new_arr = np.ones((5, 4), dtype=np.float32) * 999 + np.save(npy_file, new_arr) + + result = runner.invoke( + app, + ["import", str(sample_h5ad_file), "X", str(npy_file), "--inplace"], + ) + assert result.exit_code == 0 + + with h5py.File(sample_h5ad_file, "r") as f: + np.testing.assert_allclose(f["X"][...], new_arr) + assert not np.allclose(f["X"][...], original_X) From e1db36f748ee70a69323023c4450caa57ef3e76e Mon Sep 17 00:00:00 2001 From: Aljes Date: Thu, 15 Jan 2026 20:41:06 +0000 Subject: [PATCH 07/62] Refactor CLI commands for exporting and importing dataframes, arrays, and dictionaries - Renamed and updated tests for exporting dataframes, replacing the previous table command. - Introduced new command structure for exporting arrays and dictionaries. - Updated import tests to reflect changes in command structure for dataframes and arrays. - Ensured proper validation for unsupported types and dimension mismatches during import. - Enhanced help command tests for new export and import functionalities. --- src/h5ad/cli.py | 491 ++++++++++++++++++++++++++++++------------- tests/test_cli.py | 90 +++++--- tests/test_export.py | 86 ++++---- tests/test_import.py | 215 ++++++++++++++----- 4 files changed, 606 insertions(+), 276 deletions(-) diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py index 9ea5d26..f42768a 100644 --- a/src/h5ad/cli.py +++ b/src/h5ad/cli.py @@ -1,28 +1,28 @@ -import sys -import csv +"""CLI for h5ad files with export and import subcommands.""" + from pathlib import Path -from typing import Optional, Sequence, Tuple, Dict, List +from typing import Optional, Sequence, List -import rich from rich.console import Console import typer -import h5py -import numpy as np - -from h5ad.commands import ( - show_info, - export_table, - subset_h5ad, - export_object, - import_object, -) + +from h5ad.commands import show_info, subset_h5ad app = typer.Typer( - help="Streaming CLI for huge .h5ad files (info, table, subset, export)." + help="Streaming CLI for huge .h5ad files (info, subset, export, import)." ) console = Console(stderr=True) +# Create sub-apps for export and import +export_app = typer.Typer(help="Export objects from h5ad files.") +import_app = typer.Typer(help="Import objects into h5ad files.") +app.add_typer(export_app, name="export") +app.add_typer(import_app, name="import") + +# ============================================================================ +# INFO command +# ============================================================================ @app.command() def info( file: Path = typer.Argument( @@ -58,67 +58,9 @@ def info( show_info(file, console, show_types=types, obj_path=obj) -@app.command() -def table( - file: Path = typer.Argument( - ..., - help="Path to the .h5ad file", - exists=True, - readable=True, - ), - axis: str = typer.Option("obs", help="Axis to read from ('obs' or 'var')"), - columns: Optional[str] = typer.Option( - None, - "--columns", - "-c", - help="Comma separated column names to include in the output table", - ), - out: Optional[Path] = typer.Option( - None, - "--output", - "-o", - help="Output file path (defaults to stdout)", - writable=True, - ), - chunk_rows: int = typer.Option( - 10000, "--chunk-rows", "-r", help="Number of rows to read per chunk" - ), - head: Optional[int] = typer.Option( - None, "--head", "-n", help="Output only the first n rows" - ), -) -> None: - """ - Export a table of the specified axis ('obs' or 'var') to CSV format. - Args: - file (Path): Path to the .h5ad file - axis (str): Axis to read from ('obs' or 'var') - columns (Optional[str]): Comma separated column names to include in the output table - out (Optional[Path]): Output file path (defaults to stdout) - chunk_rows (int): Number of rows to read per chunk - head (Optional[int]): Output only the first n rows - """ - # Validate axis parameter - if axis not in ("obs", "var"): - console.print( - f"[bold red]Error:[/] Invalid axis '{axis}'. Must be either 'obs' or 'var'.", - ) - raise typer.Exit(code=1) - - col_list: Optional[List[str]] = None - if columns: - col_list = [col.strip() for col in columns.split(",") if col.strip()] - - export_table( - file=file, - axis=axis, - columns=col_list, - out=out, - chunk_rows=chunk_rows, - head=head, - console=console, - ) - - +# ============================================================================ +# SUBSET command +# ============================================================================ @app.command() def subset( file: Path = typer.Argument(..., help="Input .h5ad", exists=True, readable=True), @@ -162,30 +104,135 @@ def subset( raise typer.Exit(code=1) -@app.command("export") -def export_cmd( +# ============================================================================ +# EXPORT subcommands +# ============================================================================ +@export_app.command("dataframe") +def export_dataframe( file: Path = typer.Argument( ..., help="Path to the .h5ad file", exists=True, readable=True ), - obj: str = typer.Argument( - ..., help="Object path to export (e.g., 'obs', 'X', 'obsm/X_pca', 'uns')" - ), - out: Path = typer.Argument( - ..., - help="Output file path. Extension determines format: .csv, .npy, .mtx, .json, .png/.jpg/.tiff", - ), + obj: str = typer.Argument(..., help="Object path to export ('obs' or 'var')"), + out: Path = typer.Argument(..., help="Output CSV file path"), columns: Optional[str] = typer.Option( None, "--columns", "-c", - help="Comma separated column names (for dataframe/CSV export only)", + help="Comma separated column names to include", ), chunk_rows: int = typer.Option( 10000, "--chunk-rows", "-r", help="Number of rows to read per chunk" ), head: Optional[int] = typer.Option( - None, "--head", "-n", help="Output only the first n rows (for CSV export)" + None, "--head", "-n", help="Output only the first n rows" ), +) -> None: + """ + Export a dataframe (obs or var) to CSV. + + Examples: + h5ad export dataframe data.h5ad obs obs.csv + h5ad export dataframe data.h5ad var var.csv --columns gene_id,mean + h5ad export dataframe data.h5ad obs - --head 100 + """ + from h5ad.commands import export_table + + if obj not in ("obs", "var"): + console.print( + f"[bold red]Error:[/] Object must be 'obs' or 'var', not '{obj}'.", + ) + raise typer.Exit(code=1) + + col_list: Optional[List[str]] = None + if columns: + col_list = [col.strip() for col in columns.split(",") if col.strip()] + + try: + export_table( + file=file, + axis=obj, + columns=col_list, + out=out if str(out) != "-" else None, + chunk_rows=chunk_rows, + head=head, + console=console, + ) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) + + +@export_app.command("array") +def export_array( + file: Path = typer.Argument( + ..., help="Path to the .h5ad file", exists=True, readable=True + ), + obj: str = typer.Argument( + ..., help="Object path to export (e.g., 'obsm/X_pca', 'varm/PCs', 'X')" + ), + out: Path = typer.Argument(..., help="Output .npy file path"), + chunk_rows: int = typer.Option( + 10000, "--chunk-rows", "-r", help="Number of rows to read per chunk" + ), +) -> None: + """ + Export a dense array or matrix to NumPy .npy format. + + Examples: + h5ad export array data.h5ad obsm/X_pca pca.npy + h5ad export array data.h5ad X matrix.npy + h5ad export array data.h5ad varm/PCs loadings.npy + """ + from h5ad.commands.export import _export_npy + + try: + _export_npy( + file=file, + obj=obj, + out=out, + chunk_rows=chunk_rows, + console=console, + ) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) + + +@export_app.command("sparse") +def export_sparse( + file: Path = typer.Argument( + ..., help="Path to the .h5ad file", exists=True, readable=True + ), + obj: str = typer.Argument( + ..., help="Object path to export (e.g., 'X', 'layers/counts')" + ), + out: Path = typer.Argument(..., help="Output .mtx file path"), +) -> None: + """ + Export a sparse matrix (CSR/CSC) to Matrix Market (.mtx) format. + + Examples: + h5ad export sparse data.h5ad X matrix.mtx + h5ad export sparse data.h5ad layers/counts counts.mtx + """ + from h5ad.commands.export import _export_mtx + + try: + _export_mtx(file=file, obj=obj, out=out, console=console) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) + + +@export_app.command("dict") +def export_dict( + file: Path = typer.Argument( + ..., help="Path to the .h5ad file", exists=True, readable=True + ), + obj: str = typer.Argument( + ..., help="Object path to export (e.g., 'uns', 'uns/colors')" + ), + out: Path = typer.Argument(..., help="Output .json file path"), max_elements: int = typer.Option( 1_000_000, "--max-elements", @@ -196,35 +243,19 @@ def export_cmd( ), ) -> None: """ - Export an object from the h5ad file to a common format. - - The output format is auto-detected from the file extension: - - .csv : DataFrames (obs, var) - - .npy : Dense arrays/matrices (obsm/X_pca, varm/PCs, etc.) - - .mtx : Sparse matrices (X if sparse) - - .json : Dictionaries/scalars (uns, uns/colors, etc.) - - .png/.jpg/.tiff : Image-like arrays - - The object type is auto-detected and validated against the extension. + Export a dict/group or scalar to JSON format. Examples: - h5ad export data.h5ad obs obs.csv - h5ad export data.h5ad obsm/X_pca pca.npy - h5ad export data.h5ad X matrix.mtx - h5ad export data.h5ad uns metadata.json + h5ad export dict data.h5ad uns metadata.json + h5ad export dict data.h5ad uns/colors colors.json """ - col_list: Optional[List[str]] = None - if columns: - col_list = [col.strip() for col in columns.split(",") if col.strip()] + from h5ad.commands.export import _export_json try: - export_object( + _export_json( file=file, obj=obj, out=out, - columns=col_list, - chunk_rows=chunk_rows, - head=head, max_elements=max_elements, include_attrs=include_attrs, console=console, @@ -234,82 +265,250 @@ def export_cmd( raise typer.Exit(code=1) -@app.command("import") -def import_cmd( +@export_app.command("image") +def export_image( + file: Path = typer.Argument( + ..., help="Path to the .h5ad file", exists=True, readable=True + ), + obj: str = typer.Argument(..., help="Object path to export (2D or 3D array)"), + out: Path = typer.Argument(..., help="Output image file (.png, .jpg, .tiff)"), +) -> None: + """ + Export an image-like array to PNG/JPG/TIFF format. + + The array should be 2D (H,W) or 3D (H,W,C) with C in {1,3,4}. + + Examples: + h5ad export image data.h5ad uns/spatial/image tissue.png + """ + from h5ad.commands.export import _export_image + + try: + _export_image(file=file, obj=obj, out=out, console=console) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) + + +# ============================================================================ +# IMPORT subcommands +# ============================================================================ +def _get_target_file(file: Path, output: Optional[Path], inplace: bool) -> Path: + """Determine target file and copy if needed.""" + import shutil + + if inplace: + return file + if output is None: + raise ValueError("Output file is required unless --inplace is specified.") + shutil.copy2(file, output) + console.print(f"[dim]Copied {file} → {output}[/]") + return output + + +@import_app.command("dataframe") +def import_dataframe( file: Path = typer.Argument( ..., help="Path to the source .h5ad file", exists=True, readable=True ), obj: str = typer.Argument( - ..., - help="Object path to create/replace (e.g., 'obs', 'X', 'obsm/X_pca', 'uns')", + ..., help="Object path to create/replace ('obs' or 'var')" ), input_file: Path = typer.Argument( - ..., - help="Input data file. Extension determines format: .csv, .npy, .mtx, .json", - exists=True, - readable=True, + ..., help="Input CSV file", exists=True, readable=True ), output: Optional[Path] = typer.Option( None, "--output", "-o", - help="Output .h5ad file path. Required unless --inplace is specified.", + help="Output .h5ad file path. Required unless --inplace.", writable=True, ), inplace: bool = typer.Option( False, "--inplace", - help="Modify the source file directly instead of creating a new file.", + help="Modify source file directly.", ), index_column: Optional[str] = typer.Option( None, "--index-column", "-i", - help="Column to use as index when importing CSV into obs/var. Defaults to first column.", + help="Column to use as index. Defaults to first column.", ), ) -> None: """ - Import data from a file into the h5ad file. + Import a CSV file into obs or var. - Creates or replaces an object at the specified path. By default, creates - a new output file. Use --inplace to modify the source file directly. + Examples: + h5ad import dataframe data.h5ad obs cells.csv -o output.h5ad -i cell_id + h5ad import dataframe data.h5ad var genes.csv --inplace -i gene_id + """ + from h5ad.commands.import_data import _import_csv - The input format is auto-detected from the file extension: - - .csv : DataFrames (obs, var) - - .npy : Dense arrays/matrices (X, obsm/X_pca, varm/PCs, etc.) - - .mtx : Sparse matrices (X, layers/*) - - .json : Dictionaries (uns, uns/metadata, etc.) + if obj not in ("obs", "var"): + console.print( + f"[bold red]Error:[/] Object must be 'obs' or 'var', not '{obj}'.", + ) + raise typer.Exit(code=1) - Dimensions are validated against existing obs/var: - - obs: row count must match n_obs - - var: row count must match n_var - - X, layers/*: must match (n_obs, n_var) - - obsm/*, obsp/*: first dimension must match n_obs - - varm/*, varp/*: first dimension must match n_var + if not inplace and output is None: + console.print( + "[bold red]Error:[/] Output file is required. " + "Use --output/-o or --inplace.", + ) + raise typer.Exit(code=1) + + try: + target = _get_target_file(file, output, inplace) + _import_csv(target, obj, input_file, index_column, console) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) + + +@import_app.command("array") +def import_array( + file: Path = typer.Argument( + ..., help="Path to the source .h5ad file", exists=True, readable=True + ), + obj: str = typer.Argument( + ..., help="Object path to create/replace (e.g., 'X', 'obsm/X_pca')" + ), + input_file: Path = typer.Argument( + ..., help="Input .npy file", exists=True, readable=True + ), + output: Optional[Path] = typer.Option( + None, + "--output", + "-o", + help="Output .h5ad file path. Required unless --inplace.", + writable=True, + ), + inplace: bool = typer.Option( + False, + "--inplace", + help="Modify source file directly.", + ), +) -> None: + """ + Import a NumPy .npy file as a dense array. + + Dimensions are validated against existing obs/var. Examples: - h5ad import data.h5ad obs cells.csv -o output.h5ad -i cell_id - h5ad import data.h5ad obsm/X_pca pca.npy -o output.h5ad - h5ad import data.h5ad X matrix.mtx --inplace - h5ad import data.h5ad uns/metadata config.json -o new.h5ad + h5ad import array data.h5ad obsm/X_pca pca.npy -o output.h5ad + h5ad import array data.h5ad X matrix.npy --inplace """ + from h5ad.commands.import_data import _import_npy + if not inplace and output is None: console.print( "[bold red]Error:[/] Output file is required. " - "Use --output/-o to specify output file, or --inplace to modify source.", + "Use --output/-o or --inplace.", ) raise typer.Exit(code=1) try: - import_object( - file=file, - obj=obj, - input_file=input_file, - output_file=output, - inplace=inplace, - index_column=index_column, - console=console, + target = _get_target_file(file, output, inplace) + _import_npy(target, obj, input_file, console) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) + + +@import_app.command("sparse") +def import_sparse( + file: Path = typer.Argument( + ..., help="Path to the source .h5ad file", exists=True, readable=True + ), + obj: str = typer.Argument( + ..., help="Object path to create/replace (e.g., 'X', 'layers/counts')" + ), + input_file: Path = typer.Argument( + ..., help="Input .mtx file", exists=True, readable=True + ), + output: Optional[Path] = typer.Option( + None, + "--output", + "-o", + help="Output .h5ad file path. Required unless --inplace.", + writable=True, + ), + inplace: bool = typer.Option( + False, + "--inplace", + help="Modify source file directly.", + ), +) -> None: + """ + Import a Matrix Market (.mtx) file as a CSR sparse matrix. + + Dimensions are validated against existing obs/var. + + Examples: + h5ad import sparse data.h5ad X matrix.mtx -o output.h5ad + h5ad import sparse data.h5ad layers/counts counts.mtx --inplace + """ + from h5ad.commands.import_data import _import_mtx + + if not inplace and output is None: + console.print( + "[bold red]Error:[/] Output file is required. " + "Use --output/-o or --inplace.", ) + raise typer.Exit(code=1) + + try: + target = _get_target_file(file, output, inplace) + _import_mtx(target, obj, input_file, console) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) + + +@import_app.command("dict") +def import_dict( + file: Path = typer.Argument( + ..., help="Path to the source .h5ad file", exists=True, readable=True + ), + obj: str = typer.Argument( + ..., help="Object path to create/replace (e.g., 'uns', 'uns/metadata')" + ), + input_file: Path = typer.Argument( + ..., help="Input .json file", exists=True, readable=True + ), + output: Optional[Path] = typer.Option( + None, + "--output", + "-o", + help="Output .h5ad file path. Required unless --inplace.", + writable=True, + ), + inplace: bool = typer.Option( + False, + "--inplace", + help="Modify source file directly.", + ), +) -> None: + """ + Import a JSON file into uns or other dict-like groups. + + Examples: + h5ad import dict data.h5ad uns/metadata config.json -o output.h5ad + h5ad import dict data.h5ad uns settings.json --inplace + """ + from h5ad.commands.import_data import _import_json + + if not inplace and output is None: + console.print( + "[bold red]Error:[/] Output file is required. " + "Use --output/-o or --inplace.", + ) + raise typer.Exit(code=1) + + try: + target = _get_target_file(file, output, inplace) + _import_json(target, obj, input_file, console) except Exception as e: console.print(f"[bold red]Error:[/] {e}") raise typer.Exit(code=1) diff --git a/tests/test_cli.py b/tests/test_cli.py index 07031d7..7b327e5 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -85,15 +85,21 @@ def test_info_object_not_found(self, sample_h5ad_file): assert "not found" in output -class TestTableCommand: - """Tests for table command.""" +class TestExportDataframeCommand: + """Tests for export dataframe command (replaces table command).""" - def test_table_command_obs(self, sample_h5ad_file, temp_dir): - """Test table command for obs axis.""" + def test_export_dataframe_obs(self, sample_h5ad_file, temp_dir): + """Test export dataframe for obs axis.""" output = temp_dir / "obs_table.csv" result = runner.invoke( app, - ["table", str(sample_h5ad_file), "--axis", "obs", "--output", str(output)], + [ + "export", + "dataframe", + str(sample_h5ad_file), + "obs", + str(output), + ], ) assert result.exit_code == 0 assert output.exists() @@ -105,12 +111,18 @@ def test_table_command_obs(self, sample_h5ad_file, temp_dir): assert len(rows) == 6 # header + 5 rows assert "obs_names" in rows[0] - def test_table_command_var(self, sample_h5ad_file, temp_dir): - """Test table command for var axis.""" + def test_export_dataframe_var(self, sample_h5ad_file, temp_dir): + """Test export dataframe for var axis.""" output = temp_dir / "var_table.csv" result = runner.invoke( app, - ["table", str(sample_h5ad_file), "--axis", "var", "--output", str(output)], + [ + "export", + "dataframe", + str(sample_h5ad_file), + "var", + str(output), + ], ) assert result.exit_code == 0 assert output.exists() @@ -120,20 +132,19 @@ def test_table_command_var(self, sample_h5ad_file, temp_dir): rows = list(reader) assert len(rows) == 5 # header + 4 rows - def test_table_command_columns_filter(self, sample_h5ad_file, temp_dir): - """Test table command with column filter.""" + def test_export_dataframe_columns_filter(self, sample_h5ad_file, temp_dir): + """Test export dataframe with column filter.""" output = temp_dir / "table.csv" result = runner.invoke( app, [ - "table", + "export", + "dataframe", str(sample_h5ad_file), - "--axis", "obs", + str(output), "--columns", "obs_names,cell_type", - "--output", - str(output), ], ) assert result.exit_code == 0 @@ -146,20 +157,19 @@ def test_table_command_columns_filter(self, sample_h5ad_file, temp_dir): assert "cell_type" in header assert "n_counts" not in header - def test_table_command_head(self, sample_h5ad_file, temp_dir): - """Test table command with head limit.""" + def test_export_dataframe_head(self, sample_h5ad_file, temp_dir): + """Test export dataframe with head limit.""" output = temp_dir / "table.csv" result = runner.invoke( app, [ - "table", + "export", + "dataframe", str(sample_h5ad_file), - "--axis", "obs", + str(output), "--head", "2", - "--output", - str(output), ], ) assert result.exit_code == 0 @@ -169,15 +179,23 @@ def test_table_command_head(self, sample_h5ad_file, temp_dir): rows = list(reader) assert len(rows) == 3 # header + 2 rows - def test_table_command_invalid_axis(self, sample_h5ad_file): - """Test table command with invalid axis.""" + def test_export_dataframe_invalid_axis(self, sample_h5ad_file, temp_dir): + """Test export dataframe with invalid axis.""" + output = temp_dir / "table.csv" result = runner.invoke( - app, ["table", str(sample_h5ad_file), "--axis", "invalid"] + app, + [ + "export", + "dataframe", + str(sample_h5ad_file), + "invalid", + str(output), + ], ) assert result.exit_code == 1 # Check both stdout and stderr since Console uses stderr=True - output = result.stdout + result.stderr - assert "Invalid axis" in output + output_text = result.stdout + result.stderr + assert "obs" in output_text or "var" in output_text def test_export_table_function(self, sample_h5ad_file, temp_dir): """Test export_table function directly.""" @@ -317,11 +335,25 @@ def test_info_help(self): assert result.exit_code == 0 assert "Show high-level information" in result.stdout - def test_table_help(self): - """Test table command help.""" - result = runner.invoke(app, ["table", "--help"]) + def test_export_help(self): + """Test export command help.""" + result = runner.invoke(app, ["export", "--help"]) + assert result.exit_code == 0 + assert "dataframe" in result.stdout + assert "array" in result.stdout + + def test_export_dataframe_help(self): + """Test export dataframe command help.""" + result = runner.invoke(app, ["export", "dataframe", "--help"]) + assert result.exit_code == 0 + assert "Export a dataframe" in result.stdout + + def test_import_help(self): + """Test import command help.""" + result = runner.invoke(app, ["import", "--help"]) assert result.exit_code == 0 - assert "Export a table" in result.stdout + assert "dataframe" in result.stdout + assert "array" in result.stdout def test_subset_help(self): """Test subset command help.""" diff --git a/tests/test_export.py b/tests/test_export.py index 1c2d88f..8ab14cd 100644 --- a/tests/test_export.py +++ b/tests/test_export.py @@ -28,10 +28,12 @@ def _read_mtx(path: Path) -> np.ndarray: return mat -class TestExportNpy: - def test_export_npy_dense_X(self, sample_h5ad_file, temp_dir): +class TestExportArray: + def test_export_array_dense_X(self, sample_h5ad_file, temp_dir): out = temp_dir / "X.npy" - result = runner.invoke(app, ["export", str(sample_h5ad_file), "X", str(out)]) + result = runner.invoke( + app, ["export", "array", str(sample_h5ad_file), "X", str(out)] + ) assert result.exit_code == 0 assert out.exists() @@ -41,11 +43,11 @@ def test_export_npy_dense_X(self, sample_h5ad_file, temp_dir): np.testing.assert_allclose(got, expected) -class TestExportMtx: - def test_export_mtx_csr(self, sample_sparse_csr_h5ad, temp_dir): +class TestExportSparse: + def test_export_sparse_csr(self, sample_sparse_csr_h5ad, temp_dir): out = temp_dir / "X_csr.mtx" result = runner.invoke( - app, ["export", str(sample_sparse_csr_h5ad), "X", str(out)] + app, ["export", "sparse", str(sample_sparse_csr_h5ad), "X", str(out)] ) assert result.exit_code == 0 assert out.exists() @@ -62,7 +64,7 @@ def test_export_mtx_csr(self, sample_sparse_csr_h5ad, temp_dir): ) np.testing.assert_allclose(got, expected) - def test_export_mtx_csc(self, temp_dir): + def test_export_sparse_csc(self, temp_dir): # Build a small, consistent CSC matrix group file_path = temp_dir / "test_csc.h5ad" with h5py.File(file_path, "w") as f: @@ -77,7 +79,7 @@ def test_export_mtx_csc(self, temp_dir): X.create_dataset("indptr", data=indptr) out = temp_dir / "X_csc.mtx" - result = runner.invoke(app, ["export", str(file_path), "X", str(out)]) + result = runner.invoke(app, ["export", "sparse", str(file_path), "X", str(out)]) assert result.exit_code == 0 assert out.exists() @@ -93,10 +95,12 @@ def test_export_mtx_csc(self, temp_dir): np.testing.assert_allclose(got, expected) -class TestExportJson: - def test_export_json_uns(self, sample_h5ad_file, temp_dir): +class TestExportDict: + def test_export_dict_uns(self, sample_h5ad_file, temp_dir): out = temp_dir / "uns.json" - result = runner.invoke(app, ["export", str(sample_h5ad_file), "uns", str(out)]) + result = runner.invoke( + app, ["export", "dict", str(sample_h5ad_file), "uns", str(out)] + ) assert result.exit_code == 0 assert out.exists() payload = json.loads(out.read_text(encoding="utf-8")) @@ -104,10 +108,12 @@ def test_export_json_uns(self, sample_h5ad_file, temp_dir): assert payload["description"] == ["Test dataset"] -class TestExportCsv: - def test_export_csv_obs(self, sample_h5ad_file, temp_dir): +class TestExportDataframe: + def test_export_dataframe_obs(self, sample_h5ad_file, temp_dir): out = temp_dir / "obs.csv" - result = runner.invoke(app, ["export", str(sample_h5ad_file), "obs", str(out)]) + result = runner.invoke( + app, ["export", "dataframe", str(sample_h5ad_file), "obs", str(out)] + ) assert result.exit_code == 0 assert out.exists() text = out.read_text(encoding="utf-8") @@ -115,47 +121,36 @@ def test_export_csv_obs(self, sample_h5ad_file, temp_dir): class TestExportValidation: - def test_wrong_extension_for_type(self, sample_h5ad_file, temp_dir): - """Test that wrong extension is rejected.""" - out = temp_dir / "obs.npy" # obs is a dataframe, should be .csv - result = runner.invoke(app, ["export", str(sample_h5ad_file), "obs", str(out)]) - assert result.exit_code == 1 - assert "does not match" in result.output or "Expected" in result.output - - def test_sparse_matrix_wrong_extension(self, sample_sparse_csr_h5ad, temp_dir): - """Test that sparse matrix rejects .npy extension.""" - out = temp_dir / "X.npy" # sparse matrix should be .mtx + def test_wrong_type_for_dataframe(self, sample_h5ad_file, temp_dir): + """Test that wrong object type is rejected for dataframe export.""" + out = temp_dir / "X.csv" result = runner.invoke( - app, ["export", str(sample_sparse_csr_h5ad), "X", str(out)] + app, ["export", "dataframe", str(sample_h5ad_file), "X", str(out)] ) assert result.exit_code == 1 - assert "does not match" in result.output or ".mtx" in result.output + assert "obs" in result.output or "var" in result.output - def test_dense_matrix_wrong_extension(self, sample_h5ad_file, temp_dir): - """Test that dense matrix rejects .csv extension.""" - out = temp_dir / "X.csv" # dense matrix should be .npy - result = runner.invoke(app, ["export", str(sample_h5ad_file), "X", str(out)]) - assert result.exit_code == 1 - assert "does not match" in result.output or ".npy" in result.output - - def test_json_wrong_extension(self, sample_h5ad_file, temp_dir): - """Test that dict rejects .npy extension.""" - out = temp_dir / "uns.npy" # uns is dict, should be .json - result = runner.invoke(app, ["export", str(sample_h5ad_file), "uns", str(out)]) + def test_sparse_matrix_array_export(self, sample_sparse_csr_h5ad, temp_dir): + """Test that sparse matrix requires sparse export.""" + out = temp_dir / "X.npy" + result = runner.invoke( + app, ["export", "array", str(sample_sparse_csr_h5ad), "X", str(out)] + ) + # Should fail because X is sparse, not dense assert result.exit_code == 1 - assert "does not match" in result.output or ".json" in result.output def test_nonexistent_object(self, sample_h5ad_file, temp_dir): """Test that nonexistent object path is rejected.""" - out = temp_dir / "output.csv" + out = temp_dir / "output.npy" result = runner.invoke( - app, ["export", str(sample_h5ad_file), "nonexistent/path", str(out)] + app, + ["export", "array", str(sample_h5ad_file), "nonexistent/path", str(out)], ) assert result.exit_code == 1 - assert "not found" in result.output + assert "not found" in result.output.lower() or "error" in result.output.lower() - def test_unknown_type_rejected(self, temp_dir): - """Test that unknown/complex types are rejected.""" + def test_export_dict_unknown_type(self, temp_dir): + """Test that unknown/complex types can be exported as dict.""" file_path = temp_dir / "test_unknown.h5ad" with h5py.File(file_path, "w") as f: g = f.create_group("obs") @@ -166,7 +161,8 @@ def test_unknown_type_rejected(self, temp_dir): weird.attrs["encoding-type"] = "some_unknown_encoding" out = temp_dir / "weird.json" - result = runner.invoke(app, ["export", str(file_path), "weird_group", str(out)]) + result = runner.invoke( + app, ["export", "dict", str(file_path), "weird_group", str(out)] + ) # Should succeed as it's detected as dict - # but if it had sparse inside, it would fail assert result.exit_code == 0 diff --git a/tests/test_import.py b/tests/test_import.py index 895a488..736d4e2 100644 --- a/tests/test_import.py +++ b/tests/test_import.py @@ -13,8 +13,8 @@ runner = CliRunner() -class TestImportCsv: - def test_import_csv_obs_inplace(self, sample_h5ad_file, temp_dir): +class TestImportDataframe: + def test_import_dataframe_obs_inplace(self, sample_h5ad_file, temp_dir): """Test importing CSV into obs with --inplace.""" csv_file = temp_dir / "new_obs.csv" csv_file.write_text( @@ -30,6 +30,7 @@ def test_import_csv_obs_inplace(self, sample_h5ad_file, temp_dir): app, [ "import", + "dataframe", str(sample_h5ad_file), "obs", str(csv_file), @@ -48,7 +49,7 @@ def test_import_csv_obs_inplace(self, sample_h5ad_file, temp_dir): assert "score" in obs assert "label" in obs - def test_import_csv_obs_output(self, sample_h5ad_file, temp_dir): + def test_import_dataframe_obs_output(self, sample_h5ad_file, temp_dir): """Test importing CSV into obs with output file.""" csv_file = temp_dir / "new_obs.csv" csv_file.write_text( @@ -65,6 +66,7 @@ def test_import_csv_obs_output(self, sample_h5ad_file, temp_dir): app, [ "import", + "dataframe", str(sample_h5ad_file), "obs", str(csv_file), @@ -88,7 +90,7 @@ def test_import_csv_obs_output(self, sample_h5ad_file, temp_dir): obs = f["obs"] assert "score" not in obs - def test_import_csv_var(self, sample_h5ad_file, temp_dir): + def test_import_dataframe_var(self, sample_h5ad_file, temp_dir): """Test importing CSV into var.""" csv_file = temp_dir / "new_var.csv" csv_file.write_text( @@ -103,6 +105,7 @@ def test_import_csv_var(self, sample_h5ad_file, temp_dir): app, [ "import", + "dataframe", str(sample_h5ad_file), "var", str(csv_file), @@ -114,7 +117,7 @@ def test_import_csv_var(self, sample_h5ad_file, temp_dir): assert result.exit_code == 0 assert "4 rows" in result.output - def test_import_csv_dimension_mismatch(self, sample_h5ad_file, temp_dir): + def test_import_dataframe_dimension_mismatch(self, sample_h5ad_file, temp_dir): """Test that dimension mismatch is rejected.""" csv_file = temp_dir / "wrong_obs.csv" csv_file.write_text("cell_id,score\ncell_1,1.0\ncell_2,2.0\n") @@ -123,6 +126,7 @@ def test_import_csv_dimension_mismatch(self, sample_h5ad_file, temp_dir): app, [ "import", + "dataframe", str(sample_h5ad_file), "obs", str(csv_file), @@ -134,7 +138,7 @@ def test_import_csv_dimension_mismatch(self, sample_h5ad_file, temp_dir): assert result.exit_code == 1 assert "mismatch" in result.output.lower() - def test_import_csv_invalid_index_column(self, sample_h5ad_file, temp_dir): + def test_import_dataframe_invalid_index_column(self, sample_h5ad_file, temp_dir): """Test that invalid index column is rejected.""" csv_file = temp_dir / "obs.csv" csv_file.write_text("a,b,c\n1,2,3\n") @@ -143,6 +147,7 @@ def test_import_csv_invalid_index_column(self, sample_h5ad_file, temp_dir): app, [ "import", + "dataframe", str(sample_h5ad_file), "obs", str(csv_file), @@ -154,33 +159,48 @@ def test_import_csv_invalid_index_column(self, sample_h5ad_file, temp_dir): assert result.exit_code == 1 assert "not found" in result.output.lower() - def test_import_csv_not_obs_var(self, sample_h5ad_file, temp_dir): - """Test that CSV import is only allowed for obs/var.""" + def test_import_dataframe_not_obs_var(self, sample_h5ad_file, temp_dir): + """Test that dataframe import is only allowed for obs/var.""" csv_file = temp_dir / "data.csv" csv_file.write_text("a,b\n1,2\n") result = runner.invoke( app, - ["import", str(sample_h5ad_file), "uns/data", str(csv_file), "--inplace"], + [ + "import", + "dataframe", + str(sample_h5ad_file), + "uns/data", + str(csv_file), + "--inplace", + ], ) assert result.exit_code == 1 - assert "only supported for 'obs' or 'var'" in result.output + assert "obs" in result.output or "var" in result.output - def test_import_requires_output_or_inplace(self, sample_h5ad_file, temp_dir): + def test_import_dataframe_requires_output_or_inplace( + self, sample_h5ad_file, temp_dir + ): """Test that either --output or --inplace is required.""" csv_file = temp_dir / "obs.csv" csv_file.write_text("a,b\n1,2\n") result = runner.invoke( app, - ["import", str(sample_h5ad_file), "obs", str(csv_file)], + [ + "import", + "dataframe", + str(sample_h5ad_file), + "obs", + str(csv_file), + ], ) assert result.exit_code == 1 assert "Output file is required" in result.output -class TestImportNpy: - def test_import_npy_obsm(self, sample_h5ad_file, temp_dir): +class TestImportArray: + def test_import_array_obsm(self, sample_h5ad_file, temp_dir): """Test importing NPY into obsm.""" npy_file = temp_dir / "pca.npy" arr = np.random.randn(5, 10).astype(np.float32) @@ -188,7 +208,14 @@ def test_import_npy_obsm(self, sample_h5ad_file, temp_dir): result = runner.invoke( app, - ["import", str(sample_h5ad_file), "obsm/X_pca", str(npy_file), "--inplace"], + [ + "import", + "array", + str(sample_h5ad_file), + "obsm/X_pca", + str(npy_file), + "--inplace", + ], ) assert result.exit_code == 0 assert "5×10" in result.output @@ -197,7 +224,7 @@ def test_import_npy_obsm(self, sample_h5ad_file, temp_dir): assert "obsm/X_pca" in f np.testing.assert_allclose(f["obsm/X_pca"][...], arr) - def test_import_npy_varm(self, sample_h5ad_file, temp_dir): + def test_import_array_varm(self, sample_h5ad_file, temp_dir): """Test importing NPY into varm.""" npy_file = temp_dir / "pcs.npy" arr = np.random.randn(4, 5).astype(np.float32) @@ -205,14 +232,21 @@ def test_import_npy_varm(self, sample_h5ad_file, temp_dir): result = runner.invoke( app, - ["import", str(sample_h5ad_file), "varm/PCs", str(npy_file), "--inplace"], + [ + "import", + "array", + str(sample_h5ad_file), + "varm/PCs", + str(npy_file), + "--inplace", + ], ) assert result.exit_code == 0 with h5py.File(sample_h5ad_file, "r") as f: assert "varm/PCs" in f - def test_import_npy_X(self, sample_h5ad_file, temp_dir): + def test_import_array_X(self, sample_h5ad_file, temp_dir): """Test importing NPY into X.""" npy_file = temp_dir / "X.npy" arr = np.random.randn(5, 4).astype(np.float32) @@ -220,7 +254,14 @@ def test_import_npy_X(self, sample_h5ad_file, temp_dir): result = runner.invoke( app, - ["import", str(sample_h5ad_file), "X", str(npy_file), "--inplace"], + [ + "import", + "array", + str(sample_h5ad_file), + "X", + str(npy_file), + "--inplace", + ], ) assert result.exit_code == 0 @@ -228,7 +269,7 @@ def test_import_npy_X(self, sample_h5ad_file, temp_dir): assert "X" in f np.testing.assert_allclose(f["X"][...], arr) - def test_import_npy_dimension_mismatch_obsm(self, sample_h5ad_file, temp_dir): + def test_import_array_dimension_mismatch_obsm(self, sample_h5ad_file, temp_dir): """Test that obsm dimension mismatch is rejected.""" npy_file = temp_dir / "bad_pca.npy" arr = np.random.randn(10, 5).astype(np.float32) @@ -236,12 +277,19 @@ def test_import_npy_dimension_mismatch_obsm(self, sample_h5ad_file, temp_dir): result = runner.invoke( app, - ["import", str(sample_h5ad_file), "obsm/X_pca", str(npy_file), "--inplace"], + [ + "import", + "array", + str(sample_h5ad_file), + "obsm/X_pca", + str(npy_file), + "--inplace", + ], ) assert result.exit_code == 1 assert "mismatch" in result.output.lower() - def test_import_npy_dimension_mismatch_X(self, sample_h5ad_file, temp_dir): + def test_import_array_dimension_mismatch_X(self, sample_h5ad_file, temp_dir): """Test that X dimension mismatch is rejected.""" npy_file = temp_dir / "bad_X.npy" arr = np.random.randn(5, 10).astype(np.float32) @@ -249,14 +297,39 @@ def test_import_npy_dimension_mismatch_X(self, sample_h5ad_file, temp_dir): result = runner.invoke( app, - ["import", str(sample_h5ad_file), "X", str(npy_file), "--inplace"], + [ + "import", + "array", + str(sample_h5ad_file), + "X", + str(npy_file), + "--inplace", + ], ) assert result.exit_code == 1 assert "mismatch" in result.output.lower() + def test_import_array_requires_output_or_inplace(self, sample_h5ad_file, temp_dir): + """Test that either --output or --inplace is required.""" + npy_file = temp_dir / "data.npy" + np.save(npy_file, np.array([1, 2, 3])) -class TestImportMtx: - def test_import_mtx_X(self, sample_h5ad_file, temp_dir): + result = runner.invoke( + app, + [ + "import", + "array", + str(sample_h5ad_file), + "obsm/X_pca", + str(npy_file), + ], + ) + assert result.exit_code == 1 + assert "Output file is required" in result.output + + +class TestImportSparse: + def test_import_sparse_X(self, sample_h5ad_file, temp_dir): """Test importing MTX into X.""" mtx_file = temp_dir / "X.mtx" mtx_file.write_text( @@ -272,7 +345,14 @@ def test_import_mtx_X(self, sample_h5ad_file, temp_dir): result = runner.invoke( app, - ["import", str(sample_h5ad_file), "X", str(mtx_file), "--inplace"], + [ + "import", + "sparse", + str(sample_h5ad_file), + "X", + str(mtx_file), + "--inplace", + ], ) assert result.exit_code == 0 assert "5×4" in result.output @@ -286,7 +366,7 @@ def test_import_mtx_X(self, sample_h5ad_file, temp_dir): enc = enc.decode("utf-8") assert enc == "csr_matrix" - def test_import_mtx_layer(self, sample_h5ad_file, temp_dir): + def test_import_sparse_layer(self, sample_h5ad_file, temp_dir): """Test importing MTX into layers.""" mtx_file = temp_dir / "layer.mtx" mtx_file.write_text( @@ -301,6 +381,7 @@ def test_import_mtx_layer(self, sample_h5ad_file, temp_dir): app, [ "import", + "sparse", str(sample_h5ad_file), "layers/counts", str(mtx_file), @@ -312,7 +393,7 @@ def test_import_mtx_layer(self, sample_h5ad_file, temp_dir): with h5py.File(sample_h5ad_file, "r") as f: assert "layers/counts" in f - def test_import_mtx_dimension_mismatch(self, sample_h5ad_file, temp_dir): + def test_import_sparse_dimension_mismatch(self, sample_h5ad_file, temp_dir): """Test that MTX dimension mismatch is rejected.""" mtx_file = temp_dir / "bad.mtx" mtx_file.write_text( @@ -321,14 +402,41 @@ def test_import_mtx_dimension_mismatch(self, sample_h5ad_file, temp_dir): result = runner.invoke( app, - ["import", str(sample_h5ad_file), "X", str(mtx_file), "--inplace"], + [ + "import", + "sparse", + str(sample_h5ad_file), + "X", + str(mtx_file), + "--inplace", + ], ) assert result.exit_code == 1 assert "mismatch" in result.output.lower() + def test_import_sparse_requires_output_or_inplace(self, sample_h5ad_file, temp_dir): + """Test that either --output or --inplace is required.""" + mtx_file = temp_dir / "data.mtx" + mtx_file.write_text( + "%%MatrixMarket matrix coordinate real general\n" "5 4 1\n" "1 1 1.0\n" + ) + + result = runner.invoke( + app, + [ + "import", + "sparse", + str(sample_h5ad_file), + "X", + str(mtx_file), + ], + ) + assert result.exit_code == 1 + assert "Output file is required" in result.output + -class TestImportJson: - def test_import_json_uns(self, sample_h5ad_file, temp_dir): +class TestImportDict: + def test_import_dict_uns(self, sample_h5ad_file, temp_dir): """Test importing JSON into uns.""" json_file = temp_dir / "metadata.json" json_file.write_text( @@ -345,6 +453,7 @@ def test_import_json_uns(self, sample_h5ad_file, temp_dir): app, [ "import", + "dict", str(sample_h5ad_file), "uns/metadata", str(json_file), @@ -359,7 +468,7 @@ def test_import_json_uns(self, sample_h5ad_file, temp_dir): assert "colors" in f["uns/metadata"] assert "n_pcs" in f["uns/metadata"] - def test_import_json_nested(self, sample_h5ad_file, temp_dir): + def test_import_dict_nested(self, sample_h5ad_file, temp_dir): """Test importing nested JSON.""" json_file = temp_dir / "config.json" json_file.write_text( @@ -378,6 +487,7 @@ def test_import_json_nested(self, sample_h5ad_file, temp_dir): app, [ "import", + "dict", str(sample_h5ad_file), "uns/config", str(json_file), @@ -390,40 +500,26 @@ def test_import_json_nested(self, sample_h5ad_file, temp_dir): assert "uns/config/settings" in f assert "uns/config/labels" in f - -class TestImportValidation: - def test_unsupported_extension(self, sample_h5ad_file, temp_dir): - """Test that unsupported extensions are rejected.""" - bad_file = temp_dir / "data.xlsx" - bad_file.write_text("dummy") - - result = runner.invoke( - app, - ["import", str(sample_h5ad_file), "obs", str(bad_file), "--inplace"], - ) - assert result.exit_code == 1 - assert "Unsupported" in result.output - - def test_index_column_only_for_csv_obs_var(self, sample_h5ad_file, temp_dir): - """Test that --index-column is only valid for CSV obs/var.""" - npy_file = temp_dir / "data.npy" - np.save(npy_file, np.array([1, 2, 3])) + def test_import_dict_requires_output_or_inplace(self, sample_h5ad_file, temp_dir): + """Test that either --output or --inplace is required.""" + json_file = temp_dir / "data.json" + json_file.write_text('{"key": "value"}') result = runner.invoke( app, [ "import", + "dict", str(sample_h5ad_file), "uns/data", - str(npy_file), - "--inplace", - "-i", - "col", + str(json_file), ], ) assert result.exit_code == 1 - assert "only valid for CSV" in result.output + assert "Output file is required" in result.output + +class TestImportValidation: def test_replace_existing_object(self, sample_h5ad_file, temp_dir): """Test that existing objects can be replaced.""" with h5py.File(sample_h5ad_file, "r") as f: @@ -435,7 +531,14 @@ def test_replace_existing_object(self, sample_h5ad_file, temp_dir): result = runner.invoke( app, - ["import", str(sample_h5ad_file), "X", str(npy_file), "--inplace"], + [ + "import", + "array", + str(sample_h5ad_file), + "X", + str(npy_file), + "--inplace", + ], ) assert result.exit_code == 0 From ea404a8644f79168d50f317901f713f4585d6083 Mon Sep 17 00:00:00 2001 From: Aljes Date: Thu, 15 Jan 2026 20:43:23 +0000 Subject: [PATCH 08/62] Remove export_table function from CLI commands --- src/h5ad/commands/table.py | 90 -------------------------------------- 1 file changed, 90 deletions(-) delete mode 100644 src/h5ad/commands/table.py diff --git a/src/h5ad/commands/table.py b/src/h5ad/commands/table.py deleted file mode 100644 index 16b7686..0000000 --- a/src/h5ad/commands/table.py +++ /dev/null @@ -1,90 +0,0 @@ -import sys -import csv -from pathlib import Path -from typing import List, Optional, Dict - -import h5py -import numpy as np -from rich.console import Console -from h5ad.info import get_axis_group -from h5ad.read import col_chunk_as_strings - - -def export_table( - file: Path, - axis: str, - columns: Optional[List[str]], - out: Optional[Path], - chunk_rows: int, - head: Optional[int], - console: Console, -) -> None: - """ - Export a table of the specified axis to CSV format. - Args: - file (Path): Path to the .h5ad file - axis (str): Axis to read from ('obs' or 'var') - columns (Optional[List[str]]): List of column names to include in the output table - out (Optional[Path]): Output file path (defaults to stdout) - chunk_rows (int): Number of rows to read per chunk - head (Optional[int]): Output only the first n rows - """ - with h5py.File(file, "r") as f: - group, n_rows, index_name = get_axis_group(f, axis) - - # Determine columns to read - if columns: - col_names = list(columns) - else: - col_names = [k for k in group.keys() if k != "_index" and k != index_name] - # Add index name if not already present - if index_name and index_name not in col_names: - col_names.insert(0, index_name) - - if isinstance(index_name, bytes): - index_name = index_name.decode("utf-8") - - if index_name not in col_names: - col_names.insert(0, index_name) - else: - col_names = [index_name] + [c for c in col_names if c != index_name] - - # Limit rows if head option is specified - if head is not None and head > 0: - n_rows = min(n_rows, head) - - # Open writer - if out is None or str(out) == "-": - out_fh = sys.stdout - else: - out_fh = open(out, "w", newline="", encoding="utf-8") - writer = csv.writer(out_fh) - - # Write data in chunks - try: - writer.writerow(col_names) - cat_cache: Dict[int, np.ndarray] = {} - with console.status( - f"[magenta]Exporting {axis} table...[/] to {'stdout' if out_fh is sys.stdout else out}" - ) as status: - for start in range(0, n_rows, chunk_rows): - end = min(start + chunk_rows, n_rows) - status.update( - f"[magenta]Exporting rows {start}-{end} of {n_rows}...[/]" - ) - cols_data: List[List[str]] = [] - # Read each column for the current chunk - for col in col_names: - cols_data.append( - col_chunk_as_strings(group, col, start, end, cat_cache) - ) - # Write rows - for row_idx in range(end - start): - row = [ - cols_data[col_idx][row_idx] - for col_idx in range(len(col_names)) - ] - writer.writerow(row) - finally: - if out_fh is not sys.stdout: - out_fh.close() From f87cc77816fa9b9c97e1d902f12e94b372d25872 Mon Sep 17 00:00:00 2001 From: Aljes Date: Thu, 15 Jan 2026 20:46:26 +0000 Subject: [PATCH 09/62] Refactor export_table function and update imports in CLI commands --- src/h5ad/commands/__init__.py | 3 +- src/h5ad/commands/export.py | 96 +++++++++++++++++++++++++++++++++-- tests/test_cli.py | 2 +- 3 files changed, 95 insertions(+), 6 deletions(-) diff --git a/src/h5ad/commands/__init__.py b/src/h5ad/commands/__init__.py index b4d6016..7b60c31 100644 --- a/src/h5ad/commands/__init__.py +++ b/src/h5ad/commands/__init__.py @@ -1,5 +1,4 @@ from h5ad.commands.info import show_info -from h5ad.commands.table import export_table from h5ad.commands.subset import subset_h5ad -from h5ad.commands.export import export_object +from h5ad.commands.export import export_object, export_table from h5ad.commands.import_data import import_object diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py index 8d237c9..cf0c64b 100644 --- a/src/h5ad/commands/export.py +++ b/src/h5ad/commands/export.py @@ -1,6 +1,8 @@ from __future__ import annotations +import csv import json +import sys from pathlib import Path from typing import Any, Dict, List, Optional, Union, cast @@ -8,13 +10,101 @@ import numpy as np from rich.console import Console -from h5ad.commands.table import export_table -from h5ad.read import decode_str_array -from h5ad.info import get_entry_type +from h5ad.read import col_chunk_as_strings, decode_str_array +from h5ad.info import get_axis_group, get_entry_type H5Obj = Union[h5py.Group, h5py.Dataset] + +# ============================================================================ +# DATAFRAME EXPORT (CSV) +# ============================================================================ +def export_table( + file: Path, + axis: str, + columns: Optional[List[str]], + out: Optional[Path], + chunk_rows: int, + head: Optional[int], + console: Console, +) -> None: + """ + Export a dataframe (obs or var) to CSV format. + + Args: + file: Path to the .h5ad file + axis: Axis to read from ('obs' or 'var') + columns: List of column names to include in the output table + out: Output file path (defaults to stdout if None) + chunk_rows: Number of rows to read per chunk + head: Output only the first n rows + console: Rich console for status output + """ + with h5py.File(file, "r") as f: + group, n_rows, index_name = get_axis_group(f, axis) + + # Determine columns to read + if columns: + col_names = list(columns) + else: + col_names = [k for k in group.keys() if k != "_index" and k != index_name] + # Add index name if not already present + if index_name and index_name not in col_names: + col_names.insert(0, index_name) + + if isinstance(index_name, bytes): + index_name = index_name.decode("utf-8") + + if index_name not in col_names: + col_names.insert(0, index_name) + else: + col_names = [index_name] + [c for c in col_names if c != index_name] + + # Limit rows if head option is specified + if head is not None and head > 0: + n_rows = min(n_rows, head) + + # Open writer + if out is None or str(out) == "-": + out_fh = sys.stdout + else: + out_fh = open(out, "w", newline="", encoding="utf-8") + writer = csv.writer(out_fh) + + # Write data in chunks + try: + writer.writerow(col_names) + cat_cache: Dict[int, np.ndarray] = {} + with console.status( + f"[magenta]Exporting {axis} table...[/] to {'stdout' if out_fh is sys.stdout else out}" + ) as status: + for start in range(0, n_rows, chunk_rows): + end = min(start + chunk_rows, n_rows) + status.update( + f"[magenta]Exporting rows {start}-{end} of {n_rows}...[/]" + ) + cols_data: List[List[str]] = [] + # Read each column for the current chunk + for col in col_names: + cols_data.append( + col_chunk_as_strings(group, col, start, end, cat_cache) + ) + # Write rows + for row_idx in range(end - start): + row = [ + cols_data[col_idx][row_idx] + for col_idx in range(len(col_names)) + ] + writer.writerow(row) + finally: + if out_fh is not sys.stdout: + out_fh.close() + + +# ============================================================================ +# TYPE DETECTION AND VALIDATION +# ============================================================================ # Map object types to valid output extensions TYPE_EXTENSIONS = { "dataframe": {".csv"}, diff --git a/tests/test_cli.py b/tests/test_cli.py index 7b327e5..4105546 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -6,7 +6,7 @@ from typer.testing import CliRunner from h5ad.cli import app from h5ad.commands.info import show_info -from h5ad.commands.table import export_table +from h5ad.commands.export import export_table from rich.console import Console From ddf6f2185736aec57bb952fcc633fbf624966b42 Mon Sep 17 00:00:00 2001 From: Aljes Date: Fri, 23 Jan 2026 13:07:44 +0000 Subject: [PATCH 10/62] Rename 'object' option to 'entry' in info command and add depth option for recursion control --- src/h5ad/cli.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py index f42768a..856224b 100644 --- a/src/h5ad/cli.py +++ b/src/h5ad/cli.py @@ -31,11 +31,11 @@ def info( exists=True, readable=True, ), - obj: Optional[str] = typer.Option( + entry: Optional[str] = typer.Option( None, - "--object", - "-o", - help="Object path to inspect (e.g., 'obsm/X_pca', 'X', 'uns')", + "--entry", + "-e", + help="Entry path to inspect (e.g., 'obsm/X_pca', 'X', 'uns')", ), types: bool = typer.Option( False, @@ -43,19 +43,25 @@ def info( "-t", help="Show detailed type information for all entries", ), + depth: int = typer.Option( + None, + "--depth", + "-d", + help="Maximum recursion depth for type display (only with --types)", + ), ) -> None: """ Show high-level information about the .h5ad file. Use --types to see type information for each entry. - Use --object to inspect a specific object in detail. + Use --entry to inspect a specific entry in detail. Examples: h5ad info data.h5ad h5ad info --types data.h5ad - h5ad info --object obsm/X_pca data.h5ad + h5ad info --entry obsm/X_pca data.h5ad """ - show_info(file, console, show_types=types, obj_path=obj) + show_info(file, console, show_types=types, depth=depth, entry_path=entry) # ============================================================================ From 99b6335445c900c621bdb493dc5a59f1c89dc041 Mon Sep 17 00:00:00 2001 From: Aljes Date: Fri, 23 Jan 2026 13:07:55 +0000 Subject: [PATCH 11/62] Refactor show_info function to replace obj_path with entry_path and add depth parameter for recursion control --- src/h5ad/commands/info.py | 46 ++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/src/h5ad/commands/info.py b/src/h5ad/commands/info.py index 29c94ba..e1d2bbe 100644 --- a/src/h5ad/commands/info.py +++ b/src/h5ad/commands/info.py @@ -21,7 +21,8 @@ def show_info( file: Path, console: Console, show_types: bool = False, - obj_path: Optional[str] = None, + depth: Optional[int] = None, + entry_path: Optional[str] = None, ) -> None: """ Show high-level information about the .h5ad file. @@ -29,12 +30,13 @@ def show_info( file (Path): Path to the .h5ad file console (Console): Rich console for output show_types (bool): Show detailed type information for each entry - obj_path (Optional[str]): Specific object path to inspect (e.g., 'obsm/X_pca') + depth (Optional[int]): Maximum recursion depth for type display (only with show_types=True) + entry_path (Optional[str]): Specific entry path to inspect (e.g., 'obsm/X_pca') """ with h5py.File(file, "r") as f: # If a specific path is requested, show detailed info for that object - if obj_path: - _show_object_info(f, obj_path, console) + if entry_path: + _show_object_info(f, entry_path, console) return # Get n_obs and n_var @@ -45,7 +47,7 @@ def show_info( ) if show_types: - _show_types_tree(f, console) + _show_types_tree(f, console, depth=depth) else: # List top-level keys and their sub-keys (original behavior) for key in _sort_keys(list(f.keys())): @@ -60,7 +62,9 @@ def show_info( ) -def _show_types_tree(f: h5py.File, console: Console) -> None: +def _show_types_tree( + f: h5py.File, console: Console, depth: Optional[int] = None +) -> None: """Show a tree view with type information for all entries. Recursion depth by group: @@ -121,25 +125,27 @@ def add_node( children = [k for k in obj.keys() if k != "_index"] if not children: continue - max_depth = max_depth_map.get(key, 1) # default to 1 level for unknown groups + max_depth = ( + depth if depth is not None else max_depth_map.get(key, 1) + ) # default to 1 level for unknown groups add_node(tree, key, obj, current_depth=0, max_depth=max_depth) console.print(tree) -def _show_object_info(f: h5py.File, obj_path: str, console: Console) -> None: +def _show_object_info(f: h5py.File, entry_path: str, console: Console) -> None: """Show detailed info for a specific object path.""" # Normalize path - obj_path = obj_path.strip().lstrip("/") + entry_path = entry_path.strip().lstrip("/") - if obj_path not in f: - console.print(f"[bold red]Error:[/] '{obj_path}' not found in the file.") + if entry_path not in f: + console.print(f"[bold red]Error:[/] '{entry_path}' not found in the file.") return - obj = f[obj_path] - info = get_entry_type(obj) + entry = f[entry_path] + info = get_entry_type(entry) - console.print(f"\n[bold cyan]Path:[/] {obj_path}") + console.print(f"\n[bold cyan]Path:[/] {entry_path}") console.print(f"[bold cyan]Type:[/] {info['type']}") if info["encoding"]: @@ -154,21 +160,21 @@ def _show_object_info(f: h5py.File, obj_path: str, console: Console) -> None: console.print(f"[bold cyan]Details:[/] {info['details']}") # Show attributes if any - if obj.attrs: + if entry.attrs: console.print(f"\n[bold cyan]Attributes:[/]") - for k, v in obj.attrs.items(): + for k, v in entry.attrs.items(): v_str = v.decode("utf-8") if isinstance(v, bytes) else str(v) if len(v_str) > 80: v_str = v_str[:77] + "..." console.print(f" [dim]{k}:[/] {v_str}") # If it's a group, show children - if isinstance(obj, h5py.Group): - children = [k for k in obj.keys() if k != "_index"] + if isinstance(entry, h5py.Group): + children = [k for k in entry.keys() if k != "_index"] if children: console.print(f"\n[bold cyan]Children:[/]") for child_name in sorted(children): - child_obj = obj[child_name] - child_info = get_entry_type(child_obj) + child_entry = entry[child_name] + child_info = get_entry_type(child_entry) type_str = format_type_info(child_info) console.print(f" [bright_white]{child_name}[/] {type_str}") From 99a9834e4a85dc765ca84e1d93445ff082db0f07 Mon Sep 17 00:00:00 2001 From: Aljes Date: Fri, 23 Jan 2026 13:08:11 +0000 Subject: [PATCH 12/62] Refactor get_entry_type function to replace 'obj' with 'entry' and improve encoding handling; update axis_len function to raise exceptions for error cases --- src/h5ad/info.py | 106 ++++++++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 51 deletions(-) diff --git a/src/h5ad/info.py b/src/h5ad/info.py index 94022a0..25cdc31 100644 --- a/src/h5ad/info.py +++ b/src/h5ad/info.py @@ -3,7 +3,7 @@ import numpy as np -def get_entry_type(obj: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]: +def get_entry_type(entry: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]: """ Determine the type/format of an HDF5 object for export guidance. @@ -25,61 +25,49 @@ def get_entry_type(obj: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]: } # Get encoding-type attribute if present - enc = obj.attrs.get("encoding-type", b"") + enc = entry.attrs.get("encoding-type", b"") if isinstance(enc, bytes): enc = enc.decode("utf-8") result["encoding"] = enc if enc else None - if isinstance(obj, h5py.Dataset): - result["shape"] = obj.shape - result["dtype"] = str(obj.dtype) + # Infer the type for Dataset entry + if isinstance(entry, h5py.Dataset): + result["shape"] = entry.shape + result["dtype"] = str(entry.dtype) # Scalar - if obj.shape == (): + if entry.shape == (): result["type"] = "scalar" result["export_as"] = "json" - result["details"] = f"Scalar value ({obj.dtype})" + result["details"] = f"Scalar value ({entry.dtype})" return result # 1D or 2D numeric array -> dense matrix / array - if obj.ndim == 1: + if entry.ndim == 1: result["type"] = "array" result["export_as"] = "npy" - result["details"] = f"1D array [{obj.shape[0]}] ({obj.dtype})" - elif obj.ndim == 2: - # Check if it looks like an image (2D with reasonable image dimensions) - # Minimum 16x16, maximum 10000x10000, numeric dtype - if ( - obj.shape[0] >= 16 - and obj.shape[1] >= 16 - and obj.shape[0] <= 10000 - and obj.shape[1] <= 10000 - and (np.issubdtype(obj.dtype, np.number) or obj.dtype == np.bool_) - ): - # Could be an image, but default to dense-matrix - # Image export can still be used if user provides image extension - pass + result["details"] = f"1D array [{entry.shape[0]}] ({entry.dtype})" + elif entry.ndim == 2: result["type"] = "dense-matrix" result["export_as"] = "npy" result["details"] = ( - f"Dense matrix {obj.shape[0]}×{obj.shape[1]} ({obj.dtype})" + f"Dense matrix {entry.shape[0]}×{entry.shape[1]} ({entry.dtype})" ) - elif obj.ndim == 3: + elif entry.ndim == 3: result["type"] = "array" result["export_as"] = "npy" - result["details"] = f"3D array {obj.shape} ({obj.dtype})" + result["details"] = f"3D array {entry.shape} ({entry.dtype})" else: result["type"] = "array" result["export_as"] = "npy" - result["details"] = f"ND array {obj.shape} ({obj.dtype})" - + result["details"] = f"ND array {entry.shape} ({entry.dtype})" return result # It's a Group - if isinstance(obj, h5py.Group): + if isinstance(entry, h5py.Group): # Check for sparse matrix (CSR/CSC) if enc in ("csr_matrix", "csc_matrix"): - shape = obj.attrs.get("shape", None) + shape = entry.attrs.get("shape", None) shape_str = f"{shape[0]}×{shape[1]}" if shape is not None else "?" result["type"] = "sparse-matrix" result["export_as"] = "mtx" @@ -90,8 +78,8 @@ def get_entry_type(obj: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]: # Check for categorical if enc == "categorical": - codes = obj.get("codes") - cats = obj.get("categories") + codes = entry.get("codes") + cats = entry.get("categories") n_codes = codes.shape[0] if codes is not None else "?" n_cats = cats.shape[0] if cats is not None else "?" result["type"] = "categorical" @@ -100,8 +88,8 @@ def get_entry_type(obj: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]: return result # Check for dataframe (obs/var style with _index) - if "_index" in obj.attrs or "obs_names" in obj or "var_names" in obj: - n_cols = len([k for k in obj.keys() if k != "_index"]) + if "_index" in entry.attrs or "obs_names" in entry or "var_names" in entry: + n_cols = len([k for k in entry.keys() if k != "_index"]) result["type"] = "dataframe" result["export_as"] = "csv" result["details"] = f"DataFrame with {n_cols} columns" @@ -115,7 +103,7 @@ def get_entry_type(obj: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]: return result # Generic dict/group - n_keys = len(list(obj.keys())) + n_keys = len(list(entry.keys())) result["type"] = "dict" result["export_as"] = "json" result["details"] = f"Group with {n_keys} keys" @@ -141,24 +129,31 @@ def format_type_info(info: Dict[str, Any]) -> str: return f"[{color}]<{info['type']}>[/]" -def axis_len(file: h5py.File, axis: str) -> Optional[int]: +def axis_len(file: h5py.File, axis: str) -> int: """ Get the length of the specified axis ('obs' or 'var') in the h5ad file. + Args: file (h5py.File): Opened h5ad file object axis (str): Axis name ('obs' or 'var') Returns: - Optional[int]: Length of the axis, or None if not found + int: Length of the axis + + Raises: + ValueError: If axis is not 'obs' or 'var' + KeyError: If axis or index dataset not found in file + TypeError: If axis is not a group or index is not a dataset + RuntimeError: If axis length cannot be determined """ # Check if the specified axis exists in the file if axis not in file: - return None + raise KeyError(f"'{axis}' not found in the file.") # Get the group corresponding to the axis group = file[axis] if not isinstance(group, h5py.Group): - return None + raise TypeError(f"'{axis}' is not a group.") # Determine the index name for the axis index_name = group.attrs.get("_index", None) @@ -168,49 +163,58 @@ def axis_len(file: h5py.File, axis: str) -> Optional[int]: elif axis == "var": index_name = "var_names" else: - return None + raise ValueError(f"Invalid axis '{axis}'. Must be 'obs' or 'var'.") + # Decode bytes to string if necessary if isinstance(index_name, bytes): index_name = index_name.decode("utf-8") + # Check if the index dataset exists if index_name not in group: - return None + raise KeyError(f"Index dataset '{index_name}' not found in '{axis}' group.") # Return the length of the index dataset dataset = group[index_name] if not isinstance(dataset, h5py.Dataset): - return None + raise TypeError(f"Index '{index_name}' in '{axis}' is not a dataset.") if dataset.shape: return int(dataset.shape[0]) - return None + raise RuntimeError( + f"Cannot determine length of '{axis}': index dataset has no shape." + ) def get_axis_group(file: h5py.File, axis: str) -> Tuple[h5py.Group, int, str]: """ Get the axis group, its length, and index name. + Args: file (h5py.File): Opened h5ad file object axis (str): Axis name ('obs' or 'var') Returns: - Tuple[h5py.Group, int, str]: Axis group, its length, and index + Tuple[h5py.Group, int, str]: Axis group, its length, and index name + + Raises: + ValueError: If axis is not 'obs' or 'var' + KeyError: If axis or index dataset not found in file + TypeError: If axis is not a group or index is not a dataset + RuntimeError: If axis length cannot be determined """ if axis not in ("obs", "var"): raise ValueError("axis must be 'obs' or 'var'.") - if axis not in file: - raise KeyError(f"'{axis}' not found in the file.") - - group = file[axis] - if not isinstance(group, h5py.Group): - raise TypeError(f"'{axis}' is not a group.") + # axis_len will validate existence and get length (raises exceptions if issues) n = axis_len(file, axis) - if n is None: - raise RuntimeError(f"Could not determine length of axis '{axis}'.") + # Get the group (already validated by axis_len) + group = file[axis] + + # Get the index name index_name = group.attrs.get("_index", None) if index_name is None: index_name = "obs_names" if axis == "obs" else "var_names" if isinstance(index_name, bytes): index_name = index_name.decode("utf-8") + return group, n, index_name From 82de2689eff9d70ba9ce098710c895308fffd7d3 Mon Sep 17 00:00:00 2001 From: Aljes Date: Fri, 23 Jan 2026 13:52:54 +0000 Subject: [PATCH 13/62] Refactor info command tests to replace 'object' with 'entry' flag and update related assertions --- tests/test_cli.py | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 4105546..203830f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -50,35 +50,51 @@ def test_info_types_short_flag(self, sample_h5ad_file): output = result.stdout + (result.stderr or "") assert "<" in output - def test_info_object_flag(self, sample_h5ad_file): - """Test info command with --object flag.""" - result = runner.invoke(app, ["info", "--object", "X", str(sample_h5ad_file)]) + def test_info_depth_flag(self, sample_h5ad_file): + """Test info command with --depth flag.""" + result = runner.invoke( + app, ["info", "--types", "--depth", "1", str(sample_h5ad_file)] + ) + assert result.exit_code == 0 + output = result.stdout + (result.stderr or "") + assert "<" in output + + def test_info_depth_short_flag(self, sample_h5ad_file): + """Test info command with -d short flag.""" + result = runner.invoke(app, ["info", "-t", "-d", "2", str(sample_h5ad_file)]) + assert result.exit_code == 0 + output = result.stdout + (result.stderr or "") + assert "<" in output + + def test_info_entry_flag(self, sample_h5ad_file): + """Test info command with --entry flag.""" + result = runner.invoke(app, ["info", "--entry", "X", str(sample_h5ad_file)]) assert result.exit_code == 0 output = result.stdout + (result.stderr or "") assert "Path:" in output assert "Type:" in output - def test_info_object_short_flag(self, sample_h5ad_file): - """Test info command with -o short flag.""" - result = runner.invoke(app, ["info", "-o", "obs", str(sample_h5ad_file)]) + def test_info_entry_short_flag(self, sample_h5ad_file): + """Test info command with -e short flag.""" + result = runner.invoke(app, ["info", "-e", "obs", str(sample_h5ad_file)]) assert result.exit_code == 0 output = result.stdout + (result.stderr or "") assert "Path:" in output assert "dataframe" in output - def test_info_object_nested_path(self, sample_h5ad_file): + def test_info_entry_nested_path(self, sample_h5ad_file): """Test info command with nested object path.""" result = runner.invoke( - app, ["info", "-o", "uns/description", str(sample_h5ad_file)] + app, ["info", "-e", "uns/description", str(sample_h5ad_file)] ) assert result.exit_code == 0 output = result.stdout + (result.stderr or "") assert "Path:" in output - def test_info_object_not_found(self, sample_h5ad_file): + def test_info_entry_not_found(self, sample_h5ad_file): """Test info command with non-existent object path.""" result = runner.invoke( - app, ["info", "-o", "nonexistent", str(sample_h5ad_file)] + app, ["info", "-e", "nonexistent", str(sample_h5ad_file)] ) assert result.exit_code == 0 # Doesn't exit with error, just shows message output = result.stdout + (result.stderr or "") From aaeddb2bf65691636acbd1623cb40f08bb64b9f0 Mon Sep 17 00:00:00 2001 From: Aljes Date: Fri, 23 Jan 2026 14:05:26 +0000 Subject: [PATCH 14/62] Enhance axis_len tests to validate error handling for non-existent axes, non-group types, and missing index datasets --- tests/test_info_read.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/tests/test_info_read.py b/tests/test_info_read.py index 8ad47b4..5b69fe3 100644 --- a/tests/test_info_read.py +++ b/tests/test_info_read.py @@ -102,10 +102,28 @@ def test_axis_len_var(self, sample_h5ad_file): assert length == 4 def test_axis_len_nonexistent(self, sample_h5ad_file): - """Test getting length of non-existent axis.""" + """Test getting length of non-existent axis raises KeyError.""" with h5py.File(sample_h5ad_file, "r") as f: - length = axis_len(f, "nonexistent") - assert length is None + with pytest.raises(KeyError, match="'nonexistent' not found"): + axis_len(f, "nonexistent") + + def test_axis_len_not_a_group(self, temp_dir): + """Test that axis_len raises TypeError when axis is not a group.""" + file_path = temp_dir / "test.h5ad" + with h5py.File(file_path, "w") as f: + f.create_dataset("obs", data=np.array([1, 2, 3])) + with h5py.File(file_path, "r") as f: + with pytest.raises(TypeError, match="'obs' is not a group"): + axis_len(f, "obs") + + def test_axis_len_missing_index(self, temp_dir): + """Test that axis_len raises KeyError when index dataset is missing.""" + file_path = temp_dir / "test.h5ad" + with h5py.File(file_path, "w") as f: + f.create_group("obs") + with h5py.File(file_path, "r") as f: + with pytest.raises(KeyError, match="Index dataset 'obs_names' not found"): + axis_len(f, "obs") class TestGetAxisGroup: From a7d23e23de67d3106006d01b8ad1052157ef951a Mon Sep 17 00:00:00 2001 From: Aljes Date: Fri, 23 Jan 2026 17:11:36 +0000 Subject: [PATCH 15/62] Added element specs for .h5ad files --- docs/h5ad_elements_spec.md | 274 +++++++++++++++++++++++++++++++++++++ 1 file changed, 274 insertions(+) create mode 100644 docs/h5ad_elements_spec.md diff --git a/docs/h5ad_elements_spec.md b/docs/h5ad_elements_spec.md new file mode 100644 index 0000000..acb491d --- /dev/null +++ b/docs/h5ad_elements_spec.md @@ -0,0 +1,274 @@ +# AnnData on-disk element specifications — HDF5 (`.h5ad`) + +This document describes how *elements* are encoded inside an AnnData **HDF5** container (`.h5ad`). +It is intended to be GitHub-renderable Markdown (no Sphinx/MyST directives). + +> **Scope** +> +> - “Modern” encoding metadata (`encoding-type`, `encoding-version`) is the convention used by **anndata ≥ 0.8**. +> - “Legacy” conventions (notably DataFrame categorical handling) are described for **anndata 0.7.x** files, which are still commonly encountered. + +## Table of contents + +- [Encoding metadata](#encoding-metadata) +- [AnnData group](#anndata-group) +- [Dense arrays](#dense-arrays) +- [Sparse arrays (CSR/CSC)](#sparse-arrays-csrcsc) +- [DataFrames](#dataframes) + - [DataFrame v0.2.0](#dataframe-v020) + - [DataFrame v0.1.0 (legacy: anndata 0.7.x)](#dataframe-v010-legacy-anndata-07x) + - [Legacy categorical columns (Series-level)](#legacy-categorical-columns-series-level) +- [Mappings / dict](#mappings--dict) +- [Scalars](#scalars) +- [Categorical arrays](#categorical-arrays) +- [String arrays](#string-arrays) +- [Nullable arrays](#nullable-arrays) + - [Missing value semantics](#missing-value-semantics) +- [Awkward arrays (experimental)](#awkward-arrays-experimental) +- [Sources](#sources) + +## Encoding metadata + +**Modern convention (anndata ≥ 0.8):** + +- Any element (HDF5 *group* or *dataset*) that participates in the element-dispatch system: + - **MUST** have attribute `encoding-type` (string) + - **MUST** have attribute `encoding-version` (string, parseable as a version) + +Readers should dispatch first on `encoding-type`, then on `encoding-version`. + +**Legacy convention (anndata ≤ 0.7.x):** + +- Many objects do *not* have `encoding-type`/`encoding-version`. +- Some elements (e.g. CSR/CSC sparse matrices, legacy DataFrames) *do* use `encoding-type`/`encoding-version`. +- Readers typically infer element kinds from: + - known AnnData keys (`X`, `obs`, `var`, …), + - group structure, and/or + - legacy attributes (e.g. the `categories` attribute on categorical columns). + +## AnnData group + +### `encoding-type: anndata`, `encoding-version: 0.1.0` + +An `AnnData` object **MUST** be stored as an HDF5 **group** with attributes: + +- `encoding-type: "anndata"` +- `encoding-version: "0.1.0"` + +Required members: + +- `obs` — a [DataFrame](#dataframes) +- `var` — a [DataFrame](#dataframes) + +Optional members (if present, they must satisfy these constraints): + +- `X` — dense array or sparse array; shape `(n_obs, n_var)` +- `layers` — mapping; values dense or sparse arrays; each shape `(n_obs, n_var)` +- `obsm` — mapping; values dense arrays, sparse arrays, or dataframes; first dim `n_obs` +- `varm` — mapping; values dense arrays, sparse arrays, or dataframes; first dim `n_var` +- `obsp` — mapping; values dense or sparse arrays; first two dims `n_obs` +- `varp` — mapping; values dense or sparse arrays; first two dims `n_var` +- `uns` — mapping/dict-like container (recursive) + +## Dense arrays + +### `encoding-type: array`, `encoding-version: 0.2.0` + +- A dense array **MUST** be an HDF5 **dataset**. +- The dataset **MUST** have attributes: + - `encoding-type: "array"` + - `encoding-version: "0.2.0"` + +> **Legacy note** +> +> In anndata 0.7.x, dense arrays were typically stored as plain datasets *without* `encoding-type`/`encoding-version`. + +## Sparse arrays (CSR/CSC) + +### `encoding-type: csr_matrix|csc_matrix`, `encoding-version: 0.1.0` + +A sparse matrix **MUST** be stored as an HDF5 **group**. + +- Group attributes: + - `encoding-type: "csr_matrix"` **or** `"csc_matrix"` + - `encoding-version: "0.1.0"` + - `shape`: integer array of length 2 (matrix shape) +- Group members (datasets): + - `data` + - `indices` + - `indptr` + +The exact CSR/CSC semantics follow SciPy’s conventions. + +## DataFrames + +DataFrames are stored column-wise: each column is stored as a dataset (or group, if the column itself is an encoded element). + + +### DataFrame v0.2.0 + +#### `encoding-type: dataframe`, `encoding-version: 0.2.0` + +A dataframe **MUST** be stored as an HDF5 **group**. + +- Group attributes: + - `_index`: string — the key of the dataset to be used as the row index + - `column-order`: array of strings — original column order + - `encoding-type: "dataframe"` + - `encoding-version: "0.2.0"` +- Group members: + - the index dataset (named by `_index`) + - one member per column +- All column entries **MUST** have the same length in their first dimension. +- Columns **SHOULD** share chunking along the first dimension. + +Columns are independently encoded: +- simple numeric/bool columns are commonly `encoding-type: array` +- categorical columns are commonly `encoding-type: categorical` + + +### DataFrame v0.1.0 (legacy: anndata 0.7.x) + +#### `encoding-type: dataframe`, `encoding-version: 0.1.0` + +A legacy dataframe is stored as an HDF5 **group** where: + +- Group attributes include: + - `_index` + - `column-order` + - `encoding-type: "dataframe"` + - `encoding-version: "0.1.0"` +- Each column is a dataset. +- Categorical columns are stored as **integer code datasets**, and their category labels are stored in a reserved subgroup named `__categories`. + +**Reserved subgroup:** + +- `__categories/` stores the array of category labels for column ``. + + +### Legacy categorical columns (Series-level) + +In v0.1.0 DataFrames, a categorical column dataset (e.g. `obs/cell_type`) can be identified by the presence of an attribute: + +- `categories`: an **HDF5 object reference** pointing to the corresponding `__categories/` dataset. + +## Mappings / dict + +### `encoding-type: dict`, `encoding-version: 0.1.0` + +- A mapping **MUST** be stored as an HDF5 **group**. +- Group attributes: + - `encoding-type: "dict"` + - `encoding-version: "0.1.0"` +- Each entry in the group is another element (recursively). + +> **Legacy note** +> +> In anndata 0.7.x, groups used as mappings often had **no special attributes**. + +## Scalars + +### `encoding-version: 0.2.0` + +Scalars are stored as **0-dimensional datasets**. + +- Numeric scalars: + - `encoding-type: "numeric-scalar"` + - `encoding-version: "0.2.0"` + - value is numeric (including boolean, ints, floats, complex) +- String scalars: + - `encoding-type: "string"` + - `encoding-version: "0.2.0"` + - **HDF5 requirement:** variable-length UTF-8 string dtype + +> **Legacy note** +> +> In anndata 0.7.x, scalar strings were commonly stored as `|O` datasets without `encoding-type`/`encoding-version`. + +## Categorical arrays + +### `encoding-type: categorical`, `encoding-version: 0.2.0` + +Categorical arrays are stored as an HDF5 **group** with members: + +- `codes`: integer dataset + - values are zero-based indices into `categories` + - signed integer arrays **MAY** use `-1` to denote missing values +- `categories`: array of labels + +Group attributes: + +- `encoding-type: "categorical"` +- `encoding-version: "0.2.0"` +- `ordered`: boolean (whether the categories are ordered) + +## String arrays + +### `encoding-type: string-array`, `encoding-version: 0.2.0` + +- String arrays **MUST** be stored as HDF5 datasets. +- Dataset attributes: + - `encoding-type: "string-array"` + - `encoding-version: "0.2.0"` +- **HDF5 requirement:** variable-length UTF-8 string dtype + +## Nullable arrays + +These encodings support Pandas nullable integer/boolean/string arrays by storing a `values` array plus a boolean `mask` array. + +### `encoding-type: nullable-integer`, `encoding-version: 0.1.0` + +- Stored as an HDF5 group with datasets: + - `values` (integer) + - `mask` (boolean) + +### `encoding-type: nullable-boolean`, `encoding-version: 0.1.0` + +- Stored as an HDF5 group with datasets: + - `values` (boolean) + - `mask` (boolean) +- `values` and `mask` **MUST** have the same shape. + +### `encoding-type: nullable-string-array`, `encoding-version: 0.1.0` + +- Stored as an HDF5 group with datasets: + - `values` (string array) + - `mask` (boolean) +- Group attributes: + - `encoding-type: "nullable-string-array"` + - `encoding-version: "0.1.0"` + - optional `na-value`: `"NA"` or `"NaN"` (default `"NA"`) + + +#### Missing value semantics + +For elements supporting a `na-value` attribute: + +- `"NA"`: comparisons propagate missingness (e.g. `"x" == NA` → `NA`) +- `"NaN"`: comparisons yield boolean results (e.g. `"x" == NaN` → `false`) + +Readers should preserve semantics when the runtime model supports it. + +## Awkward arrays (experimental) + +### `encoding-type: awkward-array`, `encoding-version: 0.1.0` + +Ragged arrays are stored by decomposing an Awkward Array into constituent buffers (via `ak.to_buffers`), then storing those buffers as datasets within a group. + +Group attributes: + +- `encoding-type: "awkward-array"` +- `encoding-version: "0.1.0"` +- `form`: string — serialized Awkward “form” +- `length`: integer — logical length + +Group members: datasets for the buffers (often named like `nodeX-*`). + +> **Experimental** +> +> This encoding is considered experimental in the anndata 0.9.x series and later. + +## Sources + +- AnnData “on-disk format” prose docs (modern, ≥0.8): https://anndata.readthedocs.io/en/stable/fileformat-prose.html +- AnnData 0.7.8 “on-disk format” prose docs (legacy): https://dokk.org/documentation/anndata/0.7.8/fileformat-prose/ From 1bda8d88ccdb55db04baf728729f3cf03de804e3 Mon Sep 17 00:00:00 2001 From: Aljes Date: Fri, 23 Jan 2026 17:12:04 +0000 Subject: [PATCH 16/62] Refactor info and export_dataframe commands to use arguments instead of options for entry paths; add error handling for show_info function. --- src/h5ad/cli.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py index 856224b..a09761a 100644 --- a/src/h5ad/cli.py +++ b/src/h5ad/cli.py @@ -31,10 +31,8 @@ def info( exists=True, readable=True, ), - entry: Optional[str] = typer.Option( + entry: Optional[str] = typer.Argument( None, - "--entry", - "-e", help="Entry path to inspect (e.g., 'obsm/X_pca', 'X', 'uns')", ), types: bool = typer.Option( @@ -59,9 +57,13 @@ def info( Examples: h5ad info data.h5ad h5ad info --types data.h5ad - h5ad info --entry obsm/X_pca data.h5ad + h5ad info obsm/X_pca data.h5ad """ - show_info(file, console, show_types=types, depth=depth, entry_path=entry) + try: + show_info(file, console, show_types=types, depth=depth, entry_path=entry) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) # ============================================================================ @@ -118,8 +120,10 @@ def export_dataframe( file: Path = typer.Argument( ..., help="Path to the .h5ad file", exists=True, readable=True ), - obj: str = typer.Argument(..., help="Object path to export ('obs' or 'var')"), - out: Path = typer.Argument(..., help="Output CSV file path"), + entry: str = typer.Argument(..., help="Entry path to export ('obs' or 'var')"), + output: Path = typer.Option( + None, "--output", "-o", writable=True, help="Output CSV file path" + ), columns: Optional[str] = typer.Option( None, "--columns", @@ -137,15 +141,15 @@ def export_dataframe( Export a dataframe (obs or var) to CSV. Examples: - h5ad export dataframe data.h5ad obs obs.csv - h5ad export dataframe data.h5ad var var.csv --columns gene_id,mean - h5ad export dataframe data.h5ad obs - --head 100 + h5ad export dataframe data.h5ad obs --output obs.csv + h5ad export dataframe data.h5ad var --output var.csv --columns gene_id,mean + h5ad export dataframe data.h5ad obs --head 100 """ from h5ad.commands import export_table - if obj not in ("obs", "var"): + if entry not in ("obs", "var"): console.print( - f"[bold red]Error:[/] Object must be 'obs' or 'var', not '{obj}'.", + f"[bold red]Error:[/] Dataframe export is only supported for 'obs' or 'var' at this point, not '{entry}'.", ) raise typer.Exit(code=1) @@ -156,9 +160,9 @@ def export_dataframe( try: export_table( file=file, - axis=obj, + axis=entry, columns=col_list, - out=out if str(out) != "-" else None, + out=output, chunk_rows=chunk_rows, head=head, console=console, From 4c6b18577c27bccd241095a0da519acbc05c606a Mon Sep 17 00:00:00 2001 From: Aljes Date: Fri, 23 Jan 2026 17:12:17 +0000 Subject: [PATCH 17/62] Enhance get_entry_type function to support legacy categorical and dataframe formats; improve version detection and details for various entry types. --- src/h5ad/info.py | 84 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 75 insertions(+), 9 deletions(-) diff --git a/src/h5ad/info.py b/src/h5ad/info.py index 25cdc31..7abb3e9 100644 --- a/src/h5ad/info.py +++ b/src/h5ad/info.py @@ -7,6 +7,10 @@ def get_entry_type(entry: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]: """ Determine the type/format of an HDF5 object for export guidance. + Supports both: + - v0.2.0 (modern): Objects with encoding-type/encoding-version attributes + - v0.1.0 (legacy): Objects without encoding attributes, inferred from structure + Returns a dict with: - type: str (e.g., 'dataframe', 'sparse-matrix', 'dense-matrix', 'dict', 'image', 'array', 'scalar') - export_as: str (suggested export format: csv, mtx, npy, json, image) @@ -14,6 +18,7 @@ def get_entry_type(entry: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]: - shape: tuple or None - dtype: str or None - details: str (human-readable description) + - version: str ('0.2.0', '0.1.0', or None for unknown) """ result: Dict[str, Any] = { "type": "unknown", @@ -22,6 +27,7 @@ def get_entry_type(entry: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]: "shape": None, "dtype": None, "details": "", + "version": None, } # Get encoding-type attribute if present @@ -30,11 +36,34 @@ def get_entry_type(entry: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]: enc = enc.decode("utf-8") result["encoding"] = enc if enc else None + # Get encoding-version if present + enc_ver = entry.attrs.get("encoding-version", b"") + if isinstance(enc_ver, bytes): + enc_ver = enc_ver.decode("utf-8") + result["version"] = enc_ver if enc_ver else None + # Infer the type for Dataset entry if isinstance(entry, h5py.Dataset): result["shape"] = entry.shape result["dtype"] = str(entry.dtype) + # Check for legacy categorical (v0.1.0): dataset with 'categories' attribute + if "categories" in entry.attrs: + result["type"] = "categorical" + result["export_as"] = "csv" + result["version"] = result["version"] or "0.1.0" + # Try to get category count from referenced dataset + try: + cats_ref = entry.attrs["categories"] + cats_ds = entry.file[cats_ref] + n_cats = cats_ds.shape[0] + except Exception: + n_cats = "?" + result["details"] = ( + f"Legacy categorical [{entry.shape[0]} values, {n_cats} categories]" + ) + return result + # Scalar if entry.shape == (): result["type"] = "scalar" @@ -65,7 +94,7 @@ def get_entry_type(entry: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]: # It's a Group if isinstance(entry, h5py.Group): - # Check for sparse matrix (CSR/CSC) + # Check for sparse matrix (CSR/CSC) - same in both versions if enc in ("csr_matrix", "csc_matrix"): shape = entry.attrs.get("shape", None) shape_str = f"{shape[0]}×{shape[1]}" if shape is not None else "?" @@ -76,7 +105,7 @@ def get_entry_type(entry: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]: ) return result - # Check for categorical + # Check for v0.2.0 categorical (Group with codes/categories) if enc == "categorical": codes = entry.get("codes") cats = entry.get("categories") @@ -87,22 +116,59 @@ def get_entry_type(entry: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]: result["details"] = f"Categorical [{n_codes} values, {n_cats} categories]" return result - # Check for dataframe (obs/var style with _index) - if "_index" in entry.attrs or "obs_names" in entry or "var_names" in entry: - n_cols = len([k for k in entry.keys() if k != "_index"]) + # Check for dataframe (obs/var style) + # v0.2.0: has encoding-type="dataframe" + # v0.1.0: has _index attribute or obs_names/var_names dataset + if ( + enc == "dataframe" + or "_index" in entry.attrs + or "obs_names" in entry + or "var_names" in entry + ): + # Detect version + if enc == "dataframe": + df_version = result["version"] or "0.2.0" + else: + df_version = "0.1.0" # No encoding-type, legacy format + result["version"] = df_version + + # Check for __categories subgroup (v0.1.0 legacy) + has_legacy_cats = "__categories" in entry + n_cols = len( + [k for k in entry.keys() if k not in ("_index", "__categories")] + ) + result["type"] = "dataframe" result["export_as"] = "csv" - result["details"] = f"DataFrame with {n_cols} columns" + if has_legacy_cats: + result["details"] = f"DataFrame with {n_cols} columns (legacy v0.1.0)" + else: + result["details"] = f"DataFrame with {n_cols} columns" return result - # Check for array-like groups (nullable integer, string array, etc.) - if enc in ("nullable-integer", "string-array"): + # Check for nullable arrays (v0.2.0) + if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"): result["type"] = "array" result["export_as"] = "npy" result["details"] = f"Encoded array ({enc})" return result - # Generic dict/group + # Check for string-array encoding + if enc == "string-array": + result["type"] = "array" + result["export_as"] = "npy" + result["details"] = "Encoded string array" + return result + + # Check for awkward-array (experimental) + if enc == "awkward-array": + length = entry.attrs.get("length", "?") + result["type"] = "awkward-array" + result["export_as"] = "json" + result["details"] = f"Awkward array (length={length})" + return result + + # Generic dict/group (v0.2.0 has encoding-type="dict", v0.1.0 has no attributes) n_keys = len(list(entry.keys())) result["type"] = "dict" result["export_as"] = "json" From d46981fe8b88eb0f0bccee454ffa90f799b965b9 Mon Sep 17 00:00:00 2001 From: Aljes Date: Fri, 23 Jan 2026 17:12:28 +0000 Subject: [PATCH 18/62] Refactor read_categorical_column and col_chunk_as_strings to support both modern and legacy formats; enhance error handling and caching for categorical data retrieval. --- src/h5ad/read.py | 123 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 100 insertions(+), 23 deletions(-) diff --git a/src/h5ad/read.py b/src/h5ad/read.py index 5abec06..36f2e58 100644 --- a/src/h5ad/read.py +++ b/src/h5ad/read.py @@ -20,29 +20,75 @@ def decode_str_array(array: np.ndarray) -> np.ndarray: def read_categorical_column( - col_group: h5py.Group, start: int, end: int, cache: Dict[int, np.ndarray] + col: h5py.Group | h5py.Dataset, + start: int, + end: int, + cache: Dict[int, np.ndarray], + parent_group: h5py.Group | None = None, ) -> List[str]: """ Decode an AnnData 'categorical' column for a slice [start:end]. + + Supports both: + - v0.2.0 (modern): Group with 'codes' and 'categories' datasets + - v0.1.0 (legacy): Dataset with 'categories' attribute referencing __categories/ + Args: - col_group (h5py.Group): Column group containing 'categories' and 'codes' - start (int): Start index of the slice - end (int): End index of the slice - cache (Dict[int, np.ndarray]): Cache for decoded categories + col: Column group (v0.2.0) or dataset (v0.1.0) + start: Start index of the slice + end: End index of the slice + cache: Cache for decoded categories + parent_group: Parent obs/var group (needed for v0.1.0 to resolve __categories) + Returns: List[str]: Decoded categorical values for the specified slice """ - key = id(col_group) - if key not in cache: - cats = col_group["categories"][...] - cats = decode_str_array(cats) - cache[key] = np.asarray(cats, dtype=str) - cats = cache[key] + key = id(col) + + # v0.2.0 format: Group with 'codes' and 'categories' datasets + if isinstance(col, h5py.Group): + if key not in cache: + cats = col["categories"][...] + cats = decode_str_array(cats) + cache[key] = np.asarray(cats, dtype=str) + cats = cache[key] + + codes_ds = col["codes"] + codes = codes_ds[start:end] + codes = np.asarray(codes, dtype=np.int64) + return [cats[c] if 0 <= c < len(cats) else "" for c in codes] + + # v0.1.0 format: Dataset with 'categories' attribute (object reference) + if isinstance(col, h5py.Dataset): + if key not in cache: + cats_ref = col.attrs.get("categories", None) + if cats_ref is not None: + # Dereference the HDF5 object reference + cats_ds = col.file[cats_ref] + cats = cats_ds[...] + elif parent_group is not None and "__categories" in parent_group: + # Fallback: look for __categories subgroup + col_name = col.name.split("/")[-1] + cats_grp = parent_group["__categories"] + if col_name in cats_grp: + cats = cats_grp[col_name][...] + else: + raise RuntimeError( + f"Cannot find categories for legacy column {col.name}" + ) + else: + raise RuntimeError( + f"Cannot find categories for legacy column {col.name}" + ) + cats = decode_str_array(cats) + cache[key] = np.asarray(cats, dtype=str) + cats = cache[key] - codes_ds = col_group["codes"] - codes = codes_ds[start:end] - codes = np.asarray(codes, dtype=np.int64) - return [cats[c] if 0 <= c < len(cats) else "" for c in codes] + codes = col[start:end] + codes = np.asarray(codes, dtype=np.int64) + return [cats[c] if 0 <= c < len(cats) else "" for c in codes] + + raise RuntimeError(f"Unsupported categorical column type: {type(col)}") def col_chunk_as_strings( @@ -54,29 +100,60 @@ def col_chunk_as_strings( ) -> List[str]: """ Read a column from an obs/var group as strings. + + Supports both: + - v0.2.0 (modern): Columns with encoding-type attribute + - v0.1.0 (legacy): Categorical columns with 'categories' attribute referencing __categories + Args: group (h5py.Group): The obs/var group col_name (str): Name of the column to read start (int): Start index of the slice end (int): End index of the slice cat_cache (Dict[int, np.ndarray]): Cache for decoded categorical columns + Returns: List[str]: Column values as strings for the specified slice """ - if col_name in group and isinstance(group[col_name], h5py.Dataset): - dataset = group[col_name] - chunk = dataset[start:end] + if col_name not in group: + raise RuntimeError(f"Column {col_name!r} not found in group {group.name}") + + col = group[col_name] + + # Case 1: Dataset (could be plain array or legacy categorical) + if isinstance(col, h5py.Dataset): + # Check for v0.1.0 legacy categorical (has 'categories' attribute) + if "categories" in col.attrs: + return read_categorical_column(col, start, end, cat_cache, group) + + # Plain dataset (numeric, string, etc.) + chunk = col[start:end] if chunk.ndim != 1: chunk = chunk.reshape(-1) chunk = decode_str_array(np.asarray(chunk)) return chunk.tolist() - if col_name in group and isinstance(group[col_name], h5py.Group): - col_group = group[col_name] - enc = col_group.attrs.get("encoding-type", b"") + # Case 2: Group (v0.2.0 encoded types like categorical, nullable, etc.) + if isinstance(col, h5py.Group): + enc = col.attrs.get("encoding-type", b"") if isinstance(enc, bytes): enc = enc.decode("utf-8") + if enc == "categorical": - return read_categorical_column(col_group, start, end, cat_cache) + return read_categorical_column(col, start, end, cat_cache) + + # Handle nullable arrays (nullable-integer, nullable-boolean, nullable-string-array) + if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"): + values = col["values"][start:end] + mask = col["mask"][start:end] + values = decode_str_array(np.asarray(values)) + # Apply mask: masked values become empty string + return ["" if m else str(v) for v, m in zip(values, mask)] + + raise RuntimeError( + f"Unsupported group encoding {enc!r} for column {col_name!r}" + ) - raise RuntimeError(f"Unsupported column {col_name!r} in group {group.name}") + raise RuntimeError( + f"Unsupported column type for {col_name!r} in group {group.name}" + ) From 1cebbbf315abf948d10c351f29848072e7bec2cf Mon Sep 17 00:00:00 2001 From: Aljes Date: Fri, 23 Jan 2026 17:12:39 +0000 Subject: [PATCH 19/62] Add support for legacy v0.1.0 h5ad files; implement tests for legacy categorical and dataframe formats --- tests/conftest.py | 50 ++++++++++++++++++++++++++++++++++++ tests/test_cli.py | 25 +++++++++--------- tests/test_export.py | 6 +++-- tests/test_info_read.py | 56 ++++++++++++++++++++++++++++++++++++++--- 4 files changed, 120 insertions(+), 17 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index bff9605..e3b710f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -172,3 +172,53 @@ def sample_categorical_h5ad(temp_dir): f.create_dataset("X", data=X) return file_path + + +@pytest.fixture +def sample_legacy_v010_h5ad(temp_dir): + """Create a sample h5ad file with legacy v0.1.0 categorical columns. + + In v0.1.0, categorical columns are stored as: + - Integer code datasets with a 'categories' attribute (HDF5 object reference) + - Categories stored in __categories/ subgroup + """ + file_path = temp_dir / "test_legacy_v010.h5ad" + + with h5py.File(file_path, "w") as f: + # Create obs with legacy categorical column + obs = f.create_group("obs") + obs.attrs["_index"] = "obs_names" + obs.attrs["encoding-type"] = "dataframe" + obs.attrs["encoding-version"] = "0.1.0" + obs_names = ["cell_1", "cell_2", "cell_3", "cell_4"] + obs.create_dataset("obs_names", data=np.array(obs_names, dtype="S")) + + # Create __categories subgroup (v0.1.0 convention) + categories_group = obs.create_group("__categories") + cell_type_cats = np.array(["TypeA", "TypeB", "TypeC"], dtype="S") + cats_ds = categories_group.create_dataset("cell_type", data=cell_type_cats) + + # Create categorical column as integer codes with reference to categories + codes = np.array([0, 1, 0, 2], dtype=np.int8) + cell_type_ds = obs.create_dataset("cell_type", data=codes) + # Store HDF5 object reference to categories + cell_type_ds.attrs["categories"] = cats_ds.ref + + # Add a regular non-categorical column + obs.create_dataset( + "n_counts", data=np.array([100, 200, 150, 300], dtype=np.int32) + ) + + # Create var + var = f.create_group("var") + var.attrs["_index"] = "var_names" + var.attrs["encoding-type"] = "dataframe" + var.attrs["encoding-version"] = "0.1.0" + var_names = ["gene_1", "gene_2"] + var.create_dataset("var_names", data=np.array(var_names, dtype="S")) + + # Create X matrix (no encoding-type for legacy) + X = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]], dtype=np.float32) + f.create_dataset("X", data=X) + + return file_path diff --git a/tests/test_cli.py b/tests/test_cli.py index 203830f..7f06a4f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -66,17 +66,17 @@ def test_info_depth_short_flag(self, sample_h5ad_file): output = result.stdout + (result.stderr or "") assert "<" in output - def test_info_entry_flag(self, sample_h5ad_file): - """Test info command with --entry flag.""" - result = runner.invoke(app, ["info", "--entry", "X", str(sample_h5ad_file)]) + def test_info_entry_positional(self, sample_h5ad_file): + """Test info command with entry as positional argument.""" + result = runner.invoke(app, ["info", str(sample_h5ad_file), "X"]) assert result.exit_code == 0 output = result.stdout + (result.stderr or "") assert "Path:" in output assert "Type:" in output - def test_info_entry_short_flag(self, sample_h5ad_file): - """Test info command with -e short flag.""" - result = runner.invoke(app, ["info", "-e", "obs", str(sample_h5ad_file)]) + def test_info_entry_obs(self, sample_h5ad_file): + """Test info command with obs entry.""" + result = runner.invoke(app, ["info", str(sample_h5ad_file), "obs"]) assert result.exit_code == 0 output = result.stdout + (result.stderr or "") assert "Path:" in output @@ -84,18 +84,14 @@ def test_info_entry_short_flag(self, sample_h5ad_file): def test_info_entry_nested_path(self, sample_h5ad_file): """Test info command with nested object path.""" - result = runner.invoke( - app, ["info", "-e", "uns/description", str(sample_h5ad_file)] - ) + result = runner.invoke(app, ["info", str(sample_h5ad_file), "uns/description"]) assert result.exit_code == 0 output = result.stdout + (result.stderr or "") assert "Path:" in output def test_info_entry_not_found(self, sample_h5ad_file): """Test info command with non-existent object path.""" - result = runner.invoke( - app, ["info", "-e", "nonexistent", str(sample_h5ad_file)] - ) + result = runner.invoke(app, ["info", str(sample_h5ad_file), "nonexistent"]) assert result.exit_code == 0 # Doesn't exit with error, just shows message output = result.stdout + (result.stderr or "") assert "not found" in output @@ -114,6 +110,7 @@ def test_export_dataframe_obs(self, sample_h5ad_file, temp_dir): "dataframe", str(sample_h5ad_file), "obs", + "--output", str(output), ], ) @@ -137,6 +134,7 @@ def test_export_dataframe_var(self, sample_h5ad_file, temp_dir): "dataframe", str(sample_h5ad_file), "var", + "--output", str(output), ], ) @@ -158,6 +156,7 @@ def test_export_dataframe_columns_filter(self, sample_h5ad_file, temp_dir): "dataframe", str(sample_h5ad_file), "obs", + "--output", str(output), "--columns", "obs_names,cell_type", @@ -183,6 +182,7 @@ def test_export_dataframe_head(self, sample_h5ad_file, temp_dir): "dataframe", str(sample_h5ad_file), "obs", + "--output", str(output), "--head", "2", @@ -205,6 +205,7 @@ def test_export_dataframe_invalid_axis(self, sample_h5ad_file, temp_dir): "dataframe", str(sample_h5ad_file), "invalid", + "--output", str(output), ], ) diff --git a/tests/test_export.py b/tests/test_export.py index 8ab14cd..730ce95 100644 --- a/tests/test_export.py +++ b/tests/test_export.py @@ -112,7 +112,8 @@ class TestExportDataframe: def test_export_dataframe_obs(self, sample_h5ad_file, temp_dir): out = temp_dir / "obs.csv" result = runner.invoke( - app, ["export", "dataframe", str(sample_h5ad_file), "obs", str(out)] + app, + ["export", "dataframe", str(sample_h5ad_file), "obs", "--output", str(out)], ) assert result.exit_code == 0 assert out.exists() @@ -125,7 +126,8 @@ def test_wrong_type_for_dataframe(self, sample_h5ad_file, temp_dir): """Test that wrong object type is rejected for dataframe export.""" out = temp_dir / "X.csv" result = runner.invoke( - app, ["export", "dataframe", str(sample_h5ad_file), "X", str(out)] + app, + ["export", "dataframe", str(sample_h5ad_file), "X", "--output", str(out)], ) assert result.exit_code == 1 assert "obs" in result.output or "var" in result.output diff --git a/tests/test_info_read.py b/tests/test_info_read.py index 5b69fe3..e708fac 100644 --- a/tests/test_info_read.py +++ b/tests/test_info_read.py @@ -245,9 +245,59 @@ def test_col_chunk_categorical(self, sample_categorical_h5ad): result = col_chunk_as_strings(f["obs"], "cell_type", 0, 4, cache) assert result == ["TypeA", "TypeB", "TypeA", "TypeC"] - def test_col_chunk_unsupported(self, sample_h5ad_file): - """Test reading unsupported column.""" + def test_col_chunk_not_found(self, sample_h5ad_file): + """Test reading non-existent column.""" with h5py.File(sample_h5ad_file, "r") as f: cache = {} - with pytest.raises(RuntimeError, match="Unsupported column"): + with pytest.raises(RuntimeError, match="not found in group"): col_chunk_as_strings(f["obs"], "nonexistent", 0, 5, cache) + + +class TestLegacyV010Support: + """Tests for legacy v0.1.0 format support.""" + + def test_get_entry_type_legacy_categorical(self, sample_legacy_v010_h5ad): + """Test type detection for legacy categorical column (v0.1.0).""" + with h5py.File(sample_legacy_v010_h5ad, "r") as f: + info = get_entry_type(f["obs"]["cell_type"]) + assert info["type"] == "categorical" + assert info["version"] == "0.1.0" + assert "Legacy" in info["details"] + + def test_get_entry_type_legacy_dataframe(self, sample_legacy_v010_h5ad): + """Test type detection for legacy dataframe (v0.1.0).""" + with h5py.File(sample_legacy_v010_h5ad, "r") as f: + info = get_entry_type(f["obs"]) + assert info["type"] == "dataframe" + assert info["version"] == "0.1.0" + assert "legacy" in info["details"].lower() + + def test_read_legacy_categorical_column(self, sample_legacy_v010_h5ad): + """Test reading legacy categorical column.""" + with h5py.File(sample_legacy_v010_h5ad, "r") as f: + cache = {} + result = read_categorical_column( + f["obs"]["cell_type"], 0, 4, cache, f["obs"] + ) + assert result == ["TypeA", "TypeB", "TypeA", "TypeC"] + + def test_col_chunk_legacy_categorical(self, sample_legacy_v010_h5ad): + """Test col_chunk_as_strings with legacy categorical column.""" + with h5py.File(sample_legacy_v010_h5ad, "r") as f: + cache = {} + result = col_chunk_as_strings(f["obs"], "cell_type", 0, 4, cache) + assert result == ["TypeA", "TypeB", "TypeA", "TypeC"] + + def test_col_chunk_legacy_numeric(self, sample_legacy_v010_h5ad): + """Test col_chunk_as_strings with legacy numeric column.""" + with h5py.File(sample_legacy_v010_h5ad, "r") as f: + cache = {} + result = col_chunk_as_strings(f["obs"], "n_counts", 0, 4, cache) + assert result == ["100", "200", "150", "300"] + + def test_legacy_categorical_slice(self, sample_legacy_v010_h5ad): + """Test reading slice of legacy categorical column.""" + with h5py.File(sample_legacy_v010_h5ad, "r") as f: + cache = {} + result = col_chunk_as_strings(f["obs"], "cell_type", 1, 3, cache) + assert result == ["TypeB", "TypeA"] From b5ea89827669c3679362ac17e982715c001a95a2 Mon Sep 17 00:00:00 2001 From: Aljes Date: Fri, 23 Jan 2026 18:19:30 +0000 Subject: [PATCH 20/62] Update show_info and _show_types_tree functions to exclude '__categories' from key processing; enhance child key filtering. --- src/h5ad/commands/info.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/h5ad/commands/info.py b/src/h5ad/commands/info.py index e1d2bbe..11bd11d 100644 --- a/src/h5ad/commands/info.py +++ b/src/h5ad/commands/info.py @@ -54,7 +54,9 @@ def show_info( obj = f[key] # Only process Groups, skip Datasets like X if isinstance(obj, h5py.Group): - sub_keys = [k for k in obj.keys() if k != "_index"] + sub_keys = [ + k for k in obj.keys() if k not in ("_index", "__categories") + ] if sub_keys and key != "X": rich.print( f"\t[bold yellow]{key}:[/]\t" @@ -110,7 +112,7 @@ def add_node( # Recurse only if within allowed depth if current_depth < max_depth: for child_name in sorted(obj.keys()): - if child_name == "_index": + if child_name in ("_index", "__categories"): continue child_obj = obj[child_name] add_node( @@ -122,7 +124,7 @@ def add_node( obj = f[key] # Skip empty groups if isinstance(obj, h5py.Group): - children = [k for k in obj.keys() if k != "_index"] + children = [k for k in obj.keys() if k not in ("_index", "__categories")] if not children: continue max_depth = ( @@ -170,7 +172,7 @@ def _show_object_info(f: h5py.File, entry_path: str, console: Console) -> None: # If it's a group, show children if isinstance(entry, h5py.Group): - children = [k for k in entry.keys() if k != "_index"] + children = [k for k in entry.keys() if k not in ("_index", "__categories")] if children: console.print(f"\n[bold cyan]Children:[/]") for child_name in sorted(children): From e0c727a39c63d4aa7554e1876c01cbee23c7f0cd Mon Sep 17 00:00:00 2001 From: Aljes Date: Fri, 23 Jan 2026 18:19:36 +0000 Subject: [PATCH 21/62] Enhance export_table function to support both modern and legacy dataframe formats; exclude reserved keys from column list and improve status reporting during export. --- src/h5ad/commands/export.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py index cf0c64b..1f60458 100644 --- a/src/h5ad/commands/export.py +++ b/src/h5ad/commands/export.py @@ -3,6 +3,7 @@ import csv import json import sys +from contextlib import nullcontext from pathlib import Path from typing import Any, Dict, List, Optional, Union, cast @@ -40,15 +41,23 @@ def export_table( chunk_rows: Number of rows to read per chunk head: Output only the first n rows console: Rich console for status output + + Supports both v0.2.0 (modern) and v0.1.0 (legacy) dataframe formats. """ with h5py.File(file, "r") as f: group, n_rows, index_name = get_axis_group(f, axis) + # Reserved keys to exclude from column list + # __categories is used in v0.1.0 for storing categorical labels + reserved_keys = {"_index", "__categories"} + # Determine columns to read if columns: col_names = list(columns) else: - col_names = [k for k in group.keys() if k != "_index" and k != index_name] + col_names = [ + k for k in group.keys() if k not in reserved_keys and k != index_name + ] # Add index name if not already present if index_name and index_name not in col_names: col_names.insert(0, index_name) @@ -76,14 +85,22 @@ def export_table( try: writer.writerow(col_names) cat_cache: Dict[int, np.ndarray] = {} - with console.status( - f"[magenta]Exporting {axis} table...[/] to {'stdout' if out_fh is sys.stdout else out}" - ) as status: + + # Use status spinner only when writing to file (not stdout) + use_status = out_fh is not sys.stdout + status_ctx = ( + console.status(f"[magenta]Exporting {axis} table to {out}...[/]") + if use_status + else nullcontext() + ) + + with status_ctx as status: for start in range(0, n_rows, chunk_rows): end = min(start + chunk_rows, n_rows) - status.update( - f"[magenta]Exporting rows {start}-{end} of {n_rows}...[/]" - ) + if use_status and status: + status.update( + f"[magenta]Exporting rows {start}-{end} of {n_rows}...[/]" + ) cols_data: List[List[str]] = [] # Read each column for the current chunk for col in col_names: From f14868f54e8bf94a05d56216d90e71b3ddf2da13 Mon Sep 17 00:00:00 2001 From: Aljes Date: Fri, 23 Jan 2026 18:19:46 +0000 Subject: [PATCH 22/62] Enhance console initialization in CLI to ensure Rich output is visible in non-TTY environments. --- src/h5ad/cli.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py index a09761a..772fcee 100644 --- a/src/h5ad/cli.py +++ b/src/h5ad/cli.py @@ -11,7 +11,9 @@ app = typer.Typer( help="Streaming CLI for huge .h5ad files (info, subset, export, import)." ) -console = Console(stderr=True) +# Use stderr for status/progress to keep stdout clean for data output +# force_terminal=True ensures Rich output is visible even in non-TTY environments +console = Console(stderr=True, force_terminal=True) # Create sub-apps for export and import export_app = typer.Typer(help="Export objects from h5ad files.") From 9929e5bbe882a2e971bb9b93eefae317bfd982fb Mon Sep 17 00:00:00 2001 From: Aljes Date: Fri, 23 Jan 2026 18:21:53 +0000 Subject: [PATCH 23/62] Add tests for export_dataframe command with various options and flags --- tests/test_cli.py | 121 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index 7f06a4f..2b3bd90 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -195,6 +195,127 @@ def test_export_dataframe_head(self, sample_h5ad_file, temp_dir): rows = list(reader) assert len(rows) == 3 # header + 2 rows + def test_export_dataframe_head_short_flag(self, sample_h5ad_file, temp_dir): + """Test export dataframe with -n short flag.""" + output = temp_dir / "table.csv" + result = runner.invoke( + app, + [ + "export", + "dataframe", + str(sample_h5ad_file), + "obs", + "--output", + str(output), + "-n", + "3", + ], + ) + assert result.exit_code == 0 + + with open(output, "r") as f: + reader = csv.reader(f) + rows = list(reader) + assert len(rows) == 4 # header + 3 rows + + def test_export_dataframe_stdout(self, sample_h5ad_file): + """Test export dataframe to stdout (no --output).""" + result = runner.invoke( + app, + [ + "export", + "dataframe", + str(sample_h5ad_file), + "obs", + "--head", + "2", + ], + ) + assert result.exit_code == 0 + # Output should go to stdout + assert "obs_names" in result.stdout + assert "cell_" in result.stdout + + def test_export_dataframe_columns_short_flag(self, sample_h5ad_file, temp_dir): + """Test export dataframe with -c short flag for columns.""" + output = temp_dir / "table.csv" + result = runner.invoke( + app, + [ + "export", + "dataframe", + str(sample_h5ad_file), + "obs", + "-o", + str(output), + "-c", + "obs_names", + ], + ) + assert result.exit_code == 0 + + with open(output, "r") as f: + reader = csv.reader(f) + rows = list(reader) + header = rows[0] + assert len(header) == 1 + assert "obs_names" in header + + def test_export_dataframe_chunk_rows(self, sample_h5ad_file, temp_dir): + """Test export dataframe with custom chunk size.""" + output = temp_dir / "table.csv" + result = runner.invoke( + app, + [ + "export", + "dataframe", + str(sample_h5ad_file), + "obs", + "--output", + str(output), + "--chunk-rows", + "2", + ], + ) + assert result.exit_code == 0 + assert output.exists() + + with open(output, "r") as f: + reader = csv.reader(f) + rows = list(reader) + assert len(rows) == 6 # header + 5 rows + + def test_export_dataframe_combined_options(self, sample_h5ad_file, temp_dir): + """Test export dataframe with multiple options combined.""" + output = temp_dir / "table.csv" + result = runner.invoke( + app, + [ + "export", + "dataframe", + str(sample_h5ad_file), + "obs", + "-o", + str(output), + "-c", + "obs_names,cell_type", + "-n", + "3", + "-r", + "1", + ], + ) + assert result.exit_code == 0 + + with open(output, "r") as f: + reader = csv.reader(f) + rows = list(reader) + assert len(rows) == 4 # header + 3 rows + header = rows[0] + assert "obs_names" in header + assert "cell_type" in header + assert "n_counts" not in header + def test_export_dataframe_invalid_axis(self, sample_h5ad_file, temp_dir): """Test export dataframe with invalid axis.""" output = temp_dir / "table.csv" From bf672893e903b899ce4c0301a11e92d511d67457 Mon Sep 17 00:00:00 2001 From: Aljes Date: Fri, 23 Jan 2026 19:24:08 +0000 Subject: [PATCH 24/62] Update import statements in __init__.py to include additional export functions --- src/h5ad/commands/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/h5ad/commands/__init__.py b/src/h5ad/commands/__init__.py index 7b60c31..70d960f 100644 --- a/src/h5ad/commands/__init__.py +++ b/src/h5ad/commands/__init__.py @@ -1,4 +1,4 @@ from h5ad.commands.info import show_info from h5ad.commands.subset import subset_h5ad -from h5ad.commands.export import export_object, export_table +from h5ad.commands.export import export_table, export_image, export_json, export_mtx, export_npy from h5ad.commands.import_data import import_object From 295548d0a23271536238dca4f0adc770f47e8f5e Mon Sep 17 00:00:00 2001 From: Aljes Date: Fri, 23 Jan 2026 19:24:19 +0000 Subject: [PATCH 25/62] Add Zarr element specifications documentation --- docs/zarr_elements_spec.md | 276 +++++++++++++++++++++++++++++++++++++ 1 file changed, 276 insertions(+) create mode 100644 docs/zarr_elements_spec.md diff --git a/docs/zarr_elements_spec.md b/docs/zarr_elements_spec.md new file mode 100644 index 0000000..ce309e6 --- /dev/null +++ b/docs/zarr_elements_spec.md @@ -0,0 +1,276 @@ +# AnnData on-disk element specifications — Zarr (`.zarr`) + +This document describes how *elements* are encoded inside an AnnData **Zarr** container (`.zarr`). +It is intended to be GitHub-renderable Markdown (no Sphinx/MyST directives). + +> **Scope** +> +> - “Modern” encoding metadata (`encoding-type`, `encoding-version`) is the convention used by **anndata ≥ 0.8**. +> - “Legacy” conventions (notably DataFrame categorical handling) are described for **anndata 0.7.x** files, which are still commonly encountered. + +## Table of contents + +- [Encoding metadata](#encoding-metadata) +- [AnnData group](#anndata-group) +- [Dense arrays](#dense-arrays) +- [Sparse arrays (CSR/CSC)](#sparse-arrays-csrcsc) +- [DataFrames](#dataframes) + - [DataFrame v0.2.0](#dataframe-v020) + - [DataFrame v0.1.0 (legacy: anndata 0.7.x)](#dataframe-v010-legacy-anndata-07x) + - [Legacy categorical columns (Series-level)](#legacy-categorical-columns-series-level) +- [Mappings / dict](#mappings--dict) +- [Scalars](#scalars) +- [Categorical arrays](#categorical-arrays) +- [String arrays](#string-arrays) +- [Nullable arrays](#nullable-arrays) + - [Missing value semantics](#missing-value-semantics) +- [Awkward arrays (experimental)](#awkward-arrays-experimental) +- [Sources](#sources) + +## Encoding metadata + +**Modern convention (anndata ≥ 0.8):** + +- Any element (Zarr *group* or *array*) that participates in the element-dispatch system: + - **MUST** have attribute `encoding-type` (string) + - **MUST** have attribute `encoding-version` (string, parseable as a version) + +Readers should dispatch first on `encoding-type`, then on `encoding-version`. + +**Legacy convention (anndata ≤ 0.7.x):** + +- Many objects do *not* have `encoding-type`/`encoding-version`. +- Some elements (e.g. CSR/CSC sparse matrices, legacy DataFrames) *do* use `encoding-type`/`encoding-version`. +- Readers typically infer element kinds from: + - known AnnData keys (`X`, `obs`, `var`, …), + - group structure, and/or + - legacy attributes (e.g. the `categories` attribute on categorical columns). + +## AnnData group + +### `encoding-type: anndata`, `encoding-version: 0.1.0` + +An `AnnData` object **MUST** be stored as a Zarr **group** with attributes: + +- `encoding-type: "anndata"` +- `encoding-version: "0.1.0"` + +Required members: + +- `obs` — a [DataFrame](#dataframes) +- `var` — a [DataFrame](#dataframes) + +Optional members (if present, they must satisfy these constraints): + +- `X` — dense array or sparse array; shape `(n_obs, n_var)` +- `layers` — mapping; values dense or sparse arrays; each shape `(n_obs, n_var)` +- `obsm` — mapping; values dense arrays, sparse arrays, or dataframes; first dim `n_obs` +- `varm` — mapping; values dense arrays, sparse arrays, or dataframes; first dim `n_var` +- `obsp` — mapping; values dense or sparse arrays; first two dims `n_obs` +- `varp` — mapping; values dense or sparse arrays; first two dims `n_var` +- `uns` — mapping/dict-like container (recursive) + +## Dense arrays + +### `encoding-type: array`, `encoding-version: 0.2.0` + +- A dense array **MUST** be stored as a Zarr **array**. +- The array **MUST** have attributes: + - `encoding-type: "array"` + - `encoding-version: "0.2.0"` + +> **Legacy note** +> +> In anndata 0.7.x, dense arrays were typically stored as plain Zarr arrays *without* `encoding-type`/`encoding-version`. + +## Sparse arrays (CSR/CSC) + +### `encoding-type: csr_matrix|csc_matrix`, `encoding-version: 0.1.0` + +A sparse matrix **MUST** be stored as a Zarr **group**. + +- Group attributes: + - `encoding-type: "csr_matrix"` **or** `"csc_matrix"` + - `encoding-version: "0.1.0"` + - `shape`: integer array of length 2 (matrix shape) +- Group members (arrays): + - `data` + - `indices` + - `indptr` + +The exact CSR/CSC semantics follow SciPy’s conventions. + +## DataFrames + +DataFrames are stored column-wise: each column is stored as a Zarr array (or group, if the column itself is an encoded element). + + +### DataFrame v0.2.0 + +#### `encoding-type: dataframe`, `encoding-version: 0.2.0` + +A dataframe **MUST** be stored as a Zarr **group**. + +- Group attributes: + - `_index`: string — the key of the array to be used as the row index + - `column-order`: array of strings — original column order + - `encoding-type: "dataframe"` + - `encoding-version: "0.2.0"` +- Group members: + - the index array (named by `_index`) + - one member per column +- All column entries **MUST** have the same length in their first dimension. +- Columns **SHOULD** share chunking along the first dimension. + +Columns are independently encoded: +- simple numeric/bool columns are commonly `encoding-type: array` +- categorical columns are commonly `encoding-type: categorical` + + +### DataFrame v0.1.0 (legacy: anndata 0.7.x) + +#### `encoding-type: dataframe`, `encoding-version: 0.1.0` + +A legacy dataframe is stored as a Zarr **group** where: + +- Group attributes include: + - `_index` + - `column-order` + - `encoding-type: "dataframe"` + - `encoding-version: "0.1.0"` +- Each column is an array. +- Categorical columns are stored as **integer code arrays**, and their category labels are stored in a reserved subgroup named `__categories`. + +**Reserved subgroup:** + +- `__categories/` stores the array of category labels for column ``. + + +### Legacy categorical columns (Series-level) + +In v0.1.0 DataFrames, a categorical column array (e.g. `obs/cell_type`) can be identified by the presence of an attribute: + +- `categories`: an **absolute path string** to the corresponding `__categories/` array. + +(This differs from HDF5, which can store an object reference.) + +## Mappings / dict + +### `encoding-type: dict`, `encoding-version: 0.1.0` + +- A mapping **MUST** be stored as a Zarr **group**. +- Group attributes: + - `encoding-type: "dict"` + - `encoding-version: "0.1.0"` +- Each entry in the group is another element (recursively). + +> **Legacy note** +> +> In anndata 0.7.x, groups used as mappings often had **no special attributes**. + +## Scalars + +### `encoding-version: 0.2.0` + +Scalars are stored as **0-dimensional Zarr arrays**. + +- Numeric scalars: + - `encoding-type: "numeric-scalar"` + - `encoding-version: "0.2.0"` + - value is numeric (including boolean, ints, floats, complex) +- String scalars: + - `encoding-type: "string"` + - `encoding-version: "0.2.0"` + - **Zarr requirement:** fixed-length unicode dtype (e.g. ` **Legacy note** +> +> In anndata 0.7.x, scalar strings were commonly stored without `encoding-type`/`encoding-version`. + +## Categorical arrays + +### `encoding-type: categorical`, `encoding-version: 0.2.0` + +Categorical arrays are stored as a Zarr **group** with members: + +- `codes`: integer array + - values are zero-based indices into `categories` + - signed integer arrays **MAY** use `-1` to denote missing values +- `categories`: array of labels + +Group attributes: + +- `encoding-type: "categorical"` +- `encoding-version: "0.2.0"` +- `ordered`: boolean (whether the categories are ordered) + +## String arrays + +### `encoding-type: string-array`, `encoding-version: 0.2.0` + +- String arrays **MUST** be stored as Zarr arrays. +- Array attributes: + - `encoding-type: "string-array"` + - `encoding-version: "0.2.0"` +- **Zarr requirement:** the array **MUST** be stored using `numcodecs.VLenUTF8` for variable-length UTF-8 strings. + +## Nullable arrays + +These encodings support Pandas nullable integer/boolean/string arrays by storing a `values` array plus a boolean `mask` array. + +### `encoding-type: nullable-integer`, `encoding-version: 0.1.0` + +- Stored as a Zarr group with arrays: + - `values` (integer) + - `mask` (boolean) + +### `encoding-type: nullable-boolean`, `encoding-version: 0.1.0` + +- Stored as a Zarr group with arrays: + - `values` (boolean) + - `mask` (boolean) +- `values` and `mask` **MUST** have the same shape. + +### `encoding-type: nullable-string-array`, `encoding-version: 0.1.0` + +- Stored as a Zarr group with arrays: + - `values` (string array) + - `mask` (boolean) +- Group attributes: + - `encoding-type: "nullable-string-array"` + - `encoding-version: "0.1.0"` + - optional `na-value`: `"NA"` or `"NaN"` (default `"NA"`) + + +#### Missing value semantics + +For elements supporting a `na-value` attribute: + +- `"NA"`: comparisons propagate missingness (e.g. `"x" == NA` → `NA`) +- `"NaN"`: comparisons yield boolean results (e.g. `"x" == NaN` → `false`) + +Readers should preserve semantics when the runtime model supports it. + +## Awkward arrays (experimental) + +### `encoding-type: awkward-array`, `encoding-version: 0.1.0` + +Ragged arrays are stored by decomposing an Awkward Array into constituent buffers (via `ak.to_buffers`), then storing those buffers as Zarr arrays within a group. + +Group attributes: + +- `encoding-type: "awkward-array"` +- `encoding-version: "0.1.0"` +- `form`: string — serialized Awkward “form” +- `length`: integer — logical length + +Group members: arrays for the buffers (often named like `nodeX-*`). + +> **Experimental** +> +> This encoding is considered experimental in the anndata 0.9.x series and later. + +## Sources + +- AnnData “on-disk format” prose docs (modern, ≥0.8): https://anndata.readthedocs.io/en/stable/fileformat-prose.html +- AnnData 0.7.8 “on-disk format” prose docs (legacy): https://dokk.org/documentation/anndata/0.7.8/fileformat-prose/ From 6a454f0b558e0295f5e52bb8f222295d4bd10456 Mon Sep 17 00:00:00 2001 From: Aljes Date: Fri, 23 Jan 2026 19:24:31 +0000 Subject: [PATCH 26/62] Enhance export functionality to support awkward-array type and improve export_npy and export_mtx methods for better handling of datasets and chunk processing. --- src/h5ad/commands/export.py | 239 +++++++++++++++++------------------- 1 file changed, 112 insertions(+), 127 deletions(-) diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py index 1f60458..dbd4277 100644 --- a/src/h5ad/commands/export.py +++ b/src/h5ad/commands/export.py @@ -131,6 +131,7 @@ def export_table( "dict": {".json"}, "scalar": {".json"}, "categorical": {".csv"}, + "awkward-array": {".json"}, } # Image extensions for validation @@ -195,108 +196,44 @@ def _check_json_exportable(h5obj: H5Obj, max_elements: int, path: str = "") -> N ) -def export_object( +def export_npy( file: Path, obj: str, out: Path, - columns: Optional[List[str]], - chunk_rows: int, - head: Optional[int], - max_elements: int, - include_attrs: bool, + chunk_elements: int, console: Console, ) -> None: """ - Export an HDF5 object to an appropriate format based on its type. + Export a dense HDF5 dataset to NumPy .npy without loading it all at once. - Auto-detects the object type and validates the output file extension. + Supports both: + - v0.2.0 (modern): Datasets with encoding-type="array" + - v0.1.0 (legacy): Plain datasets without encoding attributes + - Encoded groups: nullable-integer, nullable-boolean, string-array (extracts values) """ - obj = _norm_path(obj) - out_ext = out.suffix.lower() - with h5py.File(file, "r") as f: h5obj = _resolve(f, obj) - info = get_entry_type(h5obj) - obj_type = info["type"] - - # Check if type is exportable - if obj_type not in EXPORTABLE_TYPES: - raise ValueError( - f"Cannot export object of type '{obj_type}'. " - f"Exportable types: {', '.join(sorted(EXPORTABLE_TYPES))}." - ) - - # Check if extension matches the type - valid_exts = TYPE_EXTENSIONS.get(obj_type, set()) - if out_ext not in valid_exts: - ext_list = ", ".join(sorted(valid_exts)) - raise ValueError( - f"Output extension '{out_ext}' does not match object type '{obj_type}'. " - f"Expected: {ext_list}." - ) - - # Dispatch to appropriate export function - if obj_type == "dataframe": - # For dataframe, obj must be obs or var - if obj not in ("obs", "var"): - raise ValueError( - f"CSV export for dataframes currently supports only 'obs' or 'var', " - f"not '{obj}'." - ) - export_table( - file=file, - axis=obj, - columns=columns, - out=out, - chunk_rows=chunk_rows, - head=head, - console=console, - ) - elif obj_type == "categorical": - # Categorical is also exported via table if it's a column in obs/var - raise ValueError( - f"Categorical objects should be exported as part of 'obs' or 'var' table. " - f"Use: h5ad export obs " - ) - - elif obj_type in ("dense-matrix", "array"): - if out_ext in IMAGE_EXTENSIONS: - # User wants image output - validate dimensions - _export_image(file=file, obj=obj, out=out, console=console) - else: - _export_npy( - file=file, obj=obj, out=out, chunk_rows=chunk_rows, console=console - ) - - elif obj_type == "sparse-matrix": - _export_mtx(file=file, obj=obj, out=out, console=console) - - elif obj_type in ("dict", "scalar"): - _export_json( - file=file, - obj=obj, - out=out, - max_elements=max_elements, - include_attrs=include_attrs, - console=console, - ) - - -def _export_npy( - file: Path, - obj: str, - out: Path, - chunk_rows: int, - console: Console, -) -> None: - """Export a dense HDF5 dataset to NumPy .npy without loading it all at once.""" - with h5py.File(file, "r") as f: - h5obj = _resolve(f, obj) + # Handle encoded groups that contain array data if isinstance(h5obj, h5py.Group): - raise ValueError("Target is a group; cannot export as .npy.") + enc = _get_encoding_type(h5obj) + if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"): + # Extract values from nullable array group + if "values" not in h5obj: + raise ValueError( + f"Encoded group '{obj}' is missing 'values' dataset." + ) + ds = h5obj["values"] + has_mask = "mask" in h5obj + console.print(f"[dim]Exporting nullable array values from '{obj}'[/]") + else: + raise ValueError( + f"Target '{obj}' is a group with encoding '{enc}'; cannot export as .npy directly." + ) + else: + ds = h5obj + has_mask = False - ds = h5obj out.parent.mkdir(parents=True, exist_ok=True) mm = np.lib.format.open_memmap(out, mode="w+", dtype=ds.dtype, shape=ds.shape) try: @@ -307,7 +244,7 @@ def _export_npy( if ds.ndim == 1: n = int(ds.shape[0]) - step = max(1, int(chunk_rows)) + step = max(1, int(chunk_elements)) with console.status( f"[magenta]Exporting {obj} to {out}...[/]" ) as status: @@ -321,7 +258,9 @@ def _export_npy( return n0 = int(ds.shape[0]) - step0 = max(1, int(chunk_rows)) + row_elems = int(np.prod(ds.shape[1:])) if ds.ndim > 1 else 1 + # Convert element budget into a row count; fallback to 1 row if rows are larger. + step0 = max(1, int(chunk_elements) // max(1, row_elems)) with console.status(f"[magenta]Exporting {obj} to {out}...[/]") as status: for start in range(0, n0, step0): end = min(start + step0, n0) @@ -334,8 +273,19 @@ def _export_npy( del mm -def _export_mtx(file: Path, obj: str, out: Path, console: Console) -> None: - """Export a CSR/CSC matrix group (AnnData encoding) to Matrix Market (.mtx).""" +def export_mtx( + file: Path, + obj: str, + out: Optional[Path], + head: Optional[int], + chunk_elements: int, + console: Console, +) -> None: + """Export a CSR/CSC matrix group (AnnData encoding) to Matrix Market (.mtx). + + If out is None or "-", writes to stdout. The head parameter limits output lines. + chunk_elements controls how many nonzero elements are processed per slice. + """ with h5py.File(file, "r") as f: h5obj = _resolve(f, obj) if not isinstance(h5obj, h5py.Group): @@ -370,51 +320,86 @@ def _export_mtx(file: Path, obj: str, out: Path, console: Console) -> None: field = "real" if np.issubdtype(data.dtype, np.floating) else "integer" - out.parent.mkdir(parents=True, exist_ok=True) - + # Load sparse index pointers (1 per major axis row/col); used to slice data/indices. indptr_arr = np.asarray(indptr[...], dtype=np.int64) nnz_ptr = int(indptr_arr[-1]) if indptr_arr.size else 0 nnz_data = int(data.shape[0]) nnz_idx = int(indices.shape[0]) - nnz = min(nnz_ptr, nnz_data, nnz_idx) + nnz_limit = min(nnz_ptr, nnz_data, nnz_idx) + nnz = nnz_limit + elem_step = max(1, int(chunk_elements)) + if head is not None and head > 0: + nnz = min(nnz_limit, head) - with open(out, "w", encoding="utf-8", newline="\n") as fh: - fh.write(f"%%MatrixMarket matrix coordinate {field} general\n") - fh.write("% generated by h5ad-cli\n") - fh.write(f"{n_rows} {n_cols} {nnz}\n") + # Write to stdout when out is None or "-", otherwise open a file on disk. + if out is None or str(out) == "-": + out_fh = sys.stdout + else: + out.parent.mkdir(parents=True, exist_ok=True) + out_fh = open(out, "w", encoding="utf-8", newline="\n") + + use_status = out_fh is not sys.stdout + status_ctx = ( + console.status(f"[magenta]Exporting {obj} to {out}...[/]") + if use_status + else nullcontext() + ) + try: + # Matrix Market header: type, generator line, then shape and nnz. + out_fh.write(f"%%MatrixMarket matrix coordinate {field} general\n") + out_fh.write("% generated by h5ad-cli\n") + out_fh.write(f"{n_rows} {n_cols} {nnz}\n") major = n_rows if enc == "csr_matrix" else n_cols - with console.status(f"[magenta]Exporting {obj} to {out}...[/]") as status: + max_lines = head if head is not None and head > 0 else None + written = 0 + with status_ctx as status: for major_i in range(major): - start = min(int(indptr_arr[major_i]), nnz) - end = min(int(indptr_arr[major_i + 1]), nnz) + start = min(int(indptr_arr[major_i]), nnz_limit) + end = min(int(indptr_arr[major_i + 1]), nnz_limit) if end <= start: continue - status.update( - f"[magenta]Exporting {obj}: block {major_i+1}/{major}...[/]" - ) - idx = np.asarray(indices[start:end], dtype=np.int64) - vals = np.asarray(data[start:end]) - m = min(len(idx), len(vals)) - if m == 0: - continue - idx = idx[:m] - vals = vals[:m] - for k in range(m): - if enc == "csr_matrix": - r = major_i + 1 - c = int(idx[k]) + 1 - else: - r = int(idx[k]) + 1 - c = major_i + 1 - v = vals[k] - if isinstance(v, np.generic): - v = v.item() - fh.write(f"{r} {c} {v}\n") - console.print(f"[green]Wrote[/] {out}") + if use_status and status: + status.update( + f"[magenta]Exporting {obj}: block {major_i+1}/{major}...[/]" + ) + # Slice the sparse column/row segment for this major index in element chunks. + for chunk_start in range(start, end, elem_step): + chunk_end = min(chunk_start + elem_step, end) + idx = np.asarray(indices[chunk_start:chunk_end], dtype=np.int64) + vals = np.asarray(data[chunk_start:chunk_end]) + m = min(len(idx), len(vals)) + if m == 0: + continue + idx = idx[:m] + vals = vals[:m] + for k in range(m): + if max_lines is not None and written >= max_lines: + break + if enc == "csr_matrix": + r = major_i + 1 + c = int(idx[k]) + 1 + else: + r = int(idx[k]) + 1 + c = major_i + 1 + v = vals[k] + if isinstance(v, np.generic): + v = v.item() + # Matrix Market uses 1-based indices. + out_fh.write(f"{r} {c} {v}\n") + written += 1 + if max_lines is not None and written >= max_lines: + break + if max_lines is not None and written >= max_lines: + break + finally: + if out_fh is not sys.stdout: + out_fh.close() + if out_fh is not sys.stdout: + console.print(f"[green]Wrote[/] {out}") -def _export_json( +def export_json( file: Path, obj: str, out: Path, @@ -500,7 +485,7 @@ def _to_jsonable(h5obj: H5Obj, max_elements: int, include_attrs: bool) -> Any: return d -def _export_image(file: Path, obj: str, out: Path, console: Console) -> None: +def export_image(file: Path, obj: str, out: Path, console: Console) -> None: """Export an image-like dataset (H,W) or (H,W,C) to PNG/JPG/TIFF.""" try: from PIL import Image # type: ignore From 4b5d1bd158666c6e2a2fbfbff1d91f751e051b0d Mon Sep 17 00:00:00 2001 From: Aljes Date: Fri, 23 Jan 2026 19:24:37 +0000 Subject: [PATCH 27/62] Refactor CLI export commands to improve parameter naming and enhance functionality for exporting various data types --- src/h5ad/cli.py | 99 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 65 insertions(+), 34 deletions(-) diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py index 772fcee..343560c 100644 --- a/src/h5ad/cli.py +++ b/src/h5ad/cli.py @@ -6,7 +6,15 @@ from rich.console import Console import typer -from h5ad.commands import show_info, subset_h5ad +from h5ad.commands import ( + show_info, + subset_h5ad, + export_mtx, + export_npy, + export_json, + export_image, + export_table, +) app = typer.Typer( help="Streaming CLI for huge .h5ad files (info, subset, export, import)." @@ -136,7 +144,7 @@ def export_dataframe( 10000, "--chunk-rows", "-r", help="Number of rows to read per chunk" ), head: Optional[int] = typer.Option( - None, "--head", "-n", help="Output only the first n rows" + None, "--head", "-n", help="Output only the first n entries" ), ) -> None: """ @@ -147,7 +155,6 @@ def export_dataframe( h5ad export dataframe data.h5ad var --output var.csv --columns gene_id,mean h5ad export dataframe data.h5ad obs --head 100 """ - from h5ad.commands import export_table if entry not in ("obs", "var"): console.print( @@ -179,12 +186,17 @@ def export_array( file: Path = typer.Argument( ..., help="Path to the .h5ad file", exists=True, readable=True ), - obj: str = typer.Argument( - ..., help="Object path to export (e.g., 'obsm/X_pca', 'varm/PCs', 'X')" + entry: str = typer.Argument( + ..., help="Entry path to export (e.g., 'obsm/X_pca', 'varm/PCs', 'X')" ), - out: Path = typer.Argument(..., help="Output .npy file path"), - chunk_rows: int = typer.Option( - 10000, "--chunk-rows", "-r", help="Number of rows to read per chunk" + output: Path = typer.Option( + ..., "--output", "-o", help="Output .npy file path", writable=True + ), + chunk_elements: int = typer.Option( + 10_000, + "--chunk", + "-r", + help="Number of elements to read per chunk", ), ) -> None: """ @@ -195,14 +207,13 @@ def export_array( h5ad export array data.h5ad X matrix.npy h5ad export array data.h5ad varm/PCs loadings.npy """ - from h5ad.commands.export import _export_npy try: - _export_npy( + export_npy( file=file, - obj=obj, - out=out, - chunk_rows=chunk_rows, + obj=entry, + out=output, + chunk_elements=chunk_elements, console=console, ) except Exception as e: @@ -215,10 +226,25 @@ def export_sparse( file: Path = typer.Argument( ..., help="Path to the .h5ad file", exists=True, readable=True ), - obj: str = typer.Argument( - ..., help="Object path to export (e.g., 'X', 'layers/counts')" + entry: str = typer.Argument( + ..., help="Entry path to export (e.g., 'X', 'layers/counts')" + ), + output: Optional[Path] = typer.Option( + None, + "--output", + "-o", + writable=True, + help="Output .mtx file path (defaults to stdout)", + ), + head: Optional[int] = typer.Option( + None, "--head", "-n", help="Output only the first n rows" + ), + chunk_elements: int = typer.Option( + 10_000, + "--chunk", + "-r", + help="Number of nonzero elements to process per chunk", ), - out: Path = typer.Argument(..., help="Output .mtx file path"), ) -> None: """ Export a sparse matrix (CSR/CSC) to Matrix Market (.mtx) format. @@ -226,11 +252,18 @@ def export_sparse( Examples: h5ad export sparse data.h5ad X matrix.mtx h5ad export sparse data.h5ad layers/counts counts.mtx + h5ad export sparse data.h5ad X --head 100 """ - from h5ad.commands.export import _export_mtx try: - _export_mtx(file=file, obj=obj, out=out, console=console) + export_mtx( + file=file, + obj=entry, + out=output, + head=head, + chunk_elements=chunk_elements, + console=console, + ) except Exception as e: console.print(f"[bold red]Error:[/] {e}") raise typer.Exit(code=1) @@ -241,8 +274,8 @@ def export_dict( file: Path = typer.Argument( ..., help="Path to the .h5ad file", exists=True, readable=True ), - obj: str = typer.Argument( - ..., help="Object path to export (e.g., 'uns', 'uns/colors')" + entry: str = typer.Argument( + ..., help="Entry path to export (e.g., 'uns', 'uns/colors')" ), out: Path = typer.Argument(..., help="Output .json file path"), max_elements: int = typer.Option( @@ -261,12 +294,11 @@ def export_dict( h5ad export dict data.h5ad uns metadata.json h5ad export dict data.h5ad uns/colors colors.json """ - from h5ad.commands.export import _export_json try: - _export_json( + export_json( file=file, - obj=obj, + obj=entry, out=out, max_elements=max_elements, include_attrs=include_attrs, @@ -282,7 +314,7 @@ def export_image( file: Path = typer.Argument( ..., help="Path to the .h5ad file", exists=True, readable=True ), - obj: str = typer.Argument(..., help="Object path to export (2D or 3D array)"), + entry: str = typer.Argument(..., help="Entry path to export (2D or 3D array)"), out: Path = typer.Argument(..., help="Output image file (.png, .jpg, .tiff)"), ) -> None: """ @@ -293,10 +325,9 @@ def export_image( Examples: h5ad export image data.h5ad uns/spatial/image tissue.png """ - from h5ad.commands.export import _export_image try: - _export_image(file=file, obj=obj, out=out, console=console) + export_image(file=file, obj=entry, out=out, console=console) except Exception as e: console.print(f"[bold red]Error:[/] {e}") raise typer.Exit(code=1) @@ -323,8 +354,8 @@ def import_dataframe( file: Path = typer.Argument( ..., help="Path to the source .h5ad file", exists=True, readable=True ), - obj: str = typer.Argument( - ..., help="Object path to create/replace ('obs' or 'var')" + entry: str = typer.Argument( + ..., help="Entry path to create/replace ('obs' or 'var')" ), input_file: Path = typer.Argument( ..., help="Input CSV file", exists=True, readable=True @@ -357,9 +388,9 @@ def import_dataframe( """ from h5ad.commands.import_data import _import_csv - if obj not in ("obs", "var"): + if entry not in ("obs", "var"): console.print( - f"[bold red]Error:[/] Object must be 'obs' or 'var', not '{obj}'.", + f"[bold red]Error:[/] Entry must be 'obs' or 'var', not '{entry}'.", ) raise typer.Exit(code=1) @@ -372,7 +403,7 @@ def import_dataframe( try: target = _get_target_file(file, output, inplace) - _import_csv(target, obj, input_file, index_column, console) + _import_csv(target, entry, input_file, index_column, console) except Exception as e: console.print(f"[bold red]Error:[/] {e}") raise typer.Exit(code=1) @@ -383,8 +414,8 @@ def import_array( file: Path = typer.Argument( ..., help="Path to the source .h5ad file", exists=True, readable=True ), - obj: str = typer.Argument( - ..., help="Object path to create/replace (e.g., 'X', 'obsm/X_pca')" + entry: str = typer.Argument( + ..., help="Entry path to create/replace (e.g., 'X', 'obsm/X_pca')" ), input_file: Path = typer.Argument( ..., help="Input .npy file", exists=True, readable=True @@ -422,7 +453,7 @@ def import_array( try: target = _get_target_file(file, output, inplace) - _import_npy(target, obj, input_file, console) + _import_npy(target, entry, input_file, console) except Exception as e: console.print(f"[bold red]Error:[/] {e}") raise typer.Exit(code=1) From 4d4cfde583c1e37c0867a2256daf0aa632cbceff Mon Sep 17 00:00:00 2001 From: Aljes Date: Fri, 23 Jan 2026 19:24:47 +0000 Subject: [PATCH 28/62] Add tests for exporting legacy v0.1.0 dataframe and improve output validation in import tests --- tests/test_export.py | 26 ++++++++++++++++++++++++++ tests/test_import.py | 20 ++++++++++++++------ 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/tests/test_export.py b/tests/test_export.py index 730ce95..323167e 100644 --- a/tests/test_export.py +++ b/tests/test_export.py @@ -120,6 +120,32 @@ def test_export_dataframe_obs(self, sample_h5ad_file, temp_dir): text = out.read_text(encoding="utf-8") assert "obs_names" in text + def test_export_legacy_v010_dataframe(self, sample_legacy_v010_h5ad, temp_dir): + """Test exporting a legacy v0.1.0 dataframe with categorical columns.""" + out = temp_dir / "obs_legacy.csv" + result = runner.invoke( + app, + [ + "export", + "dataframe", + str(sample_legacy_v010_h5ad), + "obs", + "--output", + str(out), + ], + ) + assert result.exit_code == 0 + assert out.exists() + text = out.read_text(encoding="utf-8") + # Should contain index and columns + assert "obs_names" in text + assert "cell_type" in text + # Should NOT contain __categories (reserved subgroup) + assert "__categories" not in text + # Should contain decoded categorical values, not codes + assert "TypeA" in text + assert "TypeB" in text + class TestExportValidation: def test_wrong_type_for_dataframe(self, sample_h5ad_file, temp_dir): diff --git a/tests/test_import.py b/tests/test_import.py index 736d4e2..f49af84 100644 --- a/tests/test_import.py +++ b/tests/test_import.py @@ -1,6 +1,7 @@ """Tests for the import command.""" import json +import re from pathlib import Path import h5py @@ -13,6 +14,11 @@ runner = CliRunner() +def strip_ansi(text: str) -> str: + """Strip ANSI escape codes from text.""" + return re.sub(r"\x1b\[[0-9;]*m", "", text) + + class TestImportDataframe: def test_import_dataframe_obs_inplace(self, sample_h5ad_file, temp_dir): """Test importing CSV into obs with --inplace.""" @@ -40,8 +46,9 @@ def test_import_dataframe_obs_inplace(self, sample_h5ad_file, temp_dir): ], ) assert result.exit_code == 0 - assert "5 rows" in result.output - assert "2 columns" in result.output + output = strip_ansi(result.output) + assert "5 rows" in output + assert "2 columns" in output with h5py.File(sample_h5ad_file, "r") as f: assert "obs" in f @@ -115,7 +122,7 @@ def test_import_dataframe_var(self, sample_h5ad_file, temp_dir): ], ) assert result.exit_code == 0 - assert "4 rows" in result.output + assert "4 rows" in strip_ansi(result.output) def test_import_dataframe_dimension_mismatch(self, sample_h5ad_file, temp_dir): """Test that dimension mismatch is rejected.""" @@ -218,7 +225,7 @@ def test_import_array_obsm(self, sample_h5ad_file, temp_dir): ], ) assert result.exit_code == 0 - assert "5×10" in result.output + assert "5×10" in strip_ansi(result.output) with h5py.File(sample_h5ad_file, "r") as f: assert "obsm/X_pca" in f @@ -355,8 +362,9 @@ def test_import_sparse_X(self, sample_h5ad_file, temp_dir): ], ) assert result.exit_code == 0 - assert "5×4" in result.output - assert "5 non-zero" in result.output + output = strip_ansi(result.output) + assert "5×4" in output + assert "5 non-zero" in output with h5py.File(sample_h5ad_file, "r") as f: assert "X" in f From f380cb611d298568acc347fce5cc197792053fad Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 13:12:35 +0000 Subject: [PATCH 29/62] Refactor error handling in subset and read functions to use more specific exceptions --- src/h5ad/commands/subset.py | 4 ++-- src/h5ad/info.py | 6 +++--- src/h5ad/read.py | 12 ++++++------ 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/h5ad/commands/subset.py b/src/h5ad/commands/subset.py index 2e01d9d..ff20d6b 100644 --- a/src/h5ad/commands/subset.py +++ b/src/h5ad/commands/subset.py @@ -457,7 +457,7 @@ def subset_h5ad( ) if obs_names_ds is None: console.print("[bold red]Error:[/] Could not find obs names") - raise RuntimeError("Could not find obs names") + raise KeyError("Could not find obs names") obs_idx, missing_obs = indices_from_name_set(obs_names_ds, obs_keep) if missing_obs: @@ -476,7 +476,7 @@ def subset_h5ad( ) if var_names_ds is None: console.print("[bold red]Error:[/] Could not find var names") - raise RuntimeError("Could not find var names") + raise KeyError("Could not find var names") var_idx, missing_var = indices_from_name_set(var_names_ds, var_keep) if missing_var: diff --git a/src/h5ad/info.py b/src/h5ad/info.py index 7abb3e9..6144b07 100644 --- a/src/h5ad/info.py +++ b/src/h5ad/info.py @@ -210,7 +210,7 @@ def axis_len(file: h5py.File, axis: str) -> int: ValueError: If axis is not 'obs' or 'var' KeyError: If axis or index dataset not found in file TypeError: If axis is not a group or index is not a dataset - RuntimeError: If axis length cannot be determined + ValueError: If axis length cannot be determined """ # Check if the specified axis exists in the file if axis not in file: @@ -245,7 +245,7 @@ def axis_len(file: h5py.File, axis: str) -> int: raise TypeError(f"Index '{index_name}' in '{axis}' is not a dataset.") if dataset.shape: return int(dataset.shape[0]) - raise RuntimeError( + raise ValueError( f"Cannot determine length of '{axis}': index dataset has no shape." ) @@ -265,7 +265,7 @@ def get_axis_group(file: h5py.File, axis: str) -> Tuple[h5py.Group, int, str]: ValueError: If axis is not 'obs' or 'var' KeyError: If axis or index dataset not found in file TypeError: If axis is not a group or index is not a dataset - RuntimeError: If axis length cannot be determined + ValueError: If axis length cannot be determined """ if axis not in ("obs", "var"): raise ValueError("axis must be 'obs' or 'var'.") diff --git a/src/h5ad/read.py b/src/h5ad/read.py index 36f2e58..78fec0e 100644 --- a/src/h5ad/read.py +++ b/src/h5ad/read.py @@ -73,11 +73,11 @@ def read_categorical_column( if col_name in cats_grp: cats = cats_grp[col_name][...] else: - raise RuntimeError( + raise KeyError( f"Cannot find categories for legacy column {col.name}" ) else: - raise RuntimeError( + raise KeyError( f"Cannot find categories for legacy column {col.name}" ) cats = decode_str_array(cats) @@ -88,7 +88,7 @@ def read_categorical_column( codes = np.asarray(codes, dtype=np.int64) return [cats[c] if 0 <= c < len(cats) else "" for c in codes] - raise RuntimeError(f"Unsupported categorical column type: {type(col)}") + raise TypeError(f"Unsupported categorical column type: {type(col)}") def col_chunk_as_strings( @@ -116,7 +116,7 @@ def col_chunk_as_strings( List[str]: Column values as strings for the specified slice """ if col_name not in group: - raise RuntimeError(f"Column {col_name!r} not found in group {group.name}") + raise KeyError(f"Column {col_name!r} not found in group {group.name}") col = group[col_name] @@ -150,10 +150,10 @@ def col_chunk_as_strings( # Apply mask: masked values become empty string return ["" if m else str(v) for v, m in zip(values, mask)] - raise RuntimeError( + raise ValueError( f"Unsupported group encoding {enc!r} for column {col_name!r}" ) - raise RuntimeError( + raise TypeError( f"Unsupported column type for {col_name!r} in group {group.name}" ) From 95b6553155587465db4437eadc543c664a84d022 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 13:48:01 +0000 Subject: [PATCH 30/62] Refactor CLI options for chunk processing: update parameter names and default values for clarity --- src/h5ad/cli.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py index 343560c..0c6190d 100644 --- a/src/h5ad/cli.py +++ b/src/h5ad/cli.py @@ -98,7 +98,7 @@ def subset( readable=True, ), chunk_rows: int = typer.Option( - 1024, "--chunk-rows", "-r", help="Row chunk size for dense matrices" + 1024, "--chunk", "-C", help="Row chunk size for dense matrices" ), ) -> None: """Subset an h5ad by obs and/or var names.""" @@ -141,7 +141,7 @@ def export_dataframe( help="Comma separated column names to include", ), chunk_rows: int = typer.Option( - 10000, "--chunk-rows", "-r", help="Number of rows to read per chunk" + 10_000, "--chunk", "-C", help="Number of rows to read per chunk" ), head: Optional[int] = typer.Option( None, "--head", "-n", help="Output only the first n entries" @@ -193,9 +193,9 @@ def export_array( ..., "--output", "-o", help="Output .npy file path", writable=True ), chunk_elements: int = typer.Option( - 10_000, + 100_000, "--chunk", - "-r", + "-C", help="Number of elements to read per chunk", ), ) -> None: @@ -237,13 +237,13 @@ def export_sparse( help="Output .mtx file path (defaults to stdout)", ), head: Optional[int] = typer.Option( - None, "--head", "-n", help="Output only the first n rows" + None, "--head", "-n", help="Output only the first n entries of mtx file" ), chunk_elements: int = typer.Option( - 10_000, + 1_000, "--chunk", - "-r", - help="Number of nonzero elements to process per chunk", + "-C", + help="Number of rows/columns (depends on compression format) to process per chunk", ), ) -> None: """ @@ -262,6 +262,7 @@ def export_sparse( out=output, head=head, chunk_elements=chunk_elements, + memory_mb=memory_mb, console=console, ) except Exception as e: From 1695f25e37cf2e239479697b65db9c93d2528283 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 13:55:11 +0000 Subject: [PATCH 31/62] Refactor export_mtx function: update chunk_elements description, improve error handling, and enhance output formatting --- src/h5ad/commands/export.py | 89 ++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 35 deletions(-) diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py index dbd4277..4fc94b2 100644 --- a/src/h5ad/commands/export.py +++ b/src/h5ad/commands/export.py @@ -284,7 +284,7 @@ def export_mtx( """Export a CSR/CSC matrix group (AnnData encoding) to Matrix Market (.mtx). If out is None or "-", writes to stdout. The head parameter limits output lines. - chunk_elements controls how many nonzero elements are processed per slice. + chunk_elements controls how many rows/columns are processed per slice. """ with h5py.File(file, "r") as f: h5obj = _resolve(f, obj) @@ -307,13 +307,13 @@ def export_mtx( or not isinstance(indices, h5py.Dataset) or not isinstance(indptr, h5py.Dataset) ): - raise RuntimeError( + raise ValueError( "Sparse matrix group must contain datasets: data, indices, indptr" ) shape = h5obj.attrs.get("shape", None) if shape is None: - raise RuntimeError( + raise ValueError( "Sparse matrix group is missing required 'shape' attribute." ) n_rows, n_cols = (int(shape[0]), int(shape[1])) @@ -325,11 +325,19 @@ def export_mtx( nnz_ptr = int(indptr_arr[-1]) if indptr_arr.size else 0 nnz_data = int(data.shape[0]) nnz_idx = int(indices.shape[0]) - nnz_limit = min(nnz_ptr, nnz_data, nnz_idx) - nnz = nnz_limit - elem_step = max(1, int(chunk_elements)) + + # Check consistency of sparse data + if not (nnz_ptr == nnz_data == nnz_idx): + raise ValueError( + f"Sparse matrix data inconsistency: indptr implies {nnz_ptr} nonzeros, " + f"but data has {nnz_data} and indices has {nnz_idx}." + ) + + # Determine number of nonzero entries to write + nnz = nnz_data + major_step = max(1, int(chunk_elements)) if head is not None and head > 0: - nnz = min(nnz_limit, head) + nnz = min(nnz_data, head) # Write to stdout when out is None or "-", otherwise open a file on disk. if out is None or str(out) == "-": @@ -348,46 +356,57 @@ def export_mtx( # Matrix Market header: type, generator line, then shape and nnz. out_fh.write(f"%%MatrixMarket matrix coordinate {field} general\n") out_fh.write("% generated by h5ad-cli\n") + if head is not None and head > 0: + out_fh.write( + f"% output limited to first {nnz}/{nnz_data} nonzero entries\n" + ) out_fh.write(f"{n_rows} {n_cols} {nnz}\n") + # Iterate over major axis (rows for CSR, cols for CSC) major = n_rows if enc == "csr_matrix" else n_cols max_lines = head if head is not None and head > 0 else None written = 0 with status_ctx as status: - for major_i in range(major): - start = min(int(indptr_arr[major_i]), nnz_limit) - end = min(int(indptr_arr[major_i + 1]), nnz_limit) - if end <= start: - continue + for major_start in range(0, major, major_step): + major_end = min(major_start + major_step, major) if use_status and status: status.update( - f"[magenta]Exporting {obj}: block {major_i+1}/{major}...[/]" + f"[magenta]Exporting {obj}: {major_start+1}-{major_end} of {major}...[/]" ) - # Slice the sparse column/row segment for this major index in element chunks. - for chunk_start in range(start, end, elem_step): - chunk_end = min(chunk_start + elem_step, end) - idx = np.asarray(indices[chunk_start:chunk_end], dtype=np.int64) - vals = np.asarray(data[chunk_start:chunk_end]) + for major_i in range(major_start, major_end): + start = min(int(indptr_arr[major_i]), nnz_data) + end = min(int(indptr_arr[major_i + 1]), nnz_data) + if end <= start: + continue + idx = np.asarray(indices[start:end], dtype=np.int64) + vals = np.asarray(data[start:end]) m = min(len(idx), len(vals)) if m == 0: - continue + raise ValueError("Sparse matrix chunk has zero length.") + if max_lines is not None: + remaining = max_lines - written + if remaining <= 0: + break + if m > remaining: + m = remaining idx = idx[:m] vals = vals[:m] - for k in range(m): - if max_lines is not None and written >= max_lines: - break - if enc == "csr_matrix": - r = major_i + 1 - c = int(idx[k]) + 1 - else: - r = int(idx[k]) + 1 - c = major_i + 1 - v = vals[k] - if isinstance(v, np.generic): - v = v.item() - # Matrix Market uses 1-based indices. - out_fh.write(f"{r} {c} {v}\n") - written += 1 + idx_list = idx.tolist() + vals_list = vals.tolist() + if enc == "csr_matrix": + r = major_i + 1 + lines = [ + f"{r} {c + 1} {v}\n" + for c, v in zip(idx_list, vals_list) + ] + else: + c = major_i + 1 + lines = [ + f"{r + 1} {c} {v}\n" + for r, v in zip(idx_list, vals_list) + ] + out_fh.write("".join(lines)) + written += m if max_lines is not None and written >= max_lines: break if max_lines is not None and written >= max_lines: @@ -490,7 +509,7 @@ def export_image(file: Path, obj: str, out: Path, console: Console) -> None: try: from PIL import Image # type: ignore except Exception as e: # pragma: no cover - raise RuntimeError( + raise ImportError( "Pillow is required for image export. Install with: pip install h5ad[images]" ) from e From 559b40d5c148d7e9363d62fa0a0edd8de01957a9 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 14:59:32 +0000 Subject: [PATCH 32/62] Enhance export functions: add detailed docstrings for export_npy and export_mtx, including argument descriptions and error handling --- src/h5ad/commands/export.py | 146 +++++++++++++++++++++++++----------- 1 file changed, 102 insertions(+), 44 deletions(-) diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py index 4fc94b2..2d6a7b3 100644 --- a/src/h5ad/commands/export.py +++ b/src/h5ad/commands/export.py @@ -210,6 +210,16 @@ def export_npy( - v0.2.0 (modern): Datasets with encoding-type="array" - v0.1.0 (legacy): Plain datasets without encoding attributes - Encoded groups: nullable-integer, nullable-boolean, string-array (extracts values) + + Args: + file: Path to the .h5ad file + obj: HDF5 path to the dataset or encoded group + out: Output .npy file path + chunk_elements: Number of elements to read per chunk + console: Rich console for status output + + Raises: + ValueError: If the target object is not exportable as .npy """ with h5py.File(file, "r") as f: h5obj = _resolve(f, obj) @@ -279,12 +289,26 @@ def export_mtx( out: Optional[Path], head: Optional[int], chunk_elements: int, + in_memory: bool, console: Console, ) -> None: """Export a CSR/CSC matrix group (AnnData encoding) to Matrix Market (.mtx). If out is None or "-", writes to stdout. The head parameter limits output lines. - chunk_elements controls how many rows/columns are processed per slice. + chunk_elements controls how many rows/columns are processed per slice when + streaming. Use in_memory for small matrices to load everything at once. + + Args: + file: Path to the .h5ad file + obj: HDF5 path to the matrix group + out: Output .mtx file path (or None for stdout) + head: Output only the first n nonzero entries + chunk_elements: Number of rows/columns to process per chunk + in_memory: Load the entire sparse matrix into memory before exporting + console: Rich console for status output + + Raises: + ValueError: If the target object is not a valid CSR/CSC matrix group. """ with h5py.File(file, "r") as f: h5obj = _resolve(f, obj) @@ -362,55 +386,89 @@ def export_mtx( ) out_fh.write(f"{n_rows} {n_cols} {nnz}\n") - # Iterate over major axis (rows for CSR, cols for CSC) - major = n_rows if enc == "csr_matrix" else n_cols - max_lines = head if head is not None and head > 0 else None - written = 0 - with status_ctx as status: - for major_start in range(0, major, major_step): - major_end = min(major_start + major_step, major) + if in_memory: + with status_ctx as status: if use_status and status: status.update( - f"[magenta]Exporting {obj}: {major_start+1}-{major_end} of {major}...[/]" + f"[magenta]Loading entire matrix {obj} into memory...[/]" + ) + data_arr = np.asarray(data[...]) + indices_arr = np.asarray(indices[...], dtype=np.int64) + counts = np.diff(indptr_arr) + if int(counts.sum()) != nnz_data: + raise ValueError( + "Sparse matrix indptr does not match data/indices length." ) - for major_i in range(major_start, major_end): - start = min(int(indptr_arr[major_i]), nnz_data) - end = min(int(indptr_arr[major_i + 1]), nnz_data) - if end <= start: - continue - idx = np.asarray(indices[start:end], dtype=np.int64) - vals = np.asarray(data[start:end]) - m = min(len(idx), len(vals)) - if m == 0: - raise ValueError("Sparse matrix chunk has zero length.") - if max_lines is not None: - remaining = max_lines - written - if remaining <= 0: + + if enc == "csr_matrix": + major_idx = np.repeat(np.arange(n_rows, dtype=np.int64), counts) + row_idx = major_idx + col_idx = indices_arr + else: + major_idx = np.repeat(np.arange(n_cols, dtype=np.int64), counts) + row_idx = indices_arr + col_idx = major_idx + + if head is not None and head > 0: + row_idx = row_idx[:nnz] + col_idx = col_idx[:nnz] + data_arr = data_arr[:nnz] + + data_fmt = "%.18g" if field == "real" else "%d" + coords = np.column_stack((row_idx + 1, col_idx + 1, data_arr)) + if use_status and status: + status.update(f"[magenta]Saving {nnz} entries to {out}...[/]") + np.savetxt(out_fh, coords, fmt=["%d", "%d", data_fmt], newline="\n") + else: + # Iterate over major axis (rows for CSR, cols for CSC) + major = n_rows if enc == "csr_matrix" else n_cols + max_lines = head if head is not None and head > 0 else None + written = 0 + with status_ctx as status: + for major_start in range(0, major, major_step): + major_end = min(major_start + major_step, major) + if use_status and status: + status.update( + f"[magenta]Exporting {obj}: {major_start+1}-{major_end} of {major}...[/]" + ) + for major_i in range(major_start, major_end): + start = min(int(indptr_arr[major_i]), nnz_data) + end = min(int(indptr_arr[major_i + 1]), nnz_data) + if end <= start: + continue + idx = np.asarray(indices[start:end], dtype=np.int64) + vals = np.asarray(data[start:end]) + m = min(len(idx), len(vals)) + if m == 0: + raise ValueError("Sparse matrix chunk has zero length.") + if max_lines is not None: + remaining = max_lines - written + if remaining <= 0: + break + if m > remaining: + m = remaining + idx = idx[:m] + vals = vals[:m] + idx_list = idx.tolist() + vals_list = vals.tolist() + if enc == "csr_matrix": + r = major_i + 1 + lines = [ + f"{r} {c + 1} {v}\n" + for c, v in zip(idx_list, vals_list) + ] + else: + c = major_i + 1 + lines = [ + f"{r + 1} {c} {v}\n" + for r, v in zip(idx_list, vals_list) + ] + out_fh.write("".join(lines)) + written += m + if max_lines is not None and written >= max_lines: break - if m > remaining: - m = remaining - idx = idx[:m] - vals = vals[:m] - idx_list = idx.tolist() - vals_list = vals.tolist() - if enc == "csr_matrix": - r = major_i + 1 - lines = [ - f"{r} {c + 1} {v}\n" - for c, v in zip(idx_list, vals_list) - ] - else: - c = major_i + 1 - lines = [ - f"{r + 1} {c} {v}\n" - for r, v in zip(idx_list, vals_list) - ] - out_fh.write("".join(lines)) - written += m if max_lines is not None and written >= max_lines: break - if max_lines is not None and written >= max_lines: - break finally: if out_fh is not sys.stdout: out_fh.close() From 36e5e8b6be4b6461ecb3fbac10bdb63cb084cf26 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 14:59:38 +0000 Subject: [PATCH 33/62] Add in-memory option to export_sparse command for improved performance --- src/h5ad/cli.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py index 0c6190d..f53b3af 100644 --- a/src/h5ad/cli.py +++ b/src/h5ad/cli.py @@ -245,6 +245,12 @@ def export_sparse( "-C", help="Number of rows/columns (depends on compression format) to process per chunk", ), + in_memory: bool = typer.Option( + False, + "--in-memory", + "-m", + help="Load the entire sparse matrix into memory before exporting (may be faster for small matrices)", + ), ) -> None: """ Export a sparse matrix (CSR/CSC) to Matrix Market (.mtx) format. @@ -262,7 +268,7 @@ def export_sparse( out=output, head=head, chunk_elements=chunk_elements, - memory_mb=memory_mb, + in_memory=in_memory, console=console, ) except Exception as e: From 878421d69430d8315b78b31d20670ded96ccb3e9 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 15:11:45 +0000 Subject: [PATCH 34/62] Refactor export_dict and export_json functions: update output parameter handling and improve file writing logic --- src/h5ad/cli.py | 8 +++++--- src/h5ad/commands/export.py | 18 ++++++++++++++---- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py index f53b3af..5a462ac 100644 --- a/src/h5ad/cli.py +++ b/src/h5ad/cli.py @@ -284,9 +284,11 @@ def export_dict( entry: str = typer.Argument( ..., help="Entry path to export (e.g., 'uns', 'uns/colors')" ), - out: Path = typer.Argument(..., help="Output .json file path"), + output: Optional[Path] = typer.Option( + None, "--output", "-o", help="Output .json file path" + ), max_elements: int = typer.Option( - 1_000_000, + 100_000, "--max-elements", help="Maximum array elements for JSON export", ), @@ -306,7 +308,7 @@ def export_dict( export_json( file=file, obj=entry, - out=out, + out=output, max_elements=max_elements, include_attrs=include_attrs, console=console, diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py index 2d6a7b3..ae6f73f 100644 --- a/src/h5ad/commands/export.py +++ b/src/h5ad/commands/export.py @@ -494,10 +494,20 @@ def export_json( payload = _to_jsonable( h5obj, max_elements=max_elements, include_attrs=include_attrs ) - out.parent.mkdir(parents=True, exist_ok=True) - with open(out, "w", encoding="utf-8") as fh: - json.dump(payload, fh, indent=2, ensure_ascii=False, sort_keys=True) - console.print(f"[green]Wrote[/] {out}") + # Write to stdout when out is None or "-", otherwise open a file on disk. + if out is None or str(out) == "-": + out_fh = sys.stdout + else: + out.parent.mkdir(parents=True, exist_ok=True) + out_fh = open(out, "w", encoding="utf-8") + try: + json.dump(payload, out_fh, indent=2, ensure_ascii=False, sort_keys=True) + out_fh.write("\n") + finally: + if out_fh is not sys.stdout: + out_fh.close() + if out_fh is not sys.stdout: + console.print(f"[green]Wrote[/] {out}") def _attrs_to_jsonable( From 47fc2833ed0f5ed7462f1dda67baf70758d2bebc Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 15:22:29 +0000 Subject: [PATCH 35/62] Refactor export_image function: update output parameter to use Option, enhance docstring with argument descriptions and error handling --- src/h5ad/cli.py | 9 ++++++--- src/h5ad/commands/export.py | 23 +++++++++++++++-------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py index 5a462ac..d947270 100644 --- a/src/h5ad/cli.py +++ b/src/h5ad/cli.py @@ -12,10 +12,11 @@ export_mtx, export_npy, export_json, - export_image, export_table, ) +from h5ad.commands import export_image as export_image_cmd + app = typer.Typer( help="Streaming CLI for huge .h5ad files (info, subset, export, import)." ) @@ -324,7 +325,9 @@ def export_image( ..., help="Path to the .h5ad file", exists=True, readable=True ), entry: str = typer.Argument(..., help="Entry path to export (2D or 3D array)"), - out: Path = typer.Argument(..., help="Output image file (.png, .jpg, .tiff)"), + output: Optional[Path] = typer.Option( + None, "--output", "-o", help="Output image file (.png, .jpg, .tiff)" + ), ) -> None: """ Export an image-like array to PNG/JPG/TIFF format. @@ -336,7 +339,7 @@ def export_image( """ try: - export_image(file=file, obj=entry, out=out, console=console) + export_image_cmd(file=file, obj=entry, out=output, console=console) except Exception as e: console.print(f"[bold red]Error:[/] {e}") raise typer.Exit(code=1) diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py index ae6f73f..06c7f0f 100644 --- a/src/h5ad/commands/export.py +++ b/src/h5ad/commands/export.py @@ -10,6 +10,7 @@ import h5py import numpy as np from rich.console import Console +from PIL import Image from h5ad.read import col_chunk_as_strings, decode_str_array from h5ad.info import get_axis_group, get_entry_type @@ -573,20 +574,24 @@ def _to_jsonable(h5obj: H5Obj, max_elements: int, include_attrs: bool) -> Any: def export_image(file: Path, obj: str, out: Path, console: Console) -> None: - """Export an image-like dataset (H,W) or (H,W,C) to PNG/JPG/TIFF.""" - try: - from PIL import Image # type: ignore - except Exception as e: # pragma: no cover - raise ImportError( - "Pillow is required for image export. Install with: pip install h5ad[images]" - ) from e - + """ + Export an image-like dataset (H,W) or (H,W,C) to PNG/JPG/TIFF. + Args: + file: Path to the .h5ad file + obj: HDF5 path to the dataset + out: Output image file path + console: Rich console for status output + Raises: + ValueError: If the target object is not a valid image array. + """ + # Load dataset with h5py.File(file, "r") as f: h5obj = _resolve(f, obj) if not isinstance(h5obj, h5py.Dataset): raise ValueError("Image export requires a dataset.") arr = np.asarray(h5obj[...]) + # Validate shape if arr.ndim not in (2, 3): raise ValueError(f"Expected 2D or 3D image array; got shape {arr.shape}.") if arr.ndim == 3 and arr.shape[2] not in (1, 3, 4): @@ -609,9 +614,11 @@ def export_image(file: Path, obj: str, out: Path, console: Console) -> None: else: raise ValueError(f"Unsupported image dtype: {arr.dtype}") + # If single-channel 3D, convert to 2D if arr.ndim == 3 and arr.shape[2] == 1: arr = arr[:, :, 0] + # Save image img = Image.fromarray(arr) out.parent.mkdir(parents=True, exist_ok=True) img.save(out) From 595d81bf1f5f56ead652f3cf599294db7de68175 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 17:31:44 +0000 Subject: [PATCH 36/62] HUGE REFACTOR: Add format-specific import/export helpers for various data types --- src/h5ad/formats/__init__.py | 1 + src/h5ad/formats/array.py | 99 +++++++++++++ src/h5ad/formats/common.py | 67 +++++++++ src/h5ad/formats/dataframe.py | 169 ++++++++++++++++++++++ src/h5ad/formats/image.py | 47 ++++++ src/h5ad/formats/json_data.py | 155 ++++++++++++++++++++ src/h5ad/formats/sparse.py | 262 ++++++++++++++++++++++++++++++++++ src/h5ad/formats/validate.py | 97 +++++++++++++ 8 files changed, 897 insertions(+) create mode 100644 src/h5ad/formats/__init__.py create mode 100644 src/h5ad/formats/array.py create mode 100644 src/h5ad/formats/common.py create mode 100644 src/h5ad/formats/dataframe.py create mode 100644 src/h5ad/formats/image.py create mode 100644 src/h5ad/formats/json_data.py create mode 100644 src/h5ad/formats/sparse.py create mode 100644 src/h5ad/formats/validate.py diff --git a/src/h5ad/formats/__init__.py b/src/h5ad/formats/__init__.py new file mode 100644 index 0000000..18b9721 --- /dev/null +++ b/src/h5ad/formats/__init__.py @@ -0,0 +1 @@ +"""Format-specific import/export helpers.""" diff --git a/src/h5ad/formats/array.py b/src/h5ad/formats/array.py new file mode 100644 index 0000000..1dd21ac --- /dev/null +++ b/src/h5ad/formats/array.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import numpy as np +from rich.console import Console + +from h5ad.formats.common import _get_encoding_type, _resolve +from h5ad.formats.validate import validate_dimensions +from h5ad.storage import create_dataset, is_dataset, is_group +from h5ad.util.path import norm_path + + +def export_npy( + root: Any, + obj: str, + out: Path, + chunk_elements: int, + console: Console, +) -> None: + h5obj = _resolve(root, obj) + + if is_group(h5obj): + enc = _get_encoding_type(h5obj) + if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"): + if "values" not in h5obj: + raise ValueError(f"Encoded group '{obj}' is missing 'values' dataset.") + ds = h5obj["values"] + console.print(f"[dim]Exporting nullable array values from '{obj}'[/]") + else: + raise ValueError( + f"Target '{obj}' is a group with encoding '{enc}'; cannot export as .npy directly." + ) + elif is_dataset(h5obj): + ds = h5obj + else: + raise ValueError("Target is not an array-like object.") + + out.parent.mkdir(parents=True, exist_ok=True) + mm = np.lib.format.open_memmap(out, mode="w+", dtype=ds.dtype, shape=ds.shape) + try: + if ds.shape == (): + mm[...] = ds[()] + console.print(f"[green]Wrote[/] {out}") + return + + if ds.ndim == 1: + n = int(ds.shape[0]) + step = max(1, int(chunk_elements)) + with console.status(f"[magenta]Exporting {obj} to {out}...[/]") as status: + for start in range(0, n, step): + end = min(start + step, n) + status.update( + f"[magenta]Exporting {obj}: {start}-{end} of {n}...[/]" + ) + mm[start:end] = ds[start:end] + console.print(f"[green]Wrote[/] {out}") + return + + n0 = int(ds.shape[0]) + row_elems = int(np.prod(ds.shape[1:])) if ds.ndim > 1 else 1 + step0 = max(1, int(chunk_elements) // max(1, row_elems)) + with console.status(f"[magenta]Exporting {obj} to {out}...[/]") as status: + for start in range(0, n0, step0): + end = min(start + step0, n0) + status.update( + f"[magenta]Exporting {obj}: {start}-{end} of {n0}...[/]" + ) + mm[start:end, ...] = ds[start:end, ...] + console.print(f"[green]Wrote[/] {out}") + finally: + del mm + + +def import_npy( + root: Any, + obj: str, + input_file: Path, + console: Console, +) -> None: + obj = norm_path(obj) + arr = np.load(input_file) + + validate_dimensions(root, obj, arr.shape, console) + + parts = obj.split("/") + parent = root + for part in parts[:-1]: + parent = parent[part] if part in parent else parent.create_group(part) + name = parts[-1] + + if name in parent: + del parent[name] + + create_dataset(parent, name, data=arr) + + shape_str = "×".join(str(d) for d in arr.shape) + console.print(f"[green]Imported[/] {shape_str} array into '{obj}'") diff --git a/src/h5ad/formats/common.py b/src/h5ad/formats/common.py new file mode 100644 index 0000000..6282eb5 --- /dev/null +++ b/src/h5ad/formats/common.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from typing import Any, Dict + +import numpy as np + +from h5ad.storage import is_dataset, is_group +from h5ad.util.path import norm_path + + +TYPE_EXTENSIONS = { + "dataframe": {".csv"}, + "sparse-matrix": {".mtx"}, + "dense-matrix": {".npy", ".png", ".jpg", ".jpeg", ".tif", ".tiff"}, + "array": {".npy", ".png", ".jpg", ".jpeg", ".tif", ".tiff"}, + "dict": {".json"}, + "scalar": {".json"}, + "categorical": {".csv"}, + "awkward-array": {".json"}, +} + +IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".tif", ".tiff"} + +EXPORTABLE_TYPES = set(TYPE_EXTENSIONS.keys()) + + +def _get_encoding_type(group: Any) -> str: + enc = group.attrs.get("encoding-type", "") + if isinstance(enc, bytes): + enc = enc.decode("utf-8") + return str(enc) + + +def _resolve(root: Any, obj: str) -> Any: + obj = norm_path(obj) + if obj not in root: + raise KeyError(f"'{obj}' not found in the file.") + return root[obj] + + +def _check_json_exportable(h5obj: Any, max_elements: int, path: str = "") -> None: + if is_dataset(h5obj): + if h5obj.shape == (): + return + n = int(np.prod(h5obj.shape)) if h5obj.shape else 0 + if n > max_elements: + obj_name = getattr(h5obj, "name", "") + raise ValueError( + f"Cannot export to JSON: '{path or obj_name}' has {n} elements " + f"(max {max_elements}). Use --max-elements to increase limit." + ) + return + + if is_group(h5obj): + enc = _get_encoding_type(h5obj) + if enc in ("csr_matrix", "csc_matrix"): + obj_name = getattr(h5obj, "name", "") + raise ValueError( + f"Cannot export to JSON: '{path or obj_name}' is a sparse matrix. " + "Export it as .mtx instead." + ) + + for key in h5obj.keys(): + child = h5obj[key] + child_path = f"{path}/{key}" if path else key + if is_group(child) or is_dataset(child): + _check_json_exportable(child, max_elements=max_elements, path=child_path) diff --git a/src/h5ad/formats/dataframe.py b/src/h5ad/formats/dataframe.py new file mode 100644 index 0000000..f767c4c --- /dev/null +++ b/src/h5ad/formats/dataframe.py @@ -0,0 +1,169 @@ +from __future__ import annotations + +import csv +import sys +from contextlib import nullcontext +from pathlib import Path +from typing import Any, List, Optional, Tuple + +import numpy as np +from rich.console import Console + +from h5ad.core.info import get_axis_group +from h5ad.core.read import col_chunk_as_strings +from h5ad.formats.validate import validate_dimensions +from h5ad.storage import create_dataset, is_zarr_group + + +def export_dataframe( + root: Any, + axis: str, + columns: Optional[List[str]], + out: Optional[Path], + chunk_rows: int, + head: Optional[int], + console: Console, +) -> None: + group, n_rows, index_name = get_axis_group(root, axis) + + reserved_keys = {"_index", "__categories"} + + if columns: + col_names = list(columns) + else: + col_names = [ + k for k in group.keys() if k not in reserved_keys and k != index_name + ] + if index_name and index_name not in col_names: + col_names.insert(0, index_name) + + if isinstance(index_name, bytes): + index_name = index_name.decode("utf-8") + + if index_name not in col_names: + col_names.insert(0, index_name) + else: + col_names = [index_name] + [c for c in col_names if c != index_name] + + if head is not None and head > 0: + n_rows = min(n_rows, head) + + if out is None or str(out) == "-": + out_fh = sys.stdout + else: + out_fh = open(out, "w", newline="", encoding="utf-8") + writer = csv.writer(out_fh) + + try: + writer.writerow(col_names) + cat_cache = {} + + use_status = out_fh is not sys.stdout + status_ctx = ( + console.status(f"[magenta]Exporting {axis} table to {out}...[/]") + if use_status + else nullcontext() + ) + + with status_ctx as status: + for start in range(0, n_rows, chunk_rows): + end = min(start + chunk_rows, n_rows) + if use_status and status: + status.update( + f"[magenta]Exporting rows {start}-{end} of {n_rows}...[/]" + ) + cols_data: List[List[str]] = [] + for col in col_names: + cols_data.append( + col_chunk_as_strings(group, col, start, end, cat_cache) + ) + for row_idx in range(end - start): + row = [ + cols_data[col_idx][row_idx] + for col_idx in range(len(col_names)) + ] + writer.writerow(row) + finally: + if out_fh is not sys.stdout: + out_fh.close() + + +def _read_csv( + input_file: Path, + index_column: Optional[str], +) -> Tuple[List[dict], List[str], List[str], str]: + with open(input_file, "r", encoding="utf-8", newline="") as f: + reader = csv.DictReader(f) + if reader.fieldnames is None: + raise ValueError("CSV file has no header.") + fieldnames = list(reader.fieldnames) + + if index_column: + if index_column not in fieldnames: + raise ValueError( + f"Index column '{index_column}' not found in CSV. " + f"Available columns: {', '.join(fieldnames)}" + ) + idx_col = index_column + else: + idx_col = fieldnames[0] + + rows = list(reader) + + index_values = [row[idx_col] for row in rows] + data_columns = [c for c in fieldnames if c != idx_col] + + return rows, data_columns, index_values, idx_col + + +def import_dataframe( + root: Any, + obj: str, + input_file: Path, + index_column: Optional[str], + console: Console, +) -> None: + if obj not in ("obs", "var"): + raise ValueError( + f"CSV import is only supported for 'obs' or 'var', not '{obj}'." + ) + + rows, data_columns, index_values, _ = _read_csv(input_file, index_column) + n_rows = len(rows) + + validate_dimensions(root, obj, (n_rows,), console) + + if obj in root: + del root[obj] + + group = root.create_group(obj) + index_name = "obs_names" if obj == "obs" else "var_names" + group.attrs["_index"] = index_name + group.attrs["encoding-type"] = "dataframe" + group.attrs["encoding-version"] = "0.2.0" + + if is_zarr_group(group): + group.attrs["column-order"] = list(data_columns) + else: + group.attrs["column-order"] = np.array(data_columns, dtype="S") + + create_dataset(group, index_name, data=np.array(index_values, dtype="S")) + + for col in data_columns: + values = [row[col] for row in rows] + try: + arr = np.array(values, dtype=np.float64) + create_dataset(group, col, data=arr) + except (ValueError, TypeError): + try: + arr = np.array(values, dtype=np.int64) + create_dataset(group, col, data=arr) + except (ValueError, TypeError): + arr = np.array(values, dtype="S") + ds = create_dataset(group, col, data=arr) + ds.attrs["encoding-type"] = "string-array" + ds.attrs["encoding-version"] = "0.2.0" + + console.print( + f"[green]Imported[/] {n_rows} rows × {len(data_columns)} columns into '{obj}'" + ) diff --git a/src/h5ad/formats/image.py b/src/h5ad/formats/image.py new file mode 100644 index 0000000..fe5d2ce --- /dev/null +++ b/src/h5ad/formats/image.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import numpy as np +from PIL import Image +from rich.console import Console + +from h5ad.formats.common import _resolve +from h5ad.storage import is_dataset + + +def export_image(root: Any, obj: str, out: Path, console: Console) -> None: + h5obj = _resolve(root, obj) + if not is_dataset(h5obj): + raise ValueError("Image export requires a dataset.") + arr = np.asarray(h5obj[...]) + + if arr.ndim not in (2, 3): + raise ValueError(f"Expected 2D or 3D image array; got shape {arr.shape}.") + if arr.ndim == 3 and arr.shape[2] not in (1, 3, 4): + raise ValueError( + f"Expected last dimension (channels) to be 1, 3, or 4; got {arr.shape}." + ) + + if np.issubdtype(arr.dtype, np.floating): + amax = float(np.nanmax(arr)) if arr.size else 0.0 + if amax <= 1.0: + arr = np.clip(arr, 0.0, 1.0) * 255.0 + else: + arr = np.clip(arr, 0.0, 255.0) + arr = arr.astype(np.uint8) + elif np.issubdtype(arr.dtype, np.integer): + arr = np.clip(arr, 0, 255).astype(np.uint8) + elif arr.dtype == np.bool_: + arr = arr.astype(np.uint8) * 255 + else: + raise ValueError(f"Unsupported image dtype: {arr.dtype}") + + if arr.ndim == 3 and arr.shape[2] == 1: + arr = arr[:, :, 0] + + img = Image.fromarray(arr) + out.parent.mkdir(parents=True, exist_ok=True) + img.save(out) + console.print(f"[green]Wrote[/] {out}") diff --git a/src/h5ad/formats/json_data.py b/src/h5ad/formats/json_data.py new file mode 100644 index 0000000..c983677 --- /dev/null +++ b/src/h5ad/formats/json_data.py @@ -0,0 +1,155 @@ +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any, Dict + +import numpy as np +from rich.console import Console + +from h5ad.core.read import decode_str_array +from h5ad.formats.common import _check_json_exportable, _resolve +from h5ad.storage import create_dataset, is_dataset, is_group +from h5ad.util.path import norm_path + + +def export_json( + root: Any, + obj: str, + out: Path | None, + max_elements: int, + include_attrs: bool, + console: Console, +) -> None: + h5obj = _resolve(root, obj) + _check_json_exportable(h5obj, max_elements=max_elements) + + payload = _to_jsonable( + h5obj, max_elements=max_elements, include_attrs=include_attrs + ) + if out is None or str(out) == "-": + out_fh = sys.stdout + else: + out.parent.mkdir(parents=True, exist_ok=True) + out_fh = open(out, "w", encoding="utf-8") + try: + json.dump(payload, out_fh, indent=2, ensure_ascii=False, sort_keys=True) + out_fh.write("\n") + finally: + if out_fh is not sys.stdout: + out_fh.close() + if out_fh is not sys.stdout: + console.print(f"[green]Wrote[/] {out}") + + +def _attrs_to_jsonable(attrs: Any, max_elements: int) -> Dict[str, Any]: + out: Dict[str, Any] = {} + for k in attrs.keys(): + v = attrs.get(k) + out[str(k)] = _pyify(v, max_elements=max_elements) + return out + + +def _pyify(value: Any, max_elements: int) -> Any: + if isinstance(value, bytes): + try: + return value.decode("utf-8") + except Exception: + return value.decode("utf-8", errors="replace") + if isinstance(value, np.generic): + return value.item() + if isinstance(value, np.ndarray): + if value.size > max_elements: + raise ValueError( + f"Refusing to convert array of size {value.size} (> {max_elements}) to JSON." + ) + if np.issubdtype(value.dtype, np.bytes_) or value.dtype.kind == "O": + value = decode_str_array(value) + return value.tolist() + return value + + +def _dataset_to_jsonable(ds: Any, max_elements: int) -> Any: + if ds.shape == (): + v = ds[()] + return _pyify(v, max_elements=max_elements) + n = int(np.prod(ds.shape)) if ds.shape else 0 + if n > max_elements: + ds_name = getattr(ds, "name", "") + raise ValueError( + f"Refusing to convert dataset {ds_name!r} with {n} elements (> {max_elements}) to JSON." + ) + arr = np.asarray(ds[...]) + return _pyify(arr, max_elements=max_elements) + + +def _to_jsonable(h5obj: Any, max_elements: int, include_attrs: bool) -> Any: + if is_dataset(h5obj): + return _dataset_to_jsonable(h5obj, max_elements=max_elements) + + d: Dict[str, Any] = {} + if include_attrs and len(h5obj.attrs): + d["__attrs__"] = _attrs_to_jsonable(h5obj.attrs, max_elements=max_elements) + + for key in h5obj.keys(): + child = h5obj[key] + if is_group(child) or is_dataset(child): + d[str(key)] = _to_jsonable( + child, + max_elements=max_elements, + include_attrs=include_attrs, + ) + return d + + +def import_json( + root: Any, + obj: str, + input_file: Path, + console: Console, +) -> None: + obj = norm_path(obj) + with open(input_file, "r", encoding="utf-8") as fh: + payload = json.load(fh) + + parts = obj.split("/") + parent = root + for part in parts[:-1]: + parent = parent[part] if part in parent else parent.create_group(part) + name = parts[-1] + + if name in parent: + del parent[name] + + _write_json_to_group(parent, name, payload) + + console.print(f"[green]Imported[/] JSON data into '{obj}'") + + +def _write_json_to_group(parent: Any, name: str, value: Any) -> None: + if isinstance(value, dict): + group = parent.create_group(name) + for k, v in value.items(): + _write_json_to_group(group, k, v) + elif isinstance(value, list): + try: + arr = np.array(value) + if arr.dtype.kind in ("U", "O"): + arr = np.array(value, dtype="S") + create_dataset(parent, name, data=arr) + except (ValueError, TypeError): + create_dataset(parent, name, data=json.dumps(value).encode("utf-8")) + elif isinstance(value, str): + create_dataset(parent, name, data=np.array([value], dtype="S")) + elif isinstance(value, bool): + create_dataset(parent, name, data=np.array(value, dtype=bool)) + elif isinstance(value, int): + create_dataset(parent, name, data=np.array(value, dtype=np.int64)) + elif isinstance(value, float): + create_dataset(parent, name, data=np.array(value, dtype=np.float64)) + elif value is None: + ds = create_dataset(parent, name, data=np.array([], dtype="S")) + ds.attrs["_is_none"] = True + else: + raise ValueError(f"Cannot convert JSON value of type {type(value).__name__}") diff --git a/src/h5ad/formats/sparse.py b/src/h5ad/formats/sparse.py new file mode 100644 index 0000000..4045ce5 --- /dev/null +++ b/src/h5ad/formats/sparse.py @@ -0,0 +1,262 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any, List, Tuple +import sys +from contextlib import nullcontext + +import numpy as np +from rich.console import Console + +from h5ad.formats.common import _get_encoding_type, _resolve +from h5ad.formats.validate import validate_dimensions +from h5ad.storage import create_dataset, is_dataset, is_group, is_zarr_group +from h5ad.util.path import norm_path + + +def _read_mtx( + input_file: Path, +) -> Tuple[List[Tuple[int, int, float]], Tuple[int, int], int]: + with open(input_file, "r", encoding="utf-8") as fh: + header = fh.readline() + if not header.startswith("%%MatrixMarket"): + raise ValueError("Invalid MTX file: missing MatrixMarket header.") + + parts = header.lower().split() + field = "real" + for p in parts: + if p in ("real", "integer", "complex", "pattern"): + field = p + break + + line = fh.readline() + while line.startswith("%"): + line = fh.readline() + + dims = line.split() + n_rows, n_cols, nnz = int(dims[0]), int(dims[1]), int(dims[2]) + + entries = [] + for _ in range(nnz): + parts = fh.readline().split() + r, c = int(parts[0]) - 1, int(parts[1]) - 1 + if field == "pattern": + v = 1.0 + else: + v = float(parts[2]) + entries.append((r, c, v)) + + return entries, (n_rows, n_cols), nnz + + +def _create_csr_from_entries( + entries: List[Tuple[int, int, float]], shape: Tuple[int, int] +) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + n_rows, _ = shape + entries.sort(key=lambda x: (x[0], x[1])) + + data = np.array([e[2] for e in entries], dtype=np.float32) + indices = np.array([e[1] for e in entries], dtype=np.int32) + + indptr = np.zeros(n_rows + 1, dtype=np.int32) + for r, _, _ in entries: + indptr[r + 1] += 1 + indptr = np.cumsum(indptr) + + return data, indices, indptr + + +def export_mtx( + root: Any, + obj: str, + out: Path | None, + head: int | None, + chunk_elements: int, + in_memory: bool, + console: Console, +) -> None: + h5obj = _resolve(root, obj) + if not is_group(h5obj): + raise ValueError("MTX export requires a CSR/CSC matrix group (not a dataset).") + + enc = _get_encoding_type(h5obj) + if enc not in ("csr_matrix", "csc_matrix"): + raise ValueError( + f"Target group encoding-type is {enc!r}; expected 'csr_matrix' or 'csc_matrix'." + ) + + data = h5obj.get("data") + indices = h5obj.get("indices") + indptr = h5obj.get("indptr") + if not (is_dataset(data) and is_dataset(indices) and is_dataset(indptr)): + raise ValueError( + "Sparse matrix group must contain datasets: data, indices, indptr" + ) + + shape = h5obj.attrs.get("shape", None) + if shape is None: + raise ValueError("Sparse matrix group is missing required 'shape' attribute.") + n_rows, n_cols = (int(shape[0]), int(shape[1])) + + field = "real" if np.issubdtype(data.dtype, np.floating) else "integer" + + indptr_arr = np.asarray(indptr[...], dtype=np.int64) + nnz_ptr = int(indptr_arr[-1]) if indptr_arr.size else 0 + nnz_data = int(data.shape[0]) + nnz_idx = int(indices.shape[0]) + + if not (nnz_ptr == nnz_data == nnz_idx): + raise ValueError( + f"Sparse matrix data inconsistency: indptr implies {nnz_ptr} nonzeros, " + f"but data has {nnz_data} and indices has {nnz_idx}." + ) + + nnz = nnz_data + major_step = max(1, int(chunk_elements)) + if head is not None and head > 0: + nnz = min(nnz_data, head) + + if out is None or str(out) == "-": + out_fh = sys.stdout + else: + out.parent.mkdir(parents=True, exist_ok=True) + out_fh = open(out, "w", encoding="utf-8", newline="\n") + + use_status = out_fh is not sys.stdout + status_ctx = ( + console.status(f"[magenta]Exporting {obj} to {out}...[/]") + if use_status + else nullcontext() + ) + try: + out_fh.write(f"%%MatrixMarket matrix coordinate {field} general\n") + out_fh.write("% generated by h5ad-cli\n") + if head is not None and head > 0: + out_fh.write(f"% output limited to first {nnz}/{nnz_data} nonzero entries\n") + out_fh.write(f"{n_rows} {n_cols} {nnz}\n") + + if in_memory: + with status_ctx as status: + if use_status and status: + status.update( + f"[magenta]Loading entire matrix {obj} into memory...[/]" + ) + data_arr = np.asarray(data[...]) + indices_arr = np.asarray(indices[...], dtype=np.int64) + counts = np.diff(indptr_arr) + if int(counts.sum()) != nnz_data: + raise ValueError( + "Sparse matrix indptr does not match data/indices length." + ) + + if enc == "csr_matrix": + major_idx = np.repeat(np.arange(n_rows, dtype=np.int64), counts) + row_idx = major_idx + col_idx = indices_arr + else: + major_idx = np.repeat(np.arange(n_cols, dtype=np.int64), counts) + row_idx = indices_arr + col_idx = major_idx + + if head is not None and head > 0: + row_idx = row_idx[:nnz] + col_idx = col_idx[:nnz] + data_arr = data_arr[:nnz] + + data_fmt = "%.18g" if field == "real" else "%d" + coords = np.column_stack((row_idx + 1, col_idx + 1, data_arr)) + if use_status and status: + status.update(f"[magenta]Saving {nnz} entries to {out}...[/]") + np.savetxt(out_fh, coords, fmt=["%d", "%d", data_fmt], newline="\n") + else: + major = n_rows if enc == "csr_matrix" else n_cols + max_lines = head if head is not None and head > 0 else None + written = 0 + with status_ctx as status: + for major_start in range(0, major, major_step): + major_end = min(major_start + major_step, major) + if use_status and status: + status.update( + f"[magenta]Exporting {obj}: {major_start+1}-{major_end} of {major}...[/]" + ) + for major_i in range(major_start, major_end): + start = min(int(indptr_arr[major_i]), nnz_data) + end = min(int(indptr_arr[major_i + 1]), nnz_data) + if end <= start: + continue + idx = np.asarray(indices[start:end], dtype=np.int64) + vals = np.asarray(data[start:end]) + m = min(len(idx), len(vals)) + if m == 0: + raise ValueError("Sparse matrix chunk has zero length.") + if max_lines is not None: + remaining = max_lines - written + if remaining <= 0: + break + if m > remaining: + m = remaining + idx = idx[:m] + vals = vals[:m] + idx_list = idx.tolist() + vals_list = vals.tolist() + if enc == "csr_matrix": + r = major_i + 1 + lines = [ + f"{r} {c + 1} {v}\n" + for c, v in zip(idx_list, vals_list) + ] + else: + c = major_i + 1 + lines = [ + f"{r + 1} {c} {v}\n" + for r, v in zip(idx_list, vals_list) + ] + out_fh.write("".join(lines)) + written += m + if max_lines is not None and written >= max_lines: + break + if max_lines is not None and written >= max_lines: + break + finally: + if out_fh is not sys.stdout: + out_fh.close() + if out_fh is not sys.stdout: + console.print(f"[green]Wrote[/] {out}") + + +def import_mtx( + root: Any, + obj: str, + input_file: Path, + console: Console, +) -> None: + obj = norm_path(obj) + entries, shape, nnz = _read_mtx(input_file) + data, indices, indptr = _create_csr_from_entries(entries, shape) + + validate_dimensions(root, obj, shape, console) + + parts = obj.split("/") + parent = root + for part in parts[:-1]: + parent = parent[part] if part in parent else parent.create_group(part) + name = parts[-1] + + if name in parent: + del parent[name] + + group = parent.create_group(name) + group.attrs["encoding-type"] = "csr_matrix" + group.attrs["encoding-version"] = "0.1.0" + if is_zarr_group(group): + group.attrs["shape"] = list(shape) + else: + group.attrs["shape"] = np.array(shape, dtype=np.int64) + + create_dataset(group, "data", data=data) + create_dataset(group, "indices", data=indices) + create_dataset(group, "indptr", data=indptr) + + console.print( + f"[green]Imported[/] {shape[0]}×{shape[1]} sparse matrix ({nnz} non-zero) into '{obj}'" + ) diff --git a/src/h5ad/formats/validate.py b/src/h5ad/formats/validate.py new file mode 100644 index 0000000..194192b --- /dev/null +++ b/src/h5ad/formats/validate.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +from typing import Optional, Tuple, Any + +from rich.console import Console + +from h5ad.core.info import axis_len +from h5ad.util.path import norm_path + + +OBS_AXIS_PREFIXES = ("obs", "obsm/", "obsp/") +VAR_AXIS_PREFIXES = ("var", "varm/", "varp/") +MATRIX_PREFIXES = ("X", "layers/") + + +def _get_axis_length(root: Any, axis: str) -> Optional[int]: + try: + return axis_len(root, axis) + except Exception: + return None + + +def validate_dimensions( + root: Any, + obj_path: str, + data_shape: Tuple[int, ...], + console: Console, +) -> None: + obj_path = norm_path(obj_path) + n_obs = _get_axis_length(root, "obs") + n_var = _get_axis_length(root, "var") + + if obj_path == "obs": + if n_obs is not None and data_shape[0] != n_obs: + raise ValueError( + f"Row count mismatch: input has {data_shape[0]} rows, " + f"but obs has {n_obs} cells." + ) + return + if obj_path == "var": + if n_var is not None and data_shape[0] != n_var: + raise ValueError( + f"Row count mismatch: input has {data_shape[0]} rows, " + f"but var has {n_var} features." + ) + return + + for prefix in MATRIX_PREFIXES: + if obj_path == prefix or obj_path.startswith(prefix + "/") or obj_path.startswith(prefix): + if obj_path == "X" or obj_path.startswith("layers/"): + if len(data_shape) < 2: + raise ValueError( + f"Matrix data requires 2D shape, got {len(data_shape)}D." + ) + if n_obs is not None and data_shape[0] != n_obs: + raise ValueError( + f"First dimension mismatch: input has {data_shape[0]} rows, " + f"but obs has {n_obs} cells." + ) + if n_var is not None and data_shape[1] != n_var: + raise ValueError( + f"Second dimension mismatch: input has {data_shape[1]} columns, " + f"but var has {n_var} features." + ) + return + + for prefix in OBS_AXIS_PREFIXES: + if obj_path.startswith(prefix) and obj_path != "obs": + if n_obs is not None and data_shape[0] != n_obs: + raise ValueError( + f"First dimension mismatch: input has {data_shape[0]} rows, " + f"but obs has {n_obs} cells." + ) + if obj_path.startswith("obsp/") and len(data_shape) >= 2: + if data_shape[1] != n_obs: + raise ValueError( + "obsp matrix must be square (n_obs × n_obs): " + f"got {data_shape[0]}×{data_shape[1]}, expected {n_obs}×{n_obs}." + ) + return + + for prefix in VAR_AXIS_PREFIXES: + if obj_path.startswith(prefix) and obj_path != "var": + if n_var is not None and data_shape[0] != n_var: + raise ValueError( + f"First dimension mismatch: input has {data_shape[0]} rows, " + f"but var has {n_var} features." + ) + if obj_path.startswith("varp/") and len(data_shape) >= 2: + if data_shape[1] != n_var: + raise ValueError( + "varp matrix must be square (n_var × n_var): " + f"got {data_shape[0]}×{data_shape[1]}, expected {n_var}×{n_var}." + ) + return + + console.print(f"[dim]Note: No dimension validation for path '{obj_path}'[/]") From 6d8903bd5dfa75eb77537f00df9975fe6cf93521 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 17:31:57 +0000 Subject: [PATCH 37/62] HUGE REFACTOR: Add core functionality for handling .h5ad and .zarr stores with subset operations --- src/h5ad/core/__init__.py | 1 + src/h5ad/core/info.py | 221 ++++++++++++++++++ src/h5ad/core/read.py | 112 +++++++++ src/h5ad/core/subset.py | 464 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 798 insertions(+) create mode 100644 src/h5ad/core/__init__.py create mode 100644 src/h5ad/core/info.py create mode 100644 src/h5ad/core/read.py create mode 100644 src/h5ad/core/subset.py diff --git a/src/h5ad/core/__init__.py b/src/h5ad/core/__init__.py new file mode 100644 index 0000000..9224273 --- /dev/null +++ b/src/h5ad/core/__init__.py @@ -0,0 +1 @@ +"""Core logic shared by CLI commands and format handlers.""" diff --git a/src/h5ad/core/info.py b/src/h5ad/core/info.py new file mode 100644 index 0000000..8db8a14 --- /dev/null +++ b/src/h5ad/core/info.py @@ -0,0 +1,221 @@ +from __future__ import annotations + +from typing import Optional, Tuple, Dict, Any, Union + +import numpy as np + +from h5ad.storage import is_dataset, is_group, is_hdf5_dataset + + +def _decode_attr(value: Any) -> Any: + if isinstance(value, bytes): + return value.decode("utf-8") + return value + + +def get_entry_type(entry: Any) -> Dict[str, Any]: + """ + Determine the type/format of an object for export guidance. + + Supports both: + - v0.2.0 (modern): Objects with encoding-type/encoding-version attributes + - v0.1.0 (legacy): Objects without encoding attributes, inferred from structure + """ + result: Dict[str, Any] = { + "type": "unknown", + "export_as": None, + "encoding": None, + "shape": None, + "dtype": None, + "details": "", + "version": None, + } + + enc = _decode_attr(entry.attrs.get("encoding-type", b"")) + result["encoding"] = enc if enc else None + + enc_ver = _decode_attr(entry.attrs.get("encoding-version", b"")) + result["version"] = enc_ver if enc_ver else None + + if is_dataset(entry): + result["shape"] = entry.shape + result["dtype"] = str(entry.dtype) + + if "categories" in entry.attrs: + result["type"] = "categorical" + result["export_as"] = "csv" + result["version"] = result["version"] or "0.1.0" + n_cats = "?" + if is_hdf5_dataset(entry): + try: + cats_ref = entry.attrs["categories"] + cats_ds = entry.file[cats_ref] + n_cats = cats_ds.shape[0] + except Exception: + n_cats = "?" + result["details"] = ( + f"Legacy categorical [{entry.shape[0]} values, {n_cats} categories]" + ) + return result + + if entry.shape == (): + result["type"] = "scalar" + result["export_as"] = "json" + result["details"] = f"Scalar value ({entry.dtype})" + return result + + if entry.ndim == 1: + result["type"] = "array" + result["export_as"] = "npy" + result["details"] = f"1D array [{entry.shape[0]}] ({entry.dtype})" + elif entry.ndim == 2: + result["type"] = "dense-matrix" + result["export_as"] = "npy" + result["details"] = ( + f"Dense matrix {entry.shape[0]}×{entry.shape[1]} ({entry.dtype})" + ) + elif entry.ndim == 3: + result["type"] = "array" + result["export_as"] = "npy" + result["details"] = f"3D array {entry.shape} ({entry.dtype})" + else: + result["type"] = "array" + result["export_as"] = "npy" + result["details"] = f"ND array {entry.shape} ({entry.dtype})" + return result + + if is_group(entry): + if enc in ("csr_matrix", "csc_matrix"): + shape = entry.attrs.get("shape", None) + shape_str = f"{shape[0]}×{shape[1]}" if shape is not None else "?" + result["type"] = "sparse-matrix" + result["export_as"] = "mtx" + result["details"] = ( + f"Sparse {enc.replace('_matrix', '').upper()} matrix {shape_str}" + ) + return result + + if enc == "categorical": + codes = entry.get("codes") + cats = entry.get("categories") + n_codes = codes.shape[0] if codes is not None else "?" + n_cats = cats.shape[0] if cats is not None else "?" + result["type"] = "categorical" + result["export_as"] = "csv" + result["details"] = f"Categorical [{n_codes} values, {n_cats} categories]" + return result + + if ( + enc == "dataframe" + or "_index" in entry.attrs + or "obs_names" in entry + or "var_names" in entry + ): + if enc == "dataframe": + df_version = result["version"] or "0.2.0" + else: + df_version = "0.1.0" + result["version"] = df_version + + has_legacy_cats = "__categories" in entry + n_cols = len( + [k for k in entry.keys() if k not in ("_index", "__categories")] + ) + + result["type"] = "dataframe" + result["export_as"] = "csv" + if has_legacy_cats: + result["details"] = f"DataFrame with {n_cols} columns (legacy v0.1.0)" + else: + result["details"] = f"DataFrame with {n_cols} columns" + return result + + if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"): + result["type"] = "array" + result["export_as"] = "npy" + result["details"] = f"Encoded array ({enc})" + return result + + if enc == "string-array": + result["type"] = "array" + result["export_as"] = "npy" + result["details"] = "Encoded string array" + return result + + if enc == "awkward-array": + length = entry.attrs.get("length", "?") + result["type"] = "awkward-array" + result["export_as"] = "json" + result["details"] = f"Awkward array (length={length})" + return result + + n_keys = len(list(entry.keys())) + result["type"] = "dict" + result["export_as"] = "json" + result["details"] = f"Group with {n_keys} keys" + return result + + return result + + +def format_type_info(info: Dict[str, Any]) -> str: + type_colors = { + "dataframe": "green", + "sparse-matrix": "magenta", + "dense-matrix": "blue", + "array": "blue", + "dict": "yellow", + "categorical": "green", + "scalar": "white", + "unknown": "red", + } + + color = type_colors.get(info["type"], "white") + return f"[{color}]<{info['type']}>[/]" + + +def axis_len(file: Any, axis: str) -> int: + if axis not in file: + raise KeyError(f"'{axis}' not found in the file.") + + group = file[axis] + if not is_group(group): + raise TypeError(f"'{axis}' is not a group.") + + index_name = group.attrs.get("_index", None) + if index_name is None: + if axis == "obs": + index_name = "obs_names" + elif axis == "var": + index_name = "var_names" + else: + raise ValueError(f"Invalid axis '{axis}'. Must be 'obs' or 'var'.") + + index_name = _decode_attr(index_name) + + if index_name not in group: + raise KeyError(f"Index dataset '{index_name}' not found in '{axis}' group.") + + dataset = group[index_name] + if not is_dataset(dataset): + raise TypeError(f"Index '{index_name}' in '{axis}' is not a dataset.") + if dataset.shape: + return int(dataset.shape[0]) + raise ValueError( + f"Cannot determine length of '{axis}': index dataset has no shape." + ) + + +def get_axis_group(file: Any, axis: str) -> Tuple[Any, int, str]: + if axis not in ("obs", "var"): + raise ValueError("axis must be 'obs' or 'var'.") + + n = axis_len(file, axis) + group = file[axis] + + index_name = group.attrs.get("_index", None) + if index_name is None: + index_name = "obs_names" if axis == "obs" else "var_names" + index_name = _decode_attr(index_name) + + return group, n, index_name diff --git a/src/h5ad/core/read.py b/src/h5ad/core/read.py new file mode 100644 index 0000000..b81ee1f --- /dev/null +++ b/src/h5ad/core/read.py @@ -0,0 +1,112 @@ +from __future__ import annotations + +from typing import List, Dict, Any + +import h5py +import numpy as np + +from h5ad.storage import is_group, is_dataset, is_hdf5_dataset + + +def decode_str_array(array: np.ndarray) -> np.ndarray: + if np.issubdtype(array.dtype, np.bytes_): + return array.astype("U") + if array.dtype.kind == "O": + return array.astype(str) + return array.astype(str) + + +def read_categorical_column( + col: Any, + start: int, + end: int, + cache: Dict[int, np.ndarray], + parent_group: Any | None = None, +) -> List[str]: + key = id(col) + + if is_group(col): + if key not in cache: + cats = col["categories"][...] + cats = decode_str_array(cats) + cache[key] = np.asarray(cats, dtype=str) + cats = cache[key] + + codes_ds = col["codes"] + codes = codes_ds[start:end] + codes = np.asarray(codes, dtype=np.int64) + return [cats[c] if 0 <= c < len(cats) else "" for c in codes] + + if is_dataset(col): + if key not in cache: + cats_ref = col.attrs.get("categories", None) + if cats_ref is not None and is_hdf5_dataset(col): + cats_ds = col.file[cats_ref] + cats = cats_ds[...] + elif parent_group is not None and "__categories" in parent_group: + col_name = col.name.split("/")[-1] + cats_grp = parent_group["__categories"] + if col_name in cats_grp: + cats = cats_grp[col_name][...] + else: + raise KeyError( + f"Cannot find categories for legacy column {col.name}" + ) + else: + raise KeyError( + f"Cannot find categories for legacy column {col.name}" + ) + cats = decode_str_array(cats) + cache[key] = np.asarray(cats, dtype=str) + cats = cache[key] + + codes = col[start:end] + codes = np.asarray(codes, dtype=np.int64) + return [cats[c] if 0 <= c < len(cats) else "" for c in codes] + + raise TypeError(f"Unsupported categorical column type: {type(col)}") + + +def col_chunk_as_strings( + group: Any, + col_name: str, + start: int, + end: int, + cat_cache: Dict[int, np.ndarray], +) -> List[str]: + if col_name not in group: + raise RuntimeError(f"Column {col_name!r} not found in group {group.name}") + + col = group[col_name] + + if is_dataset(col): + if "categories" in col.attrs: + return read_categorical_column(col, start, end, cat_cache, group) + + chunk = col[start:end] + if chunk.ndim != 1: + chunk = chunk.reshape(-1) + chunk = decode_str_array(np.asarray(chunk)) + return chunk.tolist() + + if is_group(col): + enc = col.attrs.get("encoding-type", b"") + if isinstance(enc, bytes): + enc = enc.decode("utf-8") + + if enc == "categorical": + return read_categorical_column(col, start, end, cat_cache) + + if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"): + values = col["values"][start:end] + mask = col["mask"][start:end] + values = decode_str_array(np.asarray(values)) + return ["" if m else str(v) for v, m in zip(values, mask)] + + raise ValueError( + f"Unsupported group encoding {enc!r} for column {col_name!r}" + ) + + raise TypeError( + f"Unsupported column type for {col_name!r} in group {group.name}" + ) diff --git a/src/h5ad/core/subset.py b/src/h5ad/core/subset.py new file mode 100644 index 0000000..ee254cd --- /dev/null +++ b/src/h5ad/core/subset.py @@ -0,0 +1,464 @@ +"""Subset operations for .h5ad and .zarr stores.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Optional, Set, Tuple, List, Dict, Any + +import numpy as np +from rich.console import Console +from rich.progress import ( + Progress, + SpinnerColumn, + TextColumn, + BarColumn, + TaskProgressColumn, + TimeElapsedColumn, +) + +from h5ad.core.read import decode_str_array +from h5ad.storage import ( + create_dataset, + copy_attrs, + copy_tree, + dataset_create_kwargs, + is_dataset, + is_group, + is_zarr_group, + is_zarr_array, + open_store, +) + + +def _target_backend(dst_group: Any) -> str: + return "zarr" if is_zarr_group(dst_group) else "hdf5" + + +def _ensure_group(parent: Any, name: str) -> Any: + return parent[name] if name in parent else parent.create_group(name) + + +def _group_get(parent: Any, key: str) -> Any | None: + return parent[key] if key in parent else None + + +def _decode_attr(value: Any) -> Any: + if isinstance(value, bytes): + return value.decode("utf-8") + return value + + +def _read_name_file(path: Path) -> Set[str]: + names: Set[str] = set() + with open(path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + names.add(line) + return names + + +def indices_from_name_set( + names_ds: Any, + keep: Set[str], + *, + chunk_size: int = 200_000, +) -> Tuple[np.ndarray, Set[str]]: + if names_ds.ndim != 1: + flat_len = int(np.prod(names_ds.shape)) + else: + flat_len = names_ds.shape[0] + + remaining = set(keep) + found_indices: List[int] = [] + + for start in range(0, flat_len, chunk_size): + end = min(start + chunk_size, flat_len) + chunk = names_ds[start:end] + chunk = decode_str_array(np.asarray(chunk)).astype(str) + + for i, name in enumerate(chunk): + if name in remaining: + found_indices.append(start + i) + remaining.remove(name) + + if not remaining: + break + + return np.asarray(found_indices, dtype=np.int64), remaining + + +def subset_axis_group( + src: Any, + dst: Any, + indices: Optional[np.ndarray], +) -> None: + copy_attrs(src.attrs, dst.attrs, target_backend=_target_backend(dst)) + target_backend = _target_backend(dst) + + for key in src.keys(): + obj = src[key] + + if is_dataset(obj): + if indices is None: + copy_tree(obj, dst, key) + else: + if is_zarr_array(obj): + if obj.ndim == 1: + data = obj.oindex[indices] + else: + selection = (indices,) + (slice(None),) * (obj.ndim - 1) + data = obj.oindex[selection] + else: + data = obj[indices, ...] + ds = create_dataset( + dst, + key, + data=data, + **dataset_create_kwargs(obj, target_backend=target_backend), + ) + copy_attrs(obj.attrs, ds.attrs, target_backend=target_backend) + elif is_group(obj): + enc = obj.attrs.get("encoding-type", b"") + if isinstance(enc, bytes): + enc = enc.decode("utf-8") + + if enc == "categorical": + gdst = dst.create_group(key) + copy_attrs(obj.attrs, gdst.attrs, target_backend=target_backend) + copy_tree(obj["categories"], gdst, "categories") + + codes = obj["codes"] + if indices is None: + copy_tree(codes, gdst, "codes") + else: + codes_sub = codes[indices, ...] + ds = create_dataset( + gdst, + "codes", + data=codes_sub, + **dataset_create_kwargs(codes, target_backend=target_backend), + ) + copy_attrs(codes.attrs, ds.attrs, target_backend=target_backend) + else: + copy_tree(obj, dst, key) + + +def subset_dense_matrix( + src: Any, + dst_parent: Any, + name: str, + obs_idx: Optional[np.ndarray], + var_idx: Optional[np.ndarray], + *, + chunk_rows: int = 1024, +) -> None: + if src.ndim != 2: + copy_tree(src, dst_parent, name) + return + + n_obs, n_var = src.shape + out_obs = len(obs_idx) if obs_idx is not None else n_obs + out_var = len(var_idx) if var_idx is not None else n_var + + target_backend = _target_backend(dst_parent) + kw = dataset_create_kwargs(src, target_backend=target_backend) + chunks = kw.get("chunks") + if isinstance(chunks, (tuple, list)) and len(chunks) >= 2: + kw["chunks"] = (min(int(chunks[0]), out_obs), min(int(chunks[1]), out_var)) + + dst = create_dataset( + dst_parent, + name, + shape=(out_obs, out_var), + dtype=src.dtype, + **kw, + ) + copy_attrs(src.attrs, dst.attrs, target_backend=_target_backend(dst_parent)) + + for out_start in range(0, out_obs, chunk_rows): + out_end = min(out_start + chunk_rows, out_obs) + + if obs_idx is None: + block = src[out_start:out_end, :] + else: + rows = obs_idx[out_start:out_end] + block = src[rows, :] + + if var_idx is not None: + block = block[:, var_idx] + + dst[out_start:out_end, :] = block + + +def subset_sparse_matrix_group( + src: Any, + dst_parent: Any, + name: str, + obs_idx: Optional[np.ndarray], + var_idx: Optional[np.ndarray], +) -> None: + enc = src.attrs.get("encoding-type", b"") + if isinstance(enc, bytes): + enc = enc.decode("utf-8") + + if enc not in ("csr_matrix", "csc_matrix"): + raise ValueError(f"Unsupported sparse encoding type: {enc}") + + data = np.asarray(src["data"][...]) + indices = np.asarray(src["indices"][...], dtype=np.int64) + indptr = np.asarray(src["indptr"][...], dtype=np.int64) + shape = src.attrs.get("shape", None) + if shape is None: + raise ValueError("Sparse matrix group missing 'shape' attribute.") + n_rows, n_cols = int(shape[0]), int(shape[1]) + + if enc == "csr_matrix": + row_idx = obs_idx if obs_idx is not None else np.arange(n_rows, dtype=np.int64) + col_idx = var_idx if var_idx is not None else np.arange(n_cols, dtype=np.int64) + + new_data = [] + new_indices = [] + new_indptr = [0] + + for r in row_idx: + start = indptr[r] + end = indptr[r + 1] + row_cols = indices[start:end] + row_data = data[start:end] + + if var_idx is not None: + col_mask = np.isin(row_cols, col_idx) + row_cols = row_cols[col_mask] + row_data = row_data[col_mask] + + if var_idx is not None: + col_map = {c: i for i, c in enumerate(col_idx)} + row_cols = np.array([col_map[c] for c in row_cols], dtype=np.int64) + + new_indices.extend(row_cols.tolist()) + new_data.extend(row_data.tolist()) + new_indptr.append(len(new_indices)) + + new_shape = (len(row_idx), len(col_idx)) + else: + row_idx = obs_idx if obs_idx is not None else np.arange(n_rows, dtype=np.int64) + col_idx = var_idx if var_idx is not None else np.arange(n_cols, dtype=np.int64) + + new_data = [] + new_indices = [] + new_indptr = [0] + + for c in col_idx: + start = indptr[c] + end = indptr[c + 1] + col_rows = indices[start:end] + col_data = data[start:end] + + if obs_idx is not None: + row_mask = np.isin(col_rows, row_idx) + col_rows = col_rows[row_mask] + col_data = col_data[row_mask] + + if obs_idx is not None: + row_map = {r: i for i, r in enumerate(row_idx)} + col_rows = np.array([row_map[r] for r in col_rows], dtype=np.int64) + + new_indices.extend(col_rows.tolist()) + new_data.extend(col_data.tolist()) + new_indptr.append(len(new_indices)) + + new_shape = (len(row_idx), len(col_idx)) + + group = dst_parent.create_group(name) + group.attrs["encoding-type"] = enc + group.attrs["encoding-version"] = "0.1.0" + if is_zarr_group(group): + group.attrs["shape"] = list(new_shape) + else: + group.attrs["shape"] = np.array(new_shape, dtype=np.int64) + + create_dataset(group, "data", data=np.array(new_data, dtype=data.dtype)) + create_dataset(group, "indices", data=np.array(new_indices, dtype=indices.dtype)) + create_dataset(group, "indptr", data=np.array(new_indptr, dtype=indptr.dtype)) + + +def subset_h5ad( + file: Path, + output: Path, + obs_file: Optional[Path], + var_file: Optional[Path], + *, + chunk_rows: int = 1024, + console: Console, +) -> None: + obs_keep: Optional[Set[str]] = None + if obs_file is not None: + obs_keep = _read_name_file(obs_file) + console.print(f"[cyan]Found {len(obs_keep)} obs names to keep[/]") + + var_keep: Optional[Set[str]] = None + if var_file is not None: + var_keep = _read_name_file(var_file) + console.print(f"[cyan]Found {len(var_keep)} var names to keep[/]") + + if obs_keep is None and var_keep is None: + raise ValueError("At least one of --obs or --var must be provided.") + + with console.status("[magenta]Opening files...[/]"): + with open_store(file, "r") as src_store, open_store(output, "w") as dst_store: + src = src_store.root + dst = dst_store.root + + obs_idx = None + if obs_keep is not None: + console.print("[cyan]Matching obs names...[/]") + obs_group = src["obs"] + obs_index = _decode_attr(obs_group.attrs.get("_index", "obs_names")) + obs_names_ds = _group_get(obs_group, "obs_names") or _group_get( + obs_group, obs_index + ) + if obs_names_ds is None: + raise KeyError("Could not find obs names") + + obs_idx, missing_obs = indices_from_name_set(obs_names_ds, obs_keep) + if missing_obs: + console.print( + f"[yellow]Warning: {len(missing_obs)} obs names not found in file[/]" + ) + console.print( + f"[green]Selected {len(obs_idx)} obs (of {obs_names_ds.shape[0]})[/]" + ) + + var_idx = None + if var_keep is not None: + console.print("[cyan]Matching var names...[/]") + var_group = src["var"] + var_index = _decode_attr(var_group.attrs.get("_index", "var_names")) + var_names_ds = _group_get(var_group, "var_names") or _group_get( + var_group, var_index + ) + if var_names_ds is None: + raise KeyError("Could not find var names") + + var_idx, missing_var = indices_from_name_set(var_names_ds, var_keep) + if missing_var: + console.print( + f"[yellow]Warning: {len(missing_var)} var names not found in file[/]" + ) + console.print( + f"[green]Selected {len(var_idx)} var (of {var_names_ds.shape[0]})[/]" + ) + + tasks: List[str] = [] + if "obs" in src: + tasks.append("obs") + if "var" in src: + tasks.append("var") + if "X" in src: + tasks.append("X") + if "layers" in src: + tasks.extend([f"layer:{k}" for k in src["layers"].keys()]) + if "obsm" in src: + tasks.extend([f"obsm:{k}" for k in src["obsm"].keys()]) + if "varm" in src: + tasks.extend([f"varm:{k}" for k in src["varm"].keys()]) + if "obsp" in src: + tasks.extend([f"obsp:{k}" for k in src["obsp"].keys()]) + if "varp" in src: + tasks.extend([f"varp:{k}" for k in src["varp"].keys()]) + if "uns" in src: + tasks.append("uns") + + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TimeElapsedColumn(), + console=console, + ) as progress: + task_id = progress.add_task("[cyan]Subsetting...", total=len(tasks)) + + for task in tasks: + if task == "obs": + obs_dst = dst.create_group("obs") + subset_axis_group(src["obs"], obs_dst, obs_idx) + elif task == "var": + var_dst = dst.create_group("var") + subset_axis_group(src["var"], var_dst, var_idx) + elif task == "X": + X = src["X"] + if is_dataset(X): + subset_dense_matrix( + X, dst, "X", obs_idx, var_idx, chunk_rows=chunk_rows + ) + elif is_group(X): + subset_sparse_matrix_group(X, dst, "X", obs_idx, var_idx) + else: + copy_tree(X, dst, "X") + elif task.startswith("layer:"): + key = task.split(":", 1)[1] + layer_src = src["layers"][key] + if is_dataset(layer_src): + layers_dst = _ensure_group(dst, "layers") + subset_dense_matrix( + layer_src, + layers_dst, + key, + obs_idx, + var_idx, + chunk_rows=chunk_rows, + ) + elif is_group(layer_src): + layers_dst = _ensure_group(dst, "layers") + subset_sparse_matrix_group( + layer_src, layers_dst, key, obs_idx, var_idx + ) + elif task.startswith("obsm:"): + key = task.split(":", 1)[1] + obsm_dst = _ensure_group(dst, "obsm") + subset_dense_matrix( + src["obsm"][key], + obsm_dst, + key, + obs_idx, + None, + chunk_rows=chunk_rows, + ) + elif task.startswith("varm:"): + key = task.split(":", 1)[1] + varm_dst = _ensure_group(dst, "varm") + subset_dense_matrix( + src["varm"][key], + varm_dst, + key, + var_idx, + None, + chunk_rows=chunk_rows, + ) + elif task.startswith("obsp:"): + key = task.split(":", 1)[1] + obsp_dst = _ensure_group(dst, "obsp") + subset_dense_matrix( + src["obsp"][key], + obsp_dst, + key, + obs_idx, + obs_idx, + chunk_rows=chunk_rows, + ) + elif task.startswith("varp:"): + key = task.split(":", 1)[1] + varp_dst = _ensure_group(dst, "varp") + subset_dense_matrix( + src["varp"][key], + varp_dst, + key, + var_idx, + var_idx, + chunk_rows=chunk_rows, + ) + elif task == "uns": + copy_tree(src["uns"], dst, "uns") + progress.advance(task_id) From 4f6d2e58ebdc309ec857389a75ad80a6bd5eae32 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 17:32:13 +0000 Subject: [PATCH 38/62] HUGE REFACTOR: Refactor h5ad command modules: update imports and streamline subset operations - Modified `info.py` to utilize `open_store` for file handling and updated type checks to use new utility functions. - Enhanced `_show_types_tree` and `_show_object_info` functions for better clarity and functionality. - Removed redundant code in `subset.py` by consolidating functions and improving the structure for handling dense and sparse matrices. - Updated the `subset_h5ad` function to improve the process of subsetting observations and variables, including better handling of missing names. --- src/h5ad/commands/export.py | 630 +++------------------------- src/h5ad/commands/import_data.py | 470 ++------------------- src/h5ad/commands/info.py | 28 +- src/h5ad/commands/subset.py | 699 +------------------------------ 4 files changed, 131 insertions(+), 1696 deletions(-) diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py index 06c7f0f..22221a7 100644 --- a/src/h5ad/commands/export.py +++ b/src/h5ad/commands/export.py @@ -1,27 +1,19 @@ from __future__ import annotations -import csv -import json -import sys -from contextlib import nullcontext from pathlib import Path -from typing import Any, Dict, List, Optional, Union, cast +from typing import List, Optional -import h5py -import numpy as np from rich.console import Console -from PIL import Image -from h5ad.read import col_chunk_as_strings, decode_str_array -from h5ad.info import get_axis_group, get_entry_type +from h5ad.formats.array import export_npy as export_npy_format +from h5ad.formats.common import EXPORTABLE_TYPES, IMAGE_EXTENSIONS, TYPE_EXTENSIONS +from h5ad.formats.dataframe import export_dataframe +from h5ad.formats.image import export_image as export_image_format +from h5ad.formats.json_data import export_json as export_json_format +from h5ad.formats.sparse import export_mtx as export_mtx_format +from h5ad.storage import open_store -H5Obj = Union[h5py.Group, h5py.Dataset] - - -# ============================================================================ -# DATAFRAME EXPORT (CSV) -# ============================================================================ def export_table( file: Path, axis: str, @@ -31,171 +23,17 @@ def export_table( head: Optional[int], console: Console, ) -> None: - """ - Export a dataframe (obs or var) to CSV format. - - Args: - file: Path to the .h5ad file - axis: Axis to read from ('obs' or 'var') - columns: List of column names to include in the output table - out: Output file path (defaults to stdout if None) - chunk_rows: Number of rows to read per chunk - head: Output only the first n rows - console: Rich console for status output - - Supports both v0.2.0 (modern) and v0.1.0 (legacy) dataframe formats. - """ - with h5py.File(file, "r") as f: - group, n_rows, index_name = get_axis_group(f, axis) - - # Reserved keys to exclude from column list - # __categories is used in v0.1.0 for storing categorical labels - reserved_keys = {"_index", "__categories"} - - # Determine columns to read - if columns: - col_names = list(columns) - else: - col_names = [ - k for k in group.keys() if k not in reserved_keys and k != index_name - ] - # Add index name if not already present - if index_name and index_name not in col_names: - col_names.insert(0, index_name) - - if isinstance(index_name, bytes): - index_name = index_name.decode("utf-8") - - if index_name not in col_names: - col_names.insert(0, index_name) - else: - col_names = [index_name] + [c for c in col_names if c != index_name] - - # Limit rows if head option is specified - if head is not None and head > 0: - n_rows = min(n_rows, head) - - # Open writer - if out is None or str(out) == "-": - out_fh = sys.stdout - else: - out_fh = open(out, "w", newline="", encoding="utf-8") - writer = csv.writer(out_fh) - - # Write data in chunks - try: - writer.writerow(col_names) - cat_cache: Dict[int, np.ndarray] = {} - - # Use status spinner only when writing to file (not stdout) - use_status = out_fh is not sys.stdout - status_ctx = ( - console.status(f"[magenta]Exporting {axis} table to {out}...[/]") - if use_status - else nullcontext() - ) - - with status_ctx as status: - for start in range(0, n_rows, chunk_rows): - end = min(start + chunk_rows, n_rows) - if use_status and status: - status.update( - f"[magenta]Exporting rows {start}-{end} of {n_rows}...[/]" - ) - cols_data: List[List[str]] = [] - # Read each column for the current chunk - for col in col_names: - cols_data.append( - col_chunk_as_strings(group, col, start, end, cat_cache) - ) - # Write rows - for row_idx in range(end - start): - row = [ - cols_data[col_idx][row_idx] - for col_idx in range(len(col_names)) - ] - writer.writerow(row) - finally: - if out_fh is not sys.stdout: - out_fh.close() - - -# ============================================================================ -# TYPE DETECTION AND VALIDATION -# ============================================================================ -# Map object types to valid output extensions -TYPE_EXTENSIONS = { - "dataframe": {".csv"}, - "sparse-matrix": {".mtx"}, - "dense-matrix": {".npy", ".png", ".jpg", ".jpeg", ".tif", ".tiff"}, - "array": {".npy", ".png", ".jpg", ".jpeg", ".tif", ".tiff"}, - "dict": {".json"}, - "scalar": {".json"}, - "categorical": {".csv"}, - "awkward-array": {".json"}, -} - -# Image extensions for validation -IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".tif", ".tiff"} - -# Known exportable types -EXPORTABLE_TYPES = set(TYPE_EXTENSIONS.keys()) - - -def _norm_path(p: str) -> str: - p = p.strip() - if not p: - raise ValueError("Object path must be non-empty.") - return p.lstrip("/") - - -def _get_encoding_type(group: h5py.Group) -> str: - enc = group.attrs.get("encoding-type", "") - if isinstance(enc, bytes): - enc = enc.decode("utf-8") - return str(enc) - - -def _resolve(file: h5py.File, obj: str) -> H5Obj: - obj = _norm_path(obj) - if obj not in file: - raise KeyError(f"'{obj}' not found in the file.") - return cast(H5Obj, file[obj]) - - -def _check_json_exportable(h5obj: H5Obj, max_elements: int, path: str = "") -> None: - """ - Recursively check if a group/dataset can be exported to JSON. - Raises ValueError if it contains non-exportable structures. - """ - if isinstance(h5obj, h5py.Dataset): - if h5obj.shape == (): - return # scalar is fine - n = int(np.prod(h5obj.shape)) if h5obj.shape else 0 - if n > max_elements: - raise ValueError( - f"Cannot export to JSON: '{path or h5obj.name}' has {n} elements " - f"(max {max_elements}). Use --max-elements to increase limit." - ) - return - - # It's a Group - check encoding - enc = _get_encoding_type(h5obj) - if enc in ("csr_matrix", "csc_matrix"): - raise ValueError( - f"Cannot export to JSON: '{path or h5obj.name}' is a sparse matrix. " - f"Export it as .mtx instead." + with open_store(file, "r") as store: + export_dataframe( + store.root, + axis=axis, + columns=columns, + out=out, + chunk_rows=chunk_rows, + head=head, + console=console, ) - # Check children recursively - for key in h5obj.keys(): - child = h5obj[key] - child_path = f"{path}/{key}" if path else key - if isinstance(child, (h5py.Group, h5py.Dataset)): - _check_json_exportable( - cast(H5Obj, child), max_elements=max_elements, path=child_path - ) - def export_npy( file: Path, @@ -204,84 +42,14 @@ def export_npy( chunk_elements: int, console: Console, ) -> None: - """ - Export a dense HDF5 dataset to NumPy .npy without loading it all at once. - - Supports both: - - v0.2.0 (modern): Datasets with encoding-type="array" - - v0.1.0 (legacy): Plain datasets without encoding attributes - - Encoded groups: nullable-integer, nullable-boolean, string-array (extracts values) - - Args: - file: Path to the .h5ad file - obj: HDF5 path to the dataset or encoded group - out: Output .npy file path - chunk_elements: Number of elements to read per chunk - console: Rich console for status output - - Raises: - ValueError: If the target object is not exportable as .npy - """ - with h5py.File(file, "r") as f: - h5obj = _resolve(f, obj) - - # Handle encoded groups that contain array data - if isinstance(h5obj, h5py.Group): - enc = _get_encoding_type(h5obj) - if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"): - # Extract values from nullable array group - if "values" not in h5obj: - raise ValueError( - f"Encoded group '{obj}' is missing 'values' dataset." - ) - ds = h5obj["values"] - has_mask = "mask" in h5obj - console.print(f"[dim]Exporting nullable array values from '{obj}'[/]") - else: - raise ValueError( - f"Target '{obj}' is a group with encoding '{enc}'; cannot export as .npy directly." - ) - else: - ds = h5obj - has_mask = False - - out.parent.mkdir(parents=True, exist_ok=True) - mm = np.lib.format.open_memmap(out, mode="w+", dtype=ds.dtype, shape=ds.shape) - try: - if ds.shape == (): - mm[...] = ds[()] - console.print(f"[green]Wrote[/] {out}") - return - - if ds.ndim == 1: - n = int(ds.shape[0]) - step = max(1, int(chunk_elements)) - with console.status( - f"[magenta]Exporting {obj} to {out}...[/]" - ) as status: - for start in range(0, n, step): - end = min(start + step, n) - status.update( - f"[magenta]Exporting {obj}: {start}-{end} of {n}...[/]" - ) - mm[start:end] = ds[start:end] - console.print(f"[green]Wrote[/] {out}") - return - - n0 = int(ds.shape[0]) - row_elems = int(np.prod(ds.shape[1:])) if ds.ndim > 1 else 1 - # Convert element budget into a row count; fallback to 1 row if rows are larger. - step0 = max(1, int(chunk_elements) // max(1, row_elems)) - with console.status(f"[magenta]Exporting {obj} to {out}...[/]") as status: - for start in range(0, n0, step0): - end = min(start + step0, n0) - status.update( - f"[magenta]Exporting {obj}: {start}-{end} of {n0}...[/]" - ) - mm[start:end, ...] = ds[start:end, ...] - console.print(f"[green]Wrote[/] {out}") - finally: - del mm + with open_store(file, "r") as store: + export_npy_format( + store.root, + obj=obj, + out=out, + chunk_elements=chunk_elements, + console=console, + ) def export_mtx( @@ -293,333 +61,49 @@ def export_mtx( in_memory: bool, console: Console, ) -> None: - """Export a CSR/CSC matrix group (AnnData encoding) to Matrix Market (.mtx). - - If out is None or "-", writes to stdout. The head parameter limits output lines. - chunk_elements controls how many rows/columns are processed per slice when - streaming. Use in_memory for small matrices to load everything at once. - - Args: - file: Path to the .h5ad file - obj: HDF5 path to the matrix group - out: Output .mtx file path (or None for stdout) - head: Output only the first n nonzero entries - chunk_elements: Number of rows/columns to process per chunk - in_memory: Load the entire sparse matrix into memory before exporting - console: Rich console for status output - - Raises: - ValueError: If the target object is not a valid CSR/CSC matrix group. - """ - with h5py.File(file, "r") as f: - h5obj = _resolve(f, obj) - if not isinstance(h5obj, h5py.Group): - raise ValueError( - "MTX export requires a CSR/CSC matrix group (not a dataset)." - ) - - enc = _get_encoding_type(h5obj) - if enc not in ("csr_matrix", "csc_matrix"): - raise ValueError( - f"Target group encoding-type is {enc!r}; expected 'csr_matrix' or 'csc_matrix'." - ) - - data = h5obj.get("data") - indices = h5obj.get("indices") - indptr = h5obj.get("indptr") - if ( - not isinstance(data, h5py.Dataset) - or not isinstance(indices, h5py.Dataset) - or not isinstance(indptr, h5py.Dataset) - ): - raise ValueError( - "Sparse matrix group must contain datasets: data, indices, indptr" - ) - - shape = h5obj.attrs.get("shape", None) - if shape is None: - raise ValueError( - "Sparse matrix group is missing required 'shape' attribute." - ) - n_rows, n_cols = (int(shape[0]), int(shape[1])) - - field = "real" if np.issubdtype(data.dtype, np.floating) else "integer" - - # Load sparse index pointers (1 per major axis row/col); used to slice data/indices. - indptr_arr = np.asarray(indptr[...], dtype=np.int64) - nnz_ptr = int(indptr_arr[-1]) if indptr_arr.size else 0 - nnz_data = int(data.shape[0]) - nnz_idx = int(indices.shape[0]) - - # Check consistency of sparse data - if not (nnz_ptr == nnz_data == nnz_idx): - raise ValueError( - f"Sparse matrix data inconsistency: indptr implies {nnz_ptr} nonzeros, " - f"but data has {nnz_data} and indices has {nnz_idx}." - ) - - # Determine number of nonzero entries to write - nnz = nnz_data - major_step = max(1, int(chunk_elements)) - if head is not None and head > 0: - nnz = min(nnz_data, head) - - # Write to stdout when out is None or "-", otherwise open a file on disk. - if out is None or str(out) == "-": - out_fh = sys.stdout - else: - out.parent.mkdir(parents=True, exist_ok=True) - out_fh = open(out, "w", encoding="utf-8", newline="\n") - - use_status = out_fh is not sys.stdout - status_ctx = ( - console.status(f"[magenta]Exporting {obj} to {out}...[/]") - if use_status - else nullcontext() + with open_store(file, "r") as store: + export_mtx_format( + store.root, + obj=obj, + out=out, + head=head, + chunk_elements=chunk_elements, + in_memory=in_memory, + console=console, ) - try: - # Matrix Market header: type, generator line, then shape and nnz. - out_fh.write(f"%%MatrixMarket matrix coordinate {field} general\n") - out_fh.write("% generated by h5ad-cli\n") - if head is not None and head > 0: - out_fh.write( - f"% output limited to first {nnz}/{nnz_data} nonzero entries\n" - ) - out_fh.write(f"{n_rows} {n_cols} {nnz}\n") - - if in_memory: - with status_ctx as status: - if use_status and status: - status.update( - f"[magenta]Loading entire matrix {obj} into memory...[/]" - ) - data_arr = np.asarray(data[...]) - indices_arr = np.asarray(indices[...], dtype=np.int64) - counts = np.diff(indptr_arr) - if int(counts.sum()) != nnz_data: - raise ValueError( - "Sparse matrix indptr does not match data/indices length." - ) - - if enc == "csr_matrix": - major_idx = np.repeat(np.arange(n_rows, dtype=np.int64), counts) - row_idx = major_idx - col_idx = indices_arr - else: - major_idx = np.repeat(np.arange(n_cols, dtype=np.int64), counts) - row_idx = indices_arr - col_idx = major_idx - - if head is not None and head > 0: - row_idx = row_idx[:nnz] - col_idx = col_idx[:nnz] - data_arr = data_arr[:nnz] - - data_fmt = "%.18g" if field == "real" else "%d" - coords = np.column_stack((row_idx + 1, col_idx + 1, data_arr)) - if use_status and status: - status.update(f"[magenta]Saving {nnz} entries to {out}...[/]") - np.savetxt(out_fh, coords, fmt=["%d", "%d", data_fmt], newline="\n") - else: - # Iterate over major axis (rows for CSR, cols for CSC) - major = n_rows if enc == "csr_matrix" else n_cols - max_lines = head if head is not None and head > 0 else None - written = 0 - with status_ctx as status: - for major_start in range(0, major, major_step): - major_end = min(major_start + major_step, major) - if use_status and status: - status.update( - f"[magenta]Exporting {obj}: {major_start+1}-{major_end} of {major}...[/]" - ) - for major_i in range(major_start, major_end): - start = min(int(indptr_arr[major_i]), nnz_data) - end = min(int(indptr_arr[major_i + 1]), nnz_data) - if end <= start: - continue - idx = np.asarray(indices[start:end], dtype=np.int64) - vals = np.asarray(data[start:end]) - m = min(len(idx), len(vals)) - if m == 0: - raise ValueError("Sparse matrix chunk has zero length.") - if max_lines is not None: - remaining = max_lines - written - if remaining <= 0: - break - if m > remaining: - m = remaining - idx = idx[:m] - vals = vals[:m] - idx_list = idx.tolist() - vals_list = vals.tolist() - if enc == "csr_matrix": - r = major_i + 1 - lines = [ - f"{r} {c + 1} {v}\n" - for c, v in zip(idx_list, vals_list) - ] - else: - c = major_i + 1 - lines = [ - f"{r + 1} {c} {v}\n" - for r, v in zip(idx_list, vals_list) - ] - out_fh.write("".join(lines)) - written += m - if max_lines is not None and written >= max_lines: - break - if max_lines is not None and written >= max_lines: - break - finally: - if out_fh is not sys.stdout: - out_fh.close() - if out_fh is not sys.stdout: - console.print(f"[green]Wrote[/] {out}") def export_json( file: Path, obj: str, - out: Path, + out: Optional[Path], max_elements: int, include_attrs: bool, console: Console, ) -> None: - """Export an HDF5 group/dataset to JSON (best-effort, with size limits).""" - with h5py.File(file, "r") as f: - h5obj = _resolve(f, obj) - - # Check if exportable before attempting - _check_json_exportable(h5obj, max_elements=max_elements) - - payload = _to_jsonable( - h5obj, max_elements=max_elements, include_attrs=include_attrs + with open_store(file, "r") as store: + export_json_format( + store.root, + obj=obj, + out=out, + max_elements=max_elements, + include_attrs=include_attrs, + console=console, ) - # Write to stdout when out is None or "-", otherwise open a file on disk. - if out is None or str(out) == "-": - out_fh = sys.stdout - else: - out.parent.mkdir(parents=True, exist_ok=True) - out_fh = open(out, "w", encoding="utf-8") - try: - json.dump(payload, out_fh, indent=2, ensure_ascii=False, sort_keys=True) - out_fh.write("\n") - finally: - if out_fh is not sys.stdout: - out_fh.close() - if out_fh is not sys.stdout: - console.print(f"[green]Wrote[/] {out}") - - -def _attrs_to_jsonable( - attrs: h5py.AttributeManager, max_elements: int -) -> Dict[str, Any]: - out: Dict[str, Any] = {} - for k in attrs.keys(): - v = attrs.get(k) - out[str(k)] = _pyify(v, max_elements=max_elements) - return out - - -def _pyify(value: Any, max_elements: int) -> Any: - if isinstance(value, bytes): - try: - return value.decode("utf-8") - except Exception: - return value.decode("utf-8", errors="replace") - if isinstance(value, np.generic): - return value.item() - if isinstance(value, np.ndarray): - if value.size > max_elements: - raise ValueError( - f"Refusing to convert array of size {value.size} (> {max_elements}) to JSON." - ) - if np.issubdtype(value.dtype, np.bytes_) or value.dtype.kind == "O": - value = decode_str_array(value) - return value.tolist() - return value - - -def _dataset_to_jsonable(ds: h5py.Dataset, max_elements: int) -> Any: - if ds.shape == (): - v = ds[()] - return _pyify(v, max_elements=max_elements) - n = int(np.prod(ds.shape)) if ds.shape else 0 - if n > max_elements: - raise ValueError( - f"Refusing to convert dataset {ds.name!r} with {n} elements (> {max_elements}) to JSON." - ) - arr = np.asarray(ds[...]) - return _pyify(arr, max_elements=max_elements) - - -def _to_jsonable(h5obj: H5Obj, max_elements: int, include_attrs: bool) -> Any: - if isinstance(h5obj, h5py.Dataset): - return _dataset_to_jsonable(h5obj, max_elements=max_elements) - - # Group - d: Dict[str, Any] = {} - if include_attrs and len(h5obj.attrs): - d["__attrs__"] = _attrs_to_jsonable(h5obj.attrs, max_elements=max_elements) - - for key in h5obj.keys(): - child = h5obj[key] - if isinstance(child, (h5py.Group, h5py.Dataset)): - d[str(key)] = _to_jsonable( - cast(H5Obj, child), - max_elements=max_elements, - include_attrs=include_attrs, - ) - return d def export_image(file: Path, obj: str, out: Path, console: Console) -> None: - """ - Export an image-like dataset (H,W) or (H,W,C) to PNG/JPG/TIFF. - Args: - file: Path to the .h5ad file - obj: HDF5 path to the dataset - out: Output image file path - console: Rich console for status output - Raises: - ValueError: If the target object is not a valid image array. - """ - # Load dataset - with h5py.File(file, "r") as f: - h5obj = _resolve(f, obj) - if not isinstance(h5obj, h5py.Dataset): - raise ValueError("Image export requires a dataset.") - arr = np.asarray(h5obj[...]) - - # Validate shape - if arr.ndim not in (2, 3): - raise ValueError(f"Expected 2D or 3D image array; got shape {arr.shape}.") - if arr.ndim == 3 and arr.shape[2] not in (1, 3, 4): - raise ValueError( - f"Expected last dimension (channels) to be 1, 3, or 4; got {arr.shape}." - ) - - # Convert to uint8 for common image formats - if np.issubdtype(arr.dtype, np.floating): - amax = float(np.nanmax(arr)) if arr.size else 0.0 - if amax <= 1.0: - arr = np.clip(arr, 0.0, 1.0) * 255.0 - else: - arr = np.clip(arr, 0.0, 255.0) - arr = arr.astype(np.uint8) - elif np.issubdtype(arr.dtype, np.integer): - arr = np.clip(arr, 0, 255).astype(np.uint8) - elif arr.dtype == np.bool_: - arr = arr.astype(np.uint8) * 255 - else: - raise ValueError(f"Unsupported image dtype: {arr.dtype}") - - # If single-channel 3D, convert to 2D - if arr.ndim == 3 and arr.shape[2] == 1: - arr = arr[:, :, 0] - - # Save image - img = Image.fromarray(arr) - out.parent.mkdir(parents=True, exist_ok=True) - img.save(out) - console.print(f"[green]Wrote[/] {out}") + with open_store(file, "r") as store: + export_image_format(store.root, obj=obj, out=out, console=console) + + +__all__ = [ + "EXPORTABLE_TYPES", + "IMAGE_EXTENSIONS", + "TYPE_EXTENSIONS", + "export_image", + "export_json", + "export_mtx", + "export_npy", + "export_table", +] diff --git a/src/h5ad/commands/import_data.py b/src/h5ad/commands/import_data.py index c208a9d..dad838a 100644 --- a/src/h5ad/commands/import_data.py +++ b/src/h5ad/commands/import_data.py @@ -1,19 +1,19 @@ -"""Import command for creating/replacing objects in h5ad files.""" +"""Import command helpers for creating/replacing objects in h5ad/zarr stores.""" from __future__ import annotations -import csv -import json -import shutil from pathlib import Path -from typing import Any, List, Optional, Tuple, cast +from typing import Optional -import h5py -import numpy as np from rich.console import Console +from h5ad.formats.array import import_npy +from h5ad.formats.dataframe import import_dataframe +from h5ad.formats.json_data import import_json +from h5ad.formats.sparse import import_mtx +from h5ad.storage import copy_path, copy_store_contents, detect_backend, open_store + -# Map file extensions to expected input formats EXTENSION_FORMAT = { ".csv": "csv", ".npy": "npy", @@ -21,229 +21,32 @@ ".json": "json", } -# Define which object paths expect which dimensions -# obs-axis: first dimension must match n_obs -# var-axis: first dimension must match n_var -# matrix: must match (n_obs, n_var) -OBS_AXIS_PREFIXES = ("obs", "obsm/", "obsp/") -VAR_AXIS_PREFIXES = ("var", "varm/", "varp/") -MATRIX_PREFIXES = ("X", "layers/") - - -def _norm_path(p: str) -> str: - p = p.strip() - if not p: - raise ValueError("Object path must be non-empty.") - return p.lstrip("/") - - -def _get_axis_length(file: h5py.File, axis: str) -> Optional[int]: - """Get the length of obs or var axis.""" - if axis not in file: - return None - group = file[axis] - if not isinstance(group, h5py.Group): - return None - index_name = group.attrs.get("_index", None) - if index_name is None: - index_name = "obs_names" if axis == "obs" else "var_names" - if isinstance(index_name, bytes): - index_name = index_name.decode("utf-8") - if index_name not in group: - return None - dataset = group[index_name] - if isinstance(dataset, h5py.Dataset) and dataset.shape: - return int(dataset.shape[0]) - return None - -def _validate_dimensions( - file: h5py.File, - obj_path: str, - data_shape: tuple, +def _prepare_target_path( + file: Path, + output_file: Optional[Path], + inplace: bool, console: Console, -) -> None: - """Validate that data dimensions match the target path requirements.""" - n_obs = _get_axis_length(file, "obs") - n_var = _get_axis_length(file, "var") - - # Check obs/var replacement (dataframe) - if obj_path == "obs": - if n_obs is not None and data_shape[0] != n_obs: - raise ValueError( - f"Row count mismatch: input has {data_shape[0]} rows, " - f"but obs has {n_obs} cells." - ) - return - if obj_path == "var": - if n_var is not None and data_shape[0] != n_var: - raise ValueError( - f"Row count mismatch: input has {data_shape[0]} rows, " - f"but var has {n_var} features." - ) - return - - # Check matrix (X, layers/*) - for prefix in MATRIX_PREFIXES: - if ( - obj_path == prefix - or obj_path.startswith(prefix + "/") - or obj_path.startswith(prefix) - ): - if obj_path == "X" or obj_path.startswith("layers/"): - if len(data_shape) < 2: - raise ValueError( - f"Matrix data requires 2D shape, got {len(data_shape)}D." - ) - if n_obs is not None and data_shape[0] != n_obs: - raise ValueError( - f"First dimension mismatch: input has {data_shape[0]} rows, " - f"but obs has {n_obs} cells." - ) - if n_var is not None and data_shape[1] != n_var: - raise ValueError( - f"Second dimension mismatch: input has {data_shape[1]} columns, " - f"but var has {n_var} features." - ) - return - - # Check obs-axis matrices (obsm/*, obsp/*) - for prefix in OBS_AXIS_PREFIXES: - if obj_path.startswith(prefix) and obj_path != "obs": - if n_obs is not None and data_shape[0] != n_obs: - raise ValueError( - f"First dimension mismatch: input has {data_shape[0]} rows, " - f"but obs has {n_obs} cells." - ) - # obsp should be square n_obs x n_obs - if obj_path.startswith("obsp/") and len(data_shape) >= 2: - if data_shape[1] != n_obs: - raise ValueError( - f"obsp matrix must be square (n_obs × n_obs): " - f"got {data_shape[0]}×{data_shape[1]}, expected {n_obs}×{n_obs}." - ) - return - - # Check var-axis matrices (varm/*, varp/*) - for prefix in VAR_AXIS_PREFIXES: - if obj_path.startswith(prefix) and obj_path != "var": - if n_var is not None and data_shape[0] != n_var: - raise ValueError( - f"First dimension mismatch: input has {data_shape[0]} rows, " - f"but var has {n_var} features." - ) - # varp should be square n_var x n_var - if obj_path.startswith("varp/") and len(data_shape) >= 2: - if data_shape[1] != n_var: - raise ValueError( - f"varp matrix must be square (n_var × n_var): " - f"got {data_shape[0]}×{data_shape[1]}, expected {n_var}×{n_var}." - ) - return - - # For other paths (like uns/*), no dimension validation - console.print(f"[dim]Note: No dimension validation for path '{obj_path}'[/]") - - -def _read_csv( - input_file: Path, - index_column: Optional[str], -) -> Tuple[List[dict], List[str], List[str], str]: - """ - Read CSV file and return rows, column names, index values, and index column name. - - Returns: - (rows, column_names, index_values, index_column_name) - """ - with open(input_file, "r", encoding="utf-8", newline="") as f: - reader = csv.DictReader(f) - if reader.fieldnames is None: - raise ValueError("CSV file has no header.") - fieldnames = list(reader.fieldnames) - - # Determine index column - if index_column: - if index_column not in fieldnames: - raise ValueError( - f"Index column '{index_column}' not found in CSV. " - f"Available columns: {', '.join(fieldnames)}" - ) - idx_col = index_column - else: - idx_col = fieldnames[0] - - # Read all rows - rows = list(reader) - - index_values = [row[idx_col] for row in rows] - data_columns = [c for c in fieldnames if c != idx_col] - - return rows, data_columns, index_values, idx_col - - -def _read_mtx( - input_file: Path, -) -> Tuple[List[Tuple[int, int, float]], Tuple[int, int], int]: - """ - Read Matrix Market file and return sparse matrix data. - - Returns: - (data, indices, indptr, shape, nnz, is_csr) - """ - with open(input_file, "r", encoding="utf-8") as fh: - header = fh.readline() - if not header.startswith("%%MatrixMarket"): - raise ValueError("Invalid MTX file: missing MatrixMarket header.") - - # Parse header for field type - parts = header.lower().split() - field = "real" - for p in parts: - if p in ("real", "integer", "complex", "pattern"): - field = p - break - - # Skip comments - line = fh.readline() - while line.startswith("%"): - line = fh.readline() - - # Read dimensions - dims = line.split() - n_rows, n_cols, nnz = int(dims[0]), int(dims[1]), int(dims[2]) - - # Read entries - entries = [] - for _ in range(nnz): - parts = fh.readline().split() - r, c = int(parts[0]) - 1, int(parts[1]) - 1 - if field == "pattern": - v = 1.0 - else: - v = float(parts[2]) - entries.append((r, c, v)) - - return entries, (n_rows, n_cols), nnz - - -def _create_csr_from_entries( - entries: List[Tuple[int, int, float]], shape: Tuple[int, int] -) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: - """Convert coordinate entries to CSR format.""" - n_rows, _ = shape - # Sort by row, then column - entries.sort(key=lambda x: (x[0], x[1])) +) -> Path: + if inplace: + return file + if output_file is None: + raise ValueError("Output file is required unless --inplace is specified.") - data = np.array([e[2] for e in entries], dtype=np.float32) - indices = np.array([e[1] for e in entries], dtype=np.int32) + src_backend = detect_backend(file) + dst_backend = detect_backend(output_file) - # Build indptr - indptr = np.zeros(n_rows + 1, dtype=np.int32) - for r, _, _ in entries: - indptr[r + 1] += 1 - indptr = np.cumsum(indptr) + if src_backend == dst_backend: + copy_path(file, output_file) + console.print(f"[dim]Copied {file} → {output_file}[/]") + return output_file - return data, indices, indptr + with open_store(file, "r") as src_store, open_store(output_file, "w") as dst_store: + copy_store_contents(src_store.root, dst_store.root) + console.print( + f"[dim]Converted {file} ({src_backend}) → {output_file} ({dst_backend})[/]" + ) + return output_file def import_object( @@ -255,30 +58,7 @@ def import_object( index_column: Optional[str], console: Console, ) -> None: - """ - Import data from a file into an h5ad object. - - Args: - file: Path to the source h5ad file - obj: Object path to create/replace (e.g., 'obs', 'obsm/X_pca', 'X') - input_file: Input data file (.csv, .npy, .mtx, .json) - output_file: Path to output h5ad file (None if inplace) - inplace: If True, modify the source file directly - index_column: Column to use as index for obs/var CSV import - console: Console for output - """ - # Determine target file - if inplace: - target_file = file - else: - if output_file is None: - raise ValueError("Output file is required unless --inplace is specified.") - # Copy source to output first - shutil.copy2(file, output_file) - target_file = output_file - console.print(f"[dim]Copied {file} → {output_file}[/]") - - obj = _norm_path(obj) + target_file = _prepare_target_path(file, output_file, inplace, console) ext = input_file.suffix.lower() if ext not in EXTENSION_FORMAT: @@ -289,11 +69,8 @@ def import_object( fmt = EXTENSION_FORMAT[ext] - # Validate index_column is only used for obs/var CSV if index_column and (fmt != "csv" or obj not in ("obs", "var")): - raise ValueError( - "--index-column is only valid for CSV import into 'obs' or 'var'." - ) + raise ValueError("--index-column is only valid for CSV import into 'obs' or 'var'.") if fmt == "csv": _import_csv(target_file, obj, input_file, index_column, console) @@ -312,59 +89,15 @@ def _import_csv( index_column: Optional[str], console: Console, ) -> None: - """Import CSV data into obs or var.""" - if obj not in ("obs", "var"): - raise ValueError( - f"CSV import is only supported for 'obs' or 'var', not '{obj}'." - ) - - rows, data_columns, index_values, _ = _read_csv(input_file, index_column) - n_rows = len(rows) - - with h5py.File(file, "a") as f: - # Validate dimensions if the file already has obs/var - _validate_dimensions(f, obj, (n_rows,), console) - - # Delete existing group if present - if obj in f: - del f[obj] - - # Create new group - group = f.create_group(obj) - index_name = "obs_names" if obj == "obs" else "var_names" - group.attrs["_index"] = index_name - group.attrs["encoding-type"] = "dataframe" - group.attrs["encoding-version"] = "0.2.0" - group.attrs["column-order"] = np.array(data_columns, dtype="S") - - # Create index dataset - group.create_dataset( - index_name, - data=np.array(index_values, dtype="S"), + with open_store(file, "a") as store: + import_dataframe( + store.root, + obj=obj, + input_file=input_file, + index_column=index_column, + console=console, ) - # Create column datasets - for col in data_columns: - values = [row[col] for row in rows] - # Try to infer type - try: - arr = np.array(values, dtype=np.float64) - group.create_dataset(col, data=arr) - except (ValueError, TypeError): - try: - arr = np.array(values, dtype=np.int64) - group.create_dataset(col, data=arr) - except (ValueError, TypeError): - # Fallback to string - arr = np.array(values, dtype="S") - ds = group.create_dataset(col, data=arr) - ds.attrs["encoding-type"] = "string-array" - ds.attrs["encoding-version"] = "0.2.0" - - console.print( - f"[green]Imported[/] {n_rows} rows × {len(data_columns)} columns into '{obj}'" - ) - def _import_npy( file: Path, @@ -372,34 +105,8 @@ def _import_npy( input_file: Path, console: Console, ) -> None: - """Import NPY data into a dataset.""" - arr = np.load(input_file) - - with h5py.File(file, "a") as f: - _validate_dimensions(f, obj, arr.shape, console) - - # Handle nested paths - parts = obj.split("/") - parent_path = "/".join(parts[:-1]) - name = parts[-1] - - # Ensure parent groups exist - if parent_path: - if parent_path not in f: - f.create_group(parent_path) - parent = cast(h5py.Group, f[parent_path]) - else: - parent = f - - # Delete existing if present - if name in parent: - del parent[name] - - # Create dataset - parent.create_dataset(name, data=arr) - - shape_str = "×".join(str(d) for d in arr.shape) - console.print(f"[green]Imported[/] {shape_str} array into '{obj}'") + with open_store(file, "a") as store: + import_npy(store.root, obj=obj, input_file=input_file, console=console) def _import_mtx( @@ -408,42 +115,8 @@ def _import_mtx( input_file: Path, console: Console, ) -> None: - """Import MTX (Matrix Market) data as CSR sparse matrix.""" - entries, shape, nnz = _read_mtx(input_file) - data, indices, indptr = _create_csr_from_entries(entries, shape) - - with h5py.File(file, "a") as f: - _validate_dimensions(f, obj, shape, console) - - # Handle nested paths - parts = obj.split("/") - parent_path = "/".join(parts[:-1]) - name = parts[-1] - - if parent_path: - if parent_path not in f: - f.create_group(parent_path) - parent = cast(h5py.Group, f[parent_path]) - else: - parent = f - - # Delete existing if present - if name in parent: - del parent[name] - - # Create sparse matrix group - group = parent.create_group(name) - group.attrs["encoding-type"] = "csr_matrix" - group.attrs["encoding-version"] = "0.1.0" - group.attrs["shape"] = np.array(shape, dtype=np.int64) - - group.create_dataset("data", data=data) - group.create_dataset("indices", data=indices) - group.create_dataset("indptr", data=indptr) - - console.print( - f"[green]Imported[/] {shape[0]}×{shape[1]} sparse matrix ({nnz} non-zero) into '{obj}'" - ) + with open_store(file, "a") as store: + import_mtx(store.root, obj=obj, input_file=input_file, console=console) def _import_json( @@ -452,60 +125,5 @@ def _import_json( input_file: Path, console: Console, ) -> None: - """Import JSON data into uns or other dict-like groups.""" - with open(input_file, "r", encoding="utf-8") as fh: - payload = json.load(fh) - - with h5py.File(file, "a") as f: - # Handle nested paths - parts = obj.split("/") - parent_path = "/".join(parts[:-1]) - name = parts[-1] - - if parent_path: - if parent_path not in f: - f.create_group(parent_path) - parent = cast(h5py.Group, f[parent_path]) - else: - parent = f - - # Delete existing if present - if name in parent: - del parent[name] - - # Create from JSON - _write_json_to_h5(parent, name, payload) - - console.print(f"[green]Imported[/] JSON data into '{obj}'") - - -def _write_json_to_h5(parent: h5py.Group, name: str, value: Any) -> None: - """Recursively write JSON-like data to HDF5.""" - if isinstance(value, dict): - group = parent.create_group(name) - for k, v in value.items(): - _write_json_to_h5(group, k, v) - elif isinstance(value, list): - # Try to convert to array - try: - arr = np.array(value) - if arr.dtype.kind in ("U", "O"): - arr = np.array(value, dtype="S") - parent.create_dataset(name, data=arr) - except (ValueError, TypeError): - # Fallback: store as JSON string - parent.create_dataset(name, data=json.dumps(value).encode("utf-8")) - elif isinstance(value, str): - parent.create_dataset(name, data=np.array([value], dtype="S")) - elif isinstance(value, bool): - parent.create_dataset(name, data=np.array(value, dtype=bool)) - elif isinstance(value, int): - parent.create_dataset(name, data=np.array(value, dtype=np.int64)) - elif isinstance(value, float): - parent.create_dataset(name, data=np.array(value, dtype=np.float64)) - elif value is None: - # Store None as empty string attribute or special marker - ds = parent.create_dataset(name, data=np.array([], dtype="S")) - ds.attrs["_is_none"] = True - else: - raise ValueError(f"Cannot convert JSON value of type {type(value).__name__}") + with open_store(file, "a") as store: + import_json(store.root, obj=obj, input_file=input_file, console=console) diff --git a/src/h5ad/commands/info.py b/src/h5ad/commands/info.py index 11bd11d..b58b5b0 100644 --- a/src/h5ad/commands/info.py +++ b/src/h5ad/commands/info.py @@ -1,11 +1,12 @@ from pathlib import Path -from typing import Optional, Union +from typing import Any, Optional -import h5py import rich from rich.console import Console from rich.tree import Tree -from h5ad.info import axis_len, get_entry_type, format_type_info + +from h5ad.core.info import axis_len, format_type_info, get_entry_type +from h5ad.storage import is_dataset, is_group, open_store # Preferred display order for top-level keys KEY_ORDER = ["X", "obs", "var", "obsm", "varm", "layers", "obsp", "varp", "uns"] @@ -33,7 +34,8 @@ def show_info( depth (Optional[int]): Maximum recursion depth for type display (only with show_types=True) entry_path (Optional[str]): Specific entry path to inspect (e.g., 'obsm/X_pca') """ - with h5py.File(file, "r") as f: + with open_store(file, "r") as store: + f = store.root # If a specific path is requested, show detailed info for that object if entry_path: _show_object_info(f, entry_path, console) @@ -47,13 +49,13 @@ def show_info( ) if show_types: - _show_types_tree(f, console, depth=depth) + _show_types_tree(f, console, root_label=str(file), depth=depth) else: # List top-level keys and their sub-keys (original behavior) for key in _sort_keys(list(f.keys())): obj = f[key] # Only process Groups, skip Datasets like X - if isinstance(obj, h5py.Group): + if is_group(obj): sub_keys = [ k for k in obj.keys() if k not in ("_index", "__categories") ] @@ -65,7 +67,7 @@ def show_info( def _show_types_tree( - f: h5py.File, console: Console, depth: Optional[int] = None + f: Any, console: Console, root_label: str, depth: Optional[int] = None ) -> None: """Show a tree view with type information for all entries. @@ -75,7 +77,7 @@ def _show_types_tree( - obsm/obsp/varm/varp/layers: 1 level (show matrices) - uns: 2 levels deep """ - tree = Tree(f"[bold]{f.filename}[/]") + tree = Tree(f"[bold]{root_label}[/]") # Define max depth for each top-level group max_depth_map = { @@ -93,14 +95,14 @@ def _show_types_tree( def add_node( parent_tree: Tree, name: str, - obj: Union[h5py.Group, h5py.Dataset], + obj: Any, current_depth: int, max_depth: int, ) -> None: info = get_entry_type(obj) type_str = format_type_info(info) - if isinstance(obj, h5py.Dataset): + if is_dataset(obj): shape_str = f"[dim]{obj.shape}[/]" if obj.shape else "" node_text = f"[bright_white]{name}[/] {shape_str} {type_str}" parent_tree.add(node_text) @@ -123,7 +125,7 @@ def add_node( for key in _sort_keys(list(f.keys())): obj = f[key] # Skip empty groups - if isinstance(obj, h5py.Group): + if is_group(obj): children = [k for k in obj.keys() if k not in ("_index", "__categories")] if not children: continue @@ -135,7 +137,7 @@ def add_node( console.print(tree) -def _show_object_info(f: h5py.File, entry_path: str, console: Console) -> None: +def _show_object_info(f: Any, entry_path: str, console: Console) -> None: """Show detailed info for a specific object path.""" # Normalize path entry_path = entry_path.strip().lstrip("/") @@ -171,7 +173,7 @@ def _show_object_info(f: h5py.File, entry_path: str, console: Console) -> None: console.print(f" [dim]{k}:[/] {v_str}") # If it's a group, show children - if isinstance(entry, h5py.Group): + if is_group(entry): children = [k for k in entry.keys() if k not in ("_index", "__categories")] if children: console.print(f"\n[bold cyan]Children:[/]") diff --git a/src/h5ad/commands/subset.py b/src/h5ad/commands/subset.py index ff20d6b..940ef07 100644 --- a/src/h5ad/commands/subset.py +++ b/src/h5ad/commands/subset.py @@ -1,686 +1,17 @@ -"""Subset operations for .h5ad files.""" - -from __future__ import annotations - -from pathlib import Path -from typing import Optional, Set, Tuple, List, Dict, Any - -import h5py -import numpy as np -import typer -from rich.console import Console -from rich.progress import ( - Progress, - SpinnerColumn, - TextColumn, - BarColumn, - TaskProgressColumn, - TimeElapsedColumn, +from h5ad.core.subset import ( + _read_name_file, + indices_from_name_set, + subset_axis_group, + subset_dense_matrix, + subset_h5ad, + subset_sparse_matrix_group, ) -from h5ad.read import decode_str_array - - -def _copy_attrs(src: h5py.AttributeManager, dst: h5py.AttributeManager) -> None: - """ - Copy HDF5 attributes from source to destination. - Args: - src (h5py.AttributeManager): Source attributes - dst (h5py.AttributeManager): Destination attributes - """ - for k, v in src.items(): - dst[k] = v - - -def _ds_create_kwargs(src: h5py.Dataset) -> Dict[str, Any]: - """ - Best-effort carryover of dataset creation properties. - (h5py doesn't expose everything perfectly; this covers the big ones.) - - Args: - src (h5py.Dataset): Source dataset - Returns: - Dict[str, Any]: Dataset creation keyword arguments - """ - kw: Dict[str, Any] = {} - if src.chunks is not None: - kw["chunks"] = src.chunks - if src.compression is not None: - kw["compression"] = src.compression - kw["compression_opts"] = src.compression_opts - kw["shuffle"] = bool(src.shuffle) - kw["fletcher32"] = bool(src.fletcher32) - if src.scaleoffset is not None: - kw["scaleoffset"] = src.scaleoffset - if src.fillvalue is not None: - kw["fillvalue"] = src.fillvalue - return kw - - -def _read_name_file(path: Path) -> Set[str]: - """ - Read one name per line from a file. Blank lines ignored. - """ - names: Set[str] = set() - with open(path, "r", encoding="utf-8") as f: - for line in f: - line = line.strip() - if line: - names.add(line) - return names - - -def indices_from_name_set( - names_ds: h5py.Dataset, - keep: Set[str], - *, - chunk_size: int = 200_000, -) -> Tuple[np.ndarray, Set[str]]: - """ - Returns (indices_sorted, missing_names). - Chunked scan so we don't do names_ds[...] for huge datasets. - - Args: - names_ds (h5py.Dataset): Dataset containing names - keep (Set[str]): Set of names to find - chunk_size (int): Number of names to read per chunk - - Returns: - Tuple[np.ndarray, Set[str]]: (Array of found indices, set of missing names) - """ - if names_ds.ndim != 1: - # common h5ad uses 1D obs_names/var_names - flat_len = int(np.prod(names_ds.shape)) - else: - flat_len = names_ds.shape[0] - - remaining = set(keep) # we'll delete as we find - found_indices: List[int] = [] - - for start in range(0, flat_len, chunk_size): - end = min(start + chunk_size, flat_len) - chunk = names_ds[start:end] - chunk = decode_str_array(np.asarray(chunk)).astype(str) - - for i, name in enumerate(chunk): - if name in remaining: - found_indices.append(start + i) - remaining.remove(name) - - if not remaining: - break - - return np.asarray(found_indices, dtype=np.int64), remaining - - -def subset_axis_group( - src: h5py.Group, - dst: h5py.Group, - indices: Optional[np.ndarray], -) -> None: - """ - Subset obs/var group: - - datasets: subset along first axis (obj[indices, ...]) - - categorical groups: copy categories, subset codes - - unknown groups: copy as-is if indices is None; otherwise copy conservatively - - Args: - src (h5py.Group): Source axis group - dst (h5py.Group): Destination axis group - indices (Optional[np.ndarray]): Indices to keep; if None, copy as-is - """ - _copy_attrs(src.attrs, dst.attrs) - - for key in src.keys(): - obj = src[key] - - if isinstance(obj, h5py.Dataset): - if indices is None: - src.copy(key, dst, name=key) - else: - data = obj[indices, ...] - ds = dst.create_dataset(key, data=data) - _copy_attrs(obj.attrs, ds.attrs) - - elif isinstance(obj, h5py.Group): - enc = obj.attrs.get("encoding-type", b"") - if isinstance(enc, bytes): - enc = enc.decode("utf-8") - - if enc == "categorical": - gdst = dst.create_group(key) - _copy_attrs(obj.attrs, gdst.attrs) - obj.copy("categories", gdst, name="categories") - - codes = obj["codes"] - if indices is None: - obj.copy("codes", gdst, name="codes") - else: - codes_sub = codes[indices, ...] - ds = gdst.create_dataset("codes", data=codes_sub) - _copy_attrs(codes.attrs, ds.attrs) - else: - # Unknown group type - copy as-is - src.copy(key, dst, name=key) - - -def subset_dense_matrix( - src: h5py.Dataset, - dst_parent: h5py.Group, - name: str, - obs_idx: Optional[np.ndarray], - var_idx: Optional[np.ndarray], - *, - chunk_rows: int = 1024, -) -> None: - """ - Chunked write for dense 2D datasets. - Args: - src (h5py.Dataset): Source dense matrix dataset - dst_parent (h5py.Group): Destination parent group - name (str): Name for the destination dataset - obs_idx (Optional[np.ndarray]): Indices of observations to keep - var_idx (Optional[np.ndarray]): Indices of variables to keep - chunk_rows (int): Number of rows to read per chunk - """ - if src.ndim != 2: - # fallback: copy whole dataset - src.parent.copy(src.name.split("/")[-1], dst_parent, name=name) - return - - n_obs, n_var = src.shape - out_obs = len(obs_idx) if obs_idx is not None else n_obs - out_var = len(var_idx) if var_idx is not None else n_var - - kw = _ds_create_kwargs(src) - # adjust chunks to output shape if possible - if "chunks" in kw and kw["chunks"] is not None: - c0, c1 = kw["chunks"] - kw["chunks"] = (min(c0, out_obs), min(c1, out_var)) - - dst = dst_parent.create_dataset( - name, shape=(out_obs, out_var), dtype=src.dtype, **kw - ) - _copy_attrs(src.attrs, dst.attrs) - - # Write in blocks of output rows - for out_start in range(0, out_obs, chunk_rows): - out_end = min(out_start + chunk_rows, out_obs) - - if obs_idx is None: - block = src[out_start:out_end, :] - else: - rows = obs_idx[out_start:out_end] - block = src[rows, :] - - if var_idx is not None: - block = block[:, var_idx] - - dst[out_start:out_end, :] = block - - -def subset_sparse_matrix_group( - src: h5py.Group, - dst_parent: h5py.Group, - name: str, - obs_idx: Optional[np.ndarray], - var_idx: Optional[np.ndarray], -) -> None: - """ - Subset a sparse matrix stored as an h5ad group with datasets: - - data, indices, indptr - Supports both CSR (Compressed Sparse Row) and CSC (Compressed Sparse Column) formats. - - CSR: rows are compressed, efficient for row-wise operations - CSC: columns are compressed, efficient for column-wise operations - - Args: - src (h5py.Group): Source sparse matrix group - dst_parent (h5py.Group): Destination parent group - name (str): Name for the destination group - obs_idx (Optional[np.ndarray]): Indices of observations to keep - var_idx (Optional[np.ndarray]): Indices of variables to keep - """ - data = src["data"] - indices = src["indices"] - indptr = src["indptr"] - - # Determine format - encoding = src.attrs.get("encoding-type", b"") - if isinstance(encoding, bytes): - encoding = encoding.decode("utf-8") - - is_csr = encoding == "csr_matrix" - is_csc = encoding == "csc_matrix" - - if not is_csr and not is_csc: - raise ValueError(f"Unsupported sparse format: {encoding}") - - # Determine shape - shape = src.attrs.get("shape", None) - if shape is None: - # fallback: infer from indptr len and max index - major_dim = indptr.shape[0] - 1 - minor_dim = int(indices[...].max()) + 1 if indices.size else 0 - if is_csr: - n_obs, n_var = major_dim, minor_dim - else: # CSC - n_obs, n_var = minor_dim, major_dim - else: - n_obs, n_var = shape - - # For CSR: major axis = obs (rows), minor axis = var (cols) - # For CSC: major axis = var (cols), minor axis = obs (rows) - if is_csr: - major_idx = obs_idx if obs_idx is not None else np.arange(n_obs, dtype=np.int64) - minor_idx = var_idx - out_obs = major_idx.shape[0] - out_var = minor_idx.shape[0] if minor_idx is not None else n_var - else: # CSC - major_idx = var_idx if var_idx is not None else np.arange(n_var, dtype=np.int64) - minor_idx = obs_idx - out_obs = minor_idx.shape[0] if minor_idx is not None else n_obs - out_var = major_idx.shape[0] - - # Build minor axis remap if needed - minor_map = None - out_minor_dim = out_var if is_csr else out_obs - total_minor_dim = n_var if is_csr else n_obs - - if minor_idx is not None: - # array remap is fastest; if dimension is huge and memory matters, use dict instead - minor_map = np.full(total_minor_dim, -1, dtype=np.int64) - minor_map[minor_idx] = np.arange(minor_idx.shape[0], dtype=np.int64) - - # Pass 1: count nnz in output to preallocate - out_counts = np.zeros(len(major_idx), dtype=np.int64) - for i, major_pos in enumerate(major_idx): - s = int(indptr[major_pos]) - e = int(indptr[major_pos + 1]) - if s == e: - continue - minor_indices = indices[s:e] - if minor_map is None: - out_counts[i] = e - s - else: - mask = minor_map[minor_indices] >= 0 - out_counts[i] = mask.sum() - - out_indptr = np.zeros(len(major_idx) + 1, dtype=indptr.dtype) - np.cumsum(out_counts, out=out_indptr[1:]) - out_nnz = int(out_indptr[-1]) - - # Preallocate output arrays - out_data = np.empty(out_nnz, dtype=data.dtype) - out_indices = np.empty(out_nnz, dtype=indices.dtype) - - # Pass 2: fill - cursor = 0 - for i, major_pos in enumerate(major_idx): - s = int(indptr[major_pos]) - e = int(indptr[major_pos + 1]) - if s == e: - continue - - minor_indices = indices[s:e] - vals = data[s:e] - - if minor_map is None: - length = e - s - out_indices[cursor : cursor + length] = minor_indices - out_data[cursor : cursor + length] = vals - cursor += length - else: - mask = minor_map[minor_indices] >= 0 - new_minor = minor_map[minor_indices[mask]] - new_vals = vals[mask] - length = len(new_minor) - out_indices[cursor : cursor + length] = new_minor - out_data[cursor : cursor + length] = new_vals - cursor += length - - # Create dst group - gdst = dst_parent.create_group(name) - _copy_attrs(src.attrs, gdst.attrs) - gdst.attrs["shape"] = (out_obs, out_var) - # Write encoding-type as bytes to match h5ad standard - gdst.attrs["encoding-type"] = ( - encoding.encode("utf-8") if isinstance(encoding, str) else encoding - ) - - # Write datasets (best-effort preserve compression/etc.) - # Adjust chunks to not exceed output size - data_kw = _ds_create_kwargs(data) - if "chunks" in data_kw and data_kw["chunks"] is not None: - data_kw["chunks"] = (min(data_kw["chunks"][0], out_nnz),) - d_data = gdst.create_dataset("data", data=out_data, **data_kw) - _copy_attrs(data.attrs, d_data.attrs) - - indices_kw = _ds_create_kwargs(indices) - if "chunks" in indices_kw and indices_kw["chunks"] is not None: - indices_kw["chunks"] = (min(indices_kw["chunks"][0], out_nnz),) - d_indices = gdst.create_dataset("indices", data=out_indices, **indices_kw) - _copy_attrs(indices.attrs, d_indices.attrs) - - indptr_kw = _ds_create_kwargs(indptr) - if "chunks" in indptr_kw and indptr_kw["chunks"] is not None: - indptr_kw["chunks"] = (min(indptr_kw["chunks"][0], len(out_indptr)),) - d_indptr = gdst.create_dataset("indptr", data=out_indptr, **indptr_kw) - _copy_attrs(indptr.attrs, d_indptr.attrs) - - -def subset_matrix_like( - src_obj: h5py.Dataset | h5py.Group, - dst_parent: h5py.Group, - name: str, - obs_idx: Optional[np.ndarray], - var_idx: Optional[np.ndarray], - *, - chunk_rows: int = 1024, -) -> None: - """ - Dispatch for dense dataset vs sparse (csr/csc) group. - Args: - src_obj (h5py.Dataset | h5py.Group): Source dataset or group - dst_parent (h5py.Group): Destination parent group - name (str): Name for the destination dataset/group - obs_idx (Optional[np.ndarray]): Indices of observations to keep - var_idx (Optional[np.ndarray]): Indices of variables to keep - chunk_rows (int): Number of rows to read per chunk when subsetting dense matrices - """ - if isinstance(src_obj, h5py.Dataset): - subset_dense_matrix( - src_obj, dst_parent, name, obs_idx, var_idx, chunk_rows=chunk_rows - ) - return - - # group - enc = src_obj.attrs.get("encoding-type", b"") - if isinstance(enc, bytes): - enc = enc.decode("utf-8") - - if enc in ("csr_matrix", "csc_matrix"): - subset_sparse_matrix_group(src_obj, dst_parent, name, obs_idx, var_idx) - else: - # unknown sparse type -> copy as-is (or raise) - src_obj.file.copy(src_obj, dst_parent, name) - - -def subset_h5ad( - file: Path, - output: Path, - obs_file: Optional[Path], - var_file: Optional[Path], - *, - chunk_rows: int = 1024, - console: Console, -) -> None: - """ - Subset an h5ad file by obs and/or var names. - Args: - file (Path): Input h5ad file path - output (Path): Output h5ad file path - obs_file (Optional[Path]): File with obs names to keep (one per line) - var_file (Optional[Path]): File with var names to keep (one per line) - chunk_rows (int): Number of rows to read per chunk when subsetting dense matrices - console (Console): Rich console for output - """ - # ---- Read keep-lists - obs_keep: Optional[Set[str]] = None - if obs_file is not None: - obs_keep = _read_name_file(obs_file) - console.print(f"[cyan]Found {len(obs_keep)} obs names to keep[/]") - - var_keep: Optional[Set[str]] = None - if var_file is not None: - var_keep = _read_name_file(var_file) - console.print(f"[cyan]Found {len(var_keep)} var names to keep[/]") - - if obs_keep is None and var_keep is None: - console.print( - "[bold red]Error:[/] At least one of [cyan]--obs[/] or [cyan]--var[/] must be provided.", - ) - raise typer.Exit(code=1) - - # ---- Open files - with console.status("[magenta]Opening files...[/]"): - src = h5py.File(file, "r") - dst = h5py.File(output, "w") - - try: - # ---- Compute indices - obs_idx = None - if obs_keep is not None: - console.print("[cyan]Matching obs names...[/]") - obs_names_ds = src["obs"].get("obs_names") or src["obs"].get( - src["obs"].attrs.get("_index", "obs_names") - ) - if obs_names_ds is None: - console.print("[bold red]Error:[/] Could not find obs names") - raise KeyError("Could not find obs names") - - obs_idx, missing_obs = indices_from_name_set(obs_names_ds, obs_keep) - if missing_obs: - console.print( - f"[yellow]Warning: {len(missing_obs)} obs names not found in file[/]" - ) - console.print( - f"[green]Selected {len(obs_idx)} obs (of {obs_names_ds.shape[0]})[/]" - ) - - var_idx = None - if var_keep is not None: - console.print("[cyan]Matching var names...[/]") - var_names_ds = src["var"].get("var_names") or src["var"].get( - src["var"].attrs.get("_index", "var_names") - ) - if var_names_ds is None: - console.print("[bold red]Error:[/] Could not find var names") - raise KeyError("Could not find var names") - - var_idx, missing_var = indices_from_name_set(var_names_ds, var_keep) - if missing_var: - console.print( - f"[yellow]Warning: {len(missing_var)} var names not found in file[/]" - ) - console.print( - f"[green]Selected {len(var_idx)} var (of {var_names_ds.shape[0]})[/]" - ) - - # ---- Build task list - tasks: List[str] = [] - if "obs" in src: - tasks.append("obs") - if "var" in src: - tasks.append("var") - if "X" in src: - tasks.append("X") - if "layers" in src: - tasks.extend([f"layer:{k}" for k in src["layers"].keys()]) - if "obsm" in src: - tasks.extend([f"obsm:{k}" for k in src["obsm"].keys()]) - if "varm" in src: - tasks.extend([f"varm:{k}" for k in src["varm"].keys()]) - if "obsp" in src: - tasks.extend([f"obsp:{k}" for k in src["obsp"].keys()]) - if "varp" in src: - tasks.extend([f"varp:{k}" for k in src["varp"].keys()]) - if "uns" in src: - tasks.append("uns") - - # ---- Progress bar for all operations - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TaskProgressColumn(), - TimeElapsedColumn(), - console=console, - ) as progress: - task_id = progress.add_task("[cyan]Subsetting...", total=len(tasks)) - processed_top: Set[str] = set() - - # obs - if "obs" in src: - progress.update(task_id, description="[cyan]Subsetting obs...[/]") - obs_dst = dst.create_group("obs") - subset_axis_group(src["obs"], obs_dst, obs_idx) - processed_top.add("obs") - progress.advance(task_id) - - # var - if "var" in src: - progress.update(task_id, description="[cyan]Subsetting var...[/]") - var_dst = dst.create_group("var") - subset_axis_group(src["var"], var_dst, var_idx) - processed_top.add("var") - progress.advance(task_id) - - # X - if "X" in src: - progress.update(task_id, description="[cyan]Subsetting X...[/]") - subset_matrix_like( - src["X"], dst, "X", obs_idx, var_idx, chunk_rows=chunk_rows - ) - processed_top.add("X") - progress.advance(task_id) - - # layers - if "layers" in src: - layers_dst = dst.create_group("layers") - processed_top.add("layers") - for lname in src["layers"].keys(): - progress.update( - task_id, description=f"[cyan]Subsetting layer: {lname}...[/]" - ) - subset_matrix_like( - src["layers"][lname], - layers_dst, - lname, - obs_idx, - var_idx, - chunk_rows=chunk_rows, - ) - progress.advance(task_id) - - # obsm - if "obsm" in src: - obsm_dst = dst.create_group("obsm") - processed_top.add("obsm") - for k in src["obsm"].keys(): - if obs_idx is None: - progress.update( - task_id, description=f"[cyan]Copying obsm: {k}...[/]" - ) - src["obsm"].copy(k, obsm_dst, name=k) - else: - progress.update( - task_id, description=f"[cyan]Subsetting obsm: {k}...[/]" - ) - obj = src["obsm"][k] - if isinstance(obj, h5py.Dataset): - data = obj[obs_idx, ...] - obsm_dst.create_dataset(k, data=data) - for ak, av in obj.attrs.items(): - obsm_dst[k].attrs[ak] = av - else: - subset_matrix_like( - obj, obsm_dst, k, obs_idx, None, chunk_rows=chunk_rows - ) - progress.advance(task_id) - - # varm - if "varm" in src: - varm_dst = dst.create_group("varm") - processed_top.add("varm") - for k in src["varm"].keys(): - if var_idx is None: - progress.update( - task_id, description=f"[cyan]Copying varm: {k}...[/]" - ) - src["varm"].copy(k, varm_dst, name=k) - else: - progress.update( - task_id, description=f"[cyan]Subsetting varm: {k}...[/]" - ) - obj = src["varm"][k] - if isinstance(obj, h5py.Dataset): - data = obj[var_idx, ...] - varm_dst.create_dataset(k, data=data) - for ak, av in obj.attrs.items(): - varm_dst[k].attrs[ak] = av - else: - subset_matrix_like( - obj, varm_dst, k, var_idx, None, chunk_rows=chunk_rows - ) - progress.advance(task_id) - - # obsp - if "obsp" in src: - obsp_dst = dst.create_group("obsp") - processed_top.add("obsp") - for k in src["obsp"].keys(): - if obs_idx is None: - progress.update( - task_id, description=f"[cyan]Copying obsp: {k}...[/]" - ) - src["obsp"].copy(k, obsp_dst, name=k) - else: - progress.update( - task_id, description=f"[cyan]Subsetting obsp: {k}...[/]" - ) - subset_matrix_like( - src["obsp"][k], - obsp_dst, - k, - obs_idx, - obs_idx, - chunk_rows=chunk_rows, - ) - progress.advance(task_id) - - # varp - if "varp" in src: - varp_dst = dst.create_group("varp") - processed_top.add("varp") - for k in src["varp"].keys(): - if var_idx is None: - progress.update( - task_id, description=f"[cyan]Copying varp: {k}...[/]" - ) - src["varp"].copy(k, varp_dst, name=k) - else: - progress.update( - task_id, description=f"[cyan]Subsetting varp: {k}...[/]" - ) - subset_matrix_like( - src["varp"][k], - varp_dst, - k, - var_idx, - var_idx, - chunk_rows=chunk_rows, - ) - progress.advance(task_id) - - # uns - if "uns" in src: - progress.update(task_id, description="[cyan]Copying uns...[/]") - src.copy("uns", dst) - processed_top.add("uns") - progress.advance(task_id) - - # copy any remaining top-level keys - for key in src.keys(): - if key not in processed_top: - src.copy(key, dst) - - # top-level attrs - for ak, av in src.attrs.items(): - dst.attrs[ak] = av - - console.print(f"[bold green]✓ Successfully created {output}[/]") - - finally: - dst.close() - src.close() +__all__ = [ + "_read_name_file", + "indices_from_name_set", + "subset_axis_group", + "subset_dense_matrix", + "subset_h5ad", + "subset_sparse_matrix_group", +] From 4b09cf50baf81c1991e251fce0f9e48320749f2b Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 17:32:27 +0000 Subject: [PATCH 39/62] HUGE REFACTOR:Add initial implementation of Store class and backend detection for HDF5 and Zarr --- src/h5ad/storage/__init__.py | 267 +++++++++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 src/h5ad/storage/__init__.py diff --git a/src/h5ad/storage/__init__.py b/src/h5ad/storage/__init__.py new file mode 100644 index 0000000..0b652d5 --- /dev/null +++ b/src/h5ad/storage/__init__.py @@ -0,0 +1,267 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Iterable, Optional, Sequence +import shutil + +import h5py + +try: + import zarr +except Exception: # pragma: no cover - optional dependency + zarr = None + +import numpy as np + + +@dataclass +class Store: + backend: str + root: Any + path: Path + + def close(self) -> None: + if self.backend == "hdf5": + try: + self.root.close() + except Exception: + return + + def __enter__(self) -> "Store": + return self + + def __exit__(self, exc_type, exc, tb) -> None: + self.close() + + +def _require_zarr() -> None: + if zarr is None: # pragma: no cover - optional dependency + raise ImportError( + "zarr is required for .zarr support. Install with: uv sync --extra zarr" + ) + + +def is_hdf5_group(obj: Any) -> bool: + return isinstance(obj, (h5py.File, h5py.Group)) + + +def is_hdf5_dataset(obj: Any) -> bool: + return isinstance(obj, h5py.Dataset) + + +def is_zarr_group(obj: Any) -> bool: + return zarr is not None and isinstance(obj, zarr.Group) + + +def is_zarr_array(obj: Any) -> bool: + return zarr is not None and isinstance(obj, zarr.Array) + + +def is_group(obj: Any) -> bool: + return is_hdf5_group(obj) or is_zarr_group(obj) + + +def is_dataset(obj: Any) -> bool: + return is_hdf5_dataset(obj) or is_zarr_array(obj) + + +def is_zarr_path(path: Path) -> bool: + if not path.exists() or not path.is_dir(): + return False + if (path / "zarr.json").exists(): + return True + if (path / ".zgroup").exists() or (path / ".zattrs").exists(): + return True + return False + + +def detect_backend(path: Path) -> str: + if path.exists(): + if path.is_dir(): + if is_zarr_path(path): + return "zarr" + raise ValueError( + f"Path '{path}' is a directory but does not look like a Zarr store." + ) + return "hdf5" + if path.suffix == ".zarr": + return "zarr" + return "hdf5" + + +def open_store(path: Path, mode: str) -> Store: + path = Path(path) + backend = detect_backend(path) + if backend == "zarr": + _require_zarr() + root = zarr.open_group(str(path), mode=mode) + return Store(backend="zarr", root=root, path=path) + root = h5py.File(path, mode) + return Store(backend="hdf5", root=root, path=path) + + +def _normalize_attr_value(value: Any, target_backend: str) -> Any: + if target_backend == "zarr": + if isinstance(value, bytes): + return value.decode("utf-8") + if isinstance(value, (list, tuple)): + return [ + v.decode("utf-8") if isinstance(v, bytes) else v for v in value + ] + if isinstance(value, np.ndarray): + if value.dtype.kind in ("S", "O"): + return [ + v.decode("utf-8") if isinstance(v, bytes) else v + for v in value.tolist() + ] + return value.tolist() + if isinstance(value, np.generic): + return value.item() + return value + + +def copy_attrs(src_attrs: Any, dst_attrs: Any, *, target_backend: str) -> None: + for k, v in src_attrs.items(): + dst_attrs[k] = _normalize_attr_value(v, target_backend) + + +def dataset_create_kwargs(src: Any, *, target_backend: str) -> dict: + kw: dict = {} + chunks = getattr(src, "chunks", None) + if chunks is not None: + kw["chunks"] = chunks + if target_backend == "hdf5" and is_hdf5_dataset(src): + if src.compression is not None: + kw["compression"] = src.compression + kw["compression_opts"] = src.compression_opts + kw["shuffle"] = bool(src.shuffle) + kw["fletcher32"] = bool(src.fletcher32) + if src.scaleoffset is not None: + kw["scaleoffset"] = src.scaleoffset + if src.fillvalue is not None: + kw["fillvalue"] = src.fillvalue + if target_backend == "zarr" and is_zarr_array(src): + src_zarr_format = getattr(getattr(src, "metadata", None), "zarr_format", None) + if src_zarr_format == 3: + compressors = None + try: + compressors = getattr(src, "compressors", None) + except Exception: + compressors = None + if compressors is not None: + kw["compressors"] = compressors + else: + try: + compressor = getattr(src, "compressor", None) + except Exception: + compressor = None + if compressor is not None: + kw["compressor"] = compressor + try: + filters = getattr(src, "filters", None) + except Exception: + filters = None + if filters is not None: + kw["filters"] = filters + try: + fill_value = getattr(src, "fill_value", None) + except Exception: + fill_value = None + if fill_value is not None: + kw["fill_value"] = fill_value + return kw + + +def create_dataset( + parent: Any, + name: str, + *, + data: Any = None, + shape: Optional[Sequence[int]] = None, + dtype: Any = None, + **kwargs: Any, +) -> Any: + if is_zarr_group(parent): + zarr_format = getattr(getattr(parent, "metadata", None), "zarr_format", None) + if zarr_format == 3: + kwargs = dict(kwargs) + kwargs.pop("compressor", None) + elif zarr_format == 2 and "compressors" in kwargs and "compressor" not in kwargs: + kwargs = dict(kwargs) + compressors = kwargs.pop("compressors") + if isinstance(compressors, (list, tuple)) and len(compressors) == 1: + kwargs["compressor"] = compressors[0] + if data is not None: + return parent.create_array(name, data=data, **kwargs) + return parent.create_array(name, shape=shape, dtype=dtype, **kwargs) + if data is not None: + return parent.create_dataset(name, data=data, **kwargs) + return parent.create_dataset(name, shape=shape, dtype=dtype, **kwargs) + + +def _chunk_step(shape: Sequence[int], chunks: Optional[Sequence[int]]) -> int: + if chunks is not None and len(chunks) > 0 and chunks[0]: + return int(chunks[0]) + if not shape: + return 1 + return max(1, min(1024, int(shape[0]))) + + +def copy_dataset(src: Any, dst_group: Any, name: str) -> Any: + shape = tuple(src.shape) if getattr(src, "shape", None) is not None else () + target_backend = "zarr" if is_zarr_group(dst_group) else "hdf5" + ds = create_dataset( + dst_group, + name, + shape=shape, + dtype=src.dtype, + **dataset_create_kwargs(src, target_backend=target_backend), + ) + copy_attrs(src.attrs, ds.attrs, target_backend=target_backend) + + if shape == (): + ds[()] = src[()] + return ds + + step = _chunk_step(shape, getattr(src, "chunks", None)) + for start in range(0, shape[0], step): + end = min(start + step, shape[0]) + if len(shape) == 1: + ds[start:end] = src[start:end] + else: + ds[start:end, ...] = src[start:end, ...] + return ds + + +def copy_tree(src_obj: Any, dst_group: Any, name: str, *, exclude: Iterable[str] = ()) -> Any: + if is_dataset(src_obj): + return copy_dataset(src_obj, dst_group, name) + if not is_group(src_obj): + raise TypeError(f"Unsupported object type for copy: {type(src_obj)}") + + target_backend = "zarr" if is_zarr_group(dst_group) else "hdf5" + grp = dst_group.create_group(name) + copy_attrs(src_obj.attrs, grp.attrs, target_backend=target_backend) + for key in src_obj.keys(): + if key in exclude: + continue + child = src_obj[key] + copy_tree(child, grp, key, exclude=exclude) + return grp + + +def copy_store_contents(src_root: Any, dst_root: Any) -> None: + for key in src_root.keys(): + copy_tree(src_root[key], dst_root, key) + + +def copy_path(src: Path, dst: Path) -> None: + src = Path(src) + dst = Path(dst) + if is_zarr_path(src): + if dst.exists(): + raise FileExistsError(f"Destination '{dst}' already exists.") + shutil.copytree(src, dst) + return + shutil.copy2(src, dst) From 55558f9fcf472ee2bbc06128ce6ae25ac0ae85dd Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 17:32:33 +0000 Subject: [PATCH 40/62] HUGE REFACTOR:Add utility functions for path normalization in h5ad modules --- src/h5ad/util/__init__.py | 1 + src/h5ad/util/path.py | 9 +++++++++ 2 files changed, 10 insertions(+) create mode 100644 src/h5ad/util/__init__.py create mode 100644 src/h5ad/util/path.py diff --git a/src/h5ad/util/__init__.py b/src/h5ad/util/__init__.py new file mode 100644 index 0000000..364e184 --- /dev/null +++ b/src/h5ad/util/__init__.py @@ -0,0 +1 @@ +"""Utility helpers used across h5ad modules.""" diff --git a/src/h5ad/util/path.py b/src/h5ad/util/path.py new file mode 100644 index 0000000..c5c7102 --- /dev/null +++ b/src/h5ad/util/path.py @@ -0,0 +1,9 @@ +from __future__ import annotations + + +def norm_path(path: str) -> str: + """Normalize object paths used inside h5ad/zarr stores.""" + value = path.strip() + if not value: + raise ValueError("Object path must be non-empty.") + return value.lstrip("/") From d40e8490336a3a5ee04deac2e36f4b9175c03052 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 17:32:42 +0000 Subject: [PATCH 41/62] HUGE REFACTOR: Update CLI to support .zarr stores alongside .h5ad, enhancing file handling and command descriptions --- src/h5ad/cli.py | 132 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 99 insertions(+), 33 deletions(-) diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py index d947270..3d084a6 100644 --- a/src/h5ad/cli.py +++ b/src/h5ad/cli.py @@ -18,7 +18,7 @@ from h5ad.commands import export_image as export_image_cmd app = typer.Typer( - help="Streaming CLI for huge .h5ad files (info, subset, export, import)." + help="Streaming CLI for huge .h5ad and .zarr files (info, subset, export, import)." ) # Use stderr for status/progress to keep stdout clean for data output # force_terminal=True ensures Rich output is visible even in non-TTY environments @@ -38,9 +38,11 @@ def info( file: Path = typer.Argument( ..., - help="Path to the .h5ad file", + help="Path to the .h5ad/.zarr store", exists=True, readable=True, + dir_okay=True, + file_okay=True, ), entry: Optional[str] = typer.Argument( None, @@ -82,8 +84,17 @@ def info( # ============================================================================ @app.command() def subset( - file: Path = typer.Argument(..., help="Input .h5ad", exists=True, readable=True), - output: Path = typer.Argument(..., help="Output .h5ad", writable=True), + file: Path = typer.Argument( + ..., + help="Input .h5ad/.zarr", + exists=True, + readable=True, + dir_okay=True, + file_okay=True, + ), + output: Path = typer.Argument( + ..., help="Output .h5ad/.zarr", dir_okay=True, file_okay=True + ), obs: Optional[Path] = typer.Option( None, "--obs", @@ -99,7 +110,12 @@ def subset( readable=True, ), chunk_rows: int = typer.Option( - 1024, "--chunk", "-C", help="Row chunk size for dense matrices" + 1024, + "--chunk", + "-C", + "--chunk-rows", + "-r", + help="Row chunk size for dense matrices", ), ) -> None: """Subset an h5ad by obs and/or var names.""" @@ -129,7 +145,12 @@ def subset( @export_app.command("dataframe") def export_dataframe( file: Path = typer.Argument( - ..., help="Path to the .h5ad file", exists=True, readable=True + ..., + help="Path to the .h5ad/.zarr store", + exists=True, + readable=True, + dir_okay=True, + file_okay=True, ), entry: str = typer.Argument(..., help="Entry path to export ('obs' or 'var')"), output: Path = typer.Option( @@ -142,7 +163,12 @@ def export_dataframe( help="Comma separated column names to include", ), chunk_rows: int = typer.Option( - 10_000, "--chunk", "-C", help="Number of rows to read per chunk" + 10_000, + "--chunk", + "-C", + "--chunk-rows", + "-r", + help="Number of rows to read per chunk", ), head: Optional[int] = typer.Option( None, "--head", "-n", help="Output only the first n entries" @@ -185,7 +211,12 @@ def export_dataframe( @export_app.command("array") def export_array( file: Path = typer.Argument( - ..., help="Path to the .h5ad file", exists=True, readable=True + ..., + help="Path to the .h5ad/.zarr store", + exists=True, + readable=True, + dir_okay=True, + file_okay=True, ), entry: str = typer.Argument( ..., help="Entry path to export (e.g., 'obsm/X_pca', 'varm/PCs', 'X')" @@ -225,7 +256,12 @@ def export_array( @export_app.command("sparse") def export_sparse( file: Path = typer.Argument( - ..., help="Path to the .h5ad file", exists=True, readable=True + ..., + help="Path to the .h5ad/.zarr store", + exists=True, + readable=True, + dir_okay=True, + file_okay=True, ), entry: str = typer.Argument( ..., help="Entry path to export (e.g., 'X', 'layers/counts')" @@ -280,11 +316,17 @@ def export_sparse( @export_app.command("dict") def export_dict( file: Path = typer.Argument( - ..., help="Path to the .h5ad file", exists=True, readable=True + ..., + help="Path to the .h5ad/.zarr store", + exists=True, + readable=True, + dir_okay=True, + file_okay=True, ), entry: str = typer.Argument( ..., help="Entry path to export (e.g., 'uns', 'uns/colors')" ), + output_arg: Optional[Path] = typer.Argument(None, help="Output .json file path"), output: Optional[Path] = typer.Option( None, "--output", "-o", help="Output .json file path" ), @@ -306,10 +348,11 @@ def export_dict( """ try: + out_path = output if output is not None else output_arg export_json( file=file, obj=entry, - out=output, + out=out_path, max_elements=max_elements, include_attrs=include_attrs, console=console, @@ -322,7 +365,12 @@ def export_dict( @export_app.command("image") def export_image( file: Path = typer.Argument( - ..., help="Path to the .h5ad file", exists=True, readable=True + ..., + help="Path to the .h5ad/.zarr store", + exists=True, + readable=True, + dir_okay=True, + file_okay=True, ), entry: str = typer.Argument(..., help="Entry path to export (2D or 3D array)"), output: Optional[Path] = typer.Option( @@ -349,22 +397,21 @@ def export_image( # IMPORT subcommands # ============================================================================ def _get_target_file(file: Path, output: Optional[Path], inplace: bool) -> Path: - """Determine target file and copy if needed.""" - import shutil + """Determine target path and copy/convert if needed.""" + from h5ad.commands.import_data import _prepare_target_path - if inplace: - return file - if output is None: - raise ValueError("Output file is required unless --inplace is specified.") - shutil.copy2(file, output) - console.print(f"[dim]Copied {file} → {output}[/]") - return output + return _prepare_target_path(file, output, inplace, console) @import_app.command("dataframe") def import_dataframe( file: Path = typer.Argument( - ..., help="Path to the source .h5ad file", exists=True, readable=True + ..., + help="Path to the source .h5ad/.zarr store", + exists=True, + readable=True, + dir_okay=True, + file_okay=True, ), entry: str = typer.Argument( ..., help="Entry path to create/replace ('obs' or 'var')" @@ -376,8 +423,9 @@ def import_dataframe( None, "--output", "-o", - help="Output .h5ad file path. Required unless --inplace.", - writable=True, + help="Output .h5ad/.zarr path. Required unless --inplace.", + dir_okay=True, + file_okay=True, ), inplace: bool = typer.Option( False, @@ -424,7 +472,12 @@ def import_dataframe( @import_app.command("array") def import_array( file: Path = typer.Argument( - ..., help="Path to the source .h5ad file", exists=True, readable=True + ..., + help="Path to the source .h5ad/.zarr store", + exists=True, + readable=True, + dir_okay=True, + file_okay=True, ), entry: str = typer.Argument( ..., help="Entry path to create/replace (e.g., 'X', 'obsm/X_pca')" @@ -436,8 +489,9 @@ def import_array( None, "--output", "-o", - help="Output .h5ad file path. Required unless --inplace.", - writable=True, + help="Output .h5ad/.zarr path. Required unless --inplace.", + dir_okay=True, + file_okay=True, ), inplace: bool = typer.Option( False, @@ -474,7 +528,12 @@ def import_array( @import_app.command("sparse") def import_sparse( file: Path = typer.Argument( - ..., help="Path to the source .h5ad file", exists=True, readable=True + ..., + help="Path to the source .h5ad/.zarr store", + exists=True, + readable=True, + dir_okay=True, + file_okay=True, ), obj: str = typer.Argument( ..., help="Object path to create/replace (e.g., 'X', 'layers/counts')" @@ -486,8 +545,9 @@ def import_sparse( None, "--output", "-o", - help="Output .h5ad file path. Required unless --inplace.", - writable=True, + help="Output .h5ad/.zarr path. Required unless --inplace.", + dir_okay=True, + file_okay=True, ), inplace: bool = typer.Option( False, @@ -524,7 +584,12 @@ def import_sparse( @import_app.command("dict") def import_dict( file: Path = typer.Argument( - ..., help="Path to the source .h5ad file", exists=True, readable=True + ..., + help="Path to the source .h5ad/.zarr store", + exists=True, + readable=True, + dir_okay=True, + file_okay=True, ), obj: str = typer.Argument( ..., help="Object path to create/replace (e.g., 'uns', 'uns/metadata')" @@ -536,8 +601,9 @@ def import_dict( None, "--output", "-o", - help="Output .h5ad file path. Required unless --inplace.", - writable=True, + help="Output .h5ad/.zarr path. Required unless --inplace.", + dir_okay=True, + file_okay=True, ), inplace: bool = typer.Option( False, From bfec2b21a768b2656720c328e0d62242a5ba58f9 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 17:33:01 +0000 Subject: [PATCH 42/62] HUGE REFACTOR: Remove unused functions and imports from info.py and read.py, streamlining codebase --- src/h5ad/info.py | 287 +---------------------------------------------- src/h5ad/read.py | 160 +------------------------- 2 files changed, 4 insertions(+), 443 deletions(-) diff --git a/src/h5ad/info.py b/src/h5ad/info.py index 6144b07..635b03a 100644 --- a/src/h5ad/info.py +++ b/src/h5ad/info.py @@ -1,286 +1,3 @@ -from typing import Optional, Tuple, Dict, Any, Union -import h5py -import numpy as np +from h5ad.core.info import axis_len, format_type_info, get_axis_group, get_entry_type - -def get_entry_type(entry: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]: - """ - Determine the type/format of an HDF5 object for export guidance. - - Supports both: - - v0.2.0 (modern): Objects with encoding-type/encoding-version attributes - - v0.1.0 (legacy): Objects without encoding attributes, inferred from structure - - Returns a dict with: - - type: str (e.g., 'dataframe', 'sparse-matrix', 'dense-matrix', 'dict', 'image', 'array', 'scalar') - - export_as: str (suggested export format: csv, mtx, npy, json, image) - - encoding: str (h5ad encoding-type if present) - - shape: tuple or None - - dtype: str or None - - details: str (human-readable description) - - version: str ('0.2.0', '0.1.0', or None for unknown) - """ - result: Dict[str, Any] = { - "type": "unknown", - "export_as": None, - "encoding": None, - "shape": None, - "dtype": None, - "details": "", - "version": None, - } - - # Get encoding-type attribute if present - enc = entry.attrs.get("encoding-type", b"") - if isinstance(enc, bytes): - enc = enc.decode("utf-8") - result["encoding"] = enc if enc else None - - # Get encoding-version if present - enc_ver = entry.attrs.get("encoding-version", b"") - if isinstance(enc_ver, bytes): - enc_ver = enc_ver.decode("utf-8") - result["version"] = enc_ver if enc_ver else None - - # Infer the type for Dataset entry - if isinstance(entry, h5py.Dataset): - result["shape"] = entry.shape - result["dtype"] = str(entry.dtype) - - # Check for legacy categorical (v0.1.0): dataset with 'categories' attribute - if "categories" in entry.attrs: - result["type"] = "categorical" - result["export_as"] = "csv" - result["version"] = result["version"] or "0.1.0" - # Try to get category count from referenced dataset - try: - cats_ref = entry.attrs["categories"] - cats_ds = entry.file[cats_ref] - n_cats = cats_ds.shape[0] - except Exception: - n_cats = "?" - result["details"] = ( - f"Legacy categorical [{entry.shape[0]} values, {n_cats} categories]" - ) - return result - - # Scalar - if entry.shape == (): - result["type"] = "scalar" - result["export_as"] = "json" - result["details"] = f"Scalar value ({entry.dtype})" - return result - - # 1D or 2D numeric array -> dense matrix / array - if entry.ndim == 1: - result["type"] = "array" - result["export_as"] = "npy" - result["details"] = f"1D array [{entry.shape[0]}] ({entry.dtype})" - elif entry.ndim == 2: - result["type"] = "dense-matrix" - result["export_as"] = "npy" - result["details"] = ( - f"Dense matrix {entry.shape[0]}×{entry.shape[1]} ({entry.dtype})" - ) - elif entry.ndim == 3: - result["type"] = "array" - result["export_as"] = "npy" - result["details"] = f"3D array {entry.shape} ({entry.dtype})" - else: - result["type"] = "array" - result["export_as"] = "npy" - result["details"] = f"ND array {entry.shape} ({entry.dtype})" - return result - - # It's a Group - if isinstance(entry, h5py.Group): - # Check for sparse matrix (CSR/CSC) - same in both versions - if enc in ("csr_matrix", "csc_matrix"): - shape = entry.attrs.get("shape", None) - shape_str = f"{shape[0]}×{shape[1]}" if shape is not None else "?" - result["type"] = "sparse-matrix" - result["export_as"] = "mtx" - result["details"] = ( - f"Sparse {enc.replace('_matrix', '').upper()} matrix {shape_str}" - ) - return result - - # Check for v0.2.0 categorical (Group with codes/categories) - if enc == "categorical": - codes = entry.get("codes") - cats = entry.get("categories") - n_codes = codes.shape[0] if codes is not None else "?" - n_cats = cats.shape[0] if cats is not None else "?" - result["type"] = "categorical" - result["export_as"] = "csv" - result["details"] = f"Categorical [{n_codes} values, {n_cats} categories]" - return result - - # Check for dataframe (obs/var style) - # v0.2.0: has encoding-type="dataframe" - # v0.1.0: has _index attribute or obs_names/var_names dataset - if ( - enc == "dataframe" - or "_index" in entry.attrs - or "obs_names" in entry - or "var_names" in entry - ): - # Detect version - if enc == "dataframe": - df_version = result["version"] or "0.2.0" - else: - df_version = "0.1.0" # No encoding-type, legacy format - result["version"] = df_version - - # Check for __categories subgroup (v0.1.0 legacy) - has_legacy_cats = "__categories" in entry - n_cols = len( - [k for k in entry.keys() if k not in ("_index", "__categories")] - ) - - result["type"] = "dataframe" - result["export_as"] = "csv" - if has_legacy_cats: - result["details"] = f"DataFrame with {n_cols} columns (legacy v0.1.0)" - else: - result["details"] = f"DataFrame with {n_cols} columns" - return result - - # Check for nullable arrays (v0.2.0) - if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"): - result["type"] = "array" - result["export_as"] = "npy" - result["details"] = f"Encoded array ({enc})" - return result - - # Check for string-array encoding - if enc == "string-array": - result["type"] = "array" - result["export_as"] = "npy" - result["details"] = "Encoded string array" - return result - - # Check for awkward-array (experimental) - if enc == "awkward-array": - length = entry.attrs.get("length", "?") - result["type"] = "awkward-array" - result["export_as"] = "json" - result["details"] = f"Awkward array (length={length})" - return result - - # Generic dict/group (v0.2.0 has encoding-type="dict", v0.1.0 has no attributes) - n_keys = len(list(entry.keys())) - result["type"] = "dict" - result["export_as"] = "json" - result["details"] = f"Group with {n_keys} keys" - return result - - return result - - -def format_type_info(info: Dict[str, Any]) -> str: - """Format type info as a colored string for display.""" - type_colors = { - "dataframe": "green", - "sparse-matrix": "magenta", - "dense-matrix": "blue", - "array": "blue", - "dict": "yellow", - "categorical": "green", - "scalar": "white", - "unknown": "red", - } - - color = type_colors.get(info["type"], "white") - return f"[{color}]<{info['type']}>[/]" - - -def axis_len(file: h5py.File, axis: str) -> int: - """ - Get the length of the specified axis ('obs' or 'var') in the h5ad file. - - Args: - file (h5py.File): Opened h5ad file object - axis (str): Axis name ('obs' or 'var') - - Returns: - int: Length of the axis - - Raises: - ValueError: If axis is not 'obs' or 'var' - KeyError: If axis or index dataset not found in file - TypeError: If axis is not a group or index is not a dataset - ValueError: If axis length cannot be determined - """ - # Check if the specified axis exists in the file - if axis not in file: - raise KeyError(f"'{axis}' not found in the file.") - - # Get the group corresponding to the axis - group = file[axis] - if not isinstance(group, h5py.Group): - raise TypeError(f"'{axis}' is not a group.") - - # Determine the index name for the axis - index_name = group.attrs.get("_index", None) - if index_name is None: - if axis == "obs": - index_name = "obs_names" - elif axis == "var": - index_name = "var_names" - else: - raise ValueError(f"Invalid axis '{axis}'. Must be 'obs' or 'var'.") - - # Decode bytes to string if necessary - if isinstance(index_name, bytes): - index_name = index_name.decode("utf-8") - - # Check if the index dataset exists - if index_name not in group: - raise KeyError(f"Index dataset '{index_name}' not found in '{axis}' group.") - - # Return the length of the index dataset - dataset = group[index_name] - if not isinstance(dataset, h5py.Dataset): - raise TypeError(f"Index '{index_name}' in '{axis}' is not a dataset.") - if dataset.shape: - return int(dataset.shape[0]) - raise ValueError( - f"Cannot determine length of '{axis}': index dataset has no shape." - ) - - -def get_axis_group(file: h5py.File, axis: str) -> Tuple[h5py.Group, int, str]: - """ - Get the axis group, its length, and index name. - - Args: - file (h5py.File): Opened h5ad file object - axis (str): Axis name ('obs' or 'var') - - Returns: - Tuple[h5py.Group, int, str]: Axis group, its length, and index name - - Raises: - ValueError: If axis is not 'obs' or 'var' - KeyError: If axis or index dataset not found in file - TypeError: If axis is not a group or index is not a dataset - ValueError: If axis length cannot be determined - """ - if axis not in ("obs", "var"): - raise ValueError("axis must be 'obs' or 'var'.") - - # axis_len will validate existence and get length (raises exceptions if issues) - n = axis_len(file, axis) - - # Get the group (already validated by axis_len) - group = file[axis] - - # Get the index name - index_name = group.attrs.get("_index", None) - if index_name is None: - index_name = "obs_names" if axis == "obs" else "var_names" - if isinstance(index_name, bytes): - index_name = index_name.decode("utf-8") - - return group, n, index_name +__all__ = ["axis_len", "format_type_info", "get_axis_group", "get_entry_type"] diff --git a/src/h5ad/read.py b/src/h5ad/read.py index 78fec0e..63f8c4d 100644 --- a/src/h5ad/read.py +++ b/src/h5ad/read.py @@ -1,159 +1,3 @@ -import numpy as np -import h5py -from typing import List, Dict +from h5ad.core.read import col_chunk_as_strings, decode_str_array, read_categorical_column - -def decode_str_array(array: np.ndarray) -> np.ndarray: - """ - Decode a numpy array of bytes or objects to strings. - Args: - array (np.ndarray): Input numpy array of bytes or objects - - Returns: - np.ndarray: Decoded numpy array of strings - """ - if np.issubdtype(array.dtype, np.bytes_): - return array.astype("U") - if array.dtype.kind == "O": - return array.astype(str) - return array.astype(str) - - -def read_categorical_column( - col: h5py.Group | h5py.Dataset, - start: int, - end: int, - cache: Dict[int, np.ndarray], - parent_group: h5py.Group | None = None, -) -> List[str]: - """ - Decode an AnnData 'categorical' column for a slice [start:end]. - - Supports both: - - v0.2.0 (modern): Group with 'codes' and 'categories' datasets - - v0.1.0 (legacy): Dataset with 'categories' attribute referencing __categories/ - - Args: - col: Column group (v0.2.0) or dataset (v0.1.0) - start: Start index of the slice - end: End index of the slice - cache: Cache for decoded categories - parent_group: Parent obs/var group (needed for v0.1.0 to resolve __categories) - - Returns: - List[str]: Decoded categorical values for the specified slice - """ - key = id(col) - - # v0.2.0 format: Group with 'codes' and 'categories' datasets - if isinstance(col, h5py.Group): - if key not in cache: - cats = col["categories"][...] - cats = decode_str_array(cats) - cache[key] = np.asarray(cats, dtype=str) - cats = cache[key] - - codes_ds = col["codes"] - codes = codes_ds[start:end] - codes = np.asarray(codes, dtype=np.int64) - return [cats[c] if 0 <= c < len(cats) else "" for c in codes] - - # v0.1.0 format: Dataset with 'categories' attribute (object reference) - if isinstance(col, h5py.Dataset): - if key not in cache: - cats_ref = col.attrs.get("categories", None) - if cats_ref is not None: - # Dereference the HDF5 object reference - cats_ds = col.file[cats_ref] - cats = cats_ds[...] - elif parent_group is not None and "__categories" in parent_group: - # Fallback: look for __categories subgroup - col_name = col.name.split("/")[-1] - cats_grp = parent_group["__categories"] - if col_name in cats_grp: - cats = cats_grp[col_name][...] - else: - raise KeyError( - f"Cannot find categories for legacy column {col.name}" - ) - else: - raise KeyError( - f"Cannot find categories for legacy column {col.name}" - ) - cats = decode_str_array(cats) - cache[key] = np.asarray(cats, dtype=str) - cats = cache[key] - - codes = col[start:end] - codes = np.asarray(codes, dtype=np.int64) - return [cats[c] if 0 <= c < len(cats) else "" for c in codes] - - raise TypeError(f"Unsupported categorical column type: {type(col)}") - - -def col_chunk_as_strings( - group: h5py.Group, - col_name: str, - start: int, - end: int, - cat_cache: Dict[int, np.ndarray], -) -> List[str]: - """ - Read a column from an obs/var group as strings. - - Supports both: - - v0.2.0 (modern): Columns with encoding-type attribute - - v0.1.0 (legacy): Categorical columns with 'categories' attribute referencing __categories - - Args: - group (h5py.Group): The obs/var group - col_name (str): Name of the column to read - start (int): Start index of the slice - end (int): End index of the slice - cat_cache (Dict[int, np.ndarray]): Cache for decoded categorical columns - - Returns: - List[str]: Column values as strings for the specified slice - """ - if col_name not in group: - raise KeyError(f"Column {col_name!r} not found in group {group.name}") - - col = group[col_name] - - # Case 1: Dataset (could be plain array or legacy categorical) - if isinstance(col, h5py.Dataset): - # Check for v0.1.0 legacy categorical (has 'categories' attribute) - if "categories" in col.attrs: - return read_categorical_column(col, start, end, cat_cache, group) - - # Plain dataset (numeric, string, etc.) - chunk = col[start:end] - if chunk.ndim != 1: - chunk = chunk.reshape(-1) - chunk = decode_str_array(np.asarray(chunk)) - return chunk.tolist() - - # Case 2: Group (v0.2.0 encoded types like categorical, nullable, etc.) - if isinstance(col, h5py.Group): - enc = col.attrs.get("encoding-type", b"") - if isinstance(enc, bytes): - enc = enc.decode("utf-8") - - if enc == "categorical": - return read_categorical_column(col, start, end, cat_cache) - - # Handle nullable arrays (nullable-integer, nullable-boolean, nullable-string-array) - if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"): - values = col["values"][start:end] - mask = col["mask"][start:end] - values = decode_str_array(np.asarray(values)) - # Apply mask: masked values become empty string - return ["" if m else str(v) for v, m in zip(values, mask)] - - raise ValueError( - f"Unsupported group encoding {enc!r} for column {col_name!r}" - ) - - raise TypeError( - f"Unsupported column type for {col_name!r} in group {group.name}" - ) +__all__ = ["col_chunk_as_strings", "decode_str_array", "read_categorical_column"] From dd14d3e4119357bd8b6e33f104793740cc170a29 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 17:33:12 +0000 Subject: [PATCH 43/62] HUGE REFACTOR: Enhance export tests for Zarr support, adding new test cases and improving output handling --- tests/test_export.py | 88 ++++++++++++++++++++-- tests/test_zarr.py | 170 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 253 insertions(+), 5 deletions(-) create mode 100644 tests/test_zarr.py diff --git a/tests/test_export.py b/tests/test_export.py index 323167e..6a3fad9 100644 --- a/tests/test_export.py +++ b/tests/test_export.py @@ -28,11 +28,46 @@ def _read_mtx(path: Path) -> np.ndarray: return mat +def _read_mtx_header_and_data(path: Path) -> tuple[int, int, int, list[str]]: + with open(path, "r", encoding="utf-8") as fh: + header = fh.readline() + assert header.startswith("%%MatrixMarket") + line = fh.readline() + while line.startswith("%"): + line = fh.readline() + n_rows, n_cols, nnz = map(int, line.split()) + data_lines = [line.strip() for line in fh if line.strip()] + return n_rows, n_cols, nnz, data_lines + + class TestExportArray: def test_export_array_dense_X(self, sample_h5ad_file, temp_dir): out = temp_dir / "X.npy" result = runner.invoke( - app, ["export", "array", str(sample_h5ad_file), "X", str(out)] + app, ["export", "array", str(sample_h5ad_file), "X", "--output", str(out)] + ) + assert result.exit_code == 0 + assert out.exists() + + got = np.load(out) + with h5py.File(sample_h5ad_file, "r") as f: + expected = np.asarray(f["X"][...]) + np.testing.assert_allclose(got, expected) + + def test_export_array_chunk(self, sample_h5ad_file, temp_dir): + out = temp_dir / "X_chunk.npy" + result = runner.invoke( + app, + [ + "export", + "array", + str(sample_h5ad_file), + "X", + "--output", + str(out), + "--chunk", + "3", + ], ) assert result.exit_code == 0 assert out.exists() @@ -47,7 +82,15 @@ class TestExportSparse: def test_export_sparse_csr(self, sample_sparse_csr_h5ad, temp_dir): out = temp_dir / "X_csr.mtx" result = runner.invoke( - app, ["export", "sparse", str(sample_sparse_csr_h5ad), "X", str(out)] + app, + [ + "export", + "sparse", + str(sample_sparse_csr_h5ad), + "X", + "--output", + str(out), + ], ) assert result.exit_code == 0 assert out.exists() @@ -64,6 +107,31 @@ def test_export_sparse_csr(self, sample_sparse_csr_h5ad, temp_dir): ) np.testing.assert_allclose(got, expected) + def test_export_sparse_head_limits_entries(self, sample_sparse_csr_h5ad, temp_dir): + out = temp_dir / "X_csr_head.mtx" + result = runner.invoke( + app, + [ + "export", + "sparse", + str(sample_sparse_csr_h5ad), + "X", + "--output", + str(out), + "--head", + "2", + ], + ) + assert result.exit_code == 0 + assert out.exists() + + n_rows, n_cols, nnz, data_lines = _read_mtx_header_and_data(out) + assert (n_rows, n_cols) == (4, 3) + assert nnz == 2 + assert len(data_lines) == 2 + assert data_lines[0].startswith("1 1 ") + assert data_lines[1].startswith("1 3 ") + def test_export_sparse_csc(self, temp_dir): # Build a small, consistent CSC matrix group file_path = temp_dir / "test_csc.h5ad" @@ -79,7 +147,9 @@ def test_export_sparse_csc(self, temp_dir): X.create_dataset("indptr", data=indptr) out = temp_dir / "X_csc.mtx" - result = runner.invoke(app, ["export", "sparse", str(file_path), "X", str(out)]) + result = runner.invoke( + app, ["export", "sparse", str(file_path), "X", "--output", str(out)] + ) assert result.exit_code == 0 assert out.exists() @@ -162,7 +232,8 @@ def test_sparse_matrix_array_export(self, sample_sparse_csr_h5ad, temp_dir): """Test that sparse matrix requires sparse export.""" out = temp_dir / "X.npy" result = runner.invoke( - app, ["export", "array", str(sample_sparse_csr_h5ad), "X", str(out)] + app, + ["export", "array", str(sample_sparse_csr_h5ad), "X", "--output", str(out)], ) # Should fail because X is sparse, not dense assert result.exit_code == 1 @@ -172,7 +243,14 @@ def test_nonexistent_object(self, sample_h5ad_file, temp_dir): out = temp_dir / "output.npy" result = runner.invoke( app, - ["export", "array", str(sample_h5ad_file), "nonexistent/path", str(out)], + [ + "export", + "array", + str(sample_h5ad_file), + "nonexistent/path", + "--output", + str(out), + ], ) assert result.exit_code == 1 assert "not found" in result.output.lower() or "error" in result.output.lower() diff --git a/tests/test_zarr.py b/tests/test_zarr.py new file mode 100644 index 0000000..1008d6a --- /dev/null +++ b/tests/test_zarr.py @@ -0,0 +1,170 @@ +"""Tests for zarr auto-detection support (v2 and v3).""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Optional + +import numpy as np +import pytest +from typer.testing import CliRunner +from rich.console import Console + +from h5ad.cli import app +from h5ad.core.subset import subset_h5ad + + +zarr = pytest.importorskip("zarr") + +runner = CliRunner() + + +class UnsupportedZarrFormat(Exception): + pass + + +def _open_zarr_group(path: Path, zarr_format: Optional[int]) -> Any: + if zarr_format is None: + return zarr.open_group(path, mode="w") + + last_exc: Exception | None = None + for kw in ("zarr_format", "zarr_version"): + try: + return zarr.open_group(path, mode="w", **{kw: zarr_format}) + except (TypeError, ValueError) as exc: + last_exc = exc + continue + + raise UnsupportedZarrFormat(str(last_exc)) from last_exc + + +def _create_array(group: Any, name: str, data: np.ndarray) -> Any: + data = np.asarray(data) + if hasattr(group, "create_array"): + try: + return group.create_array(name, data=data) + except TypeError: + return group.create_array( + name, data=data, shape=data.shape, dtype=data.dtype + ) + try: + return group.create_dataset(name, data=data, shape=data.shape) + except TypeError: + return group.create_dataset(name, data=data) + + +def _create_zarr_store(path: Path, *, zarr_format: Optional[int]) -> None: + root = _open_zarr_group(path, zarr_format) + + obs = root.create_group("obs") + obs.attrs["_index"] = "obs_names" + obs_names = ["cell_1", "cell_2", "cell_3", "cell_4", "cell_5"] + _create_array(obs, "obs_names", np.array(obs_names, dtype="S")) + _create_array( + obs, + "cell_type", + np.array(["TypeA", "TypeB", "TypeA", "TypeC", "TypeB"], dtype="S"), + ) + + var = root.create_group("var") + var.attrs["_index"] = "var_names" + var_names = ["gene_1", "gene_2", "gene_3", "gene_4"] + _create_array(var, "var_names", np.array(var_names, dtype="S")) + + X = np.array( + [ + [1.0, 0.0, 2.5, 0.0], + [0.0, 3.2, 0.0, 1.1], + [2.1, 0.0, 1.8, 0.0], + [0.0, 4.5, 0.0, 2.3], + [1.5, 0.0, 3.0, 0.0], + ], + dtype=np.float32, + ) + _create_array(root, "X", X) + + uns = root.create_group("uns") + _create_array(uns, "description", np.array(["Test dataset"], dtype="S")) + + +@pytest.fixture(params=[None, 2], ids=["default", "v2"]) +def zarr_format(request) -> Optional[int]: + return request.param + + +def _skip_if_unsupported(exc: Exception, zarr_format: Optional[int]) -> None: + if zarr_format == 2: + pytest.skip("zarr v2 not supported by installed zarr") + raise exc + + +def test_info_zarr_auto_detect(temp_dir, zarr_format): + store_path = temp_dir / f"test_{zarr_format or 'default'}.zarr" + try: + _create_zarr_store(store_path, zarr_format=zarr_format) + except UnsupportedZarrFormat as exc: + _skip_if_unsupported(exc, zarr_format) + + result = runner.invoke(app, ["info", str(store_path)]) + output = result.stdout + (result.stderr or "") + assert result.exit_code == 0, output + assert "5 × 4" in output + + +def test_export_dataframe_zarr(temp_dir, zarr_format): + store_path = temp_dir / f"test_{zarr_format or 'default'}.zarr" + try: + _create_zarr_store(store_path, zarr_format=zarr_format) + except UnsupportedZarrFormat as exc: + _skip_if_unsupported(exc, zarr_format) + output = temp_dir / "obs.csv" + + result = runner.invoke( + app, + ["export", "dataframe", str(store_path), "obs", "--output", str(output)], + ) + if result.exit_code != 0: + raise AssertionError( + f"exit_code={result.exit_code} exception={result.exception!r} output={result.output}" + ) + assert output.exists() + assert "obs_names" in output.read_text(encoding="utf-8") + + +def test_export_dict_zarr(temp_dir, zarr_format): + store_path = temp_dir / f"test_{zarr_format or 'default'}.zarr" + try: + _create_zarr_store(store_path, zarr_format=zarr_format) + except UnsupportedZarrFormat as exc: + _skip_if_unsupported(exc, zarr_format) + output = temp_dir / "uns.json" + + result = runner.invoke( + app, ["export", "dict", str(store_path), "uns", str(output)] + ) + assert result.exit_code == 0 + assert output.exists() + + +def test_subset_zarr_output(temp_dir, zarr_format): + store_path = temp_dir / f"test_{zarr_format or 'default'}.zarr" + try: + _create_zarr_store(store_path, zarr_format=zarr_format) + except UnsupportedZarrFormat as exc: + _skip_if_unsupported(exc, zarr_format) + obs_file = temp_dir / "obs.txt" + obs_file.write_text("cell_1\ncell_3\n") + output = temp_dir / "subset.zarr" + + console = Console() + subset_h5ad( + file=store_path, + output=output, + obs_file=obs_file, + var_file=None, + chunk_rows=1024, + console=console, + ) + root = zarr.open_group(output, mode="r") + assert root["obs"]["obs_names"].shape[0] == 2 + assert root["X"].shape == (2, 4) From eee67ae5713e6ba8b56192af3c92f334e6222b26 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 17:33:19 +0000 Subject: [PATCH 44/62] HUGE REFACTOR: Rename job from 'test' to 'tests' and enhance test matrix for additional modules, improving test organization and coverage reporting --- .github/workflows/tests.yml | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7548bf7..f946c87 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -11,7 +11,7 @@ concurrency: cancel-in-progress: true jobs: - test: + tests: runs-on: ubuntu-latest timeout-minutes: 20 @@ -23,6 +23,21 @@ jobs: fail-fast: false matrix: python-version: ["3.12"] # add "3.13" if you want + module: + - name: cli + tests: tests/test_cli.py + - name: export + tests: tests/test_export.py + - name: import + tests: tests/test_import.py + - name: info-read + tests: tests/test_info_read.py + - name: subset + tests: tests/test_subset.py + - name: zarr + tests: tests/test_zarr.py + + name: tests (${{ matrix.module.name }}) steps: - uses: actions/checkout@v4 @@ -42,29 +57,29 @@ jobs: - name: Run tests with coverage run: | - uv run pytest -v \ + uv run pytest -v ${{ matrix.module.tests }} \ --cov=h5ad \ --cov-report=term-missing \ --cov-report=xml \ --cov-report=html \ - --junitxml=pytest-results.xml + --junitxml=pytest-results-${{ matrix.module.name }}.xml - name: Publish test results summary uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: - files: pytest-results.xml - check_name: Test Results + files: pytest-results-${{ matrix.module.name }}.xml + check_name: Test Results (${{ matrix.module.name }}) - name: Upload coverage artifacts uses: actions/upload-artifact@v4 if: always() with: - name: coverage + name: coverage-${{ matrix.module.name }} path: | coverage.xml htmlcov/ - pytest-results.xml + pytest-results-${{ matrix.module.name }}.xml retention-days: 30 - name: Upload coverage to Codecov From a2dc8f71ceaa2c117b97d20da8b415640fabe2f7 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 17:52:39 +0000 Subject: [PATCH 45/62] Update README to include support for .zarr stores and enhance feature descriptions for clarity --- README.md | 60 +++++++++++++++++++------------------------------------ 1 file changed, 21 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index eecee0f..b2396eb 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,20 @@ # h5ad CLI -A command-line tool for exploring huge `.h5ad` (AnnData) files without loading them fully into memory. Streams data directly from disk for efficient inspection of structure, metadata, and matrices. +A command-line tool for exploring huge AnnData stores (`.h5ad` and `.zarr`) without loading them fully into memory. Streams data directly from disk for efficient inspection of structure, metadata, and matrices. ## Features -- **`info`** – Show file structure and dimensions (`n_obs × n_var`) -- **`table`** – Export obs/var metadata to CSV with chunked streaming -- **`subset`** – Filter h5ad files by cell/gene names (supports dense and sparse CSR/CSC matrices) -- Memory-efficient chunked processing for large files -- Rich terminal output with colors and progress bars +- Streaming access to very large `.h5ad` and `.zarr` stores +- Auto-detects `.h5ad` files vs `.zarr` directories +- Chunked processing for dense and sparse matrices (CSR/CSC) +- Rich terminal output with progress indicators ## Installation +Using [uv](https://docs.astral.sh/uv/) (recommended): ```bash +git clone https://github.com/cellgeni/h5ad-cli.git +cd h5ad-cli uv sync ``` @@ -21,45 +23,25 @@ For development and testing: uv sync --extra dev ``` -See [docs/TESTING.md](docs/TESTING.md) for testing documentation. - -## Usage -Invoke any subcommand via `uv run h5ad ...`: - +Alternative with pip: ```bash -uv run h5ad --help +git clone https://github.com/cellgeni/h5ad-cli.git +cd h5ad-cli +pip install . ``` -#### Examples - -**Inspect overall structure and axis sizes:** +For development and testing with pip: ```bash -uv run h5ad info data.h5ad +pip install -e ".[dev]" ``` -**Export full obs metadata to CSV:** -```bash -uv run h5ad table data.h5ad --axis obs --out obs_metadata.csv -``` - -**Export selected obs columns to stdout:** -```bash -uv run h5ad table data.h5ad --axis obs --cols cell_type,donor -``` - -**Export var metadata with custom chunk size:** -```bash -uv run h5ad table data.h5ad --axis var --chunk-rows 5000 --out var_metadata.csv -``` +See [docs/TESTING.md](docs/TESTING.md) for testing documentation. -**Subset by cell names:** -```bash -uv run h5ad subset input.h5ad output.h5ad --obs cells.txt -``` +## Commands (Overview) -**Subset by both cells and genes:** -```bash -uv run h5ad subset input.h5ad output.h5ad --obs cells.txt --var genes.txt -``` +Run help at any level (e.g. `uv run h5ad --help`, `uv run h5ad export --help`). -All commands stream from disk, so even multi-GB `.h5ad` files remain responsive. +- `info` – read-only inspection of store layout, shapes, and type hints; supports drilling into paths like `obsm/X_pca` or `uns`. +- `subset` – stream and write a filtered copy based on obs/var name lists, preserving dense and sparse matrix encodings. +- `export` – extract data from a store; subcommands: `dataframe` (obs/var to CSV), `array` (dense to `.npy`), `sparse` (CSR/CSC to `.mtx`), `dict` (JSON), `image` (PNG). +- `import` – write new data into a store; subcommands: `dataframe` (CSV → obs/var), `array` (`.npy`), `sparse` (`.mtx`), `dict` (JSON). \ No newline at end of file From b4c58a65b2e9e7374a629e530317784f1ae71c12 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 17:55:14 +0000 Subject: [PATCH 46/62] Renamed docs --- docs/{h5ad_elements_spec.md => ELEMENTS_h5ad.md} | 0 docs/{zarr_elements_spec.md => ELEMENTS_zarr.md} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename docs/{h5ad_elements_spec.md => ELEMENTS_h5ad.md} (100%) rename docs/{zarr_elements_spec.md => ELEMENTS_zarr.md} (100%) diff --git a/docs/h5ad_elements_spec.md b/docs/ELEMENTS_h5ad.md similarity index 100% rename from docs/h5ad_elements_spec.md rename to docs/ELEMENTS_h5ad.md diff --git a/docs/zarr_elements_spec.md b/docs/ELEMENTS_zarr.md similarity index 100% rename from docs/zarr_elements_spec.md rename to docs/ELEMENTS_zarr.md From d418e4d8d450d4438d093b601dcaa90922d92245 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 17:56:05 +0000 Subject: [PATCH 47/62] Update README to add tutorial reference and ensure proper formatting --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b2396eb..ff7a474 100644 --- a/README.md +++ b/README.md @@ -44,4 +44,6 @@ Run help at any level (e.g. `uv run h5ad --help`, `uv run h5ad export --help`). - `info` – read-only inspection of store layout, shapes, and type hints; supports drilling into paths like `obsm/X_pca` or `uns`. - `subset` – stream and write a filtered copy based on obs/var name lists, preserving dense and sparse matrix encodings. - `export` – extract data from a store; subcommands: `dataframe` (obs/var to CSV), `array` (dense to `.npy`), `sparse` (CSR/CSC to `.mtx`), `dict` (JSON), `image` (PNG). -- `import` – write new data into a store; subcommands: `dataframe` (CSV → obs/var), `array` (`.npy`), `sparse` (`.mtx`), `dict` (JSON). \ No newline at end of file +- `import` – write new data into a store; subcommands: `dataframe` (CSV → obs/var), `array` (`.npy`), `sparse` (`.mtx`), `dict` (JSON). + +See [docs/GET_STARTED.md](docs/GET_STARTED.md) for a short tutorial. \ No newline at end of file From 333925f1691b7ff38aed4bbd2a505596606c912f Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 17:56:13 +0000 Subject: [PATCH 48/62] Update dependencies in pyproject.toml: remove obsolete images section and ensure correct versions for pillow and zarr --- pyproject.toml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3df76b2..281812b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,8 +7,10 @@ requires-python = ">=3.12" dependencies = [ "h5py>=3.15.1", "numpy>=2.3.5", + "pillow>=12.1.0", "rich>=14.2.0", "typer>=0.20.0", + "zarr>=3.1.5", ] [project.optional-dependencies] @@ -16,9 +18,6 @@ dev = [ "pytest>=8.3.4", "pytest-cov>=6.0.0", ] -images = [ - "pillow>=10.0.0", -] [build-system] requires = ["uv_build>=0.8.0,<0.9.0"] From bc7711fc0b43a70e7aa6269c0e9f805095f45117 Mon Sep 17 00:00:00 2001 From: Aljes Date: Mon, 26 Jan 2026 18:48:22 +0000 Subject: [PATCH 49/62] Add GET_STARTED.md for initial setup and usage instructions --- docs/GET_STARTED.md | 146 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 docs/GET_STARTED.md diff --git a/docs/GET_STARTED.md b/docs/GET_STARTED.md new file mode 100644 index 0000000..61488fb --- /dev/null +++ b/docs/GET_STARTED.md @@ -0,0 +1,146 @@ +# Get Started + +This short walkthrough shows the basic workflow: inspect a store, export metadata, and write a subset. + +## 1 Install + +Using uv (recommended): +```bash +git clone https://github.com/cellgeni/h5ad-cli.git +cd h5ad-cli +uv sync +``` + +With pip: +```bash +git clone https://github.com/cellgeni/h5ad-cli.git +cd h5ad-cli +pip install . +``` + +Additionally, it might be useful to install `csvkit` for inspecting exported CSV files: +```bash +# with uv +uv pip install csvkit + +# with pip +pip install csvkit +``` + +## 2 Inspect a files with `info` command + +Let's load an example `.h5ad` file: +```bash +wget -O visium.h5ad https://exampledata.scverse.org/squidpy/figshare/visium_hne_adata.h5ad +``` + +Now run `info` to see the file structure: +```bash +uv run h5ad info visium.h5ad +``` +``` +An object with n_obs × n_var: 2688 × 18078 + obs: array_col, array_row, cluster, in_tissue, leiden, log1p_n_genes_by_counts, log1p_total_counts, log1p_total_counts_mt, n_counts, n_genes_by_counts, pct_counts_in_top_100_genes, pct_counts_in_top_200_genes, pct_counts_in_top_500_genes, +pct_counts_in_top_50_genes, pct_counts_mt, total_counts, total_counts_mt + var: feature_types, gene_ids, genome, highly_variable, highly_variable_rank, log1p_mean_counts, log1p_total_counts, mean_counts, means, mt, n_cells, n_cells_by_counts, pct_dropout_by_counts, total_counts, variances, variances_norm + obsm: X_pca, X_umap, spatial + varm: PCs + obsp: connectivities, distances + uns: cluster_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, spatial, umap + raw: X, var +``` + +To inspect a specific entry: +```bash +uv run h5ad info visium.h5ad obsm/X_pca +``` +``` +Path: obsm/X_pca +Type: dense-matrix +Shape: (2688, 50) +Dtype: float32 +Details: Dense matrix 2688×50 (float32) +``` + +## 3 Export entries +View the first few lines of the `obs` dataframe: + +```bash +uv run h5ad export dataframe visium.h5ad obs --head 10 +``` +```csv +_index,array_col,array_row,cluster,in_tissue,leiden,log1p_n_genes_by_counts,log1p_total_counts,log1p_total_counts_mt,n_counts,n_genes_by_counts,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes,pct_counts_in_top_50_genes,pct_counts_mt,total_counts,total_counts_mt +AAACAAGTATCTCCCA-1,102,50,Cortex_2,1,Cortex_3,8.502891406705377,9.869983,8.257904,19340.0,4928,43.13340227507756,49.21406411582213,60.449844881075485,38.42812823164426,19.943123,19340.0,3857.0 +AAACAATCTACTAGCA-1,43,3,Cortex_5,1,Pyramidal_layer_dentate_gyrus,8.145839612936841,9.528867,8.091933,13750.0,3448,55.14181818181818,60.95272727272727,70.57454545454546,50.516363636363636,23.76,13750.0,3267.0 +AAACACCAATAACTGC-1,19,59,Thalamus_2,1,Hypothalamus_1,8.70334075304372,10.395467,8.499233,32710.0,6022,47.071232039131765,54.56435340874351,65.0871293182513,40.48303271170896,15.010699,32710.0,4910.0 +AAACAGAGCGACTCCT-1,94,14,Cortex_5,1,Pyramidal_layer_dentate_gyrus,8.369157112588834,9.674704,8.092851,15909.0,4311,45.81054748884279,52.07744044251681,62.97693129675027,40.95794833113332,20.554403,15909.0,3270.0 +AAACCGGGTAGGTACC-1,28,42,Thalamus_2,1,Hypothalamus_1,8.663542087751374,10.369013,8.808967,31856.0,5787,45.887744851833254,52.98216976393771,64.24849321948768,40.287543947764945,21.01017,31856.0,6693.0 +AAACCGTTCGTCCAGG-1,42,52,Hypothalamus_2,1,Pyramidal_layer,8.682538124003075,10.337314,8.559678,30862.0,5898,43.79171797031949,51.18592443781998,62.65634113148856,37.80053139783553,16.901043,30862.0,5216.0 +AAACCTCATGAAGTTG-1,19,37,Thalamus_2,1,Hypothalamus_1,9.027858802380862,11.007419,8.849371,60319.0,8331,34.28770370861586,42.45594257199224,55.48997828213332,27.803842901904872,11.553574,60319.0,6969.0 +AAACGAAGAACATACC-1,64,6,Cortex_4,1,Hypothalamus_2,8.84246002419529,10.578089,8.855521,39264.0,6921,37.99663814180929,44.75346373268134,56.6320293398533,32.95639771801141,17.858597,39264.0,7012.0 +AAACGAGACGGTTGAT-1,79,35,Fiber_tract,1,Cortex_5,8.80941494391005,10.458923,8.351847,34853.0,6696,39.947780678851174,47.52818982583996,58.838550483459095,33.7245000430379,12.156773,34853.0,4237.0 +AAACGGTTGCGAACTG-1,59,67,Lateral_ventricle,1,Striatum,8.718663567048953,10.254004,8.416489,28395.0,6115,41.67635147032928,49.20232435287903,60.556435992252155,35.562599049128366,15.918295,28395.0,4520.0 +``` + +Export cell metadata to a CSV file: +```bash +uv run h5ad export dataframe visium.h5ad obs --output cells.csv +wc -l cells.csv # 2689 cells.csv +``` + +## 4 Subset by names + +Let's get all cluster names from `cells.csv`: +```bash +awk -F ',' 'NR>1{print $4}' cells.csv | sort | uniq -c +``` +``` +284 Cortex_1 +257 Cortex_2 +244 Cortex_3 +164 Cortex_4 +129 Cortex_5 +226 Fiber_tract +222 Hippocampus +208 Hypothalamus_1 +133 Hypothalamus_2 +105 Lateral_ventricle +42 Pyramidal_layer +68 Pyramidal_layer_dentate_gyrus +153 Striatum +261 Thalamus_1 +192 Thalamus_2 +``` + +To get all obs names in "Cortex_2", you can use `csvsql` from `csvkit`: +```bash +csvsql -d ',' -I --query "SELECT _index FROM cells WHERE cluster='Cortex_2'" cells.csv > barcodes.txt +sed -i '1d' barcodes.txt # remove header +wc -l barcodes.txt # 257 barcodes.txt +``` + +Now you can use this list to create a subset `.h5ad` file: +```bash +uv run h5ad subset visium.h5ad cortex2.h5ad --obs barcodes.txt +``` + +Check the result: +```bash +uv run h5ad info cortex2.h5ad +``` + +## Import or replace data +You can also import new data into an existing store. For example, let's replace the `obs` dataframe with a modified version. First, leave only first 5 columns in `cells.csv`: +```bash +cut -d ',' -f 1-5 cells.csv > cells1to5.csv +``` + +Now import it back into `cortex2.h5ad`: +```bash +uv run h5ad import dataframe visium.h5ad obs cells1to5.csv +``` + +Check the updated `obs` structure: +```bash +uv run h5ad info visium.h5ad obs +``` \ No newline at end of file From e28d5d36ff046a182d314996adb51a7e0953a92a Mon Sep 17 00:00:00 2001 From: Aljes Date: Tue, 27 Jan 2026 11:10:10 +0000 Subject: [PATCH 50/62] Rename option '--types' to '--tree' in info command for clarity and update help text accordingly --- src/h5ad/cli.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py index 3d084a6..250aa55 100644 --- a/src/h5ad/cli.py +++ b/src/h5ad/cli.py @@ -48,32 +48,32 @@ def info( None, help="Entry path to inspect (e.g., 'obsm/X_pca', 'X', 'uns')", ), - types: bool = typer.Option( + tree: bool = typer.Option( False, - "--types", + "--tree", "-t", - help="Show detailed type information for all entries", + help="Show a tree of all entries", ), depth: int = typer.Option( None, "--depth", "-d", - help="Maximum recursion depth for type display (only with --types)", + help="Maximum recursion depth for tree display (only with --tree)", ), ) -> None: """ Show high-level information about the .h5ad file. - Use --types to see type information for each entry. + Use --tree to see a tree of all entries. Use --entry to inspect a specific entry in detail. Examples: h5ad info data.h5ad - h5ad info --types data.h5ad + h5ad info --tree data.h5ad h5ad info obsm/X_pca data.h5ad """ try: - show_info(file, console, show_types=types, depth=depth, entry_path=entry) + show_info(file, console, show_types=tree, depth=depth, entry_path=entry) except Exception as e: console.print(f"[bold red]Error:[/] {e}") raise typer.Exit(code=1) From 06740184b90b6f90ed5ff62331b9b27fe7e4fdb8 Mon Sep 17 00:00:00 2001 From: Aljes Date: Tue, 27 Jan 2026 11:54:25 +0000 Subject: [PATCH 51/62] Add support for copying HDF5 groups in copy_tree function --- src/h5ad/storage/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/h5ad/storage/__init__.py b/src/h5ad/storage/__init__.py index 0b652d5..43d876d 100644 --- a/src/h5ad/storage/__init__.py +++ b/src/h5ad/storage/__init__.py @@ -235,6 +235,10 @@ def copy_dataset(src: Any, dst_group: Any, name: str) -> Any: def copy_tree(src_obj: Any, dst_group: Any, name: str, *, exclude: Iterable[str] = ()) -> Any: + if is_hdf5_group(dst_group) and (is_hdf5_group(src_obj) or is_hdf5_dataset(src_obj)): + if not exclude: + dst_group.copy(src_obj, dst_group, name) + return dst_group[name] if is_dataset(src_obj): return copy_dataset(src_obj, dst_group, name) if not is_group(src_obj): From e66004748b1c044cf6edf0dde2dc3f43e40269eb Mon Sep 17 00:00:00 2001 From: Aljes Date: Tue, 27 Jan 2026 11:54:30 +0000 Subject: [PATCH 52/62] Implement subset_matrix_entry function for handling dense and sparse matrix subsetting --- src/h5ad/core/subset.py | 95 ++++++++++++++++++++++++++++------------- 1 file changed, 65 insertions(+), 30 deletions(-) diff --git a/src/h5ad/core/subset.py b/src/h5ad/core/subset.py index ee254cd..d6bccc9 100644 --- a/src/h5ad/core/subset.py +++ b/src/h5ad/core/subset.py @@ -283,6 +283,34 @@ def subset_sparse_matrix_group( create_dataset(group, "indptr", data=np.array(new_indptr, dtype=indptr.dtype)) +def subset_matrix_entry( + obj: Any, + dst_parent: Any, + name: str, + obs_idx: Optional[np.ndarray], + var_idx: Optional[np.ndarray], + *, + chunk_rows: int, + entry_label: str, +) -> None: + if is_dataset(obj): + subset_dense_matrix( + obj, dst_parent, name, obs_idx, var_idx, chunk_rows=chunk_rows + ) + return + + if is_group(obj): + enc = obj.attrs.get("encoding-type", b"") + if isinstance(enc, bytes): + enc = enc.decode("utf-8") + if enc in ("csr_matrix", "csc_matrix"): + subset_sparse_matrix_group(obj, dst_parent, name, obs_idx, var_idx) + return + raise ValueError(f"Unsupported {entry_label} encoding type: {enc}") + + raise ValueError(f"Unsupported {entry_label} object type") + + def subset_h5ad( file: Path, output: Path, @@ -371,16 +399,15 @@ def subset_h5ad( tasks.append("uns") with Progress( - SpinnerColumn(), + SpinnerColumn(finished_text="[green]✓[/]"), TextColumn("[progress.description]{task.description}"), - BarColumn(), - TaskProgressColumn(), - TimeElapsedColumn(), console=console, + transient=False, ) as progress: - task_id = progress.add_task("[cyan]Subsetting...", total=len(tasks)) - for task in tasks: + task_id = progress.add_task( + f"[cyan]Subsetting {task}...[/]", total=None + ) if task == "obs": obs_dst = dst.create_group("obs") subset_axis_group(src["obs"], obs_dst, obs_idx) @@ -400,65 +427,73 @@ def subset_h5ad( elif task.startswith("layer:"): key = task.split(":", 1)[1] layer_src = src["layers"][key] - if is_dataset(layer_src): - layers_dst = _ensure_group(dst, "layers") - subset_dense_matrix( - layer_src, - layers_dst, - key, - obs_idx, - var_idx, - chunk_rows=chunk_rows, - ) - elif is_group(layer_src): - layers_dst = _ensure_group(dst, "layers") - subset_sparse_matrix_group( - layer_src, layers_dst, key, obs_idx, var_idx - ) + layers_dst = _ensure_group(dst, "layers") + subset_matrix_entry( + layer_src, + layers_dst, + key, + obs_idx, + var_idx, + chunk_rows=chunk_rows, + entry_label=f"layer:{key}", + ) elif task.startswith("obsm:"): key = task.split(":", 1)[1] obsm_dst = _ensure_group(dst, "obsm") - subset_dense_matrix( - src["obsm"][key], + obsm_obj = src["obsm"][key] + subset_matrix_entry( + obsm_obj, obsm_dst, key, obs_idx, None, chunk_rows=chunk_rows, + entry_label=f"obsm:{key}", ) elif task.startswith("varm:"): key = task.split(":", 1)[1] varm_dst = _ensure_group(dst, "varm") - subset_dense_matrix( - src["varm"][key], + varm_obj = src["varm"][key] + subset_matrix_entry( + varm_obj, varm_dst, key, var_idx, None, chunk_rows=chunk_rows, + entry_label=f"varm:{key}", ) elif task.startswith("obsp:"): key = task.split(":", 1)[1] obsp_dst = _ensure_group(dst, "obsp") - subset_dense_matrix( - src["obsp"][key], + obsp_obj = src["obsp"][key] + subset_matrix_entry( + obsp_obj, obsp_dst, key, obs_idx, obs_idx, chunk_rows=chunk_rows, + entry_label=f"obsp:{key}", ) elif task.startswith("varp:"): key = task.split(":", 1)[1] varp_dst = _ensure_group(dst, "varp") - subset_dense_matrix( - src["varp"][key], + varp_obj = src["varp"][key] + subset_matrix_entry( + varp_obj, varp_dst, key, var_idx, var_idx, chunk_rows=chunk_rows, + entry_label=f"varp:{key}", ) elif task == "uns": copy_tree(src["uns"], dst, "uns") - progress.advance(task_id) + progress.update( + task_id, + description=f"[green]Subsetting {task}[/]", + completed=1, + total=1, + ) From 949f37ea8b89d43781f217d385ac89f215f0472c Mon Sep 17 00:00:00 2001 From: Aljes Date: Tue, 27 Jan 2026 11:54:35 +0000 Subject: [PATCH 53/62] Rename --types flag to --tree in info command tests for clarity --- tests/test_cli.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 2b3bd90..50cc137 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -33,9 +33,9 @@ def test_info_function_direct(self, sample_h5ad_file): # Should not raise exception show_info(sample_h5ad_file, console) - def test_info_types_flag(self, sample_h5ad_file): - """Test info command with --types flag.""" - result = runner.invoke(app, ["info", "--types", str(sample_h5ad_file)]) + def test_info_tree_flag(self, sample_h5ad_file): + """Test info command with --tree flag.""" + result = runner.invoke(app, ["info", "--tree", str(sample_h5ad_file)]) assert result.exit_code == 0 # Should show type annotations in angle brackets # Output may go to stdout or stderr depending on console config @@ -43,7 +43,7 @@ def test_info_types_flag(self, sample_h5ad_file): assert "<" in output assert ">" in output - def test_info_types_short_flag(self, sample_h5ad_file): + def test_info_tree_short_flag(self, sample_h5ad_file): """Test info command with -t short flag.""" result = runner.invoke(app, ["info", "-t", str(sample_h5ad_file)]) assert result.exit_code == 0 @@ -53,7 +53,7 @@ def test_info_types_short_flag(self, sample_h5ad_file): def test_info_depth_flag(self, sample_h5ad_file): """Test info command with --depth flag.""" result = runner.invoke( - app, ["info", "--types", "--depth", "1", str(sample_h5ad_file)] + app, ["info", "--tree", "--depth", "1", str(sample_h5ad_file)] ) assert result.exit_code == 0 output = result.stdout + (result.stderr or "") From 3056ed4f16c1f2743e3b85428cd29d92c71fb25b Mon Sep 17 00:00:00 2001 From: Aljes Date: Tue, 27 Jan 2026 11:54:42 +0000 Subject: [PATCH 54/62] Add tests for subsetting H5AD files with sparse matrices and variable-length strings --- tests/test_subset.py | 177 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) diff --git a/tests/test_subset.py b/tests/test_subset.py index 2aa9264..c2db94d 100644 --- a/tests/test_subset.py +++ b/tests/test_subset.py @@ -414,3 +414,180 @@ def test_subset_h5ad_sparse_csc(self, sample_sparse_csc_h5ad, temp_dir): if isinstance(encoding, bytes): encoding = encoding.decode("utf-8") assert encoding == "csc_matrix" + + def test_subset_h5ad_obsp_sparse_group(self, temp_dir): + """Test subsetting obsp sparse matrix groups.""" + file_path = temp_dir / "obsp_sparse.h5ad" + with h5py.File(file_path, "w") as f: + obs = f.create_group("obs") + obs.attrs["_index"] = "obs_names" + obs_names = ["cell_1", "cell_2", "cell_3", "cell_4"] + obs.create_dataset("obs_names", data=np.array(obs_names, dtype="S")) + + var = f.create_group("var") + var.attrs["_index"] = "var_names" + var_names = ["gene_1", "gene_2"] + var.create_dataset("var_names", data=np.array(var_names, dtype="S")) + + f.create_dataset("X", data=np.zeros((4, 2), dtype=np.float32)) + + obsp = f.create_group("obsp") + conn = obsp.create_group("connectivities") + conn.attrs["encoding-type"] = "csr_matrix" + conn.attrs["encoding-version"] = "0.1.0" + conn.attrs["shape"] = np.array([4, 4], dtype=np.int64) + conn.create_dataset("data", data=np.array([1.0, 2.0, 3.0, 4.0])) + conn.create_dataset("indices", data=np.array([0, 1, 2, 3], dtype=np.int64)) + conn.create_dataset("indptr", data=np.array([0, 1, 2, 3, 4], dtype=np.int64)) + + obs_file = temp_dir / "obs_names.txt" + obs_file.write_text("cell_1\ncell_3\n") + + output = temp_dir / "subset.h5ad" + console = Console(stderr=True) + + subset_h5ad( + file=file_path, + output=output, + obs_file=obs_file, + var_file=None, + chunk_rows=1024, + console=console, + ) + + with h5py.File(output, "r") as f: + conn = f["obsp"]["connectivities"] + encoding = conn.attrs["encoding-type"] + if isinstance(encoding, bytes): + encoding = encoding.decode("utf-8") + assert encoding == "csr_matrix" + assert tuple(conn.attrs["shape"]) == (2, 2) + assert conn["indptr"].shape[0] == 3 + + def test_subset_h5ad_uns_vlen_strings(self, temp_dir): + """Test copying uns datasets with variable-length strings.""" + file_path = temp_dir / "uns_strings.h5ad" + with h5py.File(file_path, "w") as f: + obs = f.create_group("obs") + obs.attrs["_index"] = "obs_names" + obs.create_dataset( + "obs_names", data=np.array(["cell_1", "cell_2"], dtype="S") + ) + + var = f.create_group("var") + var.attrs["_index"] = "var_names" + var.create_dataset( + "var_names", data=np.array(["gene_1", "gene_2"], dtype="S") + ) + + f.create_dataset("X", data=np.zeros((2, 2), dtype=np.float32)) + + uns = f.create_group("uns") + vlen = h5py.string_dtype(encoding="utf-8") + uns.create_dataset("labels", data=np.array(["a", "b", "c"]), dtype=vlen) + meta = uns.create_group("meta") + meta.create_dataset("method", data="test", dtype=vlen) + + obs_file = temp_dir / "obs_names.txt" + obs_file.write_text("cell_1\n") + + output = temp_dir / "subset.h5ad" + console = Console(stderr=True) + + subset_h5ad( + file=file_path, + output=output, + obs_file=obs_file, + var_file=None, + chunk_rows=1024, + console=console, + ) + + with h5py.File(output, "r") as f: + labels = [ + v.decode("utf-8") if isinstance(v, bytes) else v + for v in f["uns"]["labels"][...] + ] + assert labels == ["a", "b", "c"] + method = f["uns"]["meta"]["method"][()] + if isinstance(method, bytes): + method = method.decode("utf-8") + assert method == "test" + + def test_subset_h5ad_sparse_entries(self, temp_dir): + """Test sparse matrices in layers, obsm, varm, obsp, and varp.""" + file_path = temp_dir / "sparse_entries.h5ad" + + def _csr_group(parent, name, shape): + group = parent.create_group(name) + group.attrs["encoding-type"] = "csr_matrix" + group.attrs["encoding-version"] = "0.1.0" + group.attrs["shape"] = np.array(shape, dtype=np.int64) + n_rows, n_cols = shape + data = [] + indices = [] + indptr = [0] + for r in range(n_rows): + c = r % n_cols + data.append(float(r + 1)) + indices.append(c) + indptr.append(len(indices)) + group.create_dataset("data", data=np.array(data, dtype=np.float32)) + group.create_dataset("indices", data=np.array(indices, dtype=np.int64)) + group.create_dataset("indptr", data=np.array(indptr, dtype=np.int64)) + return group + + with h5py.File(file_path, "w") as f: + obs = f.create_group("obs") + obs.attrs["_index"] = "obs_names" + obs.create_dataset( + "obs_names", data=np.array(["cell_1", "cell_2", "cell_3", "cell_4"], dtype="S") + ) + + var = f.create_group("var") + var.attrs["_index"] = "var_names" + var.create_dataset( + "var_names", data=np.array(["gene_1", "gene_2", "gene_3"], dtype="S") + ) + + f.create_dataset("X", data=np.zeros((4, 3), dtype=np.float32)) + + layers = f.create_group("layers") + _csr_group(layers, "counts", (4, 3)) + + obsm = f.create_group("obsm") + _csr_group(obsm, "pca", (4, 2)) + + varm = f.create_group("varm") + _csr_group(varm, "pca", (3, 2)) + + obsp = f.create_group("obsp") + _csr_group(obsp, "connectivities", (4, 4)) + + varp = f.create_group("varp") + _csr_group(varp, "correlations", (3, 3)) + + obs_file = temp_dir / "obs_names.txt" + obs_file.write_text("cell_1\ncell_3\n") + + var_file = temp_dir / "var_names.txt" + var_file.write_text("gene_1\ngene_3\n") + + output = temp_dir / "subset.h5ad" + console = Console(stderr=True) + + subset_h5ad( + file=file_path, + output=output, + obs_file=obs_file, + var_file=var_file, + chunk_rows=1024, + console=console, + ) + + with h5py.File(output, "r") as f: + assert tuple(f["layers"]["counts"].attrs["shape"]) == (2, 2) + assert tuple(f["obsm"]["pca"].attrs["shape"]) == (2, 2) + assert tuple(f["varm"]["pca"].attrs["shape"]) == (2, 2) + assert tuple(f["obsp"]["connectivities"].attrs["shape"]) == (2, 2) + assert tuple(f["varp"]["correlations"].attrs["shape"]) == (2, 2) From 3047c7a514a93f61a9b46387f660c7aeb06060af Mon Sep 17 00:00:00 2001 From: Aljes Date: Tue, 27 Jan 2026 12:08:52 +0000 Subject: [PATCH 55/62] Exclude 'obs_names' and 'var_names' from keys in group processing for improved data handling --- src/h5ad/commands/info.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/h5ad/commands/info.py b/src/h5ad/commands/info.py index b58b5b0..76b56da 100644 --- a/src/h5ad/commands/info.py +++ b/src/h5ad/commands/info.py @@ -57,7 +57,9 @@ def show_info( # Only process Groups, skip Datasets like X if is_group(obj): sub_keys = [ - k for k in obj.keys() if k not in ("_index", "__categories") + k + for k in obj.keys() + if k not in ("_index", "__categories", "obs_names", "var_names") ] if sub_keys and key != "X": rich.print( @@ -126,7 +128,11 @@ def add_node( obj = f[key] # Skip empty groups if is_group(obj): - children = [k for k in obj.keys() if k not in ("_index", "__categories")] + children = [ + k + for k in obj.keys() + if k not in ("_index", "__categories", "obs_names", "var_names") + ] if not children: continue max_depth = ( @@ -174,7 +180,11 @@ def _show_object_info(f: Any, entry_path: str, console: Console) -> None: # If it's a group, show children if is_group(entry): - children = [k for k in entry.keys() if k not in ("_index", "__categories")] + children = [ + k + for k in entry.keys() + if k not in ("_index", "__categories", "obs_names", "var_names") + ] if children: console.print(f"\n[bold cyan]Children:[/]") for child_name in sorted(children): From 361a2aedaf6c41fc3ec190131d75d112107ba0e0 Mon Sep 17 00:00:00 2001 From: Aljes Date: Tue, 27 Jan 2026 12:08:56 +0000 Subject: [PATCH 56/62] Update GET_STARTED.md to include additional output examples for `info` command and clarify import options --- docs/GET_STARTED.md | 49 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/docs/GET_STARTED.md b/docs/GET_STARTED.md index 61488fb..a4024ac 100644 --- a/docs/GET_STARTED.md +++ b/docs/GET_STARTED.md @@ -128,6 +128,17 @@ Check the result: ```bash uv run h5ad info cortex2.h5ad ``` +``` +An object with n_obs × n_var: 257 × 18078 + obs: array_col, array_row, cluster, in_tissue, leiden, log1p_n_genes_by_counts, log1p_total_counts, log1p_total_counts_mt, n_counts, n_genes_by_counts, +pct_counts_in_top_100_genes, pct_counts_in_top_200_genes, pct_counts_in_top_500_genes, pct_counts_in_top_50_genes, pct_counts_mt, total_counts, total_counts_mt + var: feature_types, gene_ids, genome, highly_variable, highly_variable_rank, log1p_mean_counts, log1p_total_counts, mean_counts, means, mt, n_cells, n_cells_by_counts, +pct_dropout_by_counts, total_counts, variances, variances_norm + obsm: X_pca, X_umap, spatial + varm: PCs + obsp: connectivities, distances + uns: cluster_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, spatial, umap +``` ## Import or replace data You can also import new data into an existing store. For example, let's replace the `obs` dataframe with a modified version. First, leave only first 5 columns in `cells.csv`: @@ -135,12 +146,44 @@ You can also import new data into an existing store. For example, let's replace cut -d ',' -f 1-5 cells.csv > cells1to5.csv ``` -Now import it back into `cortex2.h5ad`: +Now import it back into `cortex2.h5ad` with the `_index` column as index: +```bash +uv run h5ad import dataframe visium.h5ad obs cells1to5.csv --index-column _index --output visium_obs1to5.h5ad +``` + +Check the updated `obs` structure: +```bash +uv run h5ad info visium_obs1to5.h5ad +``` +``` +An object with n_obs × n_var: 2688 × 18078 + obs: array_col, array_row, cluster, in_tissue + var: feature_types, gene_ids, genome, highly_variable, highly_variable_rank, log1p_mean_counts, log1p_total_counts, mean_counts, means, mt, n_cells, n_cells_by_counts, +pct_dropout_by_counts, total_counts, variances, variances_norm + obsm: X_pca, X_umap, spatial + varm: PCs + obsp: connectivities, distances + uns: cluster_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, spatial, umap + raw: X, var +``` + +You can also import the data into existing file: ```bash -uv run h5ad import dataframe visium.h5ad obs cells1to5.csv +uv run h5ad import dataframe visium.h5ad obs cells1to5.csv --index-column _index --inplace ``` Check the updated `obs` structure: ```bash -uv run h5ad info visium.h5ad obs +uv run h5ad info visium.h5ad +``` +``` +An object with n_obs × n_var: 2688 × 18078 + obs: array_col, array_row, cluster, in_tissue + var: feature_types, gene_ids, genome, highly_variable, highly_variable_rank, log1p_mean_counts, log1p_total_counts, mean_counts, means, mt, n_cells, n_cells_by_counts, +pct_dropout_by_counts, total_counts, variances, variances_norm + obsm: X_pca, X_umap, spatial + varm: PCs + obsp: connectivities, distances + uns: cluster_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, spatial, umap + raw: X, var ``` \ No newline at end of file From eeb34d952f3bfdca0d52ac8e7b71c2745ebb98de Mon Sep 17 00:00:00 2001 From: Aljes Date: Tue, 27 Jan 2026 12:20:08 +0000 Subject: [PATCH 57/62] Refactor subset command to require output path or use --inplace option for file modification --- src/h5ad/cli.py | 22 ++++++++++++++++++++-- src/h5ad/core/subset.py | 34 ++++++++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py index 250aa55..66bbd22 100644 --- a/src/h5ad/cli.py +++ b/src/h5ad/cli.py @@ -92,8 +92,18 @@ def subset( dir_okay=True, file_okay=True, ), - output: Path = typer.Argument( - ..., help="Output .h5ad/.zarr", dir_okay=True, file_okay=True + output: Optional[Path] = typer.Option( + None, + "--output", + "-o", + help="Output .h5ad/.zarr path. Required unless --inplace.", + dir_okay=True, + file_okay=True, + ), + inplace: bool = typer.Option( + False, + "--inplace", + help="Modify source file directly.", ), obs: Optional[Path] = typer.Option( None, @@ -125,6 +135,13 @@ def subset( ) raise typer.Exit(code=1) + if not inplace and output is None: + console.print( + "[bold red]Error:[/] Output file is required. " + "Use --output/-o or --inplace.", + ) + raise typer.Exit(code=1) + try: subset_h5ad( file=file, @@ -133,6 +150,7 @@ def subset( var_file=var, chunk_rows=chunk_rows, console=console, + inplace=inplace, ) except Exception as e: console.print(f"[bold red]Error:[/] {e}") diff --git a/src/h5ad/core/subset.py b/src/h5ad/core/subset.py index d6bccc9..d9e7829 100644 --- a/src/h5ad/core/subset.py +++ b/src/h5ad/core/subset.py @@ -3,6 +3,7 @@ from __future__ import annotations from pathlib import Path +import shutil from typing import Optional, Set, Tuple, List, Dict, Any import numpy as np @@ -22,6 +23,7 @@ copy_attrs, copy_tree, dataset_create_kwargs, + detect_backend, is_dataset, is_group, is_zarr_group, @@ -313,12 +315,13 @@ def subset_matrix_entry( def subset_h5ad( file: Path, - output: Path, + output: Optional[Path], obs_file: Optional[Path], var_file: Optional[Path], *, chunk_rows: int = 1024, console: Console, + inplace: bool = False, ) -> None: obs_keep: Optional[Set[str]] = None if obs_file is not None: @@ -333,8 +336,24 @@ def subset_h5ad( if obs_keep is None and var_keep is None: raise ValueError("At least one of --obs or --var must be provided.") + if not inplace and output is None: + raise ValueError("Output file is required unless --inplace is specified.") + + if inplace: + src_backend = detect_backend(file) + if src_backend == "zarr": + base_name = file.stem if file.suffix else file.name + tmp_path = file.with_name(f"{base_name}.subset-tmp.zarr") + else: + tmp_path = file.with_name(f"{file.name}.subset-tmp") + if tmp_path.exists(): + raise FileExistsError(f"Temporary path already exists: {tmp_path}") + dst_path = tmp_path + else: + dst_path = output + with console.status("[magenta]Opening files...[/]"): - with open_store(file, "r") as src_store, open_store(output, "w") as dst_store: + with open_store(file, "r") as src_store, open_store(dst_path, "w") as dst_store: src = src_store.root dst = dst_store.root @@ -497,3 +516,14 @@ def subset_h5ad( completed=1, total=1, ) + + if inplace: + if file.exists(): + if file.is_dir(): + shutil.rmtree(file) + else: + file.unlink() + if dst_path.is_dir(): + shutil.move(str(dst_path), str(file)) + else: + dst_path.replace(file) From cb36c17b6da5a7c00dcf037a09d34d3327a788e9 Mon Sep 17 00:00:00 2001 From: Aljes Date: Tue, 27 Jan 2026 12:20:12 +0000 Subject: [PATCH 58/62] Update GET_STARTED.md to modify subset command syntax for clarity --- docs/GET_STARTED.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/GET_STARTED.md b/docs/GET_STARTED.md index a4024ac..2ca023c 100644 --- a/docs/GET_STARTED.md +++ b/docs/GET_STARTED.md @@ -121,7 +121,7 @@ wc -l barcodes.txt # 257 barcodes.txt Now you can use this list to create a subset `.h5ad` file: ```bash -uv run h5ad subset visium.h5ad cortex2.h5ad --obs barcodes.txt +uv run h5ad subset visium.h5ad --output cortex2.h5ad --obs barcodes.txt ``` Check the result: From 6ba8336273653472da260387820be45196419d3b Mon Sep 17 00:00:00 2001 From: Aljes Date: Tue, 27 Jan 2026 12:20:19 +0000 Subject: [PATCH 59/62] Add inplace subsetting test for subset_h5ad function and fix dataset creation --- tests/test_subset.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/tests/test_subset.py b/tests/test_subset.py index c2db94d..78c5cf8 100644 --- a/tests/test_subset.py +++ b/tests/test_subset.py @@ -484,7 +484,7 @@ def test_subset_h5ad_uns_vlen_strings(self, temp_dir): uns = f.create_group("uns") vlen = h5py.string_dtype(encoding="utf-8") - uns.create_dataset("labels", data=np.array(["a", "b", "c"]), dtype=vlen) + uns.create_dataset("labels", data=["a", "b", "c"], dtype=vlen) meta = uns.create_group("meta") meta.create_dataset("method", data="test", dtype=vlen) @@ -514,6 +514,27 @@ def test_subset_h5ad_uns_vlen_strings(self, temp_dir): method = method.decode("utf-8") assert method == "test" + def test_subset_h5ad_inplace(self, sample_h5ad_file, temp_dir): + """Test subsetting with --inplace behavior.""" + obs_file = temp_dir / "obs_names.txt" + obs_file.write_text("cell_1\ncell_3\n") + + console = Console(stderr=True) + + subset_h5ad( + file=sample_h5ad_file, + output=None, + obs_file=obs_file, + var_file=None, + chunk_rows=1024, + console=console, + inplace=True, + ) + + with h5py.File(sample_h5ad_file, "r") as f: + assert f["obs"]["obs_names"].shape[0] == 2 + assert f["X"].shape[0] == 2 + def test_subset_h5ad_sparse_entries(self, temp_dir): """Test sparse matrices in layers, obsm, varm, obsp, and varp.""" file_path = temp_dir / "sparse_entries.h5ad" From a4cbb6bfcd3c8eac1696988a4e51024dacd8d943 Mon Sep 17 00:00:00 2001 From: Aljes Date: Tue, 27 Jan 2026 12:24:28 +0000 Subject: [PATCH 60/62] Refactor subset command tests to use --output flag for output file specification --- .github/workflows/tests.yml | 2 +- tests/test_cli.py | 27 ++++++++++++++++++++++++--- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f946c87..cd6a3bc 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -57,7 +57,7 @@ jobs: - name: Run tests with coverage run: | - uv run pytest -v ${{ matrix.module.tests }} \ + uv run pytest -v -W default ${{ matrix.module.tests }} \ --cov=h5ad \ --cov-report=term-missing \ --cov-report=xml \ diff --git a/tests/test_cli.py b/tests/test_cli.py index 50cc137..9a27d0c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -367,7 +367,15 @@ def test_subset_command_obs(self, sample_h5ad_file, temp_dir): output = temp_dir / "subset.h5ad" result = runner.invoke( - app, ["subset", str(sample_h5ad_file), str(output), "--obs", str(obs_file)] + app, + [ + "subset", + str(sample_h5ad_file), + "--output", + str(output), + "--obs", + str(obs_file), + ], ) assert result.exit_code == 0 assert output.exists() @@ -379,7 +387,15 @@ def test_subset_command_var(self, sample_h5ad_file, temp_dir): output = temp_dir / "subset.h5ad" result = runner.invoke( - app, ["subset", str(sample_h5ad_file), str(output), "--var", str(var_file)] + app, + [ + "subset", + str(sample_h5ad_file), + "--output", + str(output), + "--var", + str(var_file), + ], ) assert result.exit_code == 0 assert output.exists() @@ -398,6 +414,7 @@ def test_subset_command_both(self, sample_h5ad_file, temp_dir): [ "subset", str(sample_h5ad_file), + "--output", str(output), "--obs", str(obs_file), @@ -411,7 +428,9 @@ def test_subset_command_both(self, sample_h5ad_file, temp_dir): def test_subset_command_no_filters(self, sample_h5ad_file, temp_dir): """Test subset command without any filters (should fail).""" output = temp_dir / "subset.h5ad" - result = runner.invoke(app, ["subset", str(sample_h5ad_file), str(output)]) + result = runner.invoke( + app, ["subset", str(sample_h5ad_file), "--output", str(output)] + ) assert result.exit_code == 1 # Check both stdout and stderr since Console uses stderr=True output_text = result.stdout + result.stderr @@ -428,6 +447,7 @@ def test_subset_command_chunk_rows(self, sample_h5ad_file, temp_dir): [ "subset", str(sample_h5ad_file), + "--output", str(output), "--obs", str(obs_file), @@ -449,6 +469,7 @@ def test_subset_command_sparse(self, sample_sparse_csr_h5ad, temp_dir): [ "subset", str(sample_sparse_csr_h5ad), + "--output", str(output), "--obs", str(obs_file), From 982efd46ab4b6a0bebfd7d663c3a9dbf09d5b6e1 Mon Sep 17 00:00:00 2001 From: Aljes Date: Tue, 27 Jan 2026 12:24:41 +0000 Subject: [PATCH 61/62] Update uv.lock --- uv.lock | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 199 insertions(+), 1 deletion(-) diff --git a/uv.lock b/uv.lock index 71c45c8..266fa80 100644 --- a/uv.lock +++ b/uv.lock @@ -97,15 +97,52 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/48/d9f421cb8da5afaa1a64570d9989e00fb7955e6acddc5a12979f7666ef60/coverage-7.13.1-py3-none-any.whl", hash = "sha256:2016745cb3ba554469d02819d78958b571792bb68e31302610e898f80dd3a573", size = 210722, upload-time = "2025-12-28T15:42:54.901Z" }, ] +[[package]] +name = "donfig" +version = "0.8.1.post1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/25/71/80cc718ff6d7abfbabacb1f57aaa42e9c1552bfdd01e64ddd704e4a03638/donfig-0.8.1.post1.tar.gz", hash = "sha256:3bef3413a4c1c601b585e8d297256d0c1470ea012afa6e8461dc28bfb7c23f52", size = 19506, upload-time = "2024-05-23T14:14:31.513Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/d5/c5db1ea3394c6e1732fb3286b3bd878b59507a8f77d32a2cebda7d7b7cd4/donfig-0.8.1.post1-py3-none-any.whl", hash = "sha256:2a3175ce74a06109ff9307d90a230f81215cbac9a751f4d1c6194644b8204f9d", size = 21592, upload-time = "2024-05-23T14:13:55.283Z" }, +] + +[[package]] +name = "google-crc32c" +version = "1.8.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/03/41/4b9c02f99e4c5fb477122cd5437403b552873f014616ac1d19ac8221a58d/google_crc32c-1.8.0.tar.gz", hash = "sha256:a428e25fb7691024de47fecfbff7ff957214da51eddded0da0ae0e0f03a2cf79", size = 14192, upload-time = "2025-12-16T00:35:25.142Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/5f/7307325b1198b59324c0fa9807cafb551afb65e831699f2ce211ad5c8240/google_crc32c-1.8.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:4b8286b659c1335172e39563ab0a768b8015e88e08329fa5321f774275fc3113", size = 31300, upload-time = "2025-12-16T00:21:56.723Z" }, + { url = "https://files.pythonhosted.org/packages/21/8e/58c0d5d86e2220e6a37befe7e6a94dd2f6006044b1a33edf1ff6d9f7e319/google_crc32c-1.8.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:2a3dc3318507de089c5384cc74d54318401410f82aa65b2d9cdde9d297aca7cb", size = 30867, upload-time = "2025-12-16T00:38:31.302Z" }, + { url = "https://files.pythonhosted.org/packages/ce/a9/a780cc66f86335a6019f557a8aaca8fbb970728f0efd2430d15ff1beae0e/google_crc32c-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:14f87e04d613dfa218d6135e81b78272c3b904e2a7053b841481b38a7d901411", size = 33364, upload-time = "2025-12-16T00:40:22.96Z" }, + { url = "https://files.pythonhosted.org/packages/21/3f/3457ea803db0198c9aaca2dd373750972ce28a26f00544b6b85088811939/google_crc32c-1.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cb5c869c2923d56cb0c8e6bcdd73c009c36ae39b652dbe46a05eb4ef0ad01454", size = 33740, upload-time = "2025-12-16T00:40:23.96Z" }, + { url = "https://files.pythonhosted.org/packages/df/c0/87c2073e0c72515bb8733d4eef7b21548e8d189f094b5dad20b0ecaf64f6/google_crc32c-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:3cc0c8912038065eafa603b238abf252e204accab2a704c63b9e14837a854962", size = 34437, upload-time = "2025-12-16T00:35:21.395Z" }, + { url = "https://files.pythonhosted.org/packages/d1/db/000f15b41724589b0e7bc24bc7a8967898d8d3bc8caf64c513d91ef1f6c0/google_crc32c-1.8.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:3ebb04528e83b2634857f43f9bb8ef5b2bbe7f10f140daeb01b58f972d04736b", size = 31297, upload-time = "2025-12-16T00:23:20.709Z" }, + { url = "https://files.pythonhosted.org/packages/d7/0d/8ebed0c39c53a7e838e2a486da8abb0e52de135f1b376ae2f0b160eb4c1a/google_crc32c-1.8.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:450dc98429d3e33ed2926fc99ee81001928d63460f8538f21a5d6060912a8e27", size = 30867, upload-time = "2025-12-16T00:43:14.628Z" }, + { url = "https://files.pythonhosted.org/packages/ce/42/b468aec74a0354b34c8cbf748db20d6e350a68a2b0912e128cabee49806c/google_crc32c-1.8.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:3b9776774b24ba76831609ffbabce8cdf6fa2bd5e9df37b594221c7e333a81fa", size = 33344, upload-time = "2025-12-16T00:40:24.742Z" }, + { url = "https://files.pythonhosted.org/packages/1c/e8/b33784d6fc77fb5062a8a7854e43e1e618b87d5ddf610a88025e4de6226e/google_crc32c-1.8.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:89c17d53d75562edfff86679244830599ee0a48efc216200691de8b02ab6b2b8", size = 33694, upload-time = "2025-12-16T00:40:25.505Z" }, + { url = "https://files.pythonhosted.org/packages/92/b1/d3cbd4d988afb3d8e4db94ca953df429ed6db7282ed0e700d25e6c7bfc8d/google_crc32c-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:57a50a9035b75643996fbf224d6661e386c7162d1dfdab9bc4ca790947d1007f", size = 34435, upload-time = "2025-12-16T00:35:22.107Z" }, + { url = "https://files.pythonhosted.org/packages/21/88/8ecf3c2b864a490b9e7010c84fd203ec8cf3b280651106a3a74dd1b0ca72/google_crc32c-1.8.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:e6584b12cb06796d285d09e33f63309a09368b9d806a551d8036a4207ea43697", size = 31301, upload-time = "2025-12-16T00:24:48.527Z" }, + { url = "https://files.pythonhosted.org/packages/36/c6/f7ff6c11f5ca215d9f43d3629163727a272eabc356e5c9b2853df2bfe965/google_crc32c-1.8.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:f4b51844ef67d6cf2e9425983274da75f18b1597bb2c998e1c0a0e8d46f8f651", size = 30868, upload-time = "2025-12-16T00:48:12.163Z" }, + { url = "https://files.pythonhosted.org/packages/56/15/c25671c7aad70f8179d858c55a6ae8404902abe0cdcf32a29d581792b491/google_crc32c-1.8.0-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b0d1a7afc6e8e4635564ba8aa5c0548e3173e41b6384d7711a9123165f582de2", size = 33381, upload-time = "2025-12-16T00:40:26.268Z" }, + { url = "https://files.pythonhosted.org/packages/42/fa/f50f51260d7b0ef5d4898af122d8a7ec5a84e2984f676f746445f783705f/google_crc32c-1.8.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8b3f68782f3cbd1bce027e48768293072813469af6a61a86f6bb4977a4380f21", size = 33734, upload-time = "2025-12-16T00:40:27.028Z" }, + { url = "https://files.pythonhosted.org/packages/08/a5/7b059810934a09fb3ccb657e0843813c1fee1183d3bc2c8041800374aa2c/google_crc32c-1.8.0-cp314-cp314-win_amd64.whl", hash = "sha256:d511b3153e7011a27ab6ee6bb3a5404a55b994dc1a7322c0b87b29606d9790e2", size = 34878, upload-time = "2025-12-16T00:35:23.142Z" }, +] + [[package]] name = "h5ad" -version = "0.1.0" +version = "0.2.0" source = { editable = "." } dependencies = [ { name = "h5py" }, { name = "numpy" }, + { name = "pillow" }, { name = "rich" }, { name = "typer" }, + { name = "zarr" }, ] [package.optional-dependencies] @@ -118,10 +155,12 @@ dev = [ requires-dist = [ { name = "h5py", specifier = ">=3.15.1" }, { name = "numpy", specifier = ">=2.3.5" }, + { name = "pillow", specifier = ">=12.1.0" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.3.4" }, { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=6.0.0" }, { name = "rich", specifier = ">=14.2.0" }, { name = "typer", specifier = ">=0.20.0" }, + { name = "zarr", specifier = ">=3.1.5" }, ] provides-extras = ["dev"] @@ -190,6 +229,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, ] +[[package]] +name = "numcodecs" +version = "0.16.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/44/bd/8a391e7c356366224734efd24da929cc4796fff468bfb179fe1af6548535/numcodecs-0.16.5.tar.gz", hash = "sha256:0d0fb60852f84c0bd9543cc4d2ab9eefd37fc8efcc410acd4777e62a1d300318", size = 6276387, upload-time = "2025-11-21T02:49:48.986Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/75/cc/55420f3641a67f78392dc0bc5d02cb9eb0a9dcebf2848d1ac77253ca61fa/numcodecs-0.16.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:24e675dc8d1550cd976a99479b87d872cb142632c75cc402fea04c08c4898523", size = 1656287, upload-time = "2025-11-21T02:49:25.755Z" }, + { url = "https://files.pythonhosted.org/packages/f5/6c/86644987505dcb90ba6d627d6989c27bafb0699f9fd00187e06d05ea8594/numcodecs-0.16.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:94ddfa4341d1a3ab99989d13b01b5134abb687d3dab2ead54b450aefe4ad5bd6", size = 1148899, upload-time = "2025-11-21T02:49:26.87Z" }, + { url = "https://files.pythonhosted.org/packages/97/1e/98aaddf272552d9fef1f0296a9939d1487914a239e98678f6b20f8b0a5c8/numcodecs-0.16.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b554ab9ecf69de7ca2b6b5e8bc696bd9747559cb4dd5127bd08d7a28bec59c3a", size = 8534814, upload-time = "2025-11-21T02:49:28.547Z" }, + { url = "https://files.pythonhosted.org/packages/fb/53/78c98ef5c8b2b784453487f3e4d6c017b20747c58b470393e230c78d18e8/numcodecs-0.16.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ad1a379a45bd3491deab8ae6548313946744f868c21d5340116977ea3be5b1d6", size = 9173471, upload-time = "2025-11-21T02:49:30.444Z" }, + { url = "https://files.pythonhosted.org/packages/1c/20/2fdec87fc7f8cec950d2b0bea603c12dc9f05b4966dc5924ba5a36a61bf6/numcodecs-0.16.5-cp312-cp312-win_amd64.whl", hash = "sha256:845a9857886ffe4a3172ba1c537ae5bcc01e65068c31cf1fce1a844bd1da050f", size = 801412, upload-time = "2025-11-21T02:49:32.123Z" }, + { url = "https://files.pythonhosted.org/packages/38/38/071ced5a5fd1c85ba0e14ba721b66b053823e5176298c2f707e50bed11d9/numcodecs-0.16.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:25be3a516ab677dad890760d357cfe081a371d9c0a2e9a204562318ac5969de3", size = 1654359, upload-time = "2025-11-21T02:49:33.673Z" }, + { url = "https://files.pythonhosted.org/packages/d1/c0/5f84ba7525577c1b9909fc2d06ef11314825fc4ad4378f61d0e4c9883b4a/numcodecs-0.16.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0107e839ef75b854e969cb577e140b1aadb9847893937636582d23a2a4c6ce50", size = 1144237, upload-time = "2025-11-21T02:49:35.294Z" }, + { url = "https://files.pythonhosted.org/packages/0b/00/787ea5f237b8ea7bc67140c99155f9c00b5baf11c49afc5f3bfefa298f95/numcodecs-0.16.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:015a7c859ecc2a06e2a548f64008c0ec3aaecabc26456c2c62f4278d8fc20597", size = 8483064, upload-time = "2025-11-21T02:49:36.454Z" }, + { url = "https://files.pythonhosted.org/packages/c4/e6/d359fdd37498e74d26a167f7a51e54542e642ea47181eb4e643a69a066c3/numcodecs-0.16.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:84230b4b9dad2392f2a84242bd6e3e659ac137b5a1ce3571d6965fca673e0903", size = 9126063, upload-time = "2025-11-21T02:49:38.018Z" }, + { url = "https://files.pythonhosted.org/packages/27/72/6663cc0382ddbb866136c255c837bcb96cc7ce5e83562efec55e1b995941/numcodecs-0.16.5-cp313-cp313-win_amd64.whl", hash = "sha256:5088145502ad1ebf677ec47d00eb6f0fd600658217db3e0c070c321c85d6cf3d", size = 799275, upload-time = "2025-11-21T02:49:39.558Z" }, + { url = "https://files.pythonhosted.org/packages/3c/9e/38e7ca8184c958b51f45d56a4aeceb1134ecde2d8bd157efadc98502cc42/numcodecs-0.16.5-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b05647b8b769e6bc8016e9fd4843c823ce5c9f2337c089fb5c9c4da05e5275de", size = 1654721, upload-time = "2025-11-21T02:49:40.602Z" }, + { url = "https://files.pythonhosted.org/packages/a1/37/260fa42e7b2b08e6e00ad632f8dd620961a60a459426c26cea390f8c68d0/numcodecs-0.16.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3832bd1b5af8bb3e413076b7d93318c8e7d7b68935006b9fa36ca057d1725a8f", size = 1146887, upload-time = "2025-11-21T02:49:41.721Z" }, + { url = "https://files.pythonhosted.org/packages/4e/15/e2e1151b5a8b14a15dfd4bb4abccce7fff7580f39bc34092780088835f3a/numcodecs-0.16.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49f7b7d24f103187f53135bed28bb9f0ed6b2e14c604664726487bb6d7c882e1", size = 8476987, upload-time = "2025-11-21T02:49:43.363Z" }, + { url = "https://files.pythonhosted.org/packages/6d/30/16a57fc4d9fb0ba06c600408bd6634f2f1753c54a7a351c99c5e09b51ee2/numcodecs-0.16.5-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:aec9736d81b70f337d89c4070ee3ffeff113f386fd789492fa152d26a15043e4", size = 9102377, upload-time = "2025-11-21T02:49:45.508Z" }, + { url = "https://files.pythonhosted.org/packages/31/a5/a0425af36c20d55a3ea884db4b4efca25a43bea9214ba69ca7932dd997b4/numcodecs-0.16.5-cp314-cp314-win_amd64.whl", hash = "sha256:b16a14303800e9fb88abc39463ab4706c037647ac17e49e297faa5f7d7dbbf1d", size = 819022, upload-time = "2025-11-21T02:49:47.39Z" }, +] + [[package]] name = "numpy" version = "2.3.5" @@ -262,6 +328,75 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" }, ] +[[package]] +name = "pillow" +version = "12.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d0/02/d52c733a2452ef1ffcc123b68e6606d07276b0e358db70eabad7e40042b7/pillow-12.1.0.tar.gz", hash = "sha256:5c5ae0a06e9ea030ab786b0251b32c7e4ce10e58d983c0d5c56029455180b5b9", size = 46977283, upload-time = "2026-01-02T09:13:29.892Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/20/31/dc53fe21a2f2996e1b7d92bf671cdb157079385183ef7c1ae08b485db510/pillow-12.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a332ac4ccb84b6dde65dbace8431f3af08874bf9770719d32a635c4ef411b18b", size = 5262642, upload-time = "2026-01-02T09:11:10.138Z" }, + { url = "https://files.pythonhosted.org/packages/ab/c1/10e45ac9cc79419cedf5121b42dcca5a50ad2b601fa080f58c22fb27626e/pillow-12.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:907bfa8a9cb790748a9aa4513e37c88c59660da3bcfffbd24a7d9e6abf224551", size = 4657464, upload-time = "2026-01-02T09:11:12.319Z" }, + { url = "https://files.pythonhosted.org/packages/ad/26/7b82c0ab7ef40ebede7a97c72d473bda5950f609f8e0c77b04af574a0ddb/pillow-12.1.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:efdc140e7b63b8f739d09a99033aa430accce485ff78e6d311973a67b6bf3208", size = 6234878, upload-time = "2026-01-02T09:11:14.096Z" }, + { url = "https://files.pythonhosted.org/packages/76/25/27abc9792615b5e886ca9411ba6637b675f1b77af3104710ac7353fe5605/pillow-12.1.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bef9768cab184e7ae6e559c032e95ba8d07b3023c289f79a2bd36e8bf85605a5", size = 8044868, upload-time = "2026-01-02T09:11:15.903Z" }, + { url = "https://files.pythonhosted.org/packages/0a/ea/f200a4c36d836100e7bc738fc48cd963d3ba6372ebc8298a889e0cfc3359/pillow-12.1.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:742aea052cf5ab5034a53c3846165bc3ce88d7c38e954120db0ab867ca242661", size = 6349468, upload-time = "2026-01-02T09:11:17.631Z" }, + { url = "https://files.pythonhosted.org/packages/11/8f/48d0b77ab2200374c66d344459b8958c86693be99526450e7aee714e03e4/pillow-12.1.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6dfc2af5b082b635af6e08e0d1f9f1c4e04d17d4e2ca0ef96131e85eda6eb17", size = 7041518, upload-time = "2026-01-02T09:11:19.389Z" }, + { url = "https://files.pythonhosted.org/packages/1d/23/c281182eb986b5d31f0a76d2a2c8cd41722d6fb8ed07521e802f9bba52de/pillow-12.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:609e89d9f90b581c8d16358c9087df76024cf058fa693dd3e1e1620823f39670", size = 6462829, upload-time = "2026-01-02T09:11:21.28Z" }, + { url = "https://files.pythonhosted.org/packages/25/ef/7018273e0faac099d7b00982abdcc39142ae6f3bd9ceb06de09779c4a9d6/pillow-12.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:43b4899cfd091a9693a1278c4982f3e50f7fb7cff5153b05174b4afc9593b616", size = 7166756, upload-time = "2026-01-02T09:11:23.559Z" }, + { url = "https://files.pythonhosted.org/packages/8f/c8/993d4b7ab2e341fe02ceef9576afcf5830cdec640be2ac5bee1820d693d4/pillow-12.1.0-cp312-cp312-win32.whl", hash = "sha256:aa0c9cc0b82b14766a99fbe6084409972266e82f459821cd26997a488a7261a7", size = 6328770, upload-time = "2026-01-02T09:11:25.661Z" }, + { url = "https://files.pythonhosted.org/packages/a7/87/90b358775a3f02765d87655237229ba64a997b87efa8ccaca7dd3e36e7a7/pillow-12.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:d70534cea9e7966169ad29a903b99fc507e932069a881d0965a1a84bb57f6c6d", size = 7033406, upload-time = "2026-01-02T09:11:27.474Z" }, + { url = "https://files.pythonhosted.org/packages/5d/cf/881b457eccacac9e5b2ddd97d5071fb6d668307c57cbf4e3b5278e06e536/pillow-12.1.0-cp312-cp312-win_arm64.whl", hash = "sha256:65b80c1ee7e14a87d6a068dd3b0aea268ffcabfe0498d38661b00c5b4b22e74c", size = 2452612, upload-time = "2026-01-02T09:11:29.309Z" }, + { url = "https://files.pythonhosted.org/packages/dd/c7/2530a4aa28248623e9d7f27316b42e27c32ec410f695929696f2e0e4a778/pillow-12.1.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:7b5dd7cbae20285cdb597b10eb5a2c13aa9de6cde9bb64a3c1317427b1db1ae1", size = 4062543, upload-time = "2026-01-02T09:11:31.566Z" }, + { url = "https://files.pythonhosted.org/packages/8f/1f/40b8eae823dc1519b87d53c30ed9ef085506b05281d313031755c1705f73/pillow-12.1.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:29a4cef9cb672363926f0470afc516dbf7305a14d8c54f7abbb5c199cd8f8179", size = 4138373, upload-time = "2026-01-02T09:11:33.367Z" }, + { url = "https://files.pythonhosted.org/packages/d4/77/6fa60634cf06e52139fd0e89e5bbf055e8166c691c42fb162818b7fda31d/pillow-12.1.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:681088909d7e8fa9e31b9799aaa59ba5234c58e5e4f1951b4c4d1082a2e980e0", size = 3601241, upload-time = "2026-01-02T09:11:35.011Z" }, + { url = "https://files.pythonhosted.org/packages/4f/bf/28ab865de622e14b747f0cd7877510848252d950e43002e224fb1c9ababf/pillow-12.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:983976c2ab753166dc66d36af6e8ec15bb511e4a25856e2227e5f7e00a160587", size = 5262410, upload-time = "2026-01-02T09:11:36.682Z" }, + { url = "https://files.pythonhosted.org/packages/1c/34/583420a1b55e715937a85bd48c5c0991598247a1fd2eb5423188e765ea02/pillow-12.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:db44d5c160a90df2d24a24760bbd37607d53da0b34fb546c4c232af7192298ac", size = 4657312, upload-time = "2026-01-02T09:11:38.535Z" }, + { url = "https://files.pythonhosted.org/packages/1d/fd/f5a0896839762885b3376ff04878f86ab2b097c2f9a9cdccf4eda8ba8dc0/pillow-12.1.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6b7a9d1db5dad90e2991645874f708e87d9a3c370c243c2d7684d28f7e133e6b", size = 6232605, upload-time = "2026-01-02T09:11:40.602Z" }, + { url = "https://files.pythonhosted.org/packages/98/aa/938a09d127ac1e70e6ed467bd03834350b33ef646b31edb7452d5de43792/pillow-12.1.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6258f3260986990ba2fa8a874f8b6e808cf5abb51a94015ca3dc3c68aa4f30ea", size = 8041617, upload-time = "2026-01-02T09:11:42.721Z" }, + { url = "https://files.pythonhosted.org/packages/17/e8/538b24cb426ac0186e03f80f78bc8dc7246c667f58b540bdd57c71c9f79d/pillow-12.1.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e115c15e3bc727b1ca3e641a909f77f8ca72a64fff150f666fcc85e57701c26c", size = 6346509, upload-time = "2026-01-02T09:11:44.955Z" }, + { url = "https://files.pythonhosted.org/packages/01/9a/632e58ec89a32738cabfd9ec418f0e9898a2b4719afc581f07c04a05e3c9/pillow-12.1.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6741e6f3074a35e47c77b23a4e4f2d90db3ed905cb1c5e6e0d49bff2045632bc", size = 7038117, upload-time = "2026-01-02T09:11:46.736Z" }, + { url = "https://files.pythonhosted.org/packages/c7/a2/d40308cf86eada842ca1f3ffa45d0ca0df7e4ab33c83f81e73f5eaed136d/pillow-12.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:935b9d1aed48fcfb3f838caac506f38e29621b44ccc4f8a64d575cb1b2a88644", size = 6460151, upload-time = "2026-01-02T09:11:48.625Z" }, + { url = "https://files.pythonhosted.org/packages/f1/88/f5b058ad6453a085c5266660a1417bdad590199da1b32fb4efcff9d33b05/pillow-12.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5fee4c04aad8932da9f8f710af2c1a15a83582cfb884152a9caa79d4efcdbf9c", size = 7164534, upload-time = "2026-01-02T09:11:50.445Z" }, + { url = "https://files.pythonhosted.org/packages/19/ce/c17334caea1db789163b5d855a5735e47995b0b5dc8745e9a3605d5f24c0/pillow-12.1.0-cp313-cp313-win32.whl", hash = "sha256:a786bf667724d84aa29b5db1c61b7bfdde380202aaca12c3461afd6b71743171", size = 6332551, upload-time = "2026-01-02T09:11:52.234Z" }, + { url = "https://files.pythonhosted.org/packages/e5/07/74a9d941fa45c90a0d9465098fe1ec85de3e2afbdc15cc4766622d516056/pillow-12.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:461f9dfdafa394c59cd6d818bdfdbab4028b83b02caadaff0ffd433faf4c9a7a", size = 7040087, upload-time = "2026-01-02T09:11:54.822Z" }, + { url = "https://files.pythonhosted.org/packages/88/09/c99950c075a0e9053d8e880595926302575bc742b1b47fe1bbcc8d388d50/pillow-12.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:9212d6b86917a2300669511ed094a9406888362e085f2431a7da985a6b124f45", size = 2452470, upload-time = "2026-01-02T09:11:56.522Z" }, + { url = "https://files.pythonhosted.org/packages/b5/ba/970b7d85ba01f348dee4d65412476321d40ee04dcb51cd3735b9dc94eb58/pillow-12.1.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:00162e9ca6d22b7c3ee8e61faa3c3253cd19b6a37f126cad04f2f88b306f557d", size = 5264816, upload-time = "2026-01-02T09:11:58.227Z" }, + { url = "https://files.pythonhosted.org/packages/10/60/650f2fb55fdba7a510d836202aa52f0baac633e50ab1cf18415d332188fb/pillow-12.1.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:7d6daa89a00b58c37cb1747ec9fb7ac3bc5ffd5949f5888657dfddde6d1312e0", size = 4660472, upload-time = "2026-01-02T09:12:00.798Z" }, + { url = "https://files.pythonhosted.org/packages/2b/c0/5273a99478956a099d533c4f46cbaa19fd69d606624f4334b85e50987a08/pillow-12.1.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e2479c7f02f9d505682dc47df8c0ea1fc5e264c4d1629a5d63fe3e2334b89554", size = 6268974, upload-time = "2026-01-02T09:12:02.572Z" }, + { url = "https://files.pythonhosted.org/packages/b4/26/0bf714bc2e73d5267887d47931d53c4ceeceea6978148ed2ab2a4e6463c4/pillow-12.1.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f188d580bd870cda1e15183790d1cc2fa78f666e76077d103edf048eed9c356e", size = 8073070, upload-time = "2026-01-02T09:12:04.75Z" }, + { url = "https://files.pythonhosted.org/packages/43/cf/1ea826200de111a9d65724c54f927f3111dc5ae297f294b370a670c17786/pillow-12.1.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0fde7ec5538ab5095cc02df38ee99b0443ff0e1c847a045554cf5f9af1f4aa82", size = 6380176, upload-time = "2026-01-02T09:12:06.626Z" }, + { url = "https://files.pythonhosted.org/packages/03/e0/7938dd2b2013373fd85d96e0f38d62b7a5a262af21ac274250c7ca7847c9/pillow-12.1.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0ed07dca4a8464bada6139ab38f5382f83e5f111698caf3191cb8dbf27d908b4", size = 7067061, upload-time = "2026-01-02T09:12:08.624Z" }, + { url = "https://files.pythonhosted.org/packages/86/ad/a2aa97d37272a929a98437a8c0ac37b3cf012f4f8721e1bd5154699b2518/pillow-12.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f45bd71d1fa5e5749587613037b172e0b3b23159d1c00ef2fc920da6f470e6f0", size = 6491824, upload-time = "2026-01-02T09:12:10.488Z" }, + { url = "https://files.pythonhosted.org/packages/a4/44/80e46611b288d51b115826f136fb3465653c28f491068a72d3da49b54cd4/pillow-12.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:277518bf4fe74aa91489e1b20577473b19ee70fb97c374aa50830b279f25841b", size = 7190911, upload-time = "2026-01-02T09:12:12.772Z" }, + { url = "https://files.pythonhosted.org/packages/86/77/eacc62356b4cf81abe99ff9dbc7402750044aed02cfd6a503f7c6fc11f3e/pillow-12.1.0-cp313-cp313t-win32.whl", hash = "sha256:7315f9137087c4e0ee73a761b163fc9aa3b19f5f606a7fc08d83fd3e4379af65", size = 6336445, upload-time = "2026-01-02T09:12:14.775Z" }, + { url = "https://files.pythonhosted.org/packages/e7/3c/57d81d0b74d218706dafccb87a87ea44262c43eef98eb3b164fd000e0491/pillow-12.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:0ddedfaa8b5f0b4ffbc2fa87b556dc59f6bb4ecb14a53b33f9189713ae8053c0", size = 7045354, upload-time = "2026-01-02T09:12:16.599Z" }, + { url = "https://files.pythonhosted.org/packages/ac/82/8b9b97bba2e3576a340f93b044a3a3a09841170ab4c1eb0d5c93469fd32f/pillow-12.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:80941e6d573197a0c28f394753de529bb436b1ca990ed6e765cf42426abc39f8", size = 2454547, upload-time = "2026-01-02T09:12:18.704Z" }, + { url = "https://files.pythonhosted.org/packages/8c/87/bdf971d8bbcf80a348cc3bacfcb239f5882100fe80534b0ce67a784181d8/pillow-12.1.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:5cb7bc1966d031aec37ddb9dcf15c2da5b2e9f7cc3ca7c54473a20a927e1eb91", size = 4062533, upload-time = "2026-01-02T09:12:20.791Z" }, + { url = "https://files.pythonhosted.org/packages/ff/4f/5eb37a681c68d605eb7034c004875c81f86ec9ef51f5be4a63eadd58859a/pillow-12.1.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:97e9993d5ed946aba26baf9c1e8cf18adbab584b99f452ee72f7ee8acb882796", size = 4138546, upload-time = "2026-01-02T09:12:23.664Z" }, + { url = "https://files.pythonhosted.org/packages/11/6d/19a95acb2edbace40dcd582d077b991646b7083c41b98da4ed7555b59733/pillow-12.1.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:414b9a78e14ffeb98128863314e62c3f24b8a86081066625700b7985b3f529bd", size = 3601163, upload-time = "2026-01-02T09:12:26.338Z" }, + { url = "https://files.pythonhosted.org/packages/fc/36/2b8138e51cb42e4cc39c3297713455548be855a50558c3ac2beebdc251dd/pillow-12.1.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:e6bdb408f7c9dd2a5ff2b14a3b0bb6d4deb29fb9961e6eb3ae2031ae9a5cec13", size = 5266086, upload-time = "2026-01-02T09:12:28.782Z" }, + { url = "https://files.pythonhosted.org/packages/53/4b/649056e4d22e1caa90816bf99cef0884aed607ed38075bd75f091a607a38/pillow-12.1.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3413c2ae377550f5487991d444428f1a8ae92784aac79caa8b1e3b89b175f77e", size = 4657344, upload-time = "2026-01-02T09:12:31.117Z" }, + { url = "https://files.pythonhosted.org/packages/6c/6b/c5742cea0f1ade0cd61485dc3d81f05261fc2276f537fbdc00802de56779/pillow-12.1.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e5dcbe95016e88437ecf33544ba5db21ef1b8dd6e1b434a2cb2a3d605299e643", size = 6232114, upload-time = "2026-01-02T09:12:32.936Z" }, + { url = "https://files.pythonhosted.org/packages/bf/8f/9f521268ce22d63991601aafd3d48d5ff7280a246a1ef62d626d67b44064/pillow-12.1.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d0a7735df32ccbcc98b98a1ac785cc4b19b580be1bdf0aeb5c03223220ea09d5", size = 8042708, upload-time = "2026-01-02T09:12:34.78Z" }, + { url = "https://files.pythonhosted.org/packages/1a/eb/257f38542893f021502a1bbe0c2e883c90b5cff26cc33b1584a841a06d30/pillow-12.1.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0c27407a2d1b96774cbc4a7594129cc027339fd800cd081e44497722ea1179de", size = 6347762, upload-time = "2026-01-02T09:12:36.748Z" }, + { url = "https://files.pythonhosted.org/packages/c4/5a/8ba375025701c09b309e8d5163c5a4ce0102fa86bbf8800eb0d7ac87bc51/pillow-12.1.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15c794d74303828eaa957ff8070846d0efe8c630901a1c753fdc63850e19ecd9", size = 7039265, upload-time = "2026-01-02T09:12:39.082Z" }, + { url = "https://files.pythonhosted.org/packages/cf/dc/cf5e4cdb3db533f539e88a7bbf9f190c64ab8a08a9bc7a4ccf55067872e4/pillow-12.1.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c990547452ee2800d8506c4150280757f88532f3de2a58e3022e9b179107862a", size = 6462341, upload-time = "2026-01-02T09:12:40.946Z" }, + { url = "https://files.pythonhosted.org/packages/d0/47/0291a25ac9550677e22eda48510cfc4fa4b2ef0396448b7fbdc0a6946309/pillow-12.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b63e13dd27da389ed9475b3d28510f0f954bca0041e8e551b2a4eb1eab56a39a", size = 7165395, upload-time = "2026-01-02T09:12:42.706Z" }, + { url = "https://files.pythonhosted.org/packages/4f/4c/e005a59393ec4d9416be06e6b45820403bb946a778e39ecec62f5b2b991e/pillow-12.1.0-cp314-cp314-win32.whl", hash = "sha256:1a949604f73eb07a8adab38c4fe50791f9919344398bdc8ac6b307f755fc7030", size = 6431413, upload-time = "2026-01-02T09:12:44.944Z" }, + { url = "https://files.pythonhosted.org/packages/1c/af/f23697f587ac5f9095d67e31b81c95c0249cd461a9798a061ed6709b09b5/pillow-12.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:4f9f6a650743f0ddee5593ac9e954ba1bdbc5e150bc066586d4f26127853ab94", size = 7176779, upload-time = "2026-01-02T09:12:46.727Z" }, + { url = "https://files.pythonhosted.org/packages/b3/36/6a51abf8599232f3e9afbd16d52829376a68909fe14efe29084445db4b73/pillow-12.1.0-cp314-cp314-win_arm64.whl", hash = "sha256:808b99604f7873c800c4840f55ff389936ef1948e4e87645eaf3fccbc8477ac4", size = 2543105, upload-time = "2026-01-02T09:12:49.243Z" }, + { url = "https://files.pythonhosted.org/packages/82/54/2e1dd20c8749ff225080d6ba465a0cab4387f5db0d1c5fb1439e2d99923f/pillow-12.1.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:bc11908616c8a283cf7d664f77411a5ed2a02009b0097ff8abbba5e79128ccf2", size = 5268571, upload-time = "2026-01-02T09:12:51.11Z" }, + { url = "https://files.pythonhosted.org/packages/57/61/571163a5ef86ec0cf30d265ac2a70ae6fc9e28413d1dc94fa37fae6bda89/pillow-12.1.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:896866d2d436563fa2a43a9d72f417874f16b5545955c54a64941e87c1376c61", size = 4660426, upload-time = "2026-01-02T09:12:52.865Z" }, + { url = "https://files.pythonhosted.org/packages/5e/e1/53ee5163f794aef1bf84243f755ee6897a92c708505350dd1923f4afec48/pillow-12.1.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8e178e3e99d3c0ea8fc64b88447f7cac8ccf058af422a6cedc690d0eadd98c51", size = 6269908, upload-time = "2026-01-02T09:12:54.884Z" }, + { url = "https://files.pythonhosted.org/packages/bc/0b/b4b4106ff0ee1afa1dc599fde6ab230417f800279745124f6c50bcffed8e/pillow-12.1.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:079af2fb0c599c2ec144ba2c02766d1b55498e373b3ac64687e43849fbbef5bc", size = 8074733, upload-time = "2026-01-02T09:12:56.802Z" }, + { url = "https://files.pythonhosted.org/packages/19/9f/80b411cbac4a732439e629a26ad3ef11907a8c7fc5377b7602f04f6fe4e7/pillow-12.1.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bdec5e43377761c5dbca620efb69a77f6855c5a379e32ac5b158f54c84212b14", size = 6381431, upload-time = "2026-01-02T09:12:58.823Z" }, + { url = "https://files.pythonhosted.org/packages/8f/b7/d65c45db463b66ecb6abc17c6ba6917a911202a07662247e1355ce1789e7/pillow-12.1.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:565c986f4b45c020f5421a4cea13ef294dde9509a8577f29b2fc5edc7587fff8", size = 7068529, upload-time = "2026-01-02T09:13:00.885Z" }, + { url = "https://files.pythonhosted.org/packages/50/96/dfd4cd726b4a45ae6e3c669fc9e49deb2241312605d33aba50499e9d9bd1/pillow-12.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:43aca0a55ce1eefc0aefa6253661cb54571857b1a7b2964bd8a1e3ef4b729924", size = 6492981, upload-time = "2026-01-02T09:13:03.314Z" }, + { url = "https://files.pythonhosted.org/packages/4d/1c/b5dc52cf713ae46033359c5ca920444f18a6359ce1020dd3e9c553ea5bc6/pillow-12.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0deedf2ea233722476b3a81e8cdfbad786f7adbed5d848469fa59fe52396e4ef", size = 7191878, upload-time = "2026-01-02T09:13:05.276Z" }, + { url = "https://files.pythonhosted.org/packages/53/26/c4188248bd5edaf543864fe4834aebe9c9cb4968b6f573ce014cc42d0720/pillow-12.1.0-cp314-cp314t-win32.whl", hash = "sha256:b17fbdbe01c196e7e159aacb889e091f28e61020a8abeac07b68079b6e626988", size = 6438703, upload-time = "2026-01-02T09:13:07.491Z" }, + { url = "https://files.pythonhosted.org/packages/b8/0e/69ed296de8ea05cb03ee139cee600f424ca166e632567b2d66727f08c7ed/pillow-12.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:27b9baecb428899db6c0de572d6d305cfaf38ca1596b5c0542a5182e3e74e8c6", size = 7182927, upload-time = "2026-01-02T09:13:09.841Z" }, + { url = "https://files.pythonhosted.org/packages/fc/f5/68334c015eed9b5cff77814258717dec591ded209ab5b6fb70e2ae873d1d/pillow-12.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:f61333d817698bdcdd0f9d7793e365ac3d2a21c1f1eb02b32ad6aefb8d8ea831", size = 2545104, upload-time = "2026-01-02T09:13:12.068Z" }, +] + [[package]] name = "pluggy" version = "1.6.0" @@ -310,6 +445,52 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/49/1377b49de7d0c1ce41292161ea0f721913fa8722c19fb9c1e3aa0367eecb/pytest_cov-7.0.0-py3-none-any.whl", hash = "sha256:3b8e9558b16cc1479da72058bdecf8073661c7f57f7d3c5f22a1c23507f2d861", size = 22424, upload-time = "2025-09-09T10:57:00.695Z" }, ] +[[package]] +name = "pyyaml" +version = "6.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" }, + { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" }, + { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" }, + { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" }, + { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" }, + { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" }, + { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" }, + { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" }, + { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" }, + { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" }, + { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669, upload-time = "2025-09-25T21:32:23.673Z" }, + { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252, upload-time = "2025-09-25T21:32:25.149Z" }, + { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081, upload-time = "2025-09-25T21:32:26.575Z" }, + { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159, upload-time = "2025-09-25T21:32:27.727Z" }, + { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626, upload-time = "2025-09-25T21:32:28.878Z" }, + { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613, upload-time = "2025-09-25T21:32:30.178Z" }, + { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115, upload-time = "2025-09-25T21:32:31.353Z" }, + { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" }, + { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" }, + { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" }, + { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" }, + { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" }, + { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" }, + { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" }, + { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" }, + { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" }, + { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" }, + { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" }, + { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" }, + { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" }, + { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" }, + { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" }, + { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" }, + { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" }, + { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" }, + { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, +] + [[package]] name = "rich" version = "14.2.0" @@ -355,3 +536,20 @@ sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac8 wheels = [ { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, ] + +[[package]] +name = "zarr" +version = "3.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "donfig" }, + { name = "google-crc32c" }, + { name = "numcodecs" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fc/76/7fa87f57c112c7b9c82f0a730f8b6f333e792574812872e2cd45ab604199/zarr-3.1.5.tar.gz", hash = "sha256:fbe0c79675a40c996de7ca08e80a1c0a20537bd4a9f43418b6d101395c0bba2b", size = 366825, upload-time = "2025-11-21T14:06:01.492Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/44/15/bb13b4913ef95ad5448490821eee4671d0e67673342e4d4070854e5fe081/zarr-3.1.5-py3-none-any.whl", hash = "sha256:29cd905afb6235b94c09decda4258c888fcb79bb6c862ef7c0b8fe009b5c8563", size = 284067, upload-time = "2025-11-21T14:05:59.235Z" }, +] From e3acef8c51da0358bd7ecc895afebb450a962d81 Mon Sep 17 00:00:00 2001 From: Aljes Date: Tue, 27 Jan 2026 13:17:05 +0000 Subject: [PATCH 62/62] Disable caching in UV setup for consistent test environment --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index cd6a3bc..dde3803 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -50,7 +50,7 @@ jobs: - name: Set up uv uses: astral-sh/setup-uv@v3 with: - enable-cache: true + enable-cache: false - name: Install dependencies (frozen) run: uv sync --extra dev --frozen