From 2a8e673a00ec3debf660b619c1509633c6729c60 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Thu, 15 Jan 2026 17:41:54 +0000
Subject: [PATCH 01/62] Tutorial template

---
 docs/TUTORIAL.md | 382 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 382 insertions(+)
 create mode 100644 docs/TUTORIAL.md

diff --git a/docs/TUTORIAL.md b/docs/TUTORIAL.md
new file mode 100644
index 0000000..a19a416
--- /dev/null
+++ b/docs/TUTORIAL.md
@@ -0,0 +1,382 @@
+# Tutorial: Using h5ad CLI with csvkit
+
+This tutorial demonstrates how to combine `h5ad` CLI with `csvkit` to explore, analyze, and subset large `.h5ad` files efficiently without loading them into memory.
+
+## Introduction
+
+### h5ad CLI
+A command-line tool for working with AnnData (`.h5ad`) files. It streams data directly from disk, making it perfect for exploring huge single-cell datasets without memory constraints.
+
+**Key features:**
+- `info` - Inspect file structure and dimensions
+- `table` - Export metadata to CSV
+- `subset` - Filter files by cell/gene names
+
+### csvkit
+A suite of command-line tools for working with CSV files. Think of it as `awk`, `sed`, and `grep` but specifically designed for CSV data.
+
+**Key tools we'll use:**
+- `csvcut` - Select specific columns
+- `csvsql` - Execute SQL queries on CSV files
+- `csvgrep` - Filter rows by pattern
+- `csvlook` - Pretty-print CSV in terminal
+
+**Installation:**
+```bash
+pip install csvkit
+```
+
+## 1. Inspect File Structure with `info`
+
+First, let's see what's in our `.h5ad` file:
+
+```bash
+h5ad info dataset.h5ad
+```
+
+**Example output:**
+```
+File: dataset.h5ad
+Dimensions: 50000 obs × 20000 var
+
+Top-level groups:
+  obs/
+    - cell_type
+    - sample_id
+    - donor_id
+    - tissue
+    - n_genes
+  var/
+    - gene_name
+    - highly_variable
+  X (sparse matrix)
+  layers/
+  obsm/
+  uns/
+```
+
+This shows us that we have 50,000 cells with metadata including cell types, samples, and donor information.
+
+## 2. Export Metadata with `table`
+
+### 2.1 Basic Metadata Export
+
+Export all cell metadata (observations) to CSV:
+
+```bash
+h5ad table dataset.h5ad --axis obs --output cell_metadata.csv
+```
+
+Export just specific columns:
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns cell_type,sample_id,donor_id --output cells.csv
+```
+
+Preview the first few rows in a nice table format:
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns cell_type,sample_id,donor_id --head 10 | csvlook
+```
+
+**Example output:**
+```
+| obs_names           | cell_type    | sample_id | donor_id |
+| ------------------- | ------------ | --------- | -------- |
+| AAACCTGAGAAACCAT-1  | T cell       | sample_1  | donor_A  |
+| AAACCTGAGACAGACC-1  | B cell       | sample_1  | donor_A  |
+| AAACCTGAGGCATGGT-1  | NK cell      | sample_2  | donor_B  |
+| AAACCTGCAAGCCGCT-1  | T cell       | sample_2  | donor_B  |
+| AAACCTGCACATTAGC-1  | Monocyte     | sample_1  | donor_A  |
+```
+
+### 2.2 Calculate Statistics with `csvsql`
+
+Now let's analyze the metadata using SQL queries!
+
+**Count cells per cell type:**
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns cell_type | \
+  csvsql --query "SELECT cell_type, COUNT(*) as n_cells FROM stdin GROUP BY cell_type ORDER BY n_cells DESC" | \
+  csvlook
+```
+
+**Example output:**
+```
+| cell_type    | n_cells |
+| ------------ | ------- |
+| T cell       | 15234   |
+| Monocyte     | 12456   |
+| B cell       | 8932    |
+| NK cell      | 5621    |
+| DC           | 3456    |
+| Macrophage   | 2301    |
+```
+
+**Count cells per cell type and sample:**
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns cell_type,sample_id | \
+  csvsql --query "SELECT cell_type, sample_id, COUNT(*) as n_cells 
+                  FROM stdin 
+                  GROUP BY cell_type, sample_id 
+                  ORDER BY cell_type, sample_id" | \
+  csvlook
+```
+
+**Example output:**
+```
+| cell_type    | sample_id | n_cells |
+| ------------ | --------- | ------- |
+| B cell       | sample_1  | 4521    |
+| B cell       | sample_2  | 4411    |
+| Monocyte     | sample_1  | 6234    |
+| Monocyte     | sample_2  | 6222    |
+| T cell       | sample_1  | 7645    |
+| T cell       | sample_2  | 7589    |
+```
+
+**Calculate average gene count per cell type:**
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns cell_type,n_genes | \
+  csvsql --query "SELECT cell_type, 
+                         AVG(n_genes) as avg_genes,
+                         MIN(n_genes) as min_genes,
+                         MAX(n_genes) as max_genes
+                  FROM stdin 
+                  GROUP BY cell_type 
+                  ORDER BY avg_genes DESC" | \
+  csvlook
+```
+
+**Find samples with low cell counts:**
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns sample_id | \
+  csvsql --query "SELECT sample_id, COUNT(*) as n_cells 
+                  FROM stdin 
+                  GROUP BY sample_id 
+                  HAVING COUNT(*) < 1000 
+                  ORDER BY n_cells" | \
+  csvlook
+```
+
+## 3. Filter and Subset Data
+
+### 3.1 Extract Cell Names for a Specific Cell Type
+
+Let's say we want to create a subset containing only T cells.
+
+**Step 1: Export metadata and filter for T cells**
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns cell_type --output cell_metadata.csv
+```
+
+**Step 2: Use csvgrep to find T cells and extract their names**
+
+```bash
+csvgrep -c cell_type -m "T cell" cell_metadata.csv | \
+  csvcut -c obs_names | \
+  tail -n +2 > tcell_names.txt
+```
+
+This creates a file `tcell_names.txt` with one cell barcode per line.
+
+**Alternative: Use csvsql for more complex filters**
+
+Get T cells from a specific donor:
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns cell_type,donor_id --output cell_metadata.csv
+
+csvsql --query "SELECT obs_names 
+                FROM cell_metadata 
+                WHERE cell_type = 'T cell' 
+                AND donor_id = 'donor_A'" \
+       cell_metadata.csv | \
+  tail -n +2 > tcell_donor_A.txt
+```
+
+Get cells with high gene counts (>2000 genes):
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns n_genes --output cell_metadata.csv
+
+csvsql --query "SELECT obs_names 
+                FROM cell_metadata 
+                WHERE n_genes > 2000" \
+       cell_metadata.csv | \
+  tail -n +2 > high_quality_cells.txt
+```
+
+### 3.2 Create the Subset
+
+Now use the filtered cell list to create a new `.h5ad` file:
+
+```bash
+h5ad subset dataset.h5ad tcells_only.h5ad --obs tcell_names.txt
+```
+
+**Verify the subset:**
+
+```bash
+h5ad info tcells_only.h5ad
+```
+
+**Check the cell type distribution:**
+
+```bash
+h5ad table tcells_only.h5ad --axis obs --columns cell_type | \
+  csvsql --query "SELECT cell_type, COUNT(*) as n_cells FROM stdin GROUP BY cell_type" | \
+  csvlook
+```
+
+### 3.3 Advanced: Subset by Both Cells and Genes
+
+Let's create a subset with specific cell types and a curated gene list.
+
+**Step 1: Filter cells (multiple cell types)**
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns cell_type --output cell_metadata.csv
+
+csvsql --query "SELECT obs_names 
+                FROM cell_metadata 
+                WHERE cell_type IN ('T cell', 'NK cell', 'B cell')" \
+       cell_metadata.csv | \
+  tail -n +2 > lymphocytes.txt
+```
+
+**Step 2: Create a gene list**
+
+You might have a predefined list or extract genes from the file:
+
+```bash
+# Export all genes
+h5ad table dataset.h5ad --axis var --columns gene_name --output genes.csv
+
+# Filter for specific genes (e.g., markers)
+echo "CD3D
+CD3E
+CD4
+CD8A
+CD8B
+CD19
+CD20
+NCAM1" > marker_genes.txt
+```
+
+**Step 3: Create the subset**
+
+```bash
+h5ad subset dataset.h5ad lymphocytes_markers.h5ad \
+  --obs lymphocytes.txt \
+  --var marker_genes.txt
+```
+
+**Verify:**
+
+```bash
+h5ad info lymphocytes_markers.h5ad
+```
+
+## 4. Complete Example Workflow
+
+Here's a complete workflow combining everything:
+
+```bash
+# 1. Inspect the file
+h5ad info large_dataset.h5ad
+
+# 2. Export and analyze metadata
+h5ad table large_dataset.h5ad --axis obs \
+  --columns cell_type,sample_id,donor_id,n_genes \
+  --output all_metadata.csv
+
+# 3. Generate statistics
+echo "Cell type distribution:"
+csvsql --query "SELECT cell_type, COUNT(*) as n_cells 
+                FROM all_metadata 
+                GROUP BY cell_type 
+                ORDER BY n_cells DESC" \
+       all_metadata.csv | csvlook
+
+echo "Sample distribution:"
+csvsql --query "SELECT sample_id, donor_id, COUNT(*) as n_cells 
+                FROM all_metadata 
+                GROUP BY sample_id, donor_id" \
+       all_metadata.csv | csvlook
+
+# 4. Filter for high-quality T cells from a specific donor
+csvsql --query "SELECT obs_names 
+                FROM all_metadata 
+                WHERE cell_type = 'T cell' 
+                AND donor_id = 'donor_A' 
+                AND n_genes > 1500" \
+       all_metadata.csv | \
+  tail -n +2 > selected_cells.txt
+
+echo "Selected $(wc -l < selected_cells.txt) cells"
+
+# 5. Create subset
+h5ad subset large_dataset.h5ad tcells_subset.h5ad --obs selected_cells.txt
+
+# 6. Verify result
+h5ad info tcells_subset.h5ad
+h5ad table tcells_subset.h5ad --axis obs --columns cell_type,donor_id | \
+  csvsql --query "SELECT cell_type, donor_id, COUNT(*) as n_cells FROM stdin GROUP BY cell_type, donor_id" | \
+  csvlook
+```
+
+## Tips and Best Practices
+
+1. **Use `--head` for quick previews** before exporting large files:
+   ```bash
+   h5ad table data.h5ad --axis obs --head 100 | csvlook
+   ```
+
+2. **Pipe directly to csvkit** to avoid creating intermediate files:
+   ```bash
+   h5ad table data.h5ad --axis obs --columns cell_type | csvsql --query "..." 
+   ```
+
+3. **Check cell counts** before subsetting:
+   ```bash
+   wc -l selected_cells.txt  # Should be > 0!
+   ```
+
+4. **Use csvstat** for quick summary statistics:
+   ```bash
+   h5ad table data.h5ad --axis obs --columns n_genes,n_counts | csvstat
+   ```
+
+5. **Combine with standard Unix tools**:
+   ```bash
+   # Get unique cell types
+   h5ad table data.h5ad --axis obs --columns cell_type | tail -n +2 | sort -u
+   
+   # Count samples
+   h5ad table data.h5ad --axis obs --columns sample_id | tail -n +2 | sort | uniq -c
+   ```
+
+## Conclusion
+
+By combining `h5ad` CLI with `csvkit`, you can:
+- ✅ Explore huge datasets without loading them into memory
+- ✅ Perform complex queries and aggregations on metadata
+- ✅ Create filtered subsets based on sophisticated criteria
+- ✅ Work entirely on the command line without Python/R
+
+This workflow is especially powerful for:
+- Initial data exploration
+- Quality control analysis
+- Creating test datasets
+- Preparing data for downstream analysis
+- Batch processing multiple files
+
+For more information:
+- h5ad CLI: [README.md](../README.md)
+- csvkit documentation: https://csvkit.readthedocs.io/

From 0aa445308f42b78adf6c4c8fa79dcbf7aab03f2d Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Thu, 15 Jan 2026 19:34:18 +0000
Subject: [PATCH 02/62] Extended info command

---
 src/h5ad/cli.py           |  32 ++++++--
 src/h5ad/commands/info.py | 164 +++++++++++++++++++++++++++++++++++---
 src/h5ad/info.py          | 139 +++++++++++++++++++++++++++++++-
 3 files changed, 317 insertions(+), 18 deletions(-)

diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py
index bb4749d..48ecd53 100644
--- a/src/h5ad/cli.py
+++ b/src/h5ad/cli.py
@@ -9,13 +9,15 @@
 import h5py
 import numpy as np
 
-from h5ad.commands import show_info, export_table, subset_h5ad
 
 app = typer.Typer(
-    help="Streaming CLI for huge .h5ad files (info, table, subset)."
+    help="Streaming CLI for huge .h5ad files (info, table, subset, export)."
 )
 console = Console(stderr=True)
 
+export_app = typer.Typer(help="Export objects from an .h5ad file to common formats.")
+app.add_typer(export_app, name="export")
+
 
 @app.command()
 def info(
@@ -24,14 +26,32 @@ def info(
         help="Path to the .h5ad file",
         exists=True,
         readable=True,
-    )
+    ),
+    obj: Optional[str] = typer.Option(
+        None,
+        "--object",
+        "-o",
+        help="Object path to inspect (e.g., 'obsm/X_pca', 'X', 'uns')",
+    ),
+    types: bool = typer.Option(
+        False,
+        "--types",
+        "-t",
+        help="Show detailed type information for all entries",
+    ),
 ) -> None:
     """
     Show high-level information about the .h5ad file.
-    Args:
-        file (Path): Path to the .h5ad file
+
+    Use --types to see type information for each entry.
+    Use --object to inspect a specific object in detail.
+
+    Examples:
+        h5ad info data.h5ad
+        h5ad info --types data.h5ad
+        h5ad info --object obsm/X_pca data.h5ad
     """
-    show_info(file, console)
+    show_info(file, console, show_types=types, obj_path=obj)
 
 
 @app.command()
diff --git a/src/h5ad/commands/info.py b/src/h5ad/commands/info.py
index 95c3c72..29c94ba 100644
--- a/src/h5ad/commands/info.py
+++ b/src/h5ad/commands/info.py
@@ -1,32 +1,174 @@
 from pathlib import Path
+from typing import Optional, Union
 
 import h5py
 import rich
 from rich.console import Console
-from h5ad.info import axis_len
+from rich.tree import Tree
+from h5ad.info import axis_len, get_entry_type, format_type_info
 
+# Preferred display order for top-level keys
+KEY_ORDER = ["X", "obs", "var", "obsm", "varm", "layers", "obsp", "varp", "uns"]
 
-def show_info(file: Path, console: Console) -> None:
+
+def _sort_keys(keys: list) -> list:
+    """Sort keys according to KEY_ORDER, with unknown keys at the end."""
+    order_map = {k: i for i, k in enumerate(KEY_ORDER)}
+    return sorted(keys, key=lambda k: (order_map.get(k, len(KEY_ORDER)), k))
+
+
+def show_info(
+    file: Path,
+    console: Console,
+    show_types: bool = False,
+    obj_path: Optional[str] = None,
+) -> None:
     """
     Show high-level information about the .h5ad file.
     Args:
         file (Path): Path to the .h5ad file
         console (Console): Rich console for output
+        show_types (bool): Show detailed type information for each entry
+        obj_path (Optional[str]): Specific object path to inspect (e.g., 'obsm/X_pca')
     """
     with h5py.File(file, "r") as f:
+        # If a specific path is requested, show detailed info for that object
+        if obj_path:
+            _show_object_info(f, obj_path, console)
+            return
+
         # Get n_obs and n_var
         n_obs = axis_len(f, "obs")
         n_var = axis_len(f, "var")
         rich.print(
             f"[bold cyan]An object with n_obs × n_var: {n_obs if n_obs is not None else '?'} × {n_var if n_var is not None else '?'}[/]"
         )
-        # List top-level keys and their sub-keys
-        for key, obj in sorted(f.items(), key=lambda x: len(x[0])):
-            # Only process Groups, skip Datasets like X
-            if isinstance(obj, h5py.Group):
-                sub_keys = [k for k in obj.keys() if k != "_index"]
-                if sub_keys and key != "X":
-                    rich.print(
-                        f"\t[bold yellow]{key}:[/]\t"
-                        + ", ".join(f"[bright_white]{sub}[/]" for sub in sub_keys)
+
+        if show_types:
+            _show_types_tree(f, console)
+        else:
+            # List top-level keys and their sub-keys (original behavior)
+            for key in _sort_keys(list(f.keys())):
+                obj = f[key]
+                # Only process Groups, skip Datasets like X
+                if isinstance(obj, h5py.Group):
+                    sub_keys = [k for k in obj.keys() if k != "_index"]
+                    if sub_keys and key != "X":
+                        rich.print(
+                            f"\t[bold yellow]{key}:[/]\t"
+                            + ", ".join(f"[bright_white]{sub}[/]" for sub in sub_keys)
+                        )
+
+
+def _show_types_tree(f: h5py.File, console: Console) -> None:
+    """Show a tree view with type information for all entries.
+
+    Recursion depth by group:
+        - obs/var: top level only (no children)
+        - X: top level only
+        - obsm/obsp/varm/varp/layers: 1 level (show matrices)
+        - uns: 2 levels deep
+    """
+    tree = Tree(f"[bold]{f.filename}[/]")
+
+    # Define max depth for each top-level group
+    max_depth_map = {
+        "obs": 0,
+        "var": 0,
+        "X": 0,
+        "obsm": 1,
+        "obsp": 1,
+        "varm": 1,
+        "varp": 1,
+        "layers": 1,
+        "uns": 2,
+    }
+
+    def add_node(
+        parent_tree: Tree,
+        name: str,
+        obj: Union[h5py.Group, h5py.Dataset],
+        current_depth: int,
+        max_depth: int,
+    ) -> None:
+        info = get_entry_type(obj)
+        type_str = format_type_info(info)
+
+        if isinstance(obj, h5py.Dataset):
+            shape_str = f"[dim]{obj.shape}[/]" if obj.shape else ""
+            node_text = f"[bright_white]{name}[/] {shape_str} {type_str}"
+            parent_tree.add(node_text)
+        else:
+            # Group
+            node_text = f"[bold yellow]{name}/[/] {type_str}"
+            subtree = parent_tree.add(node_text)
+
+            # Recurse only if within allowed depth
+            if current_depth < max_depth:
+                for child_name in sorted(obj.keys()):
+                    if child_name == "_index":
+                        continue
+                    child_obj = obj[child_name]
+                    add_node(
+                        subtree, child_name, child_obj, current_depth + 1, max_depth
                     )
+
+    # Add top-level items in preferred order
+    for key in _sort_keys(list(f.keys())):
+        obj = f[key]
+        # Skip empty groups
+        if isinstance(obj, h5py.Group):
+            children = [k for k in obj.keys() if k != "_index"]
+            if not children:
+                continue
+        max_depth = max_depth_map.get(key, 1)  # default to 1 level for unknown groups
+        add_node(tree, key, obj, current_depth=0, max_depth=max_depth)
+
+    console.print(tree)
+
+
+def _show_object_info(f: h5py.File, obj_path: str, console: Console) -> None:
+    """Show detailed info for a specific object path."""
+    # Normalize path
+    obj_path = obj_path.strip().lstrip("/")
+
+    if obj_path not in f:
+        console.print(f"[bold red]Error:[/] '{obj_path}' not found in the file.")
+        return
+
+    obj = f[obj_path]
+    info = get_entry_type(obj)
+
+    console.print(f"\n[bold cyan]Path:[/] {obj_path}")
+    console.print(f"[bold cyan]Type:[/] {info['type']}")
+
+    if info["encoding"]:
+        console.print(f"[bold cyan]Encoding:[/] {info['encoding']}")
+
+    if info["shape"]:
+        console.print(f"[bold cyan]Shape:[/] {info['shape']}")
+
+    if info["dtype"]:
+        console.print(f"[bold cyan]Dtype:[/] {info['dtype']}")
+
+    console.print(f"[bold cyan]Details:[/] {info['details']}")
+
+    # Show attributes if any
+    if obj.attrs:
+        console.print(f"\n[bold cyan]Attributes:[/]")
+        for k, v in obj.attrs.items():
+            v_str = v.decode("utf-8") if isinstance(v, bytes) else str(v)
+            if len(v_str) > 80:
+                v_str = v_str[:77] + "..."
+            console.print(f"  [dim]{k}:[/] {v_str}")
+
+    # If it's a group, show children
+    if isinstance(obj, h5py.Group):
+        children = [k for k in obj.keys() if k != "_index"]
+        if children:
+            console.print(f"\n[bold cyan]Children:[/]")
+            for child_name in sorted(children):
+                child_obj = obj[child_name]
+                child_info = get_entry_type(child_obj)
+                type_str = format_type_info(child_info)
+                console.print(f"  [bright_white]{child_name}[/] {type_str}")
diff --git a/src/h5ad/info.py b/src/h5ad/info.py
index 3535303..56cd90a 100644
--- a/src/h5ad/info.py
+++ b/src/h5ad/info.py
@@ -1,5 +1,142 @@
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Dict, Any, Union
 import h5py
+import numpy as np
+
+
+def get_entry_type(obj: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]:
+    """
+    Determine the type/format of an HDF5 object for export guidance.
+
+    Returns a dict with:
+        - type: str (e.g., 'dataframe', 'sparse-matrix', 'dense-matrix', 'dict', 'image', 'array', 'scalar')
+        - export_as: str (suggested export format: csv, mtx, npy, json, image)
+        - encoding: str (h5ad encoding-type if present)
+        - shape: tuple or None
+        - dtype: str or None
+        - details: str (human-readable description)
+    """
+    result: Dict[str, Any] = {
+        "type": "unknown",
+        "export_as": None,
+        "encoding": None,
+        "shape": None,
+        "dtype": None,
+        "details": "",
+    }
+
+    # Get encoding-type attribute if present
+    enc = obj.attrs.get("encoding-type", b"")
+    if isinstance(enc, bytes):
+        enc = enc.decode("utf-8")
+    result["encoding"] = enc if enc else None
+
+    if isinstance(obj, h5py.Dataset):
+        result["shape"] = obj.shape
+        result["dtype"] = str(obj.dtype)
+
+        # Scalar
+        if obj.shape == ():
+            result["type"] = "scalar"
+            result["export_as"] = "json"
+            result["details"] = f"Scalar value ({obj.dtype})"
+            return result
+
+        # Check if it looks like an image (2D or 3D with small last dim)
+        if obj.ndim in (2, 3):
+            if obj.ndim == 2 or (obj.ndim == 3 and obj.shape[2] in (1, 3, 4)):
+                # Could be an image if dtype is numeric and reasonable size
+                if np.issubdtype(obj.dtype, np.number) or obj.dtype == np.bool_:
+                    if obj.shape[0] <= 10000 and obj.shape[1] <= 10000:
+                        result["type"] = "image"
+                        result["export_as"] = "image"
+                        result["details"] = (
+                            f"Image-like array {obj.shape} ({obj.dtype})"
+                        )
+                        return result
+
+        # 1D or 2D numeric array -> dense matrix / array
+        if obj.ndim == 1:
+            result["type"] = "array"
+            result["export_as"] = "npy"
+            result["details"] = f"1D array [{obj.shape[0]}] ({obj.dtype})"
+        elif obj.ndim == 2:
+            result["type"] = "dense-matrix"
+            result["export_as"] = "npy"
+            result["details"] = (
+                f"Dense matrix {obj.shape[0]}×{obj.shape[1]} ({obj.dtype})"
+            )
+        else:
+            result["type"] = "array"
+            result["export_as"] = "npy"
+            result["details"] = f"ND array {obj.shape} ({obj.dtype})"
+
+        return result
+
+    # It's a Group
+    if isinstance(obj, h5py.Group):
+        # Check for sparse matrix (CSR/CSC)
+        if enc in ("csr_matrix", "csc_matrix"):
+            shape = obj.attrs.get("shape", None)
+            shape_str = f"{shape[0]}×{shape[1]}" if shape is not None else "?"
+            result["type"] = "sparse-matrix"
+            result["export_as"] = "mtx"
+            result["details"] = (
+                f"Sparse {enc.replace('_matrix', '').upper()} matrix {shape_str}"
+            )
+            return result
+
+        # Check for categorical
+        if enc == "categorical":
+            codes = obj.get("codes")
+            cats = obj.get("categories")
+            n_codes = codes.shape[0] if codes is not None else "?"
+            n_cats = cats.shape[0] if cats is not None else "?"
+            result["type"] = "categorical"
+            result["export_as"] = "csv"
+            result["details"] = f"Categorical [{n_codes} values, {n_cats} categories]"
+            return result
+
+        # Check for dataframe (obs/var style with _index)
+        if "_index" in obj.attrs or "obs_names" in obj or "var_names" in obj:
+            n_cols = len([k for k in obj.keys() if k != "_index"])
+            result["type"] = "dataframe"
+            result["export_as"] = "csv"
+            result["details"] = f"DataFrame with {n_cols} columns"
+            return result
+
+        # Check for array-like groups (nullable integer, string array, etc.)
+        if enc in ("nullable-integer", "string-array"):
+            result["type"] = "array"
+            result["export_as"] = "npy"
+            result["details"] = f"Encoded array ({enc})"
+            return result
+
+        # Generic dict/group
+        n_keys = len(list(obj.keys()))
+        result["type"] = "dict"
+        result["export_as"] = "json"
+        result["details"] = f"Group with {n_keys} keys"
+        return result
+
+    return result
+
+
+def format_type_info(info: Dict[str, Any]) -> str:
+    """Format type info as a colored string for display."""
+    type_colors = {
+        "dataframe": "green",
+        "sparse-matrix": "magenta",
+        "dense-matrix": "blue",
+        "array": "blue",
+        "dict": "yellow",
+        "image": "cyan",
+        "categorical": "green",
+        "scalar": "white",
+        "unknown": "red",
+    }
+
+    color = type_colors.get(info["type"], "white")
+    return f"[{color}]<{info['type']}>[/]"
 
 
 def axis_len(file: h5py.File, axis: str) -> Optional[int]:

From 7c736034b7dc58cbd3c2740ccabd935bea379c9b Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Thu, 15 Jan 2026 19:52:10 +0000
Subject: [PATCH 03/62] Add export command to CLI for exporting HDF5 objects in
 various formats

---
 src/h5ad/cli.py               |  81 ++++++-
 src/h5ad/commands/__init__.py |   1 +
 src/h5ad/commands/export.py   | 439 ++++++++++++++++++++++++++++++++++
 src/h5ad/info.py              |  30 +--
 4 files changed, 534 insertions(+), 17 deletions(-)
 create mode 100644 src/h5ad/commands/export.py

diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py
index 48ecd53..a0fd811 100644
--- a/src/h5ad/cli.py
+++ b/src/h5ad/cli.py
@@ -9,15 +9,18 @@
 import h5py
 import numpy as np
 
+from h5ad.commands import (
+    show_info,
+    export_table,
+    subset_h5ad,
+    export_object,
+)
 
 app = typer.Typer(
     help="Streaming CLI for huge .h5ad files (info, table, subset, export)."
 )
 console = Console(stderr=True)
 
-export_app = typer.Typer(help="Export objects from an .h5ad file to common formats.")
-app.add_typer(export_app, name="export")
-
 
 @app.command()
 def info(
@@ -158,5 +161,77 @@ def subset(
         raise typer.Exit(code=1)
 
 
+@app.command("export")
+def export_cmd(
+    file: Path = typer.Argument(
+        ..., help="Path to the .h5ad file", exists=True, readable=True
+    ),
+    obj: str = typer.Argument(
+        ..., help="Object path to export (e.g., 'obs', 'X', 'obsm/X_pca', 'uns')"
+    ),
+    out: Path = typer.Argument(
+        ...,
+        help="Output file path. Extension determines format: .csv, .npy, .mtx, .json, .png/.jpg/.tiff",
+    ),
+    columns: Optional[str] = typer.Option(
+        None,
+        "--columns",
+        "-c",
+        help="Comma separated column names (for dataframe/CSV export only)",
+    ),
+    chunk_rows: int = typer.Option(
+        10000, "--chunk-rows", "-r", help="Number of rows to read per chunk"
+    ),
+    head: Optional[int] = typer.Option(
+        None, "--head", "-n", help="Output only the first n rows (for CSV export)"
+    ),
+    max_elements: int = typer.Option(
+        1_000_000,
+        "--max-elements",
+        help="Maximum array elements for JSON export",
+    ),
+    include_attrs: bool = typer.Option(
+        False, "--include-attrs", help="Include HDF5 attributes in JSON export"
+    ),
+) -> None:
+    """
+    Export an object from the h5ad file to a common format.
+
+    The output format is auto-detected from the file extension:
+      - .csv  : DataFrames (obs, var)
+      - .npy  : Dense arrays/matrices (obsm/X_pca, varm/PCs, etc.)
+      - .mtx  : Sparse matrices (X if sparse)
+      - .json : Dictionaries/scalars (uns, uns/colors, etc.)
+      - .png/.jpg/.tiff : Image-like arrays
+
+    The object type is auto-detected and validated against the extension.
+
+    Examples:
+        h5ad export data.h5ad obs obs.csv
+        h5ad export data.h5ad obsm/X_pca pca.npy
+        h5ad export data.h5ad X matrix.mtx
+        h5ad export data.h5ad uns metadata.json
+    """
+    col_list: Optional[List[str]] = None
+    if columns:
+        col_list = [col.strip() for col in columns.split(",") if col.strip()]
+
+    try:
+        export_object(
+            file=file,
+            obj=obj,
+            out=out,
+            columns=col_list,
+            chunk_rows=chunk_rows,
+            head=head,
+            max_elements=max_elements,
+            include_attrs=include_attrs,
+            console=console,
+        )
+    except Exception as e:
+        console.print(f"[bold red]Error:[/] {e}")
+        raise typer.Exit(code=1)
+
+
 def main(argv: Optional[Sequence[str]] = None) -> None:
     app(standalone_mode=True)
diff --git a/src/h5ad/commands/__init__.py b/src/h5ad/commands/__init__.py
index e681fea..4c5fa9a 100644
--- a/src/h5ad/commands/__init__.py
+++ b/src/h5ad/commands/__init__.py
@@ -1,3 +1,4 @@
 from h5ad.commands.info import show_info
 from h5ad.commands.table import export_table
 from h5ad.commands.subset import subset_h5ad
+from h5ad.commands.export import export_object
diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py
new file mode 100644
index 0000000..8d237c9
--- /dev/null
+++ b/src/h5ad/commands/export.py
@@ -0,0 +1,439 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union, cast
+
+import h5py
+import numpy as np
+from rich.console import Console
+
+from h5ad.commands.table import export_table
+from h5ad.read import decode_str_array
+from h5ad.info import get_entry_type
+
+
+H5Obj = Union[h5py.Group, h5py.Dataset]
+
+# Map object types to valid output extensions
+TYPE_EXTENSIONS = {
+    "dataframe": {".csv"},
+    "sparse-matrix": {".mtx"},
+    "dense-matrix": {".npy", ".png", ".jpg", ".jpeg", ".tif", ".tiff"},
+    "array": {".npy", ".png", ".jpg", ".jpeg", ".tif", ".tiff"},
+    "dict": {".json"},
+    "scalar": {".json"},
+    "categorical": {".csv"},
+}
+
+# Image extensions for validation
+IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".tif", ".tiff"}
+
+# Known exportable types
+EXPORTABLE_TYPES = set(TYPE_EXTENSIONS.keys())
+
+
+def _norm_path(p: str) -> str:
+    p = p.strip()
+    if not p:
+        raise ValueError("Object path must be non-empty.")
+    return p.lstrip("/")
+
+
+def _get_encoding_type(group: h5py.Group) -> str:
+    enc = group.attrs.get("encoding-type", "")
+    if isinstance(enc, bytes):
+        enc = enc.decode("utf-8")
+    return str(enc)
+
+
+def _resolve(file: h5py.File, obj: str) -> H5Obj:
+    obj = _norm_path(obj)
+    if obj not in file:
+        raise KeyError(f"'{obj}' not found in the file.")
+    return cast(H5Obj, file[obj])
+
+
+def _check_json_exportable(h5obj: H5Obj, max_elements: int, path: str = "") -> None:
+    """
+    Recursively check if a group/dataset can be exported to JSON.
+    Raises ValueError if it contains non-exportable structures.
+    """
+    if isinstance(h5obj, h5py.Dataset):
+        if h5obj.shape == ():
+            return  # scalar is fine
+        n = int(np.prod(h5obj.shape)) if h5obj.shape else 0
+        if n > max_elements:
+            raise ValueError(
+                f"Cannot export to JSON: '{path or h5obj.name}' has {n} elements "
+                f"(max {max_elements}). Use --max-elements to increase limit."
+            )
+        return
+
+    # It's a Group - check encoding
+    enc = _get_encoding_type(h5obj)
+    if enc in ("csr_matrix", "csc_matrix"):
+        raise ValueError(
+            f"Cannot export to JSON: '{path or h5obj.name}' is a sparse matrix. "
+            f"Export it as .mtx instead."
+        )
+
+    # Check children recursively
+    for key in h5obj.keys():
+        child = h5obj[key]
+        child_path = f"{path}/{key}" if path else key
+        if isinstance(child, (h5py.Group, h5py.Dataset)):
+            _check_json_exportable(
+                cast(H5Obj, child), max_elements=max_elements, path=child_path
+            )
+
+
+def export_object(
+    file: Path,
+    obj: str,
+    out: Path,
+    columns: Optional[List[str]],
+    chunk_rows: int,
+    head: Optional[int],
+    max_elements: int,
+    include_attrs: bool,
+    console: Console,
+) -> None:
+    """
+    Export an HDF5 object to an appropriate format based on its type.
+
+    Auto-detects the object type and validates the output file extension.
+    """
+    obj = _norm_path(obj)
+    out_ext = out.suffix.lower()
+
+    with h5py.File(file, "r") as f:
+        h5obj = _resolve(f, obj)
+        info = get_entry_type(h5obj)
+        obj_type = info["type"]
+
+        # Check if type is exportable
+        if obj_type not in EXPORTABLE_TYPES:
+            raise ValueError(
+                f"Cannot export object of type '{obj_type}'. "
+                f"Exportable types: {', '.join(sorted(EXPORTABLE_TYPES))}."
+            )
+
+        # Check if extension matches the type
+        valid_exts = TYPE_EXTENSIONS.get(obj_type, set())
+        if out_ext not in valid_exts:
+            ext_list = ", ".join(sorted(valid_exts))
+            raise ValueError(
+                f"Output extension '{out_ext}' does not match object type '{obj_type}'. "
+                f"Expected: {ext_list}."
+            )
+
+    # Dispatch to appropriate export function
+    if obj_type == "dataframe":
+        # For dataframe, obj must be obs or var
+        if obj not in ("obs", "var"):
+            raise ValueError(
+                f"CSV export for dataframes currently supports only 'obs' or 'var', "
+                f"not '{obj}'."
+            )
+        export_table(
+            file=file,
+            axis=obj,
+            columns=columns,
+            out=out,
+            chunk_rows=chunk_rows,
+            head=head,
+            console=console,
+        )
+
+    elif obj_type == "categorical":
+        # Categorical is also exported via table if it's a column in obs/var
+        raise ValueError(
+            f"Categorical objects should be exported as part of 'obs' or 'var' table. "
+            f"Use: h5ad export <file> obs <output.csv>"
+        )
+
+    elif obj_type in ("dense-matrix", "array"):
+        if out_ext in IMAGE_EXTENSIONS:
+            # User wants image output - validate dimensions
+            _export_image(file=file, obj=obj, out=out, console=console)
+        else:
+            _export_npy(
+                file=file, obj=obj, out=out, chunk_rows=chunk_rows, console=console
+            )
+
+    elif obj_type == "sparse-matrix":
+        _export_mtx(file=file, obj=obj, out=out, console=console)
+
+    elif obj_type in ("dict", "scalar"):
+        _export_json(
+            file=file,
+            obj=obj,
+            out=out,
+            max_elements=max_elements,
+            include_attrs=include_attrs,
+            console=console,
+        )
+
+
+def _export_npy(
+    file: Path,
+    obj: str,
+    out: Path,
+    chunk_rows: int,
+    console: Console,
+) -> None:
+    """Export a dense HDF5 dataset to NumPy .npy without loading it all at once."""
+    with h5py.File(file, "r") as f:
+        h5obj = _resolve(f, obj)
+        if isinstance(h5obj, h5py.Group):
+            raise ValueError("Target is a group; cannot export as .npy.")
+
+        ds = h5obj
+        out.parent.mkdir(parents=True, exist_ok=True)
+        mm = np.lib.format.open_memmap(out, mode="w+", dtype=ds.dtype, shape=ds.shape)
+        try:
+            if ds.shape == ():
+                mm[...] = ds[()]
+                console.print(f"[green]Wrote[/] {out}")
+                return
+
+            if ds.ndim == 1:
+                n = int(ds.shape[0])
+                step = max(1, int(chunk_rows))
+                with console.status(
+                    f"[magenta]Exporting {obj} to {out}...[/]"
+                ) as status:
+                    for start in range(0, n, step):
+                        end = min(start + step, n)
+                        status.update(
+                            f"[magenta]Exporting {obj}: {start}-{end} of {n}...[/]"
+                        )
+                        mm[start:end] = ds[start:end]
+                console.print(f"[green]Wrote[/] {out}")
+                return
+
+            n0 = int(ds.shape[0])
+            step0 = max(1, int(chunk_rows))
+            with console.status(f"[magenta]Exporting {obj} to {out}...[/]") as status:
+                for start in range(0, n0, step0):
+                    end = min(start + step0, n0)
+                    status.update(
+                        f"[magenta]Exporting {obj}: {start}-{end} of {n0}...[/]"
+                    )
+                    mm[start:end, ...] = ds[start:end, ...]
+            console.print(f"[green]Wrote[/] {out}")
+        finally:
+            del mm
+
+
+def _export_mtx(file: Path, obj: str, out: Path, console: Console) -> None:
+    """Export a CSR/CSC matrix group (AnnData encoding) to Matrix Market (.mtx)."""
+    with h5py.File(file, "r") as f:
+        h5obj = _resolve(f, obj)
+        if not isinstance(h5obj, h5py.Group):
+            raise ValueError(
+                "MTX export requires a CSR/CSC matrix group (not a dataset)."
+            )
+
+        enc = _get_encoding_type(h5obj)
+        if enc not in ("csr_matrix", "csc_matrix"):
+            raise ValueError(
+                f"Target group encoding-type is {enc!r}; expected 'csr_matrix' or 'csc_matrix'."
+            )
+
+        data = h5obj.get("data")
+        indices = h5obj.get("indices")
+        indptr = h5obj.get("indptr")
+        if (
+            not isinstance(data, h5py.Dataset)
+            or not isinstance(indices, h5py.Dataset)
+            or not isinstance(indptr, h5py.Dataset)
+        ):
+            raise RuntimeError(
+                "Sparse matrix group must contain datasets: data, indices, indptr"
+            )
+
+        shape = h5obj.attrs.get("shape", None)
+        if shape is None:
+            raise RuntimeError(
+                "Sparse matrix group is missing required 'shape' attribute."
+            )
+        n_rows, n_cols = (int(shape[0]), int(shape[1]))
+
+        field = "real" if np.issubdtype(data.dtype, np.floating) else "integer"
+
+        out.parent.mkdir(parents=True, exist_ok=True)
+
+        indptr_arr = np.asarray(indptr[...], dtype=np.int64)
+        nnz_ptr = int(indptr_arr[-1]) if indptr_arr.size else 0
+        nnz_data = int(data.shape[0])
+        nnz_idx = int(indices.shape[0])
+        nnz = min(nnz_ptr, nnz_data, nnz_idx)
+
+        with open(out, "w", encoding="utf-8", newline="\n") as fh:
+            fh.write(f"%%MatrixMarket matrix coordinate {field} general\n")
+            fh.write("% generated by h5ad-cli\n")
+            fh.write(f"{n_rows} {n_cols} {nnz}\n")
+
+            major = n_rows if enc == "csr_matrix" else n_cols
+            with console.status(f"[magenta]Exporting {obj} to {out}...[/]") as status:
+                for major_i in range(major):
+                    start = min(int(indptr_arr[major_i]), nnz)
+                    end = min(int(indptr_arr[major_i + 1]), nnz)
+                    if end <= start:
+                        continue
+                    status.update(
+                        f"[magenta]Exporting {obj}: block {major_i+1}/{major}...[/]"
+                    )
+                    idx = np.asarray(indices[start:end], dtype=np.int64)
+                    vals = np.asarray(data[start:end])
+                    m = min(len(idx), len(vals))
+                    if m == 0:
+                        continue
+                    idx = idx[:m]
+                    vals = vals[:m]
+                    for k in range(m):
+                        if enc == "csr_matrix":
+                            r = major_i + 1
+                            c = int(idx[k]) + 1
+                        else:
+                            r = int(idx[k]) + 1
+                            c = major_i + 1
+                        v = vals[k]
+                        if isinstance(v, np.generic):
+                            v = v.item()
+                        fh.write(f"{r} {c} {v}\n")
+        console.print(f"[green]Wrote[/] {out}")
+
+
+def _export_json(
+    file: Path,
+    obj: str,
+    out: Path,
+    max_elements: int,
+    include_attrs: bool,
+    console: Console,
+) -> None:
+    """Export an HDF5 group/dataset to JSON (best-effort, with size limits)."""
+    with h5py.File(file, "r") as f:
+        h5obj = _resolve(f, obj)
+
+        # Check if exportable before attempting
+        _check_json_exportable(h5obj, max_elements=max_elements)
+
+        payload = _to_jsonable(
+            h5obj, max_elements=max_elements, include_attrs=include_attrs
+        )
+        out.parent.mkdir(parents=True, exist_ok=True)
+        with open(out, "w", encoding="utf-8") as fh:
+            json.dump(payload, fh, indent=2, ensure_ascii=False, sort_keys=True)
+        console.print(f"[green]Wrote[/] {out}")
+
+
+def _attrs_to_jsonable(
+    attrs: h5py.AttributeManager, max_elements: int
+) -> Dict[str, Any]:
+    out: Dict[str, Any] = {}
+    for k in attrs.keys():
+        v = attrs.get(k)
+        out[str(k)] = _pyify(v, max_elements=max_elements)
+    return out
+
+
+def _pyify(value: Any, max_elements: int) -> Any:
+    if isinstance(value, bytes):
+        try:
+            return value.decode("utf-8")
+        except Exception:
+            return value.decode("utf-8", errors="replace")
+    if isinstance(value, np.generic):
+        return value.item()
+    if isinstance(value, np.ndarray):
+        if value.size > max_elements:
+            raise ValueError(
+                f"Refusing to convert array of size {value.size} (> {max_elements}) to JSON."
+            )
+        if np.issubdtype(value.dtype, np.bytes_) or value.dtype.kind == "O":
+            value = decode_str_array(value)
+        return value.tolist()
+    return value
+
+
+def _dataset_to_jsonable(ds: h5py.Dataset, max_elements: int) -> Any:
+    if ds.shape == ():
+        v = ds[()]
+        return _pyify(v, max_elements=max_elements)
+    n = int(np.prod(ds.shape)) if ds.shape else 0
+    if n > max_elements:
+        raise ValueError(
+            f"Refusing to convert dataset {ds.name!r} with {n} elements (> {max_elements}) to JSON."
+        )
+    arr = np.asarray(ds[...])
+    return _pyify(arr, max_elements=max_elements)
+
+
+def _to_jsonable(h5obj: H5Obj, max_elements: int, include_attrs: bool) -> Any:
+    if isinstance(h5obj, h5py.Dataset):
+        return _dataset_to_jsonable(h5obj, max_elements=max_elements)
+
+    # Group
+    d: Dict[str, Any] = {}
+    if include_attrs and len(h5obj.attrs):
+        d["__attrs__"] = _attrs_to_jsonable(h5obj.attrs, max_elements=max_elements)
+
+    for key in h5obj.keys():
+        child = h5obj[key]
+        if isinstance(child, (h5py.Group, h5py.Dataset)):
+            d[str(key)] = _to_jsonable(
+                cast(H5Obj, child),
+                max_elements=max_elements,
+                include_attrs=include_attrs,
+            )
+    return d
+
+
+def _export_image(file: Path, obj: str, out: Path, console: Console) -> None:
+    """Export an image-like dataset (H,W) or (H,W,C) to PNG/JPG/TIFF."""
+    try:
+        from PIL import Image  # type: ignore
+    except Exception as e:  # pragma: no cover
+        raise RuntimeError(
+            "Pillow is required for image export. Install with: pip install h5ad[images]"
+        ) from e
+
+    with h5py.File(file, "r") as f:
+        h5obj = _resolve(f, obj)
+        if not isinstance(h5obj, h5py.Dataset):
+            raise ValueError("Image export requires a dataset.")
+        arr = np.asarray(h5obj[...])
+
+    if arr.ndim not in (2, 3):
+        raise ValueError(f"Expected 2D or 3D image array; got shape {arr.shape}.")
+    if arr.ndim == 3 and arr.shape[2] not in (1, 3, 4):
+        raise ValueError(
+            f"Expected last dimension (channels) to be 1, 3, or 4; got {arr.shape}."
+        )
+
+    # Convert to uint8 for common image formats
+    if np.issubdtype(arr.dtype, np.floating):
+        amax = float(np.nanmax(arr)) if arr.size else 0.0
+        if amax <= 1.0:
+            arr = np.clip(arr, 0.0, 1.0) * 255.0
+        else:
+            arr = np.clip(arr, 0.0, 255.0)
+        arr = arr.astype(np.uint8)
+    elif np.issubdtype(arr.dtype, np.integer):
+        arr = np.clip(arr, 0, 255).astype(np.uint8)
+    elif arr.dtype == np.bool_:
+        arr = arr.astype(np.uint8) * 255
+    else:
+        raise ValueError(f"Unsupported image dtype: {arr.dtype}")
+
+    if arr.ndim == 3 and arr.shape[2] == 1:
+        arr = arr[:, :, 0]
+
+    img = Image.fromarray(arr)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    img.save(out)
+    console.print(f"[green]Wrote[/] {out}")
diff --git a/src/h5ad/info.py b/src/h5ad/info.py
index 56cd90a..94022a0 100644
--- a/src/h5ad/info.py
+++ b/src/h5ad/info.py
@@ -41,30 +41,33 @@ def get_entry_type(obj: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]:
             result["details"] = f"Scalar value ({obj.dtype})"
             return result
 
-        # Check if it looks like an image (2D or 3D with small last dim)
-        if obj.ndim in (2, 3):
-            if obj.ndim == 2 or (obj.ndim == 3 and obj.shape[2] in (1, 3, 4)):
-                # Could be an image if dtype is numeric and reasonable size
-                if np.issubdtype(obj.dtype, np.number) or obj.dtype == np.bool_:
-                    if obj.shape[0] <= 10000 and obj.shape[1] <= 10000:
-                        result["type"] = "image"
-                        result["export_as"] = "image"
-                        result["details"] = (
-                            f"Image-like array {obj.shape} ({obj.dtype})"
-                        )
-                        return result
-
         # 1D or 2D numeric array -> dense matrix / array
         if obj.ndim == 1:
             result["type"] = "array"
             result["export_as"] = "npy"
             result["details"] = f"1D array [{obj.shape[0]}] ({obj.dtype})"
         elif obj.ndim == 2:
+            # Check if it looks like an image (2D with reasonable image dimensions)
+            # Minimum 16x16, maximum 10000x10000, numeric dtype
+            if (
+                obj.shape[0] >= 16
+                and obj.shape[1] >= 16
+                and obj.shape[0] <= 10000
+                and obj.shape[1] <= 10000
+                and (np.issubdtype(obj.dtype, np.number) or obj.dtype == np.bool_)
+            ):
+                # Could be an image, but default to dense-matrix
+                # Image export can still be used if user provides image extension
+                pass
             result["type"] = "dense-matrix"
             result["export_as"] = "npy"
             result["details"] = (
                 f"Dense matrix {obj.shape[0]}×{obj.shape[1]} ({obj.dtype})"
             )
+        elif obj.ndim == 3:
+            result["type"] = "array"
+            result["export_as"] = "npy"
+            result["details"] = f"3D array {obj.shape} ({obj.dtype})"
         else:
             result["type"] = "array"
             result["export_as"] = "npy"
@@ -129,7 +132,6 @@ def format_type_info(info: Dict[str, Any]) -> str:
         "dense-matrix": "blue",
         "array": "blue",
         "dict": "yellow",
-        "image": "cyan",
         "categorical": "green",
         "scalar": "white",
         "unknown": "red",

From 48e7efc694fbea77b69c1075b96aab56961ab866 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Thu, 15 Jan 2026 19:52:18 +0000
Subject: [PATCH 04/62] Add tests for info command and entry type detection

---
 tests/test_cli.py       |  51 ++++++++++++
 tests/test_export.py    | 172 ++++++++++++++++++++++++++++++++++++++++
 tests/test_info_read.py |  81 ++++++++++++++++++-
 3 files changed, 303 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_export.py

diff --git a/tests/test_cli.py b/tests/test_cli.py
index 1659104..07031d7 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -33,6 +33,57 @@ def test_info_function_direct(self, sample_h5ad_file):
         # Should not raise exception
         show_info(sample_h5ad_file, console)
 
+    def test_info_types_flag(self, sample_h5ad_file):
+        """Test info command with --types flag."""
+        result = runner.invoke(app, ["info", "--types", str(sample_h5ad_file)])
+        assert result.exit_code == 0
+        # Should show type annotations in angle brackets
+        # Output may go to stdout or stderr depending on console config
+        output = result.stdout + (result.stderr or "")
+        assert "<" in output
+        assert ">" in output
+
+    def test_info_types_short_flag(self, sample_h5ad_file):
+        """Test info command with -t short flag."""
+        result = runner.invoke(app, ["info", "-t", str(sample_h5ad_file)])
+        assert result.exit_code == 0
+        output = result.stdout + (result.stderr or "")
+        assert "<" in output
+
+    def test_info_object_flag(self, sample_h5ad_file):
+        """Test info command with --object flag."""
+        result = runner.invoke(app, ["info", "--object", "X", str(sample_h5ad_file)])
+        assert result.exit_code == 0
+        output = result.stdout + (result.stderr or "")
+        assert "Path:" in output
+        assert "Type:" in output
+
+    def test_info_object_short_flag(self, sample_h5ad_file):
+        """Test info command with -o short flag."""
+        result = runner.invoke(app, ["info", "-o", "obs", str(sample_h5ad_file)])
+        assert result.exit_code == 0
+        output = result.stdout + (result.stderr or "")
+        assert "Path:" in output
+        assert "dataframe" in output
+
+    def test_info_object_nested_path(self, sample_h5ad_file):
+        """Test info command with nested object path."""
+        result = runner.invoke(
+            app, ["info", "-o", "uns/description", str(sample_h5ad_file)]
+        )
+        assert result.exit_code == 0
+        output = result.stdout + (result.stderr or "")
+        assert "Path:" in output
+
+    def test_info_object_not_found(self, sample_h5ad_file):
+        """Test info command with non-existent object path."""
+        result = runner.invoke(
+            app, ["info", "-o", "nonexistent", str(sample_h5ad_file)]
+        )
+        assert result.exit_code == 0  # Doesn't exit with error, just shows message
+        output = result.stdout + (result.stderr or "")
+        assert "not found" in output
+
 
 class TestTableCommand:
     """Tests for table command."""
diff --git a/tests/test_export.py b/tests/test_export.py
new file mode 100644
index 0000000..1c2d88f
--- /dev/null
+++ b/tests/test_export.py
@@ -0,0 +1,172 @@
+"""Tests for the export command."""
+
+import json
+from pathlib import Path
+
+import h5py
+import numpy as np
+from typer.testing import CliRunner
+
+from h5ad.cli import app
+
+
+runner = CliRunner()
+
+
+def _read_mtx(path: Path) -> np.ndarray:
+    with open(path, "r", encoding="utf-8") as fh:
+        header = fh.readline()
+        assert header.startswith("%%MatrixMarket")
+        line = fh.readline()
+        while line.startswith("%"):
+            line = fh.readline()
+        n_rows, n_cols, nnz = map(int, line.split())
+        mat = np.zeros((n_rows, n_cols), dtype=np.float32)
+        for _ in range(nnz):
+            r, c, v = fh.readline().split()
+            mat[int(r) - 1, int(c) - 1] = float(v)
+        return mat
+
+
+class TestExportNpy:
+    def test_export_npy_dense_X(self, sample_h5ad_file, temp_dir):
+        out = temp_dir / "X.npy"
+        result = runner.invoke(app, ["export", str(sample_h5ad_file), "X", str(out)])
+        assert result.exit_code == 0
+        assert out.exists()
+
+        got = np.load(out)
+        with h5py.File(sample_h5ad_file, "r") as f:
+            expected = np.asarray(f["X"][...])
+        np.testing.assert_allclose(got, expected)
+
+
+class TestExportMtx:
+    def test_export_mtx_csr(self, sample_sparse_csr_h5ad, temp_dir):
+        out = temp_dir / "X_csr.mtx"
+        result = runner.invoke(
+            app, ["export", str(sample_sparse_csr_h5ad), "X", str(out)]
+        )
+        assert result.exit_code == 0
+        assert out.exists()
+
+        got = _read_mtx(out)
+        expected = np.array(
+            [
+                [1.0, 0.0, 2.0],
+                [0.0, 0.0, 0.0],
+                [3.0, 4.0, 0.0],
+                [0.0, 5.0, 6.0],
+            ],
+            dtype=np.float32,
+        )
+        np.testing.assert_allclose(got, expected)
+
+    def test_export_mtx_csc(self, temp_dir):
+        # Build a small, consistent CSC matrix group
+        file_path = temp_dir / "test_csc.h5ad"
+        with h5py.File(file_path, "w") as f:
+            X = f.create_group("X")
+            X.attrs["encoding-type"] = "csc_matrix"
+            X.attrs["shape"] = (3, 4)
+            data = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], dtype=np.float32)
+            indices = np.array([0, 2, 0, 1, 1, 2], dtype=np.int32)
+            indptr = np.array([0, 2, 2, 4, 6], dtype=np.int32)
+            X.create_dataset("data", data=data)
+            X.create_dataset("indices", data=indices)
+            X.create_dataset("indptr", data=indptr)
+
+        out = temp_dir / "X_csc.mtx"
+        result = runner.invoke(app, ["export", str(file_path), "X", str(out)])
+        assert result.exit_code == 0
+        assert out.exists()
+
+        got = _read_mtx(out)
+        expected = np.array(
+            [
+                [1.0, 0.0, 3.0, 0.0],
+                [0.0, 0.0, 4.0, 5.0],
+                [2.0, 0.0, 0.0, 6.0],
+            ],
+            dtype=np.float32,
+        )
+        np.testing.assert_allclose(got, expected)
+
+
+class TestExportJson:
+    def test_export_json_uns(self, sample_h5ad_file, temp_dir):
+        out = temp_dir / "uns.json"
+        result = runner.invoke(app, ["export", str(sample_h5ad_file), "uns", str(out)])
+        assert result.exit_code == 0
+        assert out.exists()
+        payload = json.loads(out.read_text(encoding="utf-8"))
+        assert "description" in payload
+        assert payload["description"] == ["Test dataset"]
+
+
+class TestExportCsv:
+    def test_export_csv_obs(self, sample_h5ad_file, temp_dir):
+        out = temp_dir / "obs.csv"
+        result = runner.invoke(app, ["export", str(sample_h5ad_file), "obs", str(out)])
+        assert result.exit_code == 0
+        assert out.exists()
+        text = out.read_text(encoding="utf-8")
+        assert "obs_names" in text
+
+
+class TestExportValidation:
+    def test_wrong_extension_for_type(self, sample_h5ad_file, temp_dir):
+        """Test that wrong extension is rejected."""
+        out = temp_dir / "obs.npy"  # obs is a dataframe, should be .csv
+        result = runner.invoke(app, ["export", str(sample_h5ad_file), "obs", str(out)])
+        assert result.exit_code == 1
+        assert "does not match" in result.output or "Expected" in result.output
+
+    def test_sparse_matrix_wrong_extension(self, sample_sparse_csr_h5ad, temp_dir):
+        """Test that sparse matrix rejects .npy extension."""
+        out = temp_dir / "X.npy"  # sparse matrix should be .mtx
+        result = runner.invoke(
+            app, ["export", str(sample_sparse_csr_h5ad), "X", str(out)]
+        )
+        assert result.exit_code == 1
+        assert "does not match" in result.output or ".mtx" in result.output
+
+    def test_dense_matrix_wrong_extension(self, sample_h5ad_file, temp_dir):
+        """Test that dense matrix rejects .csv extension."""
+        out = temp_dir / "X.csv"  # dense matrix should be .npy
+        result = runner.invoke(app, ["export", str(sample_h5ad_file), "X", str(out)])
+        assert result.exit_code == 1
+        assert "does not match" in result.output or ".npy" in result.output
+
+    def test_json_wrong_extension(self, sample_h5ad_file, temp_dir):
+        """Test that dict rejects .npy extension."""
+        out = temp_dir / "uns.npy"  # uns is dict, should be .json
+        result = runner.invoke(app, ["export", str(sample_h5ad_file), "uns", str(out)])
+        assert result.exit_code == 1
+        assert "does not match" in result.output or ".json" in result.output
+
+    def test_nonexistent_object(self, sample_h5ad_file, temp_dir):
+        """Test that nonexistent object path is rejected."""
+        out = temp_dir / "output.csv"
+        result = runner.invoke(
+            app, ["export", str(sample_h5ad_file), "nonexistent/path", str(out)]
+        )
+        assert result.exit_code == 1
+        assert "not found" in result.output
+
+    def test_unknown_type_rejected(self, temp_dir):
+        """Test that unknown/complex types are rejected."""
+        file_path = temp_dir / "test_unknown.h5ad"
+        with h5py.File(file_path, "w") as f:
+            g = f.create_group("obs")
+            g.create_dataset("obs_names", data=np.array([b"cell1"]))
+            g.attrs["_index"] = "obs_names"
+            # Create a group without known encoding
+            weird = f.create_group("weird_group")
+            weird.attrs["encoding-type"] = "some_unknown_encoding"
+
+        out = temp_dir / "weird.json"
+        result = runner.invoke(app, ["export", str(file_path), "weird_group", str(out)])
+        # Should succeed as it's detected as dict
+        # but if it had sparse inside, it would fail
+        assert result.exit_code == 0
diff --git a/tests/test_info_read.py b/tests/test_info_read.py
index 07b9a13..8ad47b4 100644
--- a/tests/test_info_read.py
+++ b/tests/test_info_read.py
@@ -3,10 +3,89 @@
 import pytest
 import h5py
 import numpy as np
-from h5ad.info import axis_len, get_axis_group
+from h5ad.info import axis_len, get_axis_group, get_entry_type, format_type_info
 from h5ad.read import decode_str_array, read_categorical_column, col_chunk_as_strings
 
 
+class TestGetEntryType:
+    """Tests for get_entry_type function."""
+
+    def test_get_entry_type_dataframe(self, sample_h5ad_file):
+        """Test type detection for dataframe (obs/var)."""
+        with h5py.File(sample_h5ad_file, "r") as f:
+            info = get_entry_type(f["obs"])
+            assert info["type"] == "dataframe"
+            assert info["export_as"] == "csv"
+
+    def test_get_entry_type_dense_matrix(self, sample_h5ad_file):
+        """Test type detection for dense matrix."""
+        with h5py.File(sample_h5ad_file, "r") as f:
+            info = get_entry_type(f["X"])
+            assert info["type"] == "dense-matrix"
+            assert info["export_as"] == "npy"
+            assert info["shape"] == (5, 4)
+
+    def test_get_entry_type_sparse_matrix(self, sample_sparse_csr_h5ad):
+        """Test type detection for sparse matrix."""
+        with h5py.File(sample_sparse_csr_h5ad, "r") as f:
+            info = get_entry_type(f["X"])
+            assert info["type"] == "sparse-matrix"
+            assert info["export_as"] == "mtx"
+            assert info["encoding"] == "csr_matrix"
+
+    def test_get_entry_type_dict(self, sample_h5ad_file):
+        """Test type detection for dict/group."""
+        with h5py.File(sample_h5ad_file, "r") as f:
+            info = get_entry_type(f["uns"])
+            assert info["type"] == "dict"
+            assert info["export_as"] == "json"
+
+    def test_get_entry_type_1d_array(self, temp_dir):
+        """Test type detection for 1D array."""
+        file_path = temp_dir / "test.h5ad"
+        with h5py.File(file_path, "w") as f:
+            f.create_dataset("arr", data=np.array([1, 2, 3, 4, 5]))
+        with h5py.File(file_path, "r") as f:
+            info = get_entry_type(f["arr"])
+            assert info["type"] == "array"
+            assert info["export_as"] == "npy"
+
+    def test_get_entry_type_scalar(self, temp_dir):
+        """Test type detection for scalar."""
+        file_path = temp_dir / "test.h5ad"
+        with h5py.File(file_path, "w") as f:
+            f.create_dataset("scalar", data=42)
+        with h5py.File(file_path, "r") as f:
+            info = get_entry_type(f["scalar"])
+            assert info["type"] == "scalar"
+            assert info["export_as"] == "json"
+
+
+class TestFormatTypeInfo:
+    """Tests for format_type_info function."""
+
+    def test_format_type_info_dataframe(self):
+        """Test formatting dataframe type info."""
+        info = {"type": "dataframe", "export_as": "csv"}
+        result = format_type_info(info)
+        assert "<dataframe>" in result
+        assert "green" in result
+
+    def test_format_type_info_sparse(self):
+        """Test formatting sparse matrix type info."""
+        info = {"type": "sparse-matrix", "export_as": "mtx"}
+        result = format_type_info(info)
+        assert "<sparse-matrix>" in result
+        assert "magenta" in result
+
+    def test_format_type_info_unknown(self):
+        """Test formatting unknown type info."""
+        info = {"type": "unknown", "export_as": None}
+        result = format_type_info(info)
+        assert "<unknown>" in result
+        assert "red" in result
+
+
 class TestAxisLen:
     """Tests for axis_len function."""
 

From cf8bb0840f2c79e4ef247b1c041d5cbf9658f817 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Thu, 15 Jan 2026 19:52:24 +0000
Subject: [PATCH 05/62] Add optional dependency for images support

---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index c18faa4..3df76b2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,6 +16,9 @@ dev = [
     "pytest>=8.3.4",
     "pytest-cov>=6.0.0",
 ]
+images = [
+    "pillow>=10.0.0",
+]
 
 [build-system]
 requires = ["uv_build>=0.8.0,<0.9.0"]

From ec482264cfad7441e2df885150347c2c46d00e06 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Thu, 15 Jan 2026 20:10:30 +0000
Subject: [PATCH 06/62] Add import command for importing data into h5ad files

---
 src/h5ad/cli.py                  |  82 +++++
 src/h5ad/commands/__init__.py    |   1 +
 src/h5ad/commands/import_data.py | 511 +++++++++++++++++++++++++++++++
 tests/test_import.py             | 444 +++++++++++++++++++++++++++
 4 files changed, 1038 insertions(+)
 create mode 100644 src/h5ad/commands/import_data.py
 create mode 100644 tests/test_import.py

diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py
index a0fd811..9ea5d26 100644
--- a/src/h5ad/cli.py
+++ b/src/h5ad/cli.py
@@ -14,6 +14,7 @@
     export_table,
     subset_h5ad,
     export_object,
+    import_object,
 )
 
 app = typer.Typer(
@@ -233,5 +234,86 @@ def export_cmd(
         raise typer.Exit(code=1)
 
 
+@app.command("import")
+def import_cmd(
+    file: Path = typer.Argument(
+        ..., help="Path to the source .h5ad file", exists=True, readable=True
+    ),
+    obj: str = typer.Argument(
+        ...,
+        help="Object path to create/replace (e.g., 'obs', 'X', 'obsm/X_pca', 'uns')",
+    ),
+    input_file: Path = typer.Argument(
+        ...,
+        help="Input data file. Extension determines format: .csv, .npy, .mtx, .json",
+        exists=True,
+        readable=True,
+    ),
+    output: Optional[Path] = typer.Option(
+        None,
+        "--output",
+        "-o",
+        help="Output .h5ad file path. Required unless --inplace is specified.",
+        writable=True,
+    ),
+    inplace: bool = typer.Option(
+        False,
+        "--inplace",
+        help="Modify the source file directly instead of creating a new file.",
+    ),
+    index_column: Optional[str] = typer.Option(
+        None,
+        "--index-column",
+        "-i",
+        help="Column to use as index when importing CSV into obs/var. Defaults to first column.",
+    ),
+) -> None:
+    """
+    Import data from a file into the h5ad file.
+
+    Creates or replaces an object at the specified path. By default, creates
+    a new output file. Use --inplace to modify the source file directly.
+
+    The input format is auto-detected from the file extension:
+      - .csv  : DataFrames (obs, var)
+      - .npy  : Dense arrays/matrices (X, obsm/X_pca, varm/PCs, etc.)
+      - .mtx  : Sparse matrices (X, layers/*)
+      - .json : Dictionaries (uns, uns/metadata, etc.)
+
+    Dimensions are validated against existing obs/var:
+      - obs: row count must match n_obs
+      - var: row count must match n_var
+      - X, layers/*: must match (n_obs, n_var)
+      - obsm/*, obsp/*: first dimension must match n_obs
+      - varm/*, varp/*: first dimension must match n_var
+
+    Examples:
+        h5ad import data.h5ad obs cells.csv -o output.h5ad -i cell_id
+        h5ad import data.h5ad obsm/X_pca pca.npy -o output.h5ad
+        h5ad import data.h5ad X matrix.mtx --inplace
+        h5ad import data.h5ad uns/metadata config.json -o new.h5ad
+    """
+    if not inplace and output is None:
+        console.print(
+            "[bold red]Error:[/] Output file is required. "
+            "Use --output/-o to specify output file, or --inplace to modify source.",
+        )
+        raise typer.Exit(code=1)
+
+    try:
+        import_object(
+            file=file,
+            obj=obj,
+            input_file=input_file,
+            output_file=output,
+            inplace=inplace,
+            index_column=index_column,
+            console=console,
+        )
+    except Exception as e:
+        console.print(f"[bold red]Error:[/] {e}")
+        raise typer.Exit(code=1)
+
+
 def main(argv: Optional[Sequence[str]] = None) -> None:
     app(standalone_mode=True)
diff --git a/src/h5ad/commands/__init__.py b/src/h5ad/commands/__init__.py
index 4c5fa9a..b4d6016 100644
--- a/src/h5ad/commands/__init__.py
+++ b/src/h5ad/commands/__init__.py
@@ -2,3 +2,4 @@
 from h5ad.commands.table import export_table
 from h5ad.commands.subset import subset_h5ad
 from h5ad.commands.export import export_object
+from h5ad.commands.import_data import import_object
diff --git a/src/h5ad/commands/import_data.py b/src/h5ad/commands/import_data.py
new file mode 100644
index 0000000..c208a9d
--- /dev/null
+++ b/src/h5ad/commands/import_data.py
@@ -0,0 +1,511 @@
+"""Import command for creating/replacing objects in h5ad files."""
+
+from __future__ import annotations
+
+import csv
+import json
+import shutil
+from pathlib import Path
+from typing import Any, List, Optional, Tuple, cast
+
+import h5py
+import numpy as np
+from rich.console import Console
+
+
+# Map file extensions to expected input formats
+EXTENSION_FORMAT = {
+    ".csv": "csv",
+    ".npy": "npy",
+    ".mtx": "mtx",
+    ".json": "json",
+}
+
+# Define which object paths expect which dimensions
+# obs-axis: first dimension must match n_obs
+# var-axis: first dimension must match n_var
+# matrix: must match (n_obs, n_var)
+OBS_AXIS_PREFIXES = ("obs", "obsm/", "obsp/")
+VAR_AXIS_PREFIXES = ("var", "varm/", "varp/")
+MATRIX_PREFIXES = ("X", "layers/")
+
+
+def _norm_path(p: str) -> str:
+    p = p.strip()
+    if not p:
+        raise ValueError("Object path must be non-empty.")
+    return p.lstrip("/")
+
+
+def _get_axis_length(file: h5py.File, axis: str) -> Optional[int]:
+    """Get the length of obs or var axis."""
+    if axis not in file:
+        return None
+    group = file[axis]
+    if not isinstance(group, h5py.Group):
+        return None
+    index_name = group.attrs.get("_index", None)
+    if index_name is None:
+        index_name = "obs_names" if axis == "obs" else "var_names"
+    if isinstance(index_name, bytes):
+        index_name = index_name.decode("utf-8")
+    if index_name not in group:
+        return None
+    dataset = group[index_name]
+    if isinstance(dataset, h5py.Dataset) and dataset.shape:
+        return int(dataset.shape[0])
+    return None
+
+
+def _validate_dimensions(
+    file: h5py.File,
+    obj_path: str,
+    data_shape: tuple,
+    console: Console,
+) -> None:
+    """Validate that data dimensions match the target path requirements."""
+    n_obs = _get_axis_length(file, "obs")
+    n_var = _get_axis_length(file, "var")
+
+    # Check obs/var replacement (dataframe)
+    if obj_path == "obs":
+        if n_obs is not None and data_shape[0] != n_obs:
+            raise ValueError(
+                f"Row count mismatch: input has {data_shape[0]} rows, "
+                f"but obs has {n_obs} cells."
+            )
+        return
+    if obj_path == "var":
+        if n_var is not None and data_shape[0] != n_var:
+            raise ValueError(
+                f"Row count mismatch: input has {data_shape[0]} rows, "
+                f"but var has {n_var} features."
+            )
+        return
+
+    # Check matrix (X, layers/*)
+    for prefix in MATRIX_PREFIXES:
+        if (
+            obj_path == prefix
+            or obj_path.startswith(prefix + "/")
+            or obj_path.startswith(prefix)
+        ):
+            if obj_path == "X" or obj_path.startswith("layers/"):
+                if len(data_shape) < 2:
+                    raise ValueError(
+                        f"Matrix data requires 2D shape, got {len(data_shape)}D."
+                    )
+                if n_obs is not None and data_shape[0] != n_obs:
+                    raise ValueError(
+                        f"First dimension mismatch: input has {data_shape[0]} rows, "
+                        f"but obs has {n_obs} cells."
+                    )
+                if n_var is not None and data_shape[1] != n_var:
+                    raise ValueError(
+                        f"Second dimension mismatch: input has {data_shape[1]} columns, "
+                        f"but var has {n_var} features."
+                    )
+                return
+
+    # Check obs-axis matrices (obsm/*, obsp/*)
+    for prefix in OBS_AXIS_PREFIXES:
+        if obj_path.startswith(prefix) and obj_path != "obs":
+            if n_obs is not None and data_shape[0] != n_obs:
+                raise ValueError(
+                    f"First dimension mismatch: input has {data_shape[0]} rows, "
+                    f"but obs has {n_obs} cells."
+                )
+            # obsp should be square n_obs x n_obs
+            if obj_path.startswith("obsp/") and len(data_shape) >= 2:
+                if data_shape[1] != n_obs:
+                    raise ValueError(
+                        f"obsp matrix must be square (n_obs × n_obs): "
+                        f"got {data_shape[0]}×{data_shape[1]}, expected {n_obs}×{n_obs}."
+                    )
+            return
+
+    # Check var-axis matrices (varm/*, varp/*)
+    for prefix in VAR_AXIS_PREFIXES:
+        if obj_path.startswith(prefix) and obj_path != "var":
+            if n_var is not None and data_shape[0] != n_var:
+                raise ValueError(
+                    f"First dimension mismatch: input has {data_shape[0]} rows, "
+                    f"but var has {n_var} features."
+                )
+            # varp should be square n_var x n_var
+            if obj_path.startswith("varp/") and len(data_shape) >= 2:
+                if data_shape[1] != n_var:
+                    raise ValueError(
+                        f"varp matrix must be square (n_var × n_var): "
+                        f"got {data_shape[0]}×{data_shape[1]}, expected {n_var}×{n_var}."
+                    )
+            return
+
+    # For other paths (like uns/*), no dimension validation
+    console.print(f"[dim]Note: No dimension validation for path '{obj_path}'[/]")
+
+
+def _read_csv(
+    input_file: Path,
+    index_column: Optional[str],
+) -> Tuple[List[dict], List[str], List[str], str]:
+    """
+    Read CSV file and return rows, column names, index values, and index column name.
+
+    Returns:
+        (rows, column_names, index_values, index_column_name)
+    """
+    with open(input_file, "r", encoding="utf-8", newline="") as f:
+        reader = csv.DictReader(f)
+        if reader.fieldnames is None:
+            raise ValueError("CSV file has no header.")
+        fieldnames = list(reader.fieldnames)
+
+        # Determine index column
+        if index_column:
+            if index_column not in fieldnames:
+                raise ValueError(
+                    f"Index column '{index_column}' not found in CSV. "
+                    f"Available columns: {', '.join(fieldnames)}"
+                )
+            idx_col = index_column
+        else:
+            idx_col = fieldnames[0]
+
+        # Read all rows
+        rows = list(reader)
+
+    index_values = [row[idx_col] for row in rows]
+    data_columns = [c for c in fieldnames if c != idx_col]
+
+    return rows, data_columns, index_values, idx_col
+
+
+def _read_mtx(
+    input_file: Path,
+) -> Tuple[List[Tuple[int, int, float]], Tuple[int, int], int]:
+    """
+    Read Matrix Market file and return sparse matrix data.
+
+    Returns:
+        (data, indices, indptr, shape, nnz, is_csr)
+    """
+    with open(input_file, "r", encoding="utf-8") as fh:
+        header = fh.readline()
+        if not header.startswith("%%MatrixMarket"):
+            raise ValueError("Invalid MTX file: missing MatrixMarket header.")
+
+        # Parse header for field type
+        parts = header.lower().split()
+        field = "real"
+        for p in parts:
+            if p in ("real", "integer", "complex", "pattern"):
+                field = p
+                break
+
+        # Skip comments
+        line = fh.readline()
+        while line.startswith("%"):
+            line = fh.readline()
+
+        # Read dimensions
+        dims = line.split()
+        n_rows, n_cols, nnz = int(dims[0]), int(dims[1]), int(dims[2])
+
+        # Read entries
+        entries = []
+        for _ in range(nnz):
+            parts = fh.readline().split()
+            r, c = int(parts[0]) - 1, int(parts[1]) - 1
+            if field == "pattern":
+                v = 1.0
+            else:
+                v = float(parts[2])
+            entries.append((r, c, v))
+
+    return entries, (n_rows, n_cols), nnz
+
+
+def _create_csr_from_entries(
+    entries: List[Tuple[int, int, float]], shape: Tuple[int, int]
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Convert coordinate entries to CSR format."""
+    n_rows, _ = shape
+    # Sort by row, then column
+    entries.sort(key=lambda x: (x[0], x[1]))
+
+    data = np.array([e[2] for e in entries], dtype=np.float32)
+    indices = np.array([e[1] for e in entries], dtype=np.int32)
+
+    # Build indptr
+    indptr = np.zeros(n_rows + 1, dtype=np.int32)
+    for r, _, _ in entries:
+        indptr[r + 1] += 1
+    indptr = np.cumsum(indptr)
+
+    return data, indices, indptr
+
+
+def import_object(
+    file: Path,
+    obj: str,
+    input_file: Path,
+    output_file: Optional[Path],
+    inplace: bool,
+    index_column: Optional[str],
+    console: Console,
+) -> None:
+    """
+    Import data from a file into an h5ad object.
+
+    Args:
+        file: Path to the source h5ad file
+        obj: Object path to create/replace (e.g., 'obs', 'obsm/X_pca', 'X')
+        input_file: Input data file (.csv, .npy, .mtx, .json)
+        output_file: Path to output h5ad file (None if inplace)
+        inplace: If True, modify the source file directly
+        index_column: Column to use as index for obs/var CSV import
+        console: Console for output
+    """
+    # Determine target file
+    if inplace:
+        target_file = file
+    else:
+        if output_file is None:
+            raise ValueError("Output file is required unless --inplace is specified.")
+        # Copy source to output first
+        shutil.copy2(file, output_file)
+        target_file = output_file
+        console.print(f"[dim]Copied {file} → {output_file}[/]")
+
+    obj = _norm_path(obj)
+    ext = input_file.suffix.lower()
+
+    if ext not in EXTENSION_FORMAT:
+        raise ValueError(
+            f"Unsupported input file extension '{ext}'. "
+            f"Supported: {', '.join(sorted(EXTENSION_FORMAT.keys()))}"
+        )
+
+    fmt = EXTENSION_FORMAT[ext]
+
+    # Validate index_column is only used for obs/var CSV
+    if index_column and (fmt != "csv" or obj not in ("obs", "var")):
+        raise ValueError(
+            "--index-column is only valid for CSV import into 'obs' or 'var'."
+        )
+
+    if fmt == "csv":
+        _import_csv(target_file, obj, input_file, index_column, console)
+    elif fmt == "npy":
+        _import_npy(target_file, obj, input_file, console)
+    elif fmt == "mtx":
+        _import_mtx(target_file, obj, input_file, console)
+    elif fmt == "json":
+        _import_json(target_file, obj, input_file, console)
+
+
+def _import_csv(
+    file: Path,
+    obj: str,
+    input_file: Path,
+    index_column: Optional[str],
+    console: Console,
+) -> None:
+    """Import CSV data into obs or var."""
+    if obj not in ("obs", "var"):
+        raise ValueError(
+            f"CSV import is only supported for 'obs' or 'var', not '{obj}'."
+        )
+
+    rows, data_columns, index_values, _ = _read_csv(input_file, index_column)
+    n_rows = len(rows)
+
+    with h5py.File(file, "a") as f:
+        # Validate dimensions if the file already has obs/var
+        _validate_dimensions(f, obj, (n_rows,), console)
+
+        # Delete existing group if present
+        if obj in f:
+            del f[obj]
+
+        # Create new group
+        group = f.create_group(obj)
+        index_name = "obs_names" if obj == "obs" else "var_names"
+        group.attrs["_index"] = index_name
+        group.attrs["encoding-type"] = "dataframe"
+        group.attrs["encoding-version"] = "0.2.0"
+        group.attrs["column-order"] = np.array(data_columns, dtype="S")
+
+        # Create index dataset
+        group.create_dataset(
+            index_name,
+            data=np.array(index_values, dtype="S"),
+        )
+
+        # Create column datasets
+        for col in data_columns:
+            values = [row[col] for row in rows]
+            # Try to infer type
+            try:
+                arr = np.array(values, dtype=np.float64)
+                group.create_dataset(col, data=arr)
+            except (ValueError, TypeError):
+                try:
+                    arr = np.array(values, dtype=np.int64)
+                    group.create_dataset(col, data=arr)
+                except (ValueError, TypeError):
+                    # Fallback to string
+                    arr = np.array(values, dtype="S")
+                    ds = group.create_dataset(col, data=arr)
+                    ds.attrs["encoding-type"] = "string-array"
+                    ds.attrs["encoding-version"] = "0.2.0"
+
+    console.print(
+        f"[green]Imported[/] {n_rows} rows × {len(data_columns)} columns into '{obj}'"
+    )
+
+
+def _import_npy(
+    file: Path,
+    obj: str,
+    input_file: Path,
+    console: Console,
+) -> None:
+    """Import NPY data into a dataset."""
+    arr = np.load(input_file)
+
+    with h5py.File(file, "a") as f:
+        _validate_dimensions(f, obj, arr.shape, console)
+
+        # Handle nested paths
+        parts = obj.split("/")
+        parent_path = "/".join(parts[:-1])
+        name = parts[-1]
+
+        # Ensure parent groups exist
+        if parent_path:
+            if parent_path not in f:
+                f.create_group(parent_path)
+            parent = cast(h5py.Group, f[parent_path])
+        else:
+            parent = f
+
+        # Delete existing if present
+        if name in parent:
+            del parent[name]
+
+        # Create dataset
+        parent.create_dataset(name, data=arr)
+
+    shape_str = "×".join(str(d) for d in arr.shape)
+    console.print(f"[green]Imported[/] {shape_str} array into '{obj}'")
+
+
+def _import_mtx(
+    file: Path,
+    obj: str,
+    input_file: Path,
+    console: Console,
+) -> None:
+    """Import MTX (Matrix Market) data as CSR sparse matrix."""
+    entries, shape, nnz = _read_mtx(input_file)
+    data, indices, indptr = _create_csr_from_entries(entries, shape)
+
+    with h5py.File(file, "a") as f:
+        _validate_dimensions(f, obj, shape, console)
+
+        # Handle nested paths
+        parts = obj.split("/")
+        parent_path = "/".join(parts[:-1])
+        name = parts[-1]
+
+        if parent_path:
+            if parent_path not in f:
+                f.create_group(parent_path)
+            parent = cast(h5py.Group, f[parent_path])
+        else:
+            parent = f
+
+        # Delete existing if present
+        if name in parent:
+            del parent[name]
+
+        # Create sparse matrix group
+        group = parent.create_group(name)
+        group.attrs["encoding-type"] = "csr_matrix"
+        group.attrs["encoding-version"] = "0.1.0"
+        group.attrs["shape"] = np.array(shape, dtype=np.int64)
+
+        group.create_dataset("data", data=data)
+        group.create_dataset("indices", data=indices)
+        group.create_dataset("indptr", data=indptr)
+
+    console.print(
+        f"[green]Imported[/] {shape[0]}×{shape[1]} sparse matrix ({nnz} non-zero) into '{obj}'"
+    )
+
+
+def _import_json(
+    file: Path,
+    obj: str,
+    input_file: Path,
+    console: Console,
+) -> None:
+    """Import JSON data into uns or other dict-like groups."""
+    with open(input_file, "r", encoding="utf-8") as fh:
+        payload = json.load(fh)
+
+    with h5py.File(file, "a") as f:
+        # Handle nested paths
+        parts = obj.split("/")
+        parent_path = "/".join(parts[:-1])
+        name = parts[-1]
+
+        if parent_path:
+            if parent_path not in f:
+                f.create_group(parent_path)
+            parent = cast(h5py.Group, f[parent_path])
+        else:
+            parent = f
+
+        # Delete existing if present
+        if name in parent:
+            del parent[name]
+
+        # Create from JSON
+        _write_json_to_h5(parent, name, payload)
+
+    console.print(f"[green]Imported[/] JSON data into '{obj}'")
+
+
+def _write_json_to_h5(parent: h5py.Group, name: str, value: Any) -> None:
+    """Recursively write JSON-like data to HDF5."""
+    if isinstance(value, dict):
+        group = parent.create_group(name)
+        for k, v in value.items():
+            _write_json_to_h5(group, k, v)
+    elif isinstance(value, list):
+        # Try to convert to array
+        try:
+            arr = np.array(value)
+            if arr.dtype.kind in ("U", "O"):
+                arr = np.array(value, dtype="S")
+            parent.create_dataset(name, data=arr)
+        except (ValueError, TypeError):
+            # Fallback: store as JSON string
+            parent.create_dataset(name, data=json.dumps(value).encode("utf-8"))
+    elif isinstance(value, str):
+        parent.create_dataset(name, data=np.array([value], dtype="S"))
+    elif isinstance(value, bool):
+        parent.create_dataset(name, data=np.array(value, dtype=bool))
+    elif isinstance(value, int):
+        parent.create_dataset(name, data=np.array(value, dtype=np.int64))
+    elif isinstance(value, float):
+        parent.create_dataset(name, data=np.array(value, dtype=np.float64))
+    elif value is None:
+        # Store None as empty string attribute or special marker
+        ds = parent.create_dataset(name, data=np.array([], dtype="S"))
+        ds.attrs["_is_none"] = True
+    else:
+        raise ValueError(f"Cannot convert JSON value of type {type(value).__name__}")
diff --git a/tests/test_import.py b/tests/test_import.py
new file mode 100644
index 0000000..895a488
--- /dev/null
+++ b/tests/test_import.py
@@ -0,0 +1,444 @@
+"""Tests for the import command."""
+
+import json
+from pathlib import Path
+
+import h5py
+import numpy as np
+from typer.testing import CliRunner
+
+from h5ad.cli import app
+
+
+runner = CliRunner()
+
+
+class TestImportCsv:
+    def test_import_csv_obs_inplace(self, sample_h5ad_file, temp_dir):
+        """Test importing CSV into obs with --inplace."""
+        csv_file = temp_dir / "new_obs.csv"
+        csv_file.write_text(
+            "cell_id,score,label\n"
+            "cell_1,1.5,A\n"
+            "cell_2,2.5,B\n"
+            "cell_3,3.5,A\n"
+            "cell_4,4.5,C\n"
+            "cell_5,5.5,B\n"
+        )
+
+        result = runner.invoke(
+            app,
+            [
+                "import",
+                str(sample_h5ad_file),
+                "obs",
+                str(csv_file),
+                "--inplace",
+                "-i",
+                "cell_id",
+            ],
+        )
+        assert result.exit_code == 0
+        assert "5 rows" in result.output
+        assert "2 columns" in result.output
+
+        with h5py.File(sample_h5ad_file, "r") as f:
+            assert "obs" in f
+            obs = f["obs"]
+            assert "score" in obs
+            assert "label" in obs
+
+    def test_import_csv_obs_output(self, sample_h5ad_file, temp_dir):
+        """Test importing CSV into obs with output file."""
+        csv_file = temp_dir / "new_obs.csv"
+        csv_file.write_text(
+            "cell_id,score,label\n"
+            "cell_1,1.5,A\n"
+            "cell_2,2.5,B\n"
+            "cell_3,3.5,A\n"
+            "cell_4,4.5,C\n"
+            "cell_5,5.5,B\n"
+        )
+        output_file = temp_dir / "output.h5ad"
+
+        result = runner.invoke(
+            app,
+            [
+                "import",
+                str(sample_h5ad_file),
+                "obs",
+                str(csv_file),
+                "-o",
+                str(output_file),
+                "-i",
+                "cell_id",
+            ],
+        )
+        assert result.exit_code == 0
+        assert output_file.exists()
+
+        # Verify output file has the new data
+        with h5py.File(output_file, "r") as f:
+            assert "obs" in f
+            obs = f["obs"]
+            assert "score" in obs
+
+        # Verify source file is unchanged
+        with h5py.File(sample_h5ad_file, "r") as f:
+            obs = f["obs"]
+            assert "score" not in obs
+
+    def test_import_csv_var(self, sample_h5ad_file, temp_dir):
+        """Test importing CSV into var."""
+        csv_file = temp_dir / "new_var.csv"
+        csv_file.write_text(
+            "gene_id,mean,std\n"
+            "gene_1,0.1,0.01\n"
+            "gene_2,0.2,0.02\n"
+            "gene_3,0.3,0.03\n"
+            "gene_4,0.4,0.04\n"
+        )
+
+        result = runner.invoke(
+            app,
+            [
+                "import",
+                str(sample_h5ad_file),
+                "var",
+                str(csv_file),
+                "--inplace",
+                "-i",
+                "gene_id",
+            ],
+        )
+        assert result.exit_code == 0
+        assert "4 rows" in result.output
+
+    def test_import_csv_dimension_mismatch(self, sample_h5ad_file, temp_dir):
+        """Test that dimension mismatch is rejected."""
+        csv_file = temp_dir / "wrong_obs.csv"
+        csv_file.write_text("cell_id,score\ncell_1,1.0\ncell_2,2.0\n")
+
+        result = runner.invoke(
+            app,
+            [
+                "import",
+                str(sample_h5ad_file),
+                "obs",
+                str(csv_file),
+                "--inplace",
+                "-i",
+                "cell_id",
+            ],
+        )
+        assert result.exit_code == 1
+        assert "mismatch" in result.output.lower()
+
+    def test_import_csv_invalid_index_column(self, sample_h5ad_file, temp_dir):
+        """Test that invalid index column is rejected."""
+        csv_file = temp_dir / "obs.csv"
+        csv_file.write_text("a,b,c\n1,2,3\n")
+
+        result = runner.invoke(
+            app,
+            [
+                "import",
+                str(sample_h5ad_file),
+                "obs",
+                str(csv_file),
+                "--inplace",
+                "-i",
+                "nonexistent",
+            ],
+        )
+        assert result.exit_code == 1
+        assert "not found" in result.output.lower()
+
+    def test_import_csv_not_obs_var(self, sample_h5ad_file, temp_dir):
+        """Test that CSV import is only allowed for obs/var."""
+        csv_file = temp_dir / "data.csv"
+        csv_file.write_text("a,b\n1,2\n")
+
+        result = runner.invoke(
+            app,
+            ["import", str(sample_h5ad_file), "uns/data", str(csv_file), "--inplace"],
+        )
+        assert result.exit_code == 1
+        assert "only supported for 'obs' or 'var'" in result.output
+
+    def test_import_requires_output_or_inplace(self, sample_h5ad_file, temp_dir):
+        """Test that either --output or --inplace is required."""
+        csv_file = temp_dir / "obs.csv"
+        csv_file.write_text("a,b\n1,2\n")
+
+        result = runner.invoke(
+            app,
+            ["import", str(sample_h5ad_file), "obs", str(csv_file)],
+        )
+        assert result.exit_code == 1
+        assert "Output file is required" in result.output
+
+
+class TestImportNpy:
+    def test_import_npy_obsm(self, sample_h5ad_file, temp_dir):
+        """Test importing NPY into obsm."""
+        npy_file = temp_dir / "pca.npy"
+        arr = np.random.randn(5, 10).astype(np.float32)
+        np.save(npy_file, arr)
+
+        result = runner.invoke(
+            app,
+            ["import", str(sample_h5ad_file), "obsm/X_pca", str(npy_file), "--inplace"],
+        )
+        assert result.exit_code == 0
+        assert "5×10" in result.output
+
+        with h5py.File(sample_h5ad_file, "r") as f:
+            assert "obsm/X_pca" in f
+            np.testing.assert_allclose(f["obsm/X_pca"][...], arr)
+
+    def test_import_npy_varm(self, sample_h5ad_file, temp_dir):
+        """Test importing NPY into varm."""
+        npy_file = temp_dir / "pcs.npy"
+        arr = np.random.randn(4, 5).astype(np.float32)
+        np.save(npy_file, arr)
+
+        result = runner.invoke(
+            app,
+            ["import", str(sample_h5ad_file), "varm/PCs", str(npy_file), "--inplace"],
+        )
+        assert result.exit_code == 0
+
+        with h5py.File(sample_h5ad_file, "r") as f:
+            assert "varm/PCs" in f
+
+    def test_import_npy_X(self, sample_h5ad_file, temp_dir):
+        """Test importing NPY into X."""
+        npy_file = temp_dir / "X.npy"
+        arr = np.random.randn(5, 4).astype(np.float32)
+        np.save(npy_file, arr)
+
+        result = runner.invoke(
+            app,
+            ["import", str(sample_h5ad_file), "X", str(npy_file), "--inplace"],
+        )
+        assert result.exit_code == 0
+
+        with h5py.File(sample_h5ad_file, "r") as f:
+            assert "X" in f
+            np.testing.assert_allclose(f["X"][...], arr)
+
+    def test_import_npy_dimension_mismatch_obsm(self, sample_h5ad_file, temp_dir):
+        """Test that obsm dimension mismatch is rejected."""
+        npy_file = temp_dir / "bad_pca.npy"
+        arr = np.random.randn(10, 5).astype(np.float32)
+        np.save(npy_file, arr)
+
+        result = runner.invoke(
+            app,
+            ["import", str(sample_h5ad_file), "obsm/X_pca", str(npy_file), "--inplace"],
+        )
+        assert result.exit_code == 1
+        assert "mismatch" in result.output.lower()
+
+    def test_import_npy_dimension_mismatch_X(self, sample_h5ad_file, temp_dir):
+        """Test that X dimension mismatch is rejected."""
+        npy_file = temp_dir / "bad_X.npy"
+        arr = np.random.randn(5, 10).astype(np.float32)
+        np.save(npy_file, arr)
+
+        result = runner.invoke(
+            app,
+            ["import", str(sample_h5ad_file), "X", str(npy_file), "--inplace"],
+        )
+        assert result.exit_code == 1
+        assert "mismatch" in result.output.lower()
+
+
+class TestImportMtx:
+    def test_import_mtx_X(self, sample_h5ad_file, temp_dir):
+        """Test importing MTX into X."""
+        mtx_file = temp_dir / "X.mtx"
+        mtx_file.write_text(
+            "%%MatrixMarket matrix coordinate real general\n"
+            "% test matrix\n"
+            "5 4 5\n"
+            "1 1 1.0\n"
+            "2 2 2.0\n"
+            "3 3 3.0\n"
+            "4 4 4.0\n"
+            "5 1 5.0\n"
+        )
+
+        result = runner.invoke(
+            app,
+            ["import", str(sample_h5ad_file), "X", str(mtx_file), "--inplace"],
+        )
+        assert result.exit_code == 0
+        assert "5×4" in result.output
+        assert "5 non-zero" in result.output
+
+        with h5py.File(sample_h5ad_file, "r") as f:
+            assert "X" in f
+            X = f["X"]
+            enc = X.attrs.get("encoding-type")
+            if isinstance(enc, bytes):
+                enc = enc.decode("utf-8")
+            assert enc == "csr_matrix"
+
+    def test_import_mtx_layer(self, sample_h5ad_file, temp_dir):
+        """Test importing MTX into layers."""
+        mtx_file = temp_dir / "layer.mtx"
+        mtx_file.write_text(
+            "%%MatrixMarket matrix coordinate real general\n"
+            "5 4 3\n"
+            "1 1 1.0\n"
+            "3 2 2.0\n"
+            "5 4 3.0\n"
+        )
+
+        result = runner.invoke(
+            app,
+            [
+                "import",
+                str(sample_h5ad_file),
+                "layers/counts",
+                str(mtx_file),
+                "--inplace",
+            ],
+        )
+        assert result.exit_code == 0
+
+        with h5py.File(sample_h5ad_file, "r") as f:
+            assert "layers/counts" in f
+
+    def test_import_mtx_dimension_mismatch(self, sample_h5ad_file, temp_dir):
+        """Test that MTX dimension mismatch is rejected."""
+        mtx_file = temp_dir / "bad.mtx"
+        mtx_file.write_text(
+            "%%MatrixMarket matrix coordinate real general\n" "10 4 1\n" "1 1 1.0\n"
+        )
+
+        result = runner.invoke(
+            app,
+            ["import", str(sample_h5ad_file), "X", str(mtx_file), "--inplace"],
+        )
+        assert result.exit_code == 1
+        assert "mismatch" in result.output.lower()
+
+
+class TestImportJson:
+    def test_import_json_uns(self, sample_h5ad_file, temp_dir):
+        """Test importing JSON into uns."""
+        json_file = temp_dir / "metadata.json"
+        json_file.write_text(
+            json.dumps(
+                {
+                    "version": "1.0",
+                    "colors": ["red", "green", "blue"],
+                    "n_pcs": 50,
+                }
+            )
+        )
+
+        result = runner.invoke(
+            app,
+            [
+                "import",
+                str(sample_h5ad_file),
+                "uns/metadata",
+                str(json_file),
+                "--inplace",
+            ],
+        )
+        assert result.exit_code == 0
+        assert "JSON data" in result.output
+
+        with h5py.File(sample_h5ad_file, "r") as f:
+            assert "uns/metadata" in f
+            assert "colors" in f["uns/metadata"]
+            assert "n_pcs" in f["uns/metadata"]
+
+    def test_import_json_nested(self, sample_h5ad_file, temp_dir):
+        """Test importing nested JSON."""
+        json_file = temp_dir / "config.json"
+        json_file.write_text(
+            json.dumps(
+                {
+                    "settings": {
+                        "threshold": 0.5,
+                        "enabled": True,
+                    },
+                    "labels": ["A", "B", "C"],
+                }
+            )
+        )
+
+        result = runner.invoke(
+            app,
+            [
+                "import",
+                str(sample_h5ad_file),
+                "uns/config",
+                str(json_file),
+                "--inplace",
+            ],
+        )
+        assert result.exit_code == 0
+
+        with h5py.File(sample_h5ad_file, "r") as f:
+            assert "uns/config/settings" in f
+            assert "uns/config/labels" in f
+
+
+class TestImportValidation:
+    def test_unsupported_extension(self, sample_h5ad_file, temp_dir):
+        """Test that unsupported extensions are rejected."""
+        bad_file = temp_dir / "data.xlsx"
+        bad_file.write_text("dummy")
+
+        result = runner.invoke(
+            app,
+            ["import", str(sample_h5ad_file), "obs", str(bad_file), "--inplace"],
+        )
+        assert result.exit_code == 1
+        assert "Unsupported" in result.output
+
+    def test_index_column_only_for_csv_obs_var(self, sample_h5ad_file, temp_dir):
+        """Test that --index-column is only valid for CSV obs/var."""
+        npy_file = temp_dir / "data.npy"
+        np.save(npy_file, np.array([1, 2, 3]))
+
+        result = runner.invoke(
+            app,
+            [
+                "import",
+                str(sample_h5ad_file),
+                "uns/data",
+                str(npy_file),
+                "--inplace",
+                "-i",
+                "col",
+            ],
+        )
+        assert result.exit_code == 1
+        assert "only valid for CSV" in result.output
+
+    def test_replace_existing_object(self, sample_h5ad_file, temp_dir):
+        """Test that existing objects can be replaced."""
+        with h5py.File(sample_h5ad_file, "r") as f:
+            original_X = np.array(f["X"][...])
+
+        npy_file = temp_dir / "new_X.npy"
+        new_arr = np.ones((5, 4), dtype=np.float32) * 999
+        np.save(npy_file, new_arr)
+
+        result = runner.invoke(
+            app,
+            ["import", str(sample_h5ad_file), "X", str(npy_file), "--inplace"],
+        )
+        assert result.exit_code == 0
+
+        with h5py.File(sample_h5ad_file, "r") as f:
+            np.testing.assert_allclose(f["X"][...], new_arr)
+            assert not np.allclose(f["X"][...], original_X)

From e1db36f748ee70a69323023c4450caa57ef3e76e Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Thu, 15 Jan 2026 20:41:06 +0000
Subject: [PATCH 07/62] Refactor CLI commands for exporting and importing
 dataframes, arrays, and dictionaries

- Renamed and updated tests for exporting dataframes, replacing the previous table command.
- Introduced new command structure for exporting arrays and dictionaries.
- Updated import tests to reflect changes in command structure for dataframes and arrays.
- Ensured proper validation for unsupported types and dimension mismatches during import.
- Enhanced help command tests for new export and import functionalities.
---
 src/h5ad/cli.py      | 491 ++++++++++++++++++++++++++++++-------------
 tests/test_cli.py    |  90 +++++---
 tests/test_export.py |  86 ++++----
 tests/test_import.py | 215 ++++++++++++++-----
 4 files changed, 606 insertions(+), 276 deletions(-)

diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py
index 9ea5d26..f42768a 100644
--- a/src/h5ad/cli.py
+++ b/src/h5ad/cli.py
@@ -1,28 +1,28 @@
-import sys
-import csv
+"""CLI for h5ad files with export and import subcommands."""
+
 from pathlib import Path
-from typing import Optional, Sequence, Tuple, Dict, List
+from typing import Optional, Sequence, List
 
-import rich
 from rich.console import Console
 import typer
-import h5py
-import numpy as np
-
-from h5ad.commands import (
-    show_info,
-    export_table,
-    subset_h5ad,
-    export_object,
-    import_object,
-)
+
+from h5ad.commands import show_info, subset_h5ad
 
 app = typer.Typer(
-    help="Streaming CLI for huge .h5ad files (info, table, subset, export)."
+    help="Streaming CLI for huge .h5ad files (info, subset, export, import)."
 )
 console = Console(stderr=True)
 
+# Create sub-apps for export and import
+export_app = typer.Typer(help="Export objects from h5ad files.")
+import_app = typer.Typer(help="Import objects into h5ad files.")
+app.add_typer(export_app, name="export")
+app.add_typer(import_app, name="import")
 
+
+# ============================================================================
+# INFO command
+# ============================================================================
 @app.command()
 def info(
     file: Path = typer.Argument(
@@ -58,67 +58,9 @@ def info(
     show_info(file, console, show_types=types, obj_path=obj)
 
 
-@app.command()
-def table(
-    file: Path = typer.Argument(
-        ...,
-        help="Path to the .h5ad file",
-        exists=True,
-        readable=True,
-    ),
-    axis: str = typer.Option("obs", help="Axis to read from ('obs' or 'var')"),
-    columns: Optional[str] = typer.Option(
-        None,
-        "--columns",
-        "-c",
-        help="Comma separated column names to include in the output table",
-    ),
-    out: Optional[Path] = typer.Option(
-        None,
-        "--output",
-        "-o",
-        help="Output file path (defaults to stdout)",
-        writable=True,
-    ),
-    chunk_rows: int = typer.Option(
-        10000, "--chunk-rows", "-r", help="Number of rows to read per chunk"
-    ),
-    head: Optional[int] = typer.Option(
-        None, "--head", "-n", help="Output only the first n rows"
-    ),
-) -> None:
-    """
-    Export a table of the specified axis ('obs' or 'var') to CSV format.
-    Args:
-        file (Path): Path to the .h5ad file
-        axis (str): Axis to read from ('obs' or 'var')
-        columns (Optional[str]): Comma separated column names to include in the output table
-        out (Optional[Path]): Output file path (defaults to stdout)
-        chunk_rows (int): Number of rows to read per chunk
-        head (Optional[int]): Output only the first n rows
-    """
-    # Validate axis parameter
-    if axis not in ("obs", "var"):
-        console.print(
-            f"[bold red]Error:[/] Invalid axis '{axis}'. Must be either 'obs' or 'var'.",
-        )
-        raise typer.Exit(code=1)
-
-    col_list: Optional[List[str]] = None
-    if columns:
-        col_list = [col.strip() for col in columns.split(",") if col.strip()]
-
-    export_table(
-        file=file,
-        axis=axis,
-        columns=col_list,
-        out=out,
-        chunk_rows=chunk_rows,
-        head=head,
-        console=console,
-    )
-
-
+# ============================================================================
+# SUBSET command
+# ============================================================================
 @app.command()
 def subset(
     file: Path = typer.Argument(..., help="Input .h5ad", exists=True, readable=True),
@@ -162,30 +104,135 @@ def subset(
         raise typer.Exit(code=1)
 
 
-@app.command("export")
-def export_cmd(
+# ============================================================================
+# EXPORT subcommands
+# ============================================================================
+@export_app.command("dataframe")
+def export_dataframe(
     file: Path = typer.Argument(
         ..., help="Path to the .h5ad file", exists=True, readable=True
     ),
-    obj: str = typer.Argument(
-        ..., help="Object path to export (e.g., 'obs', 'X', 'obsm/X_pca', 'uns')"
-    ),
-    out: Path = typer.Argument(
-        ...,
-        help="Output file path. Extension determines format: .csv, .npy, .mtx, .json, .png/.jpg/.tiff",
-    ),
+    obj: str = typer.Argument(..., help="Object path to export ('obs' or 'var')"),
+    out: Path = typer.Argument(..., help="Output CSV file path"),
     columns: Optional[str] = typer.Option(
         None,
         "--columns",
         "-c",
-        help="Comma separated column names (for dataframe/CSV export only)",
+        help="Comma separated column names to include",
     ),
     chunk_rows: int = typer.Option(
         10000, "--chunk-rows", "-r", help="Number of rows to read per chunk"
     ),
     head: Optional[int] = typer.Option(
-        None, "--head", "-n", help="Output only the first n rows (for CSV export)"
+        None, "--head", "-n", help="Output only the first n rows"
     ),
+) -> None:
+    """
+    Export a dataframe (obs or var) to CSV.
+
+    Examples:
+        h5ad export dataframe data.h5ad obs obs.csv
+        h5ad export dataframe data.h5ad var var.csv --columns gene_id,mean
+        h5ad export dataframe data.h5ad obs - --head 100
+    """
+    from h5ad.commands import export_table
+
+    if obj not in ("obs", "var"):
+        console.print(
+            f"[bold red]Error:[/] Object must be 'obs' or 'var', not '{obj}'.",
+        )
+        raise typer.Exit(code=1)
+
+    col_list: Optional[List[str]] = None
+    if columns:
+        col_list = [col.strip() for col in columns.split(",") if col.strip()]
+
+    try:
+        export_table(
+            file=file,
+            axis=obj,
+            columns=col_list,
+            out=out if str(out) != "-" else None,
+            chunk_rows=chunk_rows,
+            head=head,
+            console=console,
+        )
+    except Exception as e:
+        console.print(f"[bold red]Error:[/] {e}")
+        raise typer.Exit(code=1)
+
+
+@export_app.command("array")
+def export_array(
+    file: Path = typer.Argument(
+        ..., help="Path to the .h5ad file", exists=True, readable=True
+    ),
+    obj: str = typer.Argument(
+        ..., help="Object path to export (e.g., 'obsm/X_pca', 'varm/PCs', 'X')"
+    ),
+    out: Path = typer.Argument(..., help="Output .npy file path"),
+    chunk_rows: int = typer.Option(
+        10000, "--chunk-rows", "-r", help="Number of rows to read per chunk"
+    ),
+) -> None:
+    """
+    Export a dense array or matrix to NumPy .npy format.
+
+    Examples:
+        h5ad export array data.h5ad obsm/X_pca pca.npy
+        h5ad export array data.h5ad X matrix.npy
+        h5ad export array data.h5ad varm/PCs loadings.npy
+    """
+    from h5ad.commands.export import _export_npy
+
+    try:
+        _export_npy(
+            file=file,
+            obj=obj,
+            out=out,
+            chunk_rows=chunk_rows,
+            console=console,
+        )
+    except Exception as e:
+        console.print(f"[bold red]Error:[/] {e}")
+        raise typer.Exit(code=1)
+
+
+@export_app.command("sparse")
+def export_sparse(
+    file: Path = typer.Argument(
+        ..., help="Path to the .h5ad file", exists=True, readable=True
+    ),
+    obj: str = typer.Argument(
+        ..., help="Object path to export (e.g., 'X', 'layers/counts')"
+    ),
+    out: Path = typer.Argument(..., help="Output .mtx file path"),
+) -> None:
+    """
+    Export a sparse matrix (CSR/CSC) to Matrix Market (.mtx) format.
+
+    Examples:
+        h5ad export sparse data.h5ad X matrix.mtx
+        h5ad export sparse data.h5ad layers/counts counts.mtx
+    """
+    from h5ad.commands.export import _export_mtx
+
+    try:
+        _export_mtx(file=file, obj=obj, out=out, console=console)
+    except Exception as e:
+        console.print(f"[bold red]Error:[/] {e}")
+        raise typer.Exit(code=1)
+
+
+@export_app.command("dict")
+def export_dict(
+    file: Path = typer.Argument(
+        ..., help="Path to the .h5ad file", exists=True, readable=True
+    ),
+    obj: str = typer.Argument(
+        ..., help="Object path to export (e.g., 'uns', 'uns/colors')"
+    ),
+    out: Path = typer.Argument(..., help="Output .json file path"),
     max_elements: int = typer.Option(
         1_000_000,
         "--max-elements",
@@ -196,35 +243,19 @@ def export_cmd(
     ),
 ) -> None:
     """
-    Export an object from the h5ad file to a common format.
-
-    The output format is auto-detected from the file extension:
-      - .csv  : DataFrames (obs, var)
-      - .npy  : Dense arrays/matrices (obsm/X_pca, varm/PCs, etc.)
-      - .mtx  : Sparse matrices (X if sparse)
-      - .json : Dictionaries/scalars (uns, uns/colors, etc.)
-      - .png/.jpg/.tiff : Image-like arrays
-
-    The object type is auto-detected and validated against the extension.
+    Export a dict/group or scalar to JSON format.
 
     Examples:
-        h5ad export data.h5ad obs obs.csv
-        h5ad export data.h5ad obsm/X_pca pca.npy
-        h5ad export data.h5ad X matrix.mtx
-        h5ad export data.h5ad uns metadata.json
+        h5ad export dict data.h5ad uns metadata.json
+        h5ad export dict data.h5ad uns/colors colors.json
     """
-    col_list: Optional[List[str]] = None
-    if columns:
-        col_list = [col.strip() for col in columns.split(",") if col.strip()]
+    from h5ad.commands.export import _export_json
 
     try:
-        export_object(
+        _export_json(
             file=file,
             obj=obj,
             out=out,
-            columns=col_list,
-            chunk_rows=chunk_rows,
-            head=head,
             max_elements=max_elements,
             include_attrs=include_attrs,
             console=console,
@@ -234,82 +265,250 @@ def export_cmd(
         raise typer.Exit(code=1)
 
 
-@app.command("import")
-def import_cmd(
+@export_app.command("image")
+def export_image(
+    file: Path = typer.Argument(
+        ..., help="Path to the .h5ad file", exists=True, readable=True
+    ),
+    obj: str = typer.Argument(..., help="Object path to export (2D or 3D array)"),
+    out: Path = typer.Argument(..., help="Output image file (.png, .jpg, .tiff)"),
+) -> None:
+    """
+    Export an image-like array to PNG/JPG/TIFF format.
+
+    The array should be 2D (H,W) or 3D (H,W,C) with C in {1,3,4}.
+
+    Examples:
+        h5ad export image data.h5ad uns/spatial/image tissue.png
+    """
+    from h5ad.commands.export import _export_image
+
+    try:
+        _export_image(file=file, obj=obj, out=out, console=console)
+    except Exception as e:
+        console.print(f"[bold red]Error:[/] {e}")
+        raise typer.Exit(code=1)
+
+
+# ============================================================================
+# IMPORT subcommands
+# ============================================================================
+def _get_target_file(file: Path, output: Optional[Path], inplace: bool) -> Path:
+    """Determine target file and copy if needed."""
+    import shutil
+
+    if inplace:
+        return file
+    if output is None:
+        raise ValueError("Output file is required unless --inplace is specified.")
+    shutil.copy2(file, output)
+    console.print(f"[dim]Copied {file} → {output}[/]")
+    return output
+
+
+@import_app.command("dataframe")
+def import_dataframe(
     file: Path = typer.Argument(
         ..., help="Path to the source .h5ad file", exists=True, readable=True
     ),
     obj: str = typer.Argument(
-        ...,
-        help="Object path to create/replace (e.g., 'obs', 'X', 'obsm/X_pca', 'uns')",
+        ..., help="Object path to create/replace ('obs' or 'var')"
     ),
     input_file: Path = typer.Argument(
-        ...,
-        help="Input data file. Extension determines format: .csv, .npy, .mtx, .json",
-        exists=True,
-        readable=True,
+        ..., help="Input CSV file", exists=True, readable=True
     ),
     output: Optional[Path] = typer.Option(
         None,
         "--output",
         "-o",
-        help="Output .h5ad file path. Required unless --inplace is specified.",
+        help="Output .h5ad file path. Required unless --inplace.",
         writable=True,
     ),
     inplace: bool = typer.Option(
         False,
         "--inplace",
-        help="Modify the source file directly instead of creating a new file.",
+        help="Modify source file directly.",
     ),
     index_column: Optional[str] = typer.Option(
         None,
         "--index-column",
         "-i",
-        help="Column to use as index when importing CSV into obs/var. Defaults to first column.",
+        help="Column to use as index. Defaults to first column.",
     ),
 ) -> None:
     """
-    Import data from a file into the h5ad file.
+    Import a CSV file into obs or var.
 
-    Creates or replaces an object at the specified path. By default, creates
-    a new output file. Use --inplace to modify the source file directly.
+    Examples:
+        h5ad import dataframe data.h5ad obs cells.csv -o output.h5ad -i cell_id
+        h5ad import dataframe data.h5ad var genes.csv --inplace -i gene_id
+    """
+    from h5ad.commands.import_data import _import_csv
 
-    The input format is auto-detected from the file extension:
-      - .csv  : DataFrames (obs, var)
-      - .npy  : Dense arrays/matrices (X, obsm/X_pca, varm/PCs, etc.)
-      - .mtx  : Sparse matrices (X, layers/*)
-      - .json : Dictionaries (uns, uns/metadata, etc.)
+    if obj not in ("obs", "var"):
+        console.print(
+            f"[bold red]Error:[/] Object must be 'obs' or 'var', not '{obj}'.",
+        )
+        raise typer.Exit(code=1)
 
-    Dimensions are validated against existing obs/var:
-      - obs: row count must match n_obs
-      - var: row count must match n_var
-      - X, layers/*: must match (n_obs, n_var)
-      - obsm/*, obsp/*: first dimension must match n_obs
-      - varm/*, varp/*: first dimension must match n_var
+    if not inplace and output is None:
+        console.print(
+            "[bold red]Error:[/] Output file is required. "
+            "Use --output/-o or --inplace.",
+        )
+        raise typer.Exit(code=1)
+
+    try:
+        target = _get_target_file(file, output, inplace)
+        _import_csv(target, obj, input_file, index_column, console)
+    except Exception as e:
+        console.print(f"[bold red]Error:[/] {e}")
+        raise typer.Exit(code=1)
+
+
+@import_app.command("array")
+def import_array(
+    file: Path = typer.Argument(
+        ..., help="Path to the source .h5ad file", exists=True, readable=True
+    ),
+    obj: str = typer.Argument(
+        ..., help="Object path to create/replace (e.g., 'X', 'obsm/X_pca')"
+    ),
+    input_file: Path = typer.Argument(
+        ..., help="Input .npy file", exists=True, readable=True
+    ),
+    output: Optional[Path] = typer.Option(
+        None,
+        "--output",
+        "-o",
+        help="Output .h5ad file path. Required unless --inplace.",
+        writable=True,
+    ),
+    inplace: bool = typer.Option(
+        False,
+        "--inplace",
+        help="Modify source file directly.",
+    ),
+) -> None:
+    """
+    Import a NumPy .npy file as a dense array.
+
+    Dimensions are validated against existing obs/var.
 
     Examples:
-        h5ad import data.h5ad obs cells.csv -o output.h5ad -i cell_id
-        h5ad import data.h5ad obsm/X_pca pca.npy -o output.h5ad
-        h5ad import data.h5ad X matrix.mtx --inplace
-        h5ad import data.h5ad uns/metadata config.json -o new.h5ad
+        h5ad import array data.h5ad obsm/X_pca pca.npy -o output.h5ad
+        h5ad import array data.h5ad X matrix.npy --inplace
     """
+    from h5ad.commands.import_data import _import_npy
+
     if not inplace and output is None:
         console.print(
             "[bold red]Error:[/] Output file is required. "
-            "Use --output/-o to specify output file, or --inplace to modify source.",
+            "Use --output/-o or --inplace.",
         )
         raise typer.Exit(code=1)
 
     try:
-        import_object(
-            file=file,
-            obj=obj,
-            input_file=input_file,
-            output_file=output,
-            inplace=inplace,
-            index_column=index_column,
-            console=console,
+        target = _get_target_file(file, output, inplace)
+        _import_npy(target, obj, input_file, console)
+    except Exception as e:
+        console.print(f"[bold red]Error:[/] {e}")
+        raise typer.Exit(code=1)
+
+
+@import_app.command("sparse")
+def import_sparse(
+    file: Path = typer.Argument(
+        ..., help="Path to the source .h5ad file", exists=True, readable=True
+    ),
+    obj: str = typer.Argument(
+        ..., help="Object path to create/replace (e.g., 'X', 'layers/counts')"
+    ),
+    input_file: Path = typer.Argument(
+        ..., help="Input .mtx file", exists=True, readable=True
+    ),
+    output: Optional[Path] = typer.Option(
+        None,
+        "--output",
+        "-o",
+        help="Output .h5ad file path. Required unless --inplace.",
+        writable=True,
+    ),
+    inplace: bool = typer.Option(
+        False,
+        "--inplace",
+        help="Modify source file directly.",
+    ),
+) -> None:
+    """
+    Import a Matrix Market (.mtx) file as a CSR sparse matrix.
+
+    Dimensions are validated against existing obs/var.
+
+    Examples:
+        h5ad import sparse data.h5ad X matrix.mtx -o output.h5ad
+        h5ad import sparse data.h5ad layers/counts counts.mtx --inplace
+    """
+    from h5ad.commands.import_data import _import_mtx
+
+    if not inplace and output is None:
+        console.print(
+            "[bold red]Error:[/] Output file is required. "
+            "Use --output/-o or --inplace.",
         )
+        raise typer.Exit(code=1)
+
+    try:
+        target = _get_target_file(file, output, inplace)
+        _import_mtx(target, obj, input_file, console)
+    except Exception as e:
+        console.print(f"[bold red]Error:[/] {e}")
+        raise typer.Exit(code=1)
+
+
+@import_app.command("dict")
+def import_dict(
+    file: Path = typer.Argument(
+        ..., help="Path to the source .h5ad file", exists=True, readable=True
+    ),
+    obj: str = typer.Argument(
+        ..., help="Object path to create/replace (e.g., 'uns', 'uns/metadata')"
+    ),
+    input_file: Path = typer.Argument(
+        ..., help="Input .json file", exists=True, readable=True
+    ),
+    output: Optional[Path] = typer.Option(
+        None,
+        "--output",
+        "-o",
+        help="Output .h5ad file path. Required unless --inplace.",
+        writable=True,
+    ),
+    inplace: bool = typer.Option(
+        False,
+        "--inplace",
+        help="Modify source file directly.",
+    ),
+) -> None:
+    """
+    Import a JSON file into uns or other dict-like groups.
+
+    Examples:
+        h5ad import dict data.h5ad uns/metadata config.json -o output.h5ad
+        h5ad import dict data.h5ad uns settings.json --inplace
+    """
+    from h5ad.commands.import_data import _import_json
+
+    if not inplace and output is None:
+        console.print(
+            "[bold red]Error:[/] Output file is required. "
+            "Use --output/-o or --inplace.",
+        )
+        raise typer.Exit(code=1)
+
+    try:
+        target = _get_target_file(file, output, inplace)
+        _import_json(target, obj, input_file, console)
     except Exception as e:
         console.print(f"[bold red]Error:[/] {e}")
         raise typer.Exit(code=1)
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 07031d7..7b327e5 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -85,15 +85,21 @@ def test_info_object_not_found(self, sample_h5ad_file):
         assert "not found" in output
 
 
-class TestTableCommand:
-    """Tests for table command."""
+class TestExportDataframeCommand:
+    """Tests for export dataframe command (replaces table command)."""
 
-    def test_table_command_obs(self, sample_h5ad_file, temp_dir):
-        """Test table command for obs axis."""
+    def test_export_dataframe_obs(self, sample_h5ad_file, temp_dir):
+        """Test export dataframe for obs axis."""
         output = temp_dir / "obs_table.csv"
         result = runner.invoke(
             app,
-            ["table", str(sample_h5ad_file), "--axis", "obs", "--output", str(output)],
+            [
+                "export",
+                "dataframe",
+                str(sample_h5ad_file),
+                "obs",
+                str(output),
+            ],
         )
         assert result.exit_code == 0
         assert output.exists()
@@ -105,12 +111,18 @@ def test_table_command_obs(self, sample_h5ad_file, temp_dir):
             assert len(rows) == 6  # header + 5 rows
             assert "obs_names" in rows[0]
 
-    def test_table_command_var(self, sample_h5ad_file, temp_dir):
-        """Test table command for var axis."""
+    def test_export_dataframe_var(self, sample_h5ad_file, temp_dir):
+        """Test export dataframe for var axis."""
         output = temp_dir / "var_table.csv"
         result = runner.invoke(
             app,
-            ["table", str(sample_h5ad_file), "--axis", "var", "--output", str(output)],
+            [
+                "export",
+                "dataframe",
+                str(sample_h5ad_file),
+                "var",
+                str(output),
+            ],
         )
         assert result.exit_code == 0
         assert output.exists()
@@ -120,20 +132,19 @@ def test_table_command_var(self, sample_h5ad_file, temp_dir):
             rows = list(reader)
             assert len(rows) == 5  # header + 4 rows
 
-    def test_table_command_columns_filter(self, sample_h5ad_file, temp_dir):
-        """Test table command with column filter."""
+    def test_export_dataframe_columns_filter(self, sample_h5ad_file, temp_dir):
+        """Test export dataframe with column filter."""
         output = temp_dir / "table.csv"
         result = runner.invoke(
             app,
             [
-                "table",
+                "export",
+                "dataframe",
                 str(sample_h5ad_file),
-                "--axis",
                 "obs",
+                str(output),
                 "--columns",
                 "obs_names,cell_type",
-                "--output",
-                str(output),
             ],
         )
         assert result.exit_code == 0
@@ -146,20 +157,19 @@ def test_table_command_columns_filter(self, sample_h5ad_file, temp_dir):
             assert "cell_type" in header
             assert "n_counts" not in header
 
-    def test_table_command_head(self, sample_h5ad_file, temp_dir):
-        """Test table command with head limit."""
+    def test_export_dataframe_head(self, sample_h5ad_file, temp_dir):
+        """Test export dataframe with head limit."""
         output = temp_dir / "table.csv"
         result = runner.invoke(
             app,
             [
-                "table",
+                "export",
+                "dataframe",
                 str(sample_h5ad_file),
-                "--axis",
                 "obs",
+                str(output),
                 "--head",
                 "2",
-                "--output",
-                str(output),
             ],
         )
         assert result.exit_code == 0
@@ -169,15 +179,23 @@ def test_table_command_head(self, sample_h5ad_file, temp_dir):
             rows = list(reader)
             assert len(rows) == 3  # header + 2 rows
 
-    def test_table_command_invalid_axis(self, sample_h5ad_file):
-        """Test table command with invalid axis."""
+    def test_export_dataframe_invalid_axis(self, sample_h5ad_file, temp_dir):
+        """Test export dataframe with invalid axis."""
+        output = temp_dir / "table.csv"
         result = runner.invoke(
-            app, ["table", str(sample_h5ad_file), "--axis", "invalid"]
+            app,
+            [
+                "export",
+                "dataframe",
+                str(sample_h5ad_file),
+                "invalid",
+                str(output),
+            ],
         )
         assert result.exit_code == 1
         # Check both stdout and stderr since Console uses stderr=True
-        output = result.stdout + result.stderr
-        assert "Invalid axis" in output
+        output_text = result.stdout + result.stderr
+        assert "obs" in output_text or "var" in output_text
 
     def test_export_table_function(self, sample_h5ad_file, temp_dir):
         """Test export_table function directly."""
@@ -317,11 +335,25 @@ def test_info_help(self):
         assert result.exit_code == 0
         assert "Show high-level information" in result.stdout
 
-    def test_table_help(self):
-        """Test table command help."""
-        result = runner.invoke(app, ["table", "--help"])
+    def test_export_help(self):
+        """Test export command help."""
+        result = runner.invoke(app, ["export", "--help"])
+        assert result.exit_code == 0
+        assert "dataframe" in result.stdout
+        assert "array" in result.stdout
+
+    def test_export_dataframe_help(self):
+        """Test export dataframe command help."""
+        result = runner.invoke(app, ["export", "dataframe", "--help"])
+        assert result.exit_code == 0
+        assert "Export a dataframe" in result.stdout
+
+    def test_import_help(self):
+        """Test import command help."""
+        result = runner.invoke(app, ["import", "--help"])
         assert result.exit_code == 0
-        assert "Export a table" in result.stdout
+        assert "dataframe" in result.stdout
+        assert "array" in result.stdout
 
     def test_subset_help(self):
         """Test subset command help."""
diff --git a/tests/test_export.py b/tests/test_export.py
index 1c2d88f..8ab14cd 100644
--- a/tests/test_export.py
+++ b/tests/test_export.py
@@ -28,10 +28,12 @@ def _read_mtx(path: Path) -> np.ndarray:
         return mat
 
 
-class TestExportNpy:
-    def test_export_npy_dense_X(self, sample_h5ad_file, temp_dir):
+class TestExportArray:
+    def test_export_array_dense_X(self, sample_h5ad_file, temp_dir):
         out = temp_dir / "X.npy"
-        result = runner.invoke(app, ["export", str(sample_h5ad_file), "X", str(out)])
+        result = runner.invoke(
+            app, ["export", "array", str(sample_h5ad_file), "X", str(out)]
+        )
         assert result.exit_code == 0
         assert out.exists()
 
@@ -41,11 +43,11 @@ def test_export_npy_dense_X(self, sample_h5ad_file, temp_dir):
         np.testing.assert_allclose(got, expected)
 
 
-class TestExportMtx:
-    def test_export_mtx_csr(self, sample_sparse_csr_h5ad, temp_dir):
+class TestExportSparse:
+    def test_export_sparse_csr(self, sample_sparse_csr_h5ad, temp_dir):
         out = temp_dir / "X_csr.mtx"
         result = runner.invoke(
-            app, ["export", str(sample_sparse_csr_h5ad), "X", str(out)]
+            app, ["export", "sparse", str(sample_sparse_csr_h5ad), "X", str(out)]
         )
         assert result.exit_code == 0
         assert out.exists()
@@ -62,7 +64,7 @@ def test_export_mtx_csr(self, sample_sparse_csr_h5ad, temp_dir):
         )
         np.testing.assert_allclose(got, expected)
 
-    def test_export_mtx_csc(self, temp_dir):
+    def test_export_sparse_csc(self, temp_dir):
         # Build a small, consistent CSC matrix group
         file_path = temp_dir / "test_csc.h5ad"
         with h5py.File(file_path, "w") as f:
@@ -77,7 +79,7 @@ def test_export_mtx_csc(self, temp_dir):
             X.create_dataset("indptr", data=indptr)
 
         out = temp_dir / "X_csc.mtx"
-        result = runner.invoke(app, ["export", str(file_path), "X", str(out)])
+        result = runner.invoke(app, ["export", "sparse", str(file_path), "X", str(out)])
         assert result.exit_code == 0
         assert out.exists()
 
@@ -93,10 +95,12 @@ def test_export_mtx_csc(self, temp_dir):
         np.testing.assert_allclose(got, expected)
 
 
-class TestExportJson:
-    def test_export_json_uns(self, sample_h5ad_file, temp_dir):
+class TestExportDict:
+    def test_export_dict_uns(self, sample_h5ad_file, temp_dir):
         out = temp_dir / "uns.json"
-        result = runner.invoke(app, ["export", str(sample_h5ad_file), "uns", str(out)])
+        result = runner.invoke(
+            app, ["export", "dict", str(sample_h5ad_file), "uns", str(out)]
+        )
         assert result.exit_code == 0
         assert out.exists()
         payload = json.loads(out.read_text(encoding="utf-8"))
@@ -104,10 +108,12 @@ def test_export_json_uns(self, sample_h5ad_file, temp_dir):
         assert payload["description"] == ["Test dataset"]
 
 
-class TestExportCsv:
-    def test_export_csv_obs(self, sample_h5ad_file, temp_dir):
+class TestExportDataframe:
+    def test_export_dataframe_obs(self, sample_h5ad_file, temp_dir):
         out = temp_dir / "obs.csv"
-        result = runner.invoke(app, ["export", str(sample_h5ad_file), "obs", str(out)])
+        result = runner.invoke(
+            app, ["export", "dataframe", str(sample_h5ad_file), "obs", str(out)]
+        )
         assert result.exit_code == 0
         assert out.exists()
         text = out.read_text(encoding="utf-8")
@@ -115,47 +121,36 @@ def test_export_csv_obs(self, sample_h5ad_file, temp_dir):
 
 
 class TestExportValidation:
-    def test_wrong_extension_for_type(self, sample_h5ad_file, temp_dir):
-        """Test that wrong extension is rejected."""
-        out = temp_dir / "obs.npy"  # obs is a dataframe, should be .csv
-        result = runner.invoke(app, ["export", str(sample_h5ad_file), "obs", str(out)])
-        assert result.exit_code == 1
-        assert "does not match" in result.output or "Expected" in result.output
-
-    def test_sparse_matrix_wrong_extension(self, sample_sparse_csr_h5ad, temp_dir):
-        """Test that sparse matrix rejects .npy extension."""
-        out = temp_dir / "X.npy"  # sparse matrix should be .mtx
+    def test_wrong_type_for_dataframe(self, sample_h5ad_file, temp_dir):
+        """Test that wrong object type is rejected for dataframe export."""
+        out = temp_dir / "X.csv"
         result = runner.invoke(
-            app, ["export", str(sample_sparse_csr_h5ad), "X", str(out)]
+            app, ["export", "dataframe", str(sample_h5ad_file), "X", str(out)]
         )
         assert result.exit_code == 1
-        assert "does not match" in result.output or ".mtx" in result.output
+        assert "obs" in result.output or "var" in result.output
 
-    def test_dense_matrix_wrong_extension(self, sample_h5ad_file, temp_dir):
-        """Test that dense matrix rejects .csv extension."""
-        out = temp_dir / "X.csv"  # dense matrix should be .npy
-        result = runner.invoke(app, ["export", str(sample_h5ad_file), "X", str(out)])
-        assert result.exit_code == 1
-        assert "does not match" in result.output or ".npy" in result.output
-
-    def test_json_wrong_extension(self, sample_h5ad_file, temp_dir):
-        """Test that dict rejects .npy extension."""
-        out = temp_dir / "uns.npy"  # uns is dict, should be .json
-        result = runner.invoke(app, ["export", str(sample_h5ad_file), "uns", str(out)])
+    def test_sparse_matrix_array_export(self, sample_sparse_csr_h5ad, temp_dir):
+        """Test that sparse matrix requires sparse export."""
+        out = temp_dir / "X.npy"
+        result = runner.invoke(
+            app, ["export", "array", str(sample_sparse_csr_h5ad), "X", str(out)]
+        )
+        # Should fail because X is sparse, not dense
         assert result.exit_code == 1
-        assert "does not match" in result.output or ".json" in result.output
 
     def test_nonexistent_object(self, sample_h5ad_file, temp_dir):
         """Test that nonexistent object path is rejected."""
-        out = temp_dir / "output.csv"
+        out = temp_dir / "output.npy"
         result = runner.invoke(
-            app, ["export", str(sample_h5ad_file), "nonexistent/path", str(out)]
+            app,
+            ["export", "array", str(sample_h5ad_file), "nonexistent/path", str(out)],
         )
         assert result.exit_code == 1
-        assert "not found" in result.output
+        assert "not found" in result.output.lower() or "error" in result.output.lower()
 
-    def test_unknown_type_rejected(self, temp_dir):
-        """Test that unknown/complex types are rejected."""
+    def test_export_dict_unknown_type(self, temp_dir):
+        """Test that unknown/complex types can be exported as dict."""
         file_path = temp_dir / "test_unknown.h5ad"
         with h5py.File(file_path, "w") as f:
             g = f.create_group("obs")
@@ -166,7 +161,8 @@ def test_unknown_type_rejected(self, temp_dir):
             weird.attrs["encoding-type"] = "some_unknown_encoding"
 
         out = temp_dir / "weird.json"
-        result = runner.invoke(app, ["export", str(file_path), "weird_group", str(out)])
+        result = runner.invoke(
+            app, ["export", "dict", str(file_path), "weird_group", str(out)]
+        )
         # Should succeed as it's detected as dict
-        # but if it had sparse inside, it would fail
         assert result.exit_code == 0
diff --git a/tests/test_import.py b/tests/test_import.py
index 895a488..736d4e2 100644
--- a/tests/test_import.py
+++ b/tests/test_import.py
@@ -13,8 +13,8 @@
 runner = CliRunner()
 
 
-class TestImportCsv:
-    def test_import_csv_obs_inplace(self, sample_h5ad_file, temp_dir):
+class TestImportDataframe:
+    def test_import_dataframe_obs_inplace(self, sample_h5ad_file, temp_dir):
         """Test importing CSV into obs with --inplace."""
         csv_file = temp_dir / "new_obs.csv"
         csv_file.write_text(
@@ -30,6 +30,7 @@ def test_import_csv_obs_inplace(self, sample_h5ad_file, temp_dir):
             app,
             [
                 "import",
+                "dataframe",
                 str(sample_h5ad_file),
                 "obs",
                 str(csv_file),
@@ -48,7 +49,7 @@ def test_import_csv_obs_inplace(self, sample_h5ad_file, temp_dir):
             assert "score" in obs
             assert "label" in obs
 
-    def test_import_csv_obs_output(self, sample_h5ad_file, temp_dir):
+    def test_import_dataframe_obs_output(self, sample_h5ad_file, temp_dir):
         """Test importing CSV into obs with output file."""
         csv_file = temp_dir / "new_obs.csv"
         csv_file.write_text(
@@ -65,6 +66,7 @@ def test_import_csv_obs_output(self, sample_h5ad_file, temp_dir):
             app,
             [
                 "import",
+                "dataframe",
                 str(sample_h5ad_file),
                 "obs",
                 str(csv_file),
@@ -88,7 +90,7 @@ def test_import_csv_obs_output(self, sample_h5ad_file, temp_dir):
             obs = f["obs"]
             assert "score" not in obs
 
-    def test_import_csv_var(self, sample_h5ad_file, temp_dir):
+    def test_import_dataframe_var(self, sample_h5ad_file, temp_dir):
         """Test importing CSV into var."""
         csv_file = temp_dir / "new_var.csv"
         csv_file.write_text(
@@ -103,6 +105,7 @@ def test_import_csv_var(self, sample_h5ad_file, temp_dir):
             app,
             [
                 "import",
+                "dataframe",
                 str(sample_h5ad_file),
                 "var",
                 str(csv_file),
@@ -114,7 +117,7 @@ def test_import_csv_var(self, sample_h5ad_file, temp_dir):
         assert result.exit_code == 0
         assert "4 rows" in result.output
 
-    def test_import_csv_dimension_mismatch(self, sample_h5ad_file, temp_dir):
+    def test_import_dataframe_dimension_mismatch(self, sample_h5ad_file, temp_dir):
         """Test that dimension mismatch is rejected."""
         csv_file = temp_dir / "wrong_obs.csv"
         csv_file.write_text("cell_id,score\ncell_1,1.0\ncell_2,2.0\n")
@@ -123,6 +126,7 @@ def test_import_csv_dimension_mismatch(self, sample_h5ad_file, temp_dir):
             app,
             [
                 "import",
+                "dataframe",
                 str(sample_h5ad_file),
                 "obs",
                 str(csv_file),
@@ -134,7 +138,7 @@ def test_import_csv_dimension_mismatch(self, sample_h5ad_file, temp_dir):
         assert result.exit_code == 1
         assert "mismatch" in result.output.lower()
 
-    def test_import_csv_invalid_index_column(self, sample_h5ad_file, temp_dir):
+    def test_import_dataframe_invalid_index_column(self, sample_h5ad_file, temp_dir):
         """Test that invalid index column is rejected."""
         csv_file = temp_dir / "obs.csv"
         csv_file.write_text("a,b,c\n1,2,3\n")
@@ -143,6 +147,7 @@ def test_import_csv_invalid_index_column(self, sample_h5ad_file, temp_dir):
             app,
             [
                 "import",
+                "dataframe",
                 str(sample_h5ad_file),
                 "obs",
                 str(csv_file),
@@ -154,33 +159,48 @@ def test_import_csv_invalid_index_column(self, sample_h5ad_file, temp_dir):
         assert result.exit_code == 1
         assert "not found" in result.output.lower()
 
-    def test_import_csv_not_obs_var(self, sample_h5ad_file, temp_dir):
-        """Test that CSV import is only allowed for obs/var."""
+    def test_import_dataframe_not_obs_var(self, sample_h5ad_file, temp_dir):
+        """Test that dataframe import is only allowed for obs/var."""
         csv_file = temp_dir / "data.csv"
         csv_file.write_text("a,b\n1,2\n")
 
         result = runner.invoke(
             app,
-            ["import", str(sample_h5ad_file), "uns/data", str(csv_file), "--inplace"],
+            [
+                "import",
+                "dataframe",
+                str(sample_h5ad_file),
+                "uns/data",
+                str(csv_file),
+                "--inplace",
+            ],
         )
         assert result.exit_code == 1
-        assert "only supported for 'obs' or 'var'" in result.output
+        assert "obs" in result.output or "var" in result.output
 
-    def test_import_requires_output_or_inplace(self, sample_h5ad_file, temp_dir):
+    def test_import_dataframe_requires_output_or_inplace(
+        self, sample_h5ad_file, temp_dir
+    ):
         """Test that either --output or --inplace is required."""
         csv_file = temp_dir / "obs.csv"
         csv_file.write_text("a,b\n1,2\n")
 
         result = runner.invoke(
             app,
-            ["import", str(sample_h5ad_file), "obs", str(csv_file)],
+            [
+                "import",
+                "dataframe",
+                str(sample_h5ad_file),
+                "obs",
+                str(csv_file),
+            ],
         )
         assert result.exit_code == 1
         assert "Output file is required" in result.output
 
 
-class TestImportNpy:
-    def test_import_npy_obsm(self, sample_h5ad_file, temp_dir):
+class TestImportArray:
+    def test_import_array_obsm(self, sample_h5ad_file, temp_dir):
         """Test importing NPY into obsm."""
         npy_file = temp_dir / "pca.npy"
         arr = np.random.randn(5, 10).astype(np.float32)
@@ -188,7 +208,14 @@ def test_import_npy_obsm(self, sample_h5ad_file, temp_dir):
 
         result = runner.invoke(
             app,
-            ["import", str(sample_h5ad_file), "obsm/X_pca", str(npy_file), "--inplace"],
+            [
+                "import",
+                "array",
+                str(sample_h5ad_file),
+                "obsm/X_pca",
+                str(npy_file),
+                "--inplace",
+            ],
         )
         assert result.exit_code == 0
         assert "5×10" in result.output
@@ -197,7 +224,7 @@ def test_import_npy_obsm(self, sample_h5ad_file, temp_dir):
             assert "obsm/X_pca" in f
             np.testing.assert_allclose(f["obsm/X_pca"][...], arr)
 
-    def test_import_npy_varm(self, sample_h5ad_file, temp_dir):
+    def test_import_array_varm(self, sample_h5ad_file, temp_dir):
         """Test importing NPY into varm."""
         npy_file = temp_dir / "pcs.npy"
         arr = np.random.randn(4, 5).astype(np.float32)
@@ -205,14 +232,21 @@ def test_import_npy_varm(self, sample_h5ad_file, temp_dir):
 
         result = runner.invoke(
             app,
-            ["import", str(sample_h5ad_file), "varm/PCs", str(npy_file), "--inplace"],
+            [
+                "import",
+                "array",
+                str(sample_h5ad_file),
+                "varm/PCs",
+                str(npy_file),
+                "--inplace",
+            ],
         )
         assert result.exit_code == 0
 
         with h5py.File(sample_h5ad_file, "r") as f:
             assert "varm/PCs" in f
 
-    def test_import_npy_X(self, sample_h5ad_file, temp_dir):
+    def test_import_array_X(self, sample_h5ad_file, temp_dir):
         """Test importing NPY into X."""
         npy_file = temp_dir / "X.npy"
         arr = np.random.randn(5, 4).astype(np.float32)
@@ -220,7 +254,14 @@ def test_import_npy_X(self, sample_h5ad_file, temp_dir):
 
         result = runner.invoke(
             app,
-            ["import", str(sample_h5ad_file), "X", str(npy_file), "--inplace"],
+            [
+                "import",
+                "array",
+                str(sample_h5ad_file),
+                "X",
+                str(npy_file),
+                "--inplace",
+            ],
         )
         assert result.exit_code == 0
 
@@ -228,7 +269,7 @@ def test_import_npy_X(self, sample_h5ad_file, temp_dir):
             assert "X" in f
             np.testing.assert_allclose(f["X"][...], arr)
 
-    def test_import_npy_dimension_mismatch_obsm(self, sample_h5ad_file, temp_dir):
+    def test_import_array_dimension_mismatch_obsm(self, sample_h5ad_file, temp_dir):
         """Test that obsm dimension mismatch is rejected."""
         npy_file = temp_dir / "bad_pca.npy"
         arr = np.random.randn(10, 5).astype(np.float32)
@@ -236,12 +277,19 @@ def test_import_npy_dimension_mismatch_obsm(self, sample_h5ad_file, temp_dir):
 
         result = runner.invoke(
             app,
-            ["import", str(sample_h5ad_file), "obsm/X_pca", str(npy_file), "--inplace"],
+            [
+                "import",
+                "array",
+                str(sample_h5ad_file),
+                "obsm/X_pca",
+                str(npy_file),
+                "--inplace",
+            ],
         )
         assert result.exit_code == 1
         assert "mismatch" in result.output.lower()
 
-    def test_import_npy_dimension_mismatch_X(self, sample_h5ad_file, temp_dir):
+    def test_import_array_dimension_mismatch_X(self, sample_h5ad_file, temp_dir):
         """Test that X dimension mismatch is rejected."""
         npy_file = temp_dir / "bad_X.npy"
         arr = np.random.randn(5, 10).astype(np.float32)
@@ -249,14 +297,39 @@ def test_import_npy_dimension_mismatch_X(self, sample_h5ad_file, temp_dir):
 
         result = runner.invoke(
             app,
-            ["import", str(sample_h5ad_file), "X", str(npy_file), "--inplace"],
+            [
+                "import",
+                "array",
+                str(sample_h5ad_file),
+                "X",
+                str(npy_file),
+                "--inplace",
+            ],
         )
         assert result.exit_code == 1
         assert "mismatch" in result.output.lower()
 
+    def test_import_array_requires_output_or_inplace(self, sample_h5ad_file, temp_dir):
+        """Test that either --output or --inplace is required."""
+        npy_file = temp_dir / "data.npy"
+        np.save(npy_file, np.array([1, 2, 3]))
 
-class TestImportMtx:
-    def test_import_mtx_X(self, sample_h5ad_file, temp_dir):
+        result = runner.invoke(
+            app,
+            [
+                "import",
+                "array",
+                str(sample_h5ad_file),
+                "obsm/X_pca",
+                str(npy_file),
+            ],
+        )
+        assert result.exit_code == 1
+        assert "Output file is required" in result.output
+
+
+class TestImportSparse:
+    def test_import_sparse_X(self, sample_h5ad_file, temp_dir):
         """Test importing MTX into X."""
         mtx_file = temp_dir / "X.mtx"
         mtx_file.write_text(
@@ -272,7 +345,14 @@ def test_import_mtx_X(self, sample_h5ad_file, temp_dir):
 
         result = runner.invoke(
             app,
-            ["import", str(sample_h5ad_file), "X", str(mtx_file), "--inplace"],
+            [
+                "import",
+                "sparse",
+                str(sample_h5ad_file),
+                "X",
+                str(mtx_file),
+                "--inplace",
+            ],
         )
         assert result.exit_code == 0
         assert "5×4" in result.output
@@ -286,7 +366,7 @@ def test_import_mtx_X(self, sample_h5ad_file, temp_dir):
                 enc = enc.decode("utf-8")
             assert enc == "csr_matrix"
 
-    def test_import_mtx_layer(self, sample_h5ad_file, temp_dir):
+    def test_import_sparse_layer(self, sample_h5ad_file, temp_dir):
         """Test importing MTX into layers."""
         mtx_file = temp_dir / "layer.mtx"
         mtx_file.write_text(
@@ -301,6 +381,7 @@ def test_import_mtx_layer(self, sample_h5ad_file, temp_dir):
             app,
             [
                 "import",
+                "sparse",
                 str(sample_h5ad_file),
                 "layers/counts",
                 str(mtx_file),
@@ -312,7 +393,7 @@ def test_import_mtx_layer(self, sample_h5ad_file, temp_dir):
         with h5py.File(sample_h5ad_file, "r") as f:
             assert "layers/counts" in f
 
-    def test_import_mtx_dimension_mismatch(self, sample_h5ad_file, temp_dir):
+    def test_import_sparse_dimension_mismatch(self, sample_h5ad_file, temp_dir):
         """Test that MTX dimension mismatch is rejected."""
         mtx_file = temp_dir / "bad.mtx"
         mtx_file.write_text(
@@ -321,14 +402,41 @@ def test_import_mtx_dimension_mismatch(self, sample_h5ad_file, temp_dir):
 
         result = runner.invoke(
             app,
-            ["import", str(sample_h5ad_file), "X", str(mtx_file), "--inplace"],
+            [
+                "import",
+                "sparse",
+                str(sample_h5ad_file),
+                "X",
+                str(mtx_file),
+                "--inplace",
+            ],
         )
         assert result.exit_code == 1
         assert "mismatch" in result.output.lower()
 
+    def test_import_sparse_requires_output_or_inplace(self, sample_h5ad_file, temp_dir):
+        """Test that either --output or --inplace is required."""
+        mtx_file = temp_dir / "data.mtx"
+        mtx_file.write_text(
+            "%%MatrixMarket matrix coordinate real general\n" "5 4 1\n" "1 1 1.0\n"
+        )
+
+        result = runner.invoke(
+            app,
+            [
+                "import",
+                "sparse",
+                str(sample_h5ad_file),
+                "X",
+                str(mtx_file),
+            ],
+        )
+        assert result.exit_code == 1
+        assert "Output file is required" in result.output
+
 
-class TestImportJson:
-    def test_import_json_uns(self, sample_h5ad_file, temp_dir):
+class TestImportDict:
+    def test_import_dict_uns(self, sample_h5ad_file, temp_dir):
         """Test importing JSON into uns."""
         json_file = temp_dir / "metadata.json"
         json_file.write_text(
@@ -345,6 +453,7 @@ def test_import_json_uns(self, sample_h5ad_file, temp_dir):
             app,
             [
                 "import",
+                "dict",
                 str(sample_h5ad_file),
                 "uns/metadata",
                 str(json_file),
@@ -359,7 +468,7 @@ def test_import_json_uns(self, sample_h5ad_file, temp_dir):
             assert "colors" in f["uns/metadata"]
             assert "n_pcs" in f["uns/metadata"]
 
-    def test_import_json_nested(self, sample_h5ad_file, temp_dir):
+    def test_import_dict_nested(self, sample_h5ad_file, temp_dir):
         """Test importing nested JSON."""
         json_file = temp_dir / "config.json"
         json_file.write_text(
@@ -378,6 +487,7 @@ def test_import_json_nested(self, sample_h5ad_file, temp_dir):
             app,
             [
                 "import",
+                "dict",
                 str(sample_h5ad_file),
                 "uns/config",
                 str(json_file),
@@ -390,40 +500,26 @@ def test_import_json_nested(self, sample_h5ad_file, temp_dir):
             assert "uns/config/settings" in f
             assert "uns/config/labels" in f
 
-
-class TestImportValidation:
-    def test_unsupported_extension(self, sample_h5ad_file, temp_dir):
-        """Test that unsupported extensions are rejected."""
-        bad_file = temp_dir / "data.xlsx"
-        bad_file.write_text("dummy")
-
-        result = runner.invoke(
-            app,
-            ["import", str(sample_h5ad_file), "obs", str(bad_file), "--inplace"],
-        )
-        assert result.exit_code == 1
-        assert "Unsupported" in result.output
-
-    def test_index_column_only_for_csv_obs_var(self, sample_h5ad_file, temp_dir):
-        """Test that --index-column is only valid for CSV obs/var."""
-        npy_file = temp_dir / "data.npy"
-        np.save(npy_file, np.array([1, 2, 3]))
+    def test_import_dict_requires_output_or_inplace(self, sample_h5ad_file, temp_dir):
+        """Test that either --output or --inplace is required."""
+        json_file = temp_dir / "data.json"
+        json_file.write_text('{"key": "value"}')
 
         result = runner.invoke(
             app,
             [
                 "import",
+                "dict",
                 str(sample_h5ad_file),
                 "uns/data",
-                str(npy_file),
-                "--inplace",
-                "-i",
-                "col",
+                str(json_file),
             ],
         )
         assert result.exit_code == 1
-        assert "only valid for CSV" in result.output
+        assert "Output file is required" in result.output
+
 
+class TestImportValidation:
     def test_replace_existing_object(self, sample_h5ad_file, temp_dir):
         """Test that existing objects can be replaced."""
         with h5py.File(sample_h5ad_file, "r") as f:
@@ -435,7 +531,14 @@ def test_replace_existing_object(self, sample_h5ad_file, temp_dir):
 
         result = runner.invoke(
             app,
-            ["import", str(sample_h5ad_file), "X", str(npy_file), "--inplace"],
+            [
+                "import",
+                "array",
+                str(sample_h5ad_file),
+                "X",
+                str(npy_file),
+                "--inplace",
+            ],
         )
         assert result.exit_code == 0
 

From ea404a8644f79168d50f317901f713f4585d6083 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Thu, 15 Jan 2026 20:43:23 +0000
Subject: [PATCH 08/62] Remove export_table function from CLI commands

---
 src/h5ad/commands/table.py | 90 --------------------------------------
 1 file changed, 90 deletions(-)
 delete mode 100644 src/h5ad/commands/table.py

diff --git a/src/h5ad/commands/table.py b/src/h5ad/commands/table.py
deleted file mode 100644
index 16b7686..0000000
--- a/src/h5ad/commands/table.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import sys
-import csv
-from pathlib import Path
-from typing import List, Optional, Dict
-
-import h5py
-import numpy as np
-from rich.console import Console
-from h5ad.info import get_axis_group
-from h5ad.read import col_chunk_as_strings
-
-
-def export_table(
-    file: Path,
-    axis: str,
-    columns: Optional[List[str]],
-    out: Optional[Path],
-    chunk_rows: int,
-    head: Optional[int],
-    console: Console,
-) -> None:
-    """
-    Export a table of the specified axis to CSV format.
-    Args:
-        file (Path): Path to the .h5ad file
-        axis (str): Axis to read from ('obs' or 'var')
-        columns (Optional[List[str]]): List of column names to include in the output table
-        out (Optional[Path]): Output file path (defaults to stdout)
-        chunk_rows (int): Number of rows to read per chunk
-        head (Optional[int]): Output only the first n rows
-    """
-    with h5py.File(file, "r") as f:
-        group, n_rows, index_name = get_axis_group(f, axis)
-
-        # Determine columns to read
-        if columns:
-            col_names = list(columns)
-        else:
-            col_names = [k for k in group.keys() if k != "_index" and k != index_name]
-            # Add index name if not already present
-            if index_name and index_name not in col_names:
-                col_names.insert(0, index_name)
-
-        if isinstance(index_name, bytes):
-            index_name = index_name.decode("utf-8")
-
-        if index_name not in col_names:
-            col_names.insert(0, index_name)
-        else:
-            col_names = [index_name] + [c for c in col_names if c != index_name]
-
-        # Limit rows if head option is specified
-        if head is not None and head > 0:
-            n_rows = min(n_rows, head)
-
-        # Open writer
-        if out is None or str(out) == "-":
-            out_fh = sys.stdout
-        else:
-            out_fh = open(out, "w", newline="", encoding="utf-8")
-        writer = csv.writer(out_fh)
-
-        # Write data in chunks
-        try:
-            writer.writerow(col_names)
-            cat_cache: Dict[int, np.ndarray] = {}
-            with console.status(
-                f"[magenta]Exporting {axis} table...[/] to {'stdout' if out_fh is sys.stdout else out}"
-            ) as status:
-                for start in range(0, n_rows, chunk_rows):
-                    end = min(start + chunk_rows, n_rows)
-                    status.update(
-                        f"[magenta]Exporting rows {start}-{end} of {n_rows}...[/]"
-                    )
-                    cols_data: List[List[str]] = []
-                    # Read each column for the current chunk
-                    for col in col_names:
-                        cols_data.append(
-                            col_chunk_as_strings(group, col, start, end, cat_cache)
-                        )
-                    # Write rows
-                    for row_idx in range(end - start):
-                        row = [
-                            cols_data[col_idx][row_idx]
-                            for col_idx in range(len(col_names))
-                        ]
-                        writer.writerow(row)
-        finally:
-            if out_fh is not sys.stdout:
-                out_fh.close()

From f87cc77816fa9b9c97e1d902f12e94b372d25872 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Thu, 15 Jan 2026 20:46:26 +0000
Subject: [PATCH 09/62] Refactor export_table function and update imports in
 CLI commands

---
 src/h5ad/commands/__init__.py |  3 +-
 src/h5ad/commands/export.py   | 96 +++++++++++++++++++++++++++++++++--
 tests/test_cli.py             |  2 +-
 3 files changed, 95 insertions(+), 6 deletions(-)

diff --git a/src/h5ad/commands/__init__.py b/src/h5ad/commands/__init__.py
index b4d6016..7b60c31 100644
--- a/src/h5ad/commands/__init__.py
+++ b/src/h5ad/commands/__init__.py
@@ -1,5 +1,4 @@
 from h5ad.commands.info import show_info
-from h5ad.commands.table import export_table
 from h5ad.commands.subset import subset_h5ad
-from h5ad.commands.export import export_object
+from h5ad.commands.export import export_object, export_table
 from h5ad.commands.import_data import import_object
diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py
index 8d237c9..cf0c64b 100644
--- a/src/h5ad/commands/export.py
+++ b/src/h5ad/commands/export.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
+import csv
 import json
+import sys
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union, cast
 
@@ -8,13 +10,101 @@
 import numpy as np
 from rich.console import Console
 
-from h5ad.commands.table import export_table
-from h5ad.read import decode_str_array
-from h5ad.info import get_entry_type
+from h5ad.read import col_chunk_as_strings, decode_str_array
+from h5ad.info import get_axis_group, get_entry_type
 
 
 H5Obj = Union[h5py.Group, h5py.Dataset]
 
+
+# ============================================================================
+# DATAFRAME EXPORT (CSV)
+# ============================================================================
+def export_table(
+    file: Path,
+    axis: str,
+    columns: Optional[List[str]],
+    out: Optional[Path],
+    chunk_rows: int,
+    head: Optional[int],
+    console: Console,
+) -> None:
+    """
+    Export a dataframe (obs or var) to CSV format.
+
+    Args:
+        file: Path to the .h5ad file
+        axis: Axis to read from ('obs' or 'var')
+        columns: List of column names to include in the output table
+        out: Output file path (defaults to stdout if None)
+        chunk_rows: Number of rows to read per chunk
+        head: Output only the first n rows
+        console: Rich console for status output
+    """
+    with h5py.File(file, "r") as f:
+        group, n_rows, index_name = get_axis_group(f, axis)
+
+        # Determine columns to read
+        if columns:
+            col_names = list(columns)
+        else:
+            col_names = [k for k in group.keys() if k != "_index" and k != index_name]
+            # Add index name if not already present
+            if index_name and index_name not in col_names:
+                col_names.insert(0, index_name)
+
+        if isinstance(index_name, bytes):
+            index_name = index_name.decode("utf-8")
+
+        if index_name not in col_names:
+            col_names.insert(0, index_name)
+        else:
+            col_names = [index_name] + [c for c in col_names if c != index_name]
+
+        # Limit rows if head option is specified
+        if head is not None and head > 0:
+            n_rows = min(n_rows, head)
+
+        # Open writer
+        if out is None or str(out) == "-":
+            out_fh = sys.stdout
+        else:
+            out_fh = open(out, "w", newline="", encoding="utf-8")
+        writer = csv.writer(out_fh)
+
+        # Write data in chunks
+        try:
+            writer.writerow(col_names)
+            cat_cache: Dict[int, np.ndarray] = {}
+            with console.status(
+                f"[magenta]Exporting {axis} table...[/] to {'stdout' if out_fh is sys.stdout else out}"
+            ) as status:
+                for start in range(0, n_rows, chunk_rows):
+                    end = min(start + chunk_rows, n_rows)
+                    status.update(
+                        f"[magenta]Exporting rows {start}-{end} of {n_rows}...[/]"
+                    )
+                    cols_data: List[List[str]] = []
+                    # Read each column for the current chunk
+                    for col in col_names:
+                        cols_data.append(
+                            col_chunk_as_strings(group, col, start, end, cat_cache)
+                        )
+                    # Write rows
+                    for row_idx in range(end - start):
+                        row = [
+                            cols_data[col_idx][row_idx]
+                            for col_idx in range(len(col_names))
+                        ]
+                        writer.writerow(row)
+        finally:
+            if out_fh is not sys.stdout:
+                out_fh.close()
+
+
+# ============================================================================
+# TYPE DETECTION AND VALIDATION
+# ============================================================================
 # Map object types to valid output extensions
 TYPE_EXTENSIONS = {
     "dataframe": {".csv"},
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 7b327e5..4105546 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -6,7 +6,7 @@
 from typer.testing import CliRunner
 from h5ad.cli import app
 from h5ad.commands.info import show_info
-from h5ad.commands.table import export_table
+from h5ad.commands.export import export_table
 from rich.console import Console
 
 

From ddf6f2185736aec57bb952fcc633fbf624966b42 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Fri, 23 Jan 2026 13:07:44 +0000
Subject: [PATCH 10/62] Rename 'object' option to 'entry' in info command and
 add depth option for recursion control

---
 src/h5ad/cli.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py
index f42768a..856224b 100644
--- a/src/h5ad/cli.py
+++ b/src/h5ad/cli.py
@@ -31,11 +31,11 @@ def info(
         exists=True,
         readable=True,
     ),
-    obj: Optional[str] = typer.Option(
+    entry: Optional[str] = typer.Option(
         None,
-        "--object",
-        "-o",
-        help="Object path to inspect (e.g., 'obsm/X_pca', 'X', 'uns')",
+        "--entry",
+        "-e",
+        help="Entry path to inspect (e.g., 'obsm/X_pca', 'X', 'uns')",
     ),
     types: bool = typer.Option(
         False,
@@ -43,19 +43,25 @@ def info(
         "-t",
         help="Show detailed type information for all entries",
     ),
+    depth: int = typer.Option(
+        None,
+        "--depth",
+        "-d",
+        help="Maximum recursion depth for type display (only with --types)",
+    ),
 ) -> None:
     """
     Show high-level information about the .h5ad file.
 
     Use --types to see type information for each entry.
-    Use --object to inspect a specific object in detail.
+    Use --entry to inspect a specific entry in detail.
 
     Examples:
         h5ad info data.h5ad
         h5ad info --types data.h5ad
-        h5ad info --object obsm/X_pca data.h5ad
+        h5ad info --entry obsm/X_pca data.h5ad
     """
-    show_info(file, console, show_types=types, obj_path=obj)
+    show_info(file, console, show_types=types, depth=depth, entry_path=entry)
 
 
 # ============================================================================

From 99b6335445c900c621bdb493dc5a59f1c89dc041 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Fri, 23 Jan 2026 13:07:55 +0000
Subject: [PATCH 11/62] Refactor show_info function to replace obj_path with
 entry_path and add depth parameter for recursion control

---
 src/h5ad/commands/info.py | 46 ++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/src/h5ad/commands/info.py b/src/h5ad/commands/info.py
index 29c94ba..e1d2bbe 100644
--- a/src/h5ad/commands/info.py
+++ b/src/h5ad/commands/info.py
@@ -21,7 +21,8 @@ def show_info(
     file: Path,
     console: Console,
     show_types: bool = False,
-    obj_path: Optional[str] = None,
+    depth: Optional[int] = None,
+    entry_path: Optional[str] = None,
 ) -> None:
     """
     Show high-level information about the .h5ad file.
@@ -29,12 +30,13 @@ def show_info(
         file (Path): Path to the .h5ad file
         console (Console): Rich console for output
         show_types (bool): Show detailed type information for each entry
-        obj_path (Optional[str]): Specific object path to inspect (e.g., 'obsm/X_pca')
+        depth (Optional[int]): Maximum recursion depth for type display (only with show_types=True)
+        entry_path (Optional[str]): Specific entry path to inspect (e.g., 'obsm/X_pca')
     """
     with h5py.File(file, "r") as f:
         # If a specific path is requested, show detailed info for that object
-        if obj_path:
-            _show_object_info(f, obj_path, console)
+        if entry_path:
+            _show_object_info(f, entry_path, console)
             return
 
         # Get n_obs and n_var
@@ -45,7 +47,7 @@ def show_info(
         )
 
         if show_types:
-            _show_types_tree(f, console)
+            _show_types_tree(f, console, depth=depth)
         else:
             # List top-level keys and their sub-keys (original behavior)
             for key in _sort_keys(list(f.keys())):
@@ -60,7 +62,9 @@ def show_info(
                         )
 
 
-def _show_types_tree(f: h5py.File, console: Console) -> None:
+def _show_types_tree(
+    f: h5py.File, console: Console, depth: Optional[int] = None
+) -> None:
     """Show a tree view with type information for all entries.
 
     Recursion depth by group:
@@ -121,25 +125,27 @@ def add_node(
             children = [k for k in obj.keys() if k != "_index"]
             if not children:
                 continue
-        max_depth = max_depth_map.get(key, 1)  # default to 1 level for unknown groups
+        max_depth = (
+            depth if depth is not None else max_depth_map.get(key, 1)
+        )  # default to 1 level for unknown groups
         add_node(tree, key, obj, current_depth=0, max_depth=max_depth)
 
     console.print(tree)
 
 
-def _show_object_info(f: h5py.File, obj_path: str, console: Console) -> None:
+def _show_object_info(f: h5py.File, entry_path: str, console: Console) -> None:
     """Show detailed info for a specific object path."""
     # Normalize path
-    obj_path = obj_path.strip().lstrip("/")
+    entry_path = entry_path.strip().lstrip("/")
 
-    if obj_path not in f:
-        console.print(f"[bold red]Error:[/] '{obj_path}' not found in the file.")
+    if entry_path not in f:
+        console.print(f"[bold red]Error:[/] '{entry_path}' not found in the file.")
         return
 
-    obj = f[obj_path]
-    info = get_entry_type(obj)
+    entry = f[entry_path]
+    info = get_entry_type(entry)
 
-    console.print(f"\n[bold cyan]Path:[/] {obj_path}")
+    console.print(f"\n[bold cyan]Path:[/] {entry_path}")
     console.print(f"[bold cyan]Type:[/] {info['type']}")
 
     if info["encoding"]:
@@ -154,21 +160,21 @@ def _show_object_info(f: h5py.File, obj_path: str, console: Console) -> None:
     console.print(f"[bold cyan]Details:[/] {info['details']}")
 
     # Show attributes if any
-    if obj.attrs:
+    if entry.attrs:
         console.print(f"\n[bold cyan]Attributes:[/]")
-        for k, v in obj.attrs.items():
+        for k, v in entry.attrs.items():
             v_str = v.decode("utf-8") if isinstance(v, bytes) else str(v)
             if len(v_str) > 80:
                 v_str = v_str[:77] + "..."
             console.print(f"  [dim]{k}:[/] {v_str}")
 
     # If it's a group, show children
-    if isinstance(obj, h5py.Group):
-        children = [k for k in obj.keys() if k != "_index"]
+    if isinstance(entry, h5py.Group):
+        children = [k for k in entry.keys() if k != "_index"]
         if children:
             console.print(f"\n[bold cyan]Children:[/]")
             for child_name in sorted(children):
-                child_obj = obj[child_name]
-                child_info = get_entry_type(child_obj)
+                child_entry = entry[child_name]
+                child_info = get_entry_type(child_entry)
                 type_str = format_type_info(child_info)
                 console.print(f"  [bright_white]{child_name}[/] {type_str}")

From 99a9834e4a85dc765ca84e1d93445ff082db0f07 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Fri, 23 Jan 2026 13:08:11 +0000
Subject: [PATCH 12/62] Refactor get_entry_type function to replace 'obj' with
 'entry' and improve encoding handling; update axis_len function to raise
 exceptions for error cases

---
 src/h5ad/info.py | 106 ++++++++++++++++++++++++-----------------------
 1 file changed, 55 insertions(+), 51 deletions(-)

diff --git a/src/h5ad/info.py b/src/h5ad/info.py
index 94022a0..25cdc31 100644
--- a/src/h5ad/info.py
+++ b/src/h5ad/info.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 
-def get_entry_type(obj: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]:
+def get_entry_type(entry: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]:
     """
     Determine the type/format of an HDF5 object for export guidance.
 
@@ -25,61 +25,49 @@ def get_entry_type(obj: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]:
     }
 
     # Get encoding-type attribute if present
-    enc = obj.attrs.get("encoding-type", b"")
+    enc = entry.attrs.get("encoding-type", b"")
     if isinstance(enc, bytes):
         enc = enc.decode("utf-8")
     result["encoding"] = enc if enc else None
 
-    if isinstance(obj, h5py.Dataset):
-        result["shape"] = obj.shape
-        result["dtype"] = str(obj.dtype)
+    # Infer the type for Dataset entry
+    if isinstance(entry, h5py.Dataset):
+        result["shape"] = entry.shape
+        result["dtype"] = str(entry.dtype)
 
         # Scalar
-        if obj.shape == ():
+        if entry.shape == ():
             result["type"] = "scalar"
             result["export_as"] = "json"
-            result["details"] = f"Scalar value ({obj.dtype})"
+            result["details"] = f"Scalar value ({entry.dtype})"
             return result
 
         # 1D or 2D numeric array -> dense matrix / array
-        if obj.ndim == 1:
+        if entry.ndim == 1:
             result["type"] = "array"
             result["export_as"] = "npy"
-            result["details"] = f"1D array [{obj.shape[0]}] ({obj.dtype})"
-        elif obj.ndim == 2:
-            # Check if it looks like an image (2D with reasonable image dimensions)
-            # Minimum 16x16, maximum 10000x10000, numeric dtype
-            if (
-                obj.shape[0] >= 16
-                and obj.shape[1] >= 16
-                and obj.shape[0] <= 10000
-                and obj.shape[1] <= 10000
-                and (np.issubdtype(obj.dtype, np.number) or obj.dtype == np.bool_)
-            ):
-                # Could be an image, but default to dense-matrix
-                # Image export can still be used if user provides image extension
-                pass
+            result["details"] = f"1D array [{entry.shape[0]}] ({entry.dtype})"
+        elif entry.ndim == 2:
             result["type"] = "dense-matrix"
             result["export_as"] = "npy"
             result["details"] = (
-                f"Dense matrix {obj.shape[0]}×{obj.shape[1]} ({obj.dtype})"
+                f"Dense matrix {entry.shape[0]}×{entry.shape[1]} ({entry.dtype})"
             )
-        elif obj.ndim == 3:
+        elif entry.ndim == 3:
             result["type"] = "array"
             result["export_as"] = "npy"
-            result["details"] = f"3D array {obj.shape} ({obj.dtype})"
+            result["details"] = f"3D array {entry.shape} ({entry.dtype})"
         else:
             result["type"] = "array"
             result["export_as"] = "npy"
-            result["details"] = f"ND array {obj.shape} ({obj.dtype})"
-
+            result["details"] = f"ND array {entry.shape} ({entry.dtype})"
         return result
 
     # It's a Group
-    if isinstance(obj, h5py.Group):
+    if isinstance(entry, h5py.Group):
         # Check for sparse matrix (CSR/CSC)
         if enc in ("csr_matrix", "csc_matrix"):
-            shape = obj.attrs.get("shape", None)
+            shape = entry.attrs.get("shape", None)
             shape_str = f"{shape[0]}×{shape[1]}" if shape is not None else "?"
             result["type"] = "sparse-matrix"
             result["export_as"] = "mtx"
@@ -90,8 +78,8 @@ def get_entry_type(obj: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]:
 
         # Check for categorical
         if enc == "categorical":
-            codes = obj.get("codes")
-            cats = obj.get("categories")
+            codes = entry.get("codes")
+            cats = entry.get("categories")
             n_codes = codes.shape[0] if codes is not None else "?"
             n_cats = cats.shape[0] if cats is not None else "?"
             result["type"] = "categorical"
@@ -100,8 +88,8 @@ def get_entry_type(obj: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]:
             return result
 
         # Check for dataframe (obs/var style with _index)
-        if "_index" in obj.attrs or "obs_names" in obj or "var_names" in obj:
-            n_cols = len([k for k in obj.keys() if k != "_index"])
+        if "_index" in entry.attrs or "obs_names" in entry or "var_names" in entry:
+            n_cols = len([k for k in entry.keys() if k != "_index"])
             result["type"] = "dataframe"
             result["export_as"] = "csv"
             result["details"] = f"DataFrame with {n_cols} columns"
@@ -115,7 +103,7 @@ def get_entry_type(obj: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]:
             return result
 
         # Generic dict/group
-        n_keys = len(list(obj.keys()))
+        n_keys = len(list(entry.keys()))
         result["type"] = "dict"
         result["export_as"] = "json"
         result["details"] = f"Group with {n_keys} keys"
@@ -141,24 +129,31 @@ def format_type_info(info: Dict[str, Any]) -> str:
     return f"[{color}]<{info['type']}>[/]"
 
 
-def axis_len(file: h5py.File, axis: str) -> Optional[int]:
+def axis_len(file: h5py.File, axis: str) -> int:
     """
     Get the length of the specified axis ('obs' or 'var') in the h5ad file.
+
     Args:
         file (h5py.File): Opened h5ad file object
         axis (str): Axis name ('obs' or 'var')
 
     Returns:
-        Optional[int]: Length of the axis, or None if not found
+        int: Length of the axis
+
+    Raises:
+        ValueError: If axis is not 'obs' or 'var'
+        KeyError: If axis or index dataset not found in file
+        TypeError: If axis is not a group or index is not a dataset
+        RuntimeError: If axis length cannot be determined
     """
     # Check if the specified axis exists in the file
     if axis not in file:
-        return None
+        raise KeyError(f"'{axis}' not found in the file.")
 
     # Get the group corresponding to the axis
     group = file[axis]
     if not isinstance(group, h5py.Group):
-        return None
+        raise TypeError(f"'{axis}' is not a group.")
 
     # Determine the index name for the axis
     index_name = group.attrs.get("_index", None)
@@ -168,49 +163,58 @@ def axis_len(file: h5py.File, axis: str) -> Optional[int]:
         elif axis == "var":
             index_name = "var_names"
         else:
-            return None
+            raise ValueError(f"Invalid axis '{axis}'. Must be 'obs' or 'var'.")
 
+    # Decode bytes to string if necessary
     if isinstance(index_name, bytes):
         index_name = index_name.decode("utf-8")
 
+    # Check if the index dataset exists
     if index_name not in group:
-        return None
+        raise KeyError(f"Index dataset '{index_name}' not found in '{axis}' group.")
 
     # Return the length of the index dataset
     dataset = group[index_name]
     if not isinstance(dataset, h5py.Dataset):
-        return None
+        raise TypeError(f"Index '{index_name}' in '{axis}' is not a dataset.")
     if dataset.shape:
         return int(dataset.shape[0])
-    return None
+    raise RuntimeError(
+        f"Cannot determine length of '{axis}': index dataset has no shape."
+    )
 
 
 def get_axis_group(file: h5py.File, axis: str) -> Tuple[h5py.Group, int, str]:
     """
     Get the axis group, its length, and index name.
+
     Args:
         file (h5py.File): Opened h5ad file object
         axis (str): Axis name ('obs' or 'var')
 
     Returns:
-        Tuple[h5py.Group, int, str]: Axis group, its length, and index
+        Tuple[h5py.Group, int, str]: Axis group, its length, and index name
+
+    Raises:
+        ValueError: If axis is not 'obs' or 'var'
+        KeyError: If axis or index dataset not found in file
+        TypeError: If axis is not a group or index is not a dataset
+        RuntimeError: If axis length cannot be determined
     """
     if axis not in ("obs", "var"):
         raise ValueError("axis must be 'obs' or 'var'.")
-    if axis not in file:
-        raise KeyError(f"'{axis}' not found in the file.")
-
-    group = file[axis]
-    if not isinstance(group, h5py.Group):
-        raise TypeError(f"'{axis}' is not a group.")
 
+    # axis_len will validate existence and get length (raises exceptions if issues)
     n = axis_len(file, axis)
-    if n is None:
-        raise RuntimeError(f"Could not determine length of axis '{axis}'.")
 
+    # Get the group (already validated by axis_len)
+    group = file[axis]
+
+    # Get the index name
     index_name = group.attrs.get("_index", None)
     if index_name is None:
         index_name = "obs_names" if axis == "obs" else "var_names"
     if isinstance(index_name, bytes):
         index_name = index_name.decode("utf-8")
+
     return group, n, index_name

From 82de2689eff9d70ba9ce098710c895308fffd7d3 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Fri, 23 Jan 2026 13:52:54 +0000
Subject: [PATCH 13/62] Refactor info command tests to replace 'object' with
 'entry' flag and update related assertions

---
 tests/test_cli.py | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index 4105546..203830f 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -50,35 +50,51 @@ def test_info_types_short_flag(self, sample_h5ad_file):
         output = result.stdout + (result.stderr or "")
         assert "<" in output
 
-    def test_info_object_flag(self, sample_h5ad_file):
-        """Test info command with --object flag."""
-        result = runner.invoke(app, ["info", "--object", "X", str(sample_h5ad_file)])
+    def test_info_depth_flag(self, sample_h5ad_file):
+        """Test info command with --depth flag."""
+        result = runner.invoke(
+            app, ["info", "--types", "--depth", "1", str(sample_h5ad_file)]
+        )
+        assert result.exit_code == 0
+        output = result.stdout + (result.stderr or "")
+        assert "<" in output
+
+    def test_info_depth_short_flag(self, sample_h5ad_file):
+        """Test info command with -d short flag."""
+        result = runner.invoke(app, ["info", "-t", "-d", "2", str(sample_h5ad_file)])
+        assert result.exit_code == 0
+        output = result.stdout + (result.stderr or "")
+        assert "<" in output
+
+    def test_info_entry_flag(self, sample_h5ad_file):
+        """Test info command with --entry flag."""
+        result = runner.invoke(app, ["info", "--entry", "X", str(sample_h5ad_file)])
         assert result.exit_code == 0
         output = result.stdout + (result.stderr or "")
         assert "Path:" in output
         assert "Type:" in output
 
-    def test_info_object_short_flag(self, sample_h5ad_file):
-        """Test info command with -o short flag."""
-        result = runner.invoke(app, ["info", "-o", "obs", str(sample_h5ad_file)])
+    def test_info_entry_short_flag(self, sample_h5ad_file):
+        """Test info command with -e short flag."""
+        result = runner.invoke(app, ["info", "-e", "obs", str(sample_h5ad_file)])
         assert result.exit_code == 0
         output = result.stdout + (result.stderr or "")
         assert "Path:" in output
         assert "dataframe" in output
 
-    def test_info_object_nested_path(self, sample_h5ad_file):
+    def test_info_entry_nested_path(self, sample_h5ad_file):
         """Test info command with nested object path."""
         result = runner.invoke(
-            app, ["info", "-o", "uns/description", str(sample_h5ad_file)]
+            app, ["info", "-e", "uns/description", str(sample_h5ad_file)]
         )
         assert result.exit_code == 0
         output = result.stdout + (result.stderr or "")
         assert "Path:" in output
 
-    def test_info_object_not_found(self, sample_h5ad_file):
+    def test_info_entry_not_found(self, sample_h5ad_file):
         """Test info command with non-existent object path."""
         result = runner.invoke(
-            app, ["info", "-o", "nonexistent", str(sample_h5ad_file)]
+            app, ["info", "-e", "nonexistent", str(sample_h5ad_file)]
         )
         assert result.exit_code == 0  # Doesn't exit with error, just shows message
         output = result.stdout + (result.stderr or "")

From aaeddb2bf65691636acbd1623cb40f08bb64b9f0 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Fri, 23 Jan 2026 14:05:26 +0000
Subject: [PATCH 14/62] Enhance axis_len tests to validate error handling for
 non-existent axes, non-group types, and missing index datasets

---
 tests/test_info_read.py | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/tests/test_info_read.py b/tests/test_info_read.py
index 8ad47b4..5b69fe3 100644
--- a/tests/test_info_read.py
+++ b/tests/test_info_read.py
@@ -102,10 +102,28 @@ def test_axis_len_var(self, sample_h5ad_file):
             assert length == 4
 
     def test_axis_len_nonexistent(self, sample_h5ad_file):
-        """Test getting length of non-existent axis."""
+        """Test getting length of non-existent axis raises KeyError."""
         with h5py.File(sample_h5ad_file, "r") as f:
-            length = axis_len(f, "nonexistent")
-            assert length is None
+            with pytest.raises(KeyError, match="'nonexistent' not found"):
+                axis_len(f, "nonexistent")
+
+    def test_axis_len_not_a_group(self, temp_dir):
+        """Test that axis_len raises TypeError when axis is not a group."""
+        file_path = temp_dir / "test.h5ad"
+        with h5py.File(file_path, "w") as f:
+            f.create_dataset("obs", data=np.array([1, 2, 3]))
+        with h5py.File(file_path, "r") as f:
+            with pytest.raises(TypeError, match="'obs' is not a group"):
+                axis_len(f, "obs")
+
+    def test_axis_len_missing_index(self, temp_dir):
+        """Test that axis_len raises KeyError when index dataset is missing."""
+        file_path = temp_dir / "test.h5ad"
+        with h5py.File(file_path, "w") as f:
+            f.create_group("obs")
+        with h5py.File(file_path, "r") as f:
+            with pytest.raises(KeyError, match="Index dataset 'obs_names' not found"):
+                axis_len(f, "obs")
 
 
 class TestGetAxisGroup:

From a7d23e23de67d3106006d01b8ad1052157ef951a Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Fri, 23 Jan 2026 17:11:36 +0000
Subject: [PATCH 15/62] Added element specs for .h5ad files

---
 docs/h5ad_elements_spec.md | 274 +++++++++++++++++++++++++++++++++++++
 1 file changed, 274 insertions(+)
 create mode 100644 docs/h5ad_elements_spec.md

diff --git a/docs/h5ad_elements_spec.md b/docs/h5ad_elements_spec.md
new file mode 100644
index 0000000..acb491d
--- /dev/null
+++ b/docs/h5ad_elements_spec.md
@@ -0,0 +1,274 @@
+# AnnData on-disk element specifications — HDF5 (`.h5ad`)
+
+This document describes how *elements* are encoded inside an AnnData **HDF5** container (`.h5ad`).  
+It is intended to be GitHub-renderable Markdown (no Sphinx/MyST directives).
+
+> **Scope**
+>
+> - “Modern” encoding metadata (`encoding-type`, `encoding-version`) is the convention used by **anndata ≥ 0.8**.
+> - “Legacy” conventions (notably DataFrame categorical handling) are described for **anndata 0.7.x** files, which are still commonly encountered.
+
+## Table of contents
+
+- [Encoding metadata](#encoding-metadata)
+- [AnnData group](#anndata-group)
+- [Dense arrays](#dense-arrays)
+- [Sparse arrays (CSR/CSC)](#sparse-arrays-csrcsc)
+- [DataFrames](#dataframes)
+  - [DataFrame v0.2.0](#dataframe-v020)
+  - [DataFrame v0.1.0 (legacy: anndata 0.7.x)](#dataframe-v010-legacy-anndata-07x)
+  - [Legacy categorical columns (Series-level)](#legacy-categorical-columns-series-level)
+- [Mappings / dict](#mappings--dict)
+- [Scalars](#scalars)
+- [Categorical arrays](#categorical-arrays)
+- [String arrays](#string-arrays)
+- [Nullable arrays](#nullable-arrays)
+  - [Missing value semantics](#missing-value-semantics)
+- [Awkward arrays (experimental)](#awkward-arrays-experimental)
+- [Sources](#sources)
+
+## Encoding metadata
+
+**Modern convention (anndata ≥ 0.8):**
+
+- Any element (HDF5 *group* or *dataset*) that participates in the element-dispatch system:
+  - **MUST** have attribute `encoding-type` (string)
+  - **MUST** have attribute `encoding-version` (string, parseable as a version)
+
+Readers should dispatch first on `encoding-type`, then on `encoding-version`.
+
+**Legacy convention (anndata ≤ 0.7.x):**
+
+- Many objects do *not* have `encoding-type`/`encoding-version`.
+- Some elements (e.g. CSR/CSC sparse matrices, legacy DataFrames) *do* use `encoding-type`/`encoding-version`.
+- Readers typically infer element kinds from:
+  - known AnnData keys (`X`, `obs`, `var`, …),
+  - group structure, and/or
+  - legacy attributes (e.g. the `categories` attribute on categorical columns).
+
+## AnnData group
+
+### `encoding-type: anndata`, `encoding-version: 0.1.0`
+
+An `AnnData` object **MUST** be stored as an HDF5 **group** with attributes:
+
+- `encoding-type: "anndata"`
+- `encoding-version: "0.1.0"`
+
+Required members:
+
+- `obs` — a [DataFrame](#dataframes)
+- `var` — a [DataFrame](#dataframes)
+
+Optional members (if present, they must satisfy these constraints):
+
+- `X` — dense array or sparse array; shape `(n_obs, n_var)`
+- `layers` — mapping; values dense or sparse arrays; each shape `(n_obs, n_var)`
+- `obsm` — mapping; values dense arrays, sparse arrays, or dataframes; first dim `n_obs`
+- `varm` — mapping; values dense arrays, sparse arrays, or dataframes; first dim `n_var`
+- `obsp` — mapping; values dense or sparse arrays; first two dims `n_obs`
+- `varp` — mapping; values dense or sparse arrays; first two dims `n_var`
+- `uns` — mapping/dict-like container (recursive)
+
+## Dense arrays
+
+### `encoding-type: array`, `encoding-version: 0.2.0`
+
+- A dense array **MUST** be an HDF5 **dataset**.
+- The dataset **MUST** have attributes:
+  - `encoding-type: "array"`
+  - `encoding-version: "0.2.0"`
+
+> **Legacy note**
+>
+> In anndata 0.7.x, dense arrays were typically stored as plain datasets *without* `encoding-type`/`encoding-version`.
+
+## Sparse arrays (CSR/CSC)
+
+### `encoding-type: csr_matrix|csc_matrix`, `encoding-version: 0.1.0`
+
+A sparse matrix **MUST** be stored as an HDF5 **group**.
+
+- Group attributes:
+  - `encoding-type: "csr_matrix"` **or** `"csc_matrix"`
+  - `encoding-version: "0.1.0"`
+  - `shape`: integer array of length 2 (matrix shape)
+- Group members (datasets):
+  - `data`
+  - `indices`
+  - `indptr`
+
+The exact CSR/CSC semantics follow SciPy’s conventions.
+
+## DataFrames
+
+DataFrames are stored column-wise: each column is stored as a dataset (or group, if the column itself is an encoded element).
+
+<a id="dataframe-v020"></a>
+### DataFrame v0.2.0
+
+#### `encoding-type: dataframe`, `encoding-version: 0.2.0`
+
+A dataframe **MUST** be stored as an HDF5 **group**.
+
+- Group attributes:
+  - `_index`: string — the key of the dataset to be used as the row index
+  - `column-order`: array of strings — original column order
+  - `encoding-type: "dataframe"`
+  - `encoding-version: "0.2.0"`
+- Group members:
+  - the index dataset (named by `_index`)
+  - one member per column
+- All column entries **MUST** have the same length in their first dimension.
+- Columns **SHOULD** share chunking along the first dimension.
+
+Columns are independently encoded:
+- simple numeric/bool columns are commonly `encoding-type: array`
+- categorical columns are commonly `encoding-type: categorical`
+
+<a id="dataframe-v010-legacy-anndata-07x"></a>
+### DataFrame v0.1.0 (legacy: anndata 0.7.x)
+
+#### `encoding-type: dataframe`, `encoding-version: 0.1.0`
+
+A legacy dataframe is stored as an HDF5 **group** where:
+
+- Group attributes include:
+  - `_index`
+  - `column-order`
+  - `encoding-type: "dataframe"`
+  - `encoding-version: "0.1.0"`
+- Each column is a dataset.
+- Categorical columns are stored as **integer code datasets**, and their category labels are stored in a reserved subgroup named `__categories`.
+
+**Reserved subgroup:**
+
+- `__categories/<colname>` stores the array of category labels for column `<colname>`.
+
+<a id="legacy-categorical-columns-series-level"></a>
+### Legacy categorical columns (Series-level)
+
+In v0.1.0 DataFrames, a categorical column dataset (e.g. `obs/cell_type`) can be identified by the presence of an attribute:
+
+- `categories`: an **HDF5 object reference** pointing to the corresponding `__categories/<colname>` dataset.
+
+## Mappings / dict
+
+### `encoding-type: dict`, `encoding-version: 0.1.0`
+
+- A mapping **MUST** be stored as an HDF5 **group**.
+- Group attributes:
+  - `encoding-type: "dict"`
+  - `encoding-version: "0.1.0"`
+- Each entry in the group is another element (recursively).
+
+> **Legacy note**
+>
+> In anndata 0.7.x, groups used as mappings often had **no special attributes**.
+
+## Scalars
+
+### `encoding-version: 0.2.0`
+
+Scalars are stored as **0-dimensional datasets**.
+
+- Numeric scalars:
+  - `encoding-type: "numeric-scalar"`
+  - `encoding-version: "0.2.0"`
+  - value is numeric (including boolean, ints, floats, complex)
+- String scalars:
+  - `encoding-type: "string"`
+  - `encoding-version: "0.2.0"`
+  - **HDF5 requirement:** variable-length UTF-8 string dtype
+
+> **Legacy note**
+>
+> In anndata 0.7.x, scalar strings were commonly stored as `|O` datasets without `encoding-type`/`encoding-version`.
+
+## Categorical arrays
+
+### `encoding-type: categorical`, `encoding-version: 0.2.0`
+
+Categorical arrays are stored as an HDF5 **group** with members:
+
+- `codes`: integer dataset  
+  - values are zero-based indices into `categories`
+  - signed integer arrays **MAY** use `-1` to denote missing values
+- `categories`: array of labels
+
+Group attributes:
+
+- `encoding-type: "categorical"`
+- `encoding-version: "0.2.0"`
+- `ordered`: boolean (whether the categories are ordered)
+
+## String arrays
+
+### `encoding-type: string-array`, `encoding-version: 0.2.0`
+
+- String arrays **MUST** be stored as HDF5 datasets.
+- Dataset attributes:
+  - `encoding-type: "string-array"`
+  - `encoding-version: "0.2.0"`
+- **HDF5 requirement:** variable-length UTF-8 string dtype
+
+## Nullable arrays
+
+These encodings support Pandas nullable integer/boolean/string arrays by storing a `values` array plus a boolean `mask` array.
+
+### `encoding-type: nullable-integer`, `encoding-version: 0.1.0`
+
+- Stored as an HDF5 group with datasets:
+  - `values` (integer)
+  - `mask` (boolean)
+
+### `encoding-type: nullable-boolean`, `encoding-version: 0.1.0`
+
+- Stored as an HDF5 group with datasets:
+  - `values` (boolean)
+  - `mask` (boolean)
+- `values` and `mask` **MUST** have the same shape.
+
+### `encoding-type: nullable-string-array`, `encoding-version: 0.1.0`
+
+- Stored as an HDF5 group with datasets:
+  - `values` (string array)
+  - `mask` (boolean)
+- Group attributes:
+  - `encoding-type: "nullable-string-array"`
+  - `encoding-version: "0.1.0"`
+  - optional `na-value`: `"NA"` or `"NaN"` (default `"NA"`)
+
+<a id="missing-value-semantics"></a>
+#### Missing value semantics
+
+For elements supporting a `na-value` attribute:
+
+- `"NA"`: comparisons propagate missingness (e.g. `"x" == NA` → `NA`)
+- `"NaN"`: comparisons yield boolean results (e.g. `"x" == NaN` → `false`)
+
+Readers should preserve semantics when the runtime model supports it.
+
+## Awkward arrays (experimental)
+
+### `encoding-type: awkward-array`, `encoding-version: 0.1.0`
+
+Ragged arrays are stored by decomposing an Awkward Array into constituent buffers (via `ak.to_buffers`), then storing those buffers as datasets within a group.
+
+Group attributes:
+
+- `encoding-type: "awkward-array"`
+- `encoding-version: "0.1.0"`
+- `form`: string — serialized Awkward “form”
+- `length`: integer — logical length
+
+Group members: datasets for the buffers (often named like `nodeX-*`).
+
+> **Experimental**
+>
+> This encoding is considered experimental in the anndata 0.9.x series and later.
+
+## Sources
+
+- AnnData “on-disk format” prose docs (modern, ≥0.8): https://anndata.readthedocs.io/en/stable/fileformat-prose.html
+- AnnData 0.7.8 “on-disk format” prose docs (legacy): https://dokk.org/documentation/anndata/0.7.8/fileformat-prose/

From 1bda8d88ccdb55db04baf728729f3cf03de804e3 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Fri, 23 Jan 2026 17:12:04 +0000
Subject: [PATCH 16/62] Refactor info and export_dataframe commands to use
 arguments instead of options for entry paths; add error handling for
 show_info function.

---
 src/h5ad/cli.py | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py
index 856224b..a09761a 100644
--- a/src/h5ad/cli.py
+++ b/src/h5ad/cli.py
@@ -31,10 +31,8 @@ def info(
         exists=True,
         readable=True,
     ),
-    entry: Optional[str] = typer.Option(
+    entry: Optional[str] = typer.Argument(
         None,
-        "--entry",
-        "-e",
         help="Entry path to inspect (e.g., 'obsm/X_pca', 'X', 'uns')",
     ),
     types: bool = typer.Option(
@@ -59,9 +57,13 @@ def info(
     Examples:
         h5ad info data.h5ad
         h5ad info --types data.h5ad
-        h5ad info --entry obsm/X_pca data.h5ad
+        h5ad info obsm/X_pca data.h5ad
     """
-    show_info(file, console, show_types=types, depth=depth, entry_path=entry)
+    try:
+        show_info(file, console, show_types=types, depth=depth, entry_path=entry)
+    except Exception as e:
+        console.print(f"[bold red]Error:[/] {e}")
+        raise typer.Exit(code=1)
 
 
 # ============================================================================
@@ -118,8 +120,10 @@ def export_dataframe(
     file: Path = typer.Argument(
         ..., help="Path to the .h5ad file", exists=True, readable=True
     ),
-    obj: str = typer.Argument(..., help="Object path to export ('obs' or 'var')"),
-    out: Path = typer.Argument(..., help="Output CSV file path"),
+    entry: str = typer.Argument(..., help="Entry path to export ('obs' or 'var')"),
+    output: Path = typer.Option(
+        None, "--output", "-o", writable=True, help="Output CSV file path"
+    ),
     columns: Optional[str] = typer.Option(
         None,
         "--columns",
@@ -137,15 +141,15 @@ def export_dataframe(
     Export a dataframe (obs or var) to CSV.
 
     Examples:
-        h5ad export dataframe data.h5ad obs obs.csv
-        h5ad export dataframe data.h5ad var var.csv --columns gene_id,mean
-        h5ad export dataframe data.h5ad obs - --head 100
+        h5ad export dataframe data.h5ad obs --output obs.csv
+        h5ad export dataframe data.h5ad var --output var.csv --columns gene_id,mean
+        h5ad export dataframe data.h5ad obs --head 100
     """
     from h5ad.commands import export_table
 
-    if obj not in ("obs", "var"):
+    if entry not in ("obs", "var"):
         console.print(
-            f"[bold red]Error:[/] Object must be 'obs' or 'var', not '{obj}'.",
+            f"[bold red]Error:[/] Dataframe export is only supported for 'obs' or 'var' at this point, not '{entry}'.",
         )
         raise typer.Exit(code=1)
 
@@ -156,9 +160,9 @@ def export_dataframe(
     try:
         export_table(
             file=file,
-            axis=obj,
+            axis=entry,
             columns=col_list,
-            out=out if str(out) != "-" else None,
+            out=output,
             chunk_rows=chunk_rows,
             head=head,
             console=console,

From 4c6b18577c27bccd241095a0da519acbc05c606a Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Fri, 23 Jan 2026 17:12:17 +0000
Subject: [PATCH 17/62] Enhance get_entry_type function to support legacy
 categorical and dataframe formats; improve version detection and details for
 various entry types.

---
 src/h5ad/info.py | 84 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 75 insertions(+), 9 deletions(-)

diff --git a/src/h5ad/info.py b/src/h5ad/info.py
index 25cdc31..7abb3e9 100644
--- a/src/h5ad/info.py
+++ b/src/h5ad/info.py
@@ -7,6 +7,10 @@ def get_entry_type(entry: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]:
     """
     Determine the type/format of an HDF5 object for export guidance.
 
+    Supports both:
+    - v0.2.0 (modern): Objects with encoding-type/encoding-version attributes
+    - v0.1.0 (legacy): Objects without encoding attributes, inferred from structure
+
     Returns a dict with:
         - type: str (e.g., 'dataframe', 'sparse-matrix', 'dense-matrix', 'dict', 'image', 'array', 'scalar')
         - export_as: str (suggested export format: csv, mtx, npy, json, image)
@@ -14,6 +18,7 @@ def get_entry_type(entry: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]:
         - shape: tuple or None
         - dtype: str or None
         - details: str (human-readable description)
+        - version: str ('0.2.0', '0.1.0', or None for unknown)
     """
     result: Dict[str, Any] = {
         "type": "unknown",
@@ -22,6 +27,7 @@ def get_entry_type(entry: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]:
         "shape": None,
         "dtype": None,
         "details": "",
+        "version": None,
     }
 
     # Get encoding-type attribute if present
@@ -30,11 +36,34 @@ def get_entry_type(entry: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]:
         enc = enc.decode("utf-8")
     result["encoding"] = enc if enc else None
 
+    # Get encoding-version if present
+    enc_ver = entry.attrs.get("encoding-version", b"")
+    if isinstance(enc_ver, bytes):
+        enc_ver = enc_ver.decode("utf-8")
+    result["version"] = enc_ver if enc_ver else None
+
     # Infer the type for Dataset entry
     if isinstance(entry, h5py.Dataset):
         result["shape"] = entry.shape
         result["dtype"] = str(entry.dtype)
 
+        # Check for legacy categorical (v0.1.0): dataset with 'categories' attribute
+        if "categories" in entry.attrs:
+            result["type"] = "categorical"
+            result["export_as"] = "csv"
+            result["version"] = result["version"] or "0.1.0"
+            # Try to get category count from referenced dataset
+            try:
+                cats_ref = entry.attrs["categories"]
+                cats_ds = entry.file[cats_ref]
+                n_cats = cats_ds.shape[0]
+            except Exception:
+                n_cats = "?"
+            result["details"] = (
+                f"Legacy categorical [{entry.shape[0]} values, {n_cats} categories]"
+            )
+            return result
+
         # Scalar
         if entry.shape == ():
             result["type"] = "scalar"
@@ -65,7 +94,7 @@ def get_entry_type(entry: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]:
 
     # It's a Group
     if isinstance(entry, h5py.Group):
-        # Check for sparse matrix (CSR/CSC)
+        # Check for sparse matrix (CSR/CSC) - same in both versions
         if enc in ("csr_matrix", "csc_matrix"):
             shape = entry.attrs.get("shape", None)
             shape_str = f"{shape[0]}×{shape[1]}" if shape is not None else "?"
@@ -76,7 +105,7 @@ def get_entry_type(entry: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]:
             )
             return result
 
-        # Check for categorical
+        # Check for v0.2.0 categorical (Group with codes/categories)
         if enc == "categorical":
             codes = entry.get("codes")
             cats = entry.get("categories")
@@ -87,22 +116,59 @@ def get_entry_type(entry: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]:
             result["details"] = f"Categorical [{n_codes} values, {n_cats} categories]"
             return result
 
-        # Check for dataframe (obs/var style with _index)
-        if "_index" in entry.attrs or "obs_names" in entry or "var_names" in entry:
-            n_cols = len([k for k in entry.keys() if k != "_index"])
+        # Check for dataframe (obs/var style)
+        # v0.2.0: has encoding-type="dataframe"
+        # v0.1.0: has _index attribute or obs_names/var_names dataset
+        if (
+            enc == "dataframe"
+            or "_index" in entry.attrs
+            or "obs_names" in entry
+            or "var_names" in entry
+        ):
+            # Detect version
+            if enc == "dataframe":
+                df_version = result["version"] or "0.2.0"
+            else:
+                df_version = "0.1.0"  # No encoding-type, legacy format
+            result["version"] = df_version
+
+            # Check for __categories subgroup (v0.1.0 legacy)
+            has_legacy_cats = "__categories" in entry
+            n_cols = len(
+                [k for k in entry.keys() if k not in ("_index", "__categories")]
+            )
+
             result["type"] = "dataframe"
             result["export_as"] = "csv"
-            result["details"] = f"DataFrame with {n_cols} columns"
+            if has_legacy_cats:
+                result["details"] = f"DataFrame with {n_cols} columns (legacy v0.1.0)"
+            else:
+                result["details"] = f"DataFrame with {n_cols} columns"
             return result
 
-        # Check for array-like groups (nullable integer, string array, etc.)
-        if enc in ("nullable-integer", "string-array"):
+        # Check for nullable arrays (v0.2.0)
+        if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"):
             result["type"] = "array"
             result["export_as"] = "npy"
             result["details"] = f"Encoded array ({enc})"
             return result
 
-        # Generic dict/group
+        # Check for string-array encoding
+        if enc == "string-array":
+            result["type"] = "array"
+            result["export_as"] = "npy"
+            result["details"] = "Encoded string array"
+            return result
+
+        # Check for awkward-array (experimental)
+        if enc == "awkward-array":
+            length = entry.attrs.get("length", "?")
+            result["type"] = "awkward-array"
+            result["export_as"] = "json"
+            result["details"] = f"Awkward array (length={length})"
+            return result
+
+        # Generic dict/group (v0.2.0 has encoding-type="dict", v0.1.0 has no attributes)
         n_keys = len(list(entry.keys()))
         result["type"] = "dict"
         result["export_as"] = "json"

From d46981fe8b88eb0f0bccee454ffa90f799b965b9 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Fri, 23 Jan 2026 17:12:28 +0000
Subject: [PATCH 18/62] Refactor read_categorical_column and
 col_chunk_as_strings to support both modern and legacy formats; enhance error
 handling and caching for categorical data retrieval.

---
 src/h5ad/read.py | 123 ++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 100 insertions(+), 23 deletions(-)

diff --git a/src/h5ad/read.py b/src/h5ad/read.py
index 5abec06..36f2e58 100644
--- a/src/h5ad/read.py
+++ b/src/h5ad/read.py
@@ -20,29 +20,75 @@ def decode_str_array(array: np.ndarray) -> np.ndarray:
 
 
 def read_categorical_column(
-    col_group: h5py.Group, start: int, end: int, cache: Dict[int, np.ndarray]
+    col: h5py.Group | h5py.Dataset,
+    start: int,
+    end: int,
+    cache: Dict[int, np.ndarray],
+    parent_group: h5py.Group | None = None,
 ) -> List[str]:
     """
     Decode an AnnData 'categorical' column for a slice [start:end].
+
+    Supports both:
+    - v0.2.0 (modern): Group with 'codes' and 'categories' datasets
+    - v0.1.0 (legacy): Dataset with 'categories' attribute referencing __categories/<colname>
+
     Args:
-        col_group (h5py.Group): Column group containing 'categories' and 'codes'
-        start (int): Start index of the slice
-        end (int): End index of the slice
-        cache (Dict[int, np.ndarray]): Cache for decoded categories
+        col: Column group (v0.2.0) or dataset (v0.1.0)
+        start: Start index of the slice
+        end: End index of the slice
+        cache: Cache for decoded categories
+        parent_group: Parent obs/var group (needed for v0.1.0 to resolve __categories)
+
     Returns:
         List[str]: Decoded categorical values for the specified slice
     """
-    key = id(col_group)
-    if key not in cache:
-        cats = col_group["categories"][...]
-        cats = decode_str_array(cats)
-        cache[key] = np.asarray(cats, dtype=str)
-    cats = cache[key]
+    key = id(col)
+
+    # v0.2.0 format: Group with 'codes' and 'categories' datasets
+    if isinstance(col, h5py.Group):
+        if key not in cache:
+            cats = col["categories"][...]
+            cats = decode_str_array(cats)
+            cache[key] = np.asarray(cats, dtype=str)
+        cats = cache[key]
+
+        codes_ds = col["codes"]
+        codes = codes_ds[start:end]
+        codes = np.asarray(codes, dtype=np.int64)
+        return [cats[c] if 0 <= c < len(cats) else "" for c in codes]
+
+    # v0.1.0 format: Dataset with 'categories' attribute (object reference)
+    if isinstance(col, h5py.Dataset):
+        if key not in cache:
+            cats_ref = col.attrs.get("categories", None)
+            if cats_ref is not None:
+                # Dereference the HDF5 object reference
+                cats_ds = col.file[cats_ref]
+                cats = cats_ds[...]
+            elif parent_group is not None and "__categories" in parent_group:
+                # Fallback: look for __categories subgroup
+                col_name = col.name.split("/")[-1]
+                cats_grp = parent_group["__categories"]
+                if col_name in cats_grp:
+                    cats = cats_grp[col_name][...]
+                else:
+                    raise RuntimeError(
+                        f"Cannot find categories for legacy column {col.name}"
+                    )
+            else:
+                raise RuntimeError(
+                    f"Cannot find categories for legacy column {col.name}"
+                )
+            cats = decode_str_array(cats)
+            cache[key] = np.asarray(cats, dtype=str)
+        cats = cache[key]
 
-    codes_ds = col_group["codes"]
-    codes = codes_ds[start:end]
-    codes = np.asarray(codes, dtype=np.int64)
-    return [cats[c] if 0 <= c < len(cats) else "" for c in codes]
+        codes = col[start:end]
+        codes = np.asarray(codes, dtype=np.int64)
+        return [cats[c] if 0 <= c < len(cats) else "" for c in codes]
+
+    raise RuntimeError(f"Unsupported categorical column type: {type(col)}")
 
 
 def col_chunk_as_strings(
@@ -54,29 +100,60 @@ def col_chunk_as_strings(
 ) -> List[str]:
     """
     Read a column from an obs/var group as strings.
+
+    Supports both:
+    - v0.2.0 (modern): Columns with encoding-type attribute
+    - v0.1.0 (legacy): Categorical columns with 'categories' attribute referencing __categories
+
     Args:
         group (h5py.Group): The obs/var group
         col_name (str): Name of the column to read
         start (int): Start index of the slice
         end (int): End index of the slice
         cat_cache (Dict[int, np.ndarray]): Cache for decoded categorical columns
+
     Returns:
         List[str]: Column values as strings for the specified slice
     """
-    if col_name in group and isinstance(group[col_name], h5py.Dataset):
-        dataset = group[col_name]
-        chunk = dataset[start:end]
+    if col_name not in group:
+        raise RuntimeError(f"Column {col_name!r} not found in group {group.name}")
+
+    col = group[col_name]
+
+    # Case 1: Dataset (could be plain array or legacy categorical)
+    if isinstance(col, h5py.Dataset):
+        # Check for v0.1.0 legacy categorical (has 'categories' attribute)
+        if "categories" in col.attrs:
+            return read_categorical_column(col, start, end, cat_cache, group)
+
+        # Plain dataset (numeric, string, etc.)
+        chunk = col[start:end]
         if chunk.ndim != 1:
             chunk = chunk.reshape(-1)
         chunk = decode_str_array(np.asarray(chunk))
         return chunk.tolist()
 
-    if col_name in group and isinstance(group[col_name], h5py.Group):
-        col_group = group[col_name]
-        enc = col_group.attrs.get("encoding-type", b"")
+    # Case 2: Group (v0.2.0 encoded types like categorical, nullable, etc.)
+    if isinstance(col, h5py.Group):
+        enc = col.attrs.get("encoding-type", b"")
         if isinstance(enc, bytes):
             enc = enc.decode("utf-8")
+
         if enc == "categorical":
-            return read_categorical_column(col_group, start, end, cat_cache)
+            return read_categorical_column(col, start, end, cat_cache)
+
+        # Handle nullable arrays (nullable-integer, nullable-boolean, nullable-string-array)
+        if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"):
+            values = col["values"][start:end]
+            mask = col["mask"][start:end]
+            values = decode_str_array(np.asarray(values))
+            # Apply mask: masked values become empty string
+            return ["" if m else str(v) for v, m in zip(values, mask)]
+
+        raise RuntimeError(
+            f"Unsupported group encoding {enc!r} for column {col_name!r}"
+        )
 
-    raise RuntimeError(f"Unsupported column {col_name!r} in group {group.name}")
+    raise RuntimeError(
+        f"Unsupported column type for {col_name!r} in group {group.name}"
+    )

From 1cebbbf315abf948d10c351f29848072e7bec2cf Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Fri, 23 Jan 2026 17:12:39 +0000
Subject: [PATCH 19/62] Add support for legacy v0.1.0 h5ad files; implement
 tests for legacy categorical and dataframe formats

---
 tests/conftest.py       | 50 ++++++++++++++++++++++++++++++++++++
 tests/test_cli.py       | 25 +++++++++---------
 tests/test_export.py    |  6 +++--
 tests/test_info_read.py | 56 ++++++++++++++++++++++++++++++++++++++---
 4 files changed, 120 insertions(+), 17 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index bff9605..e3b710f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -172,3 +172,53 @@ def sample_categorical_h5ad(temp_dir):
         f.create_dataset("X", data=X)
 
     return file_path
+
+
+@pytest.fixture
+def sample_legacy_v010_h5ad(temp_dir):
+    """Create a sample h5ad file with legacy v0.1.0 categorical columns.
+
+    In v0.1.0, categorical columns are stored as:
+    - Integer code datasets with a 'categories' attribute (HDF5 object reference)
+    - Categories stored in __categories/<colname> subgroup
+    """
+    file_path = temp_dir / "test_legacy_v010.h5ad"
+
+    with h5py.File(file_path, "w") as f:
+        # Create obs with legacy categorical column
+        obs = f.create_group("obs")
+        obs.attrs["_index"] = "obs_names"
+        obs.attrs["encoding-type"] = "dataframe"
+        obs.attrs["encoding-version"] = "0.1.0"
+        obs_names = ["cell_1", "cell_2", "cell_3", "cell_4"]
+        obs.create_dataset("obs_names", data=np.array(obs_names, dtype="S"))
+
+        # Create __categories subgroup (v0.1.0 convention)
+        categories_group = obs.create_group("__categories")
+        cell_type_cats = np.array(["TypeA", "TypeB", "TypeC"], dtype="S")
+        cats_ds = categories_group.create_dataset("cell_type", data=cell_type_cats)
+
+        # Create categorical column as integer codes with reference to categories
+        codes = np.array([0, 1, 0, 2], dtype=np.int8)
+        cell_type_ds = obs.create_dataset("cell_type", data=codes)
+        # Store HDF5 object reference to categories
+        cell_type_ds.attrs["categories"] = cats_ds.ref
+
+        # Add a regular non-categorical column
+        obs.create_dataset(
+            "n_counts", data=np.array([100, 200, 150, 300], dtype=np.int32)
+        )
+
+        # Create var
+        var = f.create_group("var")
+        var.attrs["_index"] = "var_names"
+        var.attrs["encoding-type"] = "dataframe"
+        var.attrs["encoding-version"] = "0.1.0"
+        var_names = ["gene_1", "gene_2"]
+        var.create_dataset("var_names", data=np.array(var_names, dtype="S"))
+
+        # Create X matrix (no encoding-type for legacy)
+        X = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]], dtype=np.float32)
+        f.create_dataset("X", data=X)
+
+    return file_path
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 203830f..7f06a4f 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -66,17 +66,17 @@ def test_info_depth_short_flag(self, sample_h5ad_file):
         output = result.stdout + (result.stderr or "")
         assert "<" in output
 
-    def test_info_entry_flag(self, sample_h5ad_file):
-        """Test info command with --entry flag."""
-        result = runner.invoke(app, ["info", "--entry", "X", str(sample_h5ad_file)])
+    def test_info_entry_positional(self, sample_h5ad_file):
+        """Test info command with entry as positional argument."""
+        result = runner.invoke(app, ["info", str(sample_h5ad_file), "X"])
         assert result.exit_code == 0
         output = result.stdout + (result.stderr or "")
         assert "Path:" in output
         assert "Type:" in output
 
-    def test_info_entry_short_flag(self, sample_h5ad_file):
-        """Test info command with -e short flag."""
-        result = runner.invoke(app, ["info", "-e", "obs", str(sample_h5ad_file)])
+    def test_info_entry_obs(self, sample_h5ad_file):
+        """Test info command with obs entry."""
+        result = runner.invoke(app, ["info", str(sample_h5ad_file), "obs"])
         assert result.exit_code == 0
         output = result.stdout + (result.stderr or "")
         assert "Path:" in output
@@ -84,18 +84,14 @@ def test_info_entry_short_flag(self, sample_h5ad_file):
 
     def test_info_entry_nested_path(self, sample_h5ad_file):
         """Test info command with nested object path."""
-        result = runner.invoke(
-            app, ["info", "-e", "uns/description", str(sample_h5ad_file)]
-        )
+        result = runner.invoke(app, ["info", str(sample_h5ad_file), "uns/description"])
         assert result.exit_code == 0
         output = result.stdout + (result.stderr or "")
         assert "Path:" in output
 
     def test_info_entry_not_found(self, sample_h5ad_file):
         """Test info command with non-existent object path."""
-        result = runner.invoke(
-            app, ["info", "-e", "nonexistent", str(sample_h5ad_file)]
-        )
+        result = runner.invoke(app, ["info", str(sample_h5ad_file), "nonexistent"])
         assert result.exit_code == 0  # Doesn't exit with error, just shows message
         output = result.stdout + (result.stderr or "")
         assert "not found" in output
@@ -114,6 +110,7 @@ def test_export_dataframe_obs(self, sample_h5ad_file, temp_dir):
                 "dataframe",
                 str(sample_h5ad_file),
                 "obs",
+                "--output",
                 str(output),
             ],
         )
@@ -137,6 +134,7 @@ def test_export_dataframe_var(self, sample_h5ad_file, temp_dir):
                 "dataframe",
                 str(sample_h5ad_file),
                 "var",
+                "--output",
                 str(output),
             ],
         )
@@ -158,6 +156,7 @@ def test_export_dataframe_columns_filter(self, sample_h5ad_file, temp_dir):
                 "dataframe",
                 str(sample_h5ad_file),
                 "obs",
+                "--output",
                 str(output),
                 "--columns",
                 "obs_names,cell_type",
@@ -183,6 +182,7 @@ def test_export_dataframe_head(self, sample_h5ad_file, temp_dir):
                 "dataframe",
                 str(sample_h5ad_file),
                 "obs",
+                "--output",
                 str(output),
                 "--head",
                 "2",
@@ -205,6 +205,7 @@ def test_export_dataframe_invalid_axis(self, sample_h5ad_file, temp_dir):
                 "dataframe",
                 str(sample_h5ad_file),
                 "invalid",
+                "--output",
                 str(output),
             ],
         )
diff --git a/tests/test_export.py b/tests/test_export.py
index 8ab14cd..730ce95 100644
--- a/tests/test_export.py
+++ b/tests/test_export.py
@@ -112,7 +112,8 @@ class TestExportDataframe:
     def test_export_dataframe_obs(self, sample_h5ad_file, temp_dir):
         out = temp_dir / "obs.csv"
         result = runner.invoke(
-            app, ["export", "dataframe", str(sample_h5ad_file), "obs", str(out)]
+            app,
+            ["export", "dataframe", str(sample_h5ad_file), "obs", "--output", str(out)],
         )
         assert result.exit_code == 0
         assert out.exists()
@@ -125,7 +126,8 @@ def test_wrong_type_for_dataframe(self, sample_h5ad_file, temp_dir):
         """Test that wrong object type is rejected for dataframe export."""
         out = temp_dir / "X.csv"
         result = runner.invoke(
-            app, ["export", "dataframe", str(sample_h5ad_file), "X", str(out)]
+            app,
+            ["export", "dataframe", str(sample_h5ad_file), "X", "--output", str(out)],
         )
         assert result.exit_code == 1
         assert "obs" in result.output or "var" in result.output
diff --git a/tests/test_info_read.py b/tests/test_info_read.py
index 5b69fe3..e708fac 100644
--- a/tests/test_info_read.py
+++ b/tests/test_info_read.py
@@ -245,9 +245,59 @@ def test_col_chunk_categorical(self, sample_categorical_h5ad):
             result = col_chunk_as_strings(f["obs"], "cell_type", 0, 4, cache)
             assert result == ["TypeA", "TypeB", "TypeA", "TypeC"]
 
-    def test_col_chunk_unsupported(self, sample_h5ad_file):
-        """Test reading unsupported column."""
+    def test_col_chunk_not_found(self, sample_h5ad_file):
+        """Test reading non-existent column."""
         with h5py.File(sample_h5ad_file, "r") as f:
             cache = {}
-            with pytest.raises(RuntimeError, match="Unsupported column"):
+            with pytest.raises(RuntimeError, match="not found in group"):
                 col_chunk_as_strings(f["obs"], "nonexistent", 0, 5, cache)
+
+
+class TestLegacyV010Support:
+    """Tests for legacy v0.1.0 format support."""
+
+    def test_get_entry_type_legacy_categorical(self, sample_legacy_v010_h5ad):
+        """Test type detection for legacy categorical column (v0.1.0)."""
+        with h5py.File(sample_legacy_v010_h5ad, "r") as f:
+            info = get_entry_type(f["obs"]["cell_type"])
+            assert info["type"] == "categorical"
+            assert info["version"] == "0.1.0"
+            assert "Legacy" in info["details"]
+
+    def test_get_entry_type_legacy_dataframe(self, sample_legacy_v010_h5ad):
+        """Test type detection for legacy dataframe (v0.1.0)."""
+        with h5py.File(sample_legacy_v010_h5ad, "r") as f:
+            info = get_entry_type(f["obs"])
+            assert info["type"] == "dataframe"
+            assert info["version"] == "0.1.0"
+            assert "legacy" in info["details"].lower()
+
+    def test_read_legacy_categorical_column(self, sample_legacy_v010_h5ad):
+        """Test reading legacy categorical column."""
+        with h5py.File(sample_legacy_v010_h5ad, "r") as f:
+            cache = {}
+            result = read_categorical_column(
+                f["obs"]["cell_type"], 0, 4, cache, f["obs"]
+            )
+            assert result == ["TypeA", "TypeB", "TypeA", "TypeC"]
+
+    def test_col_chunk_legacy_categorical(self, sample_legacy_v010_h5ad):
+        """Test col_chunk_as_strings with legacy categorical column."""
+        with h5py.File(sample_legacy_v010_h5ad, "r") as f:
+            cache = {}
+            result = col_chunk_as_strings(f["obs"], "cell_type", 0, 4, cache)
+            assert result == ["TypeA", "TypeB", "TypeA", "TypeC"]
+
+    def test_col_chunk_legacy_numeric(self, sample_legacy_v010_h5ad):
+        """Test col_chunk_as_strings with legacy numeric column."""
+        with h5py.File(sample_legacy_v010_h5ad, "r") as f:
+            cache = {}
+            result = col_chunk_as_strings(f["obs"], "n_counts", 0, 4, cache)
+            assert result == ["100", "200", "150", "300"]
+
+    def test_legacy_categorical_slice(self, sample_legacy_v010_h5ad):
+        """Test reading slice of legacy categorical column."""
+        with h5py.File(sample_legacy_v010_h5ad, "r") as f:
+            cache = {}
+            result = col_chunk_as_strings(f["obs"], "cell_type", 1, 3, cache)
+            assert result == ["TypeB", "TypeA"]

From b5ea89827669c3679362ac17e982715c001a95a2 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Fri, 23 Jan 2026 18:19:30 +0000
Subject: [PATCH 20/62] Update show_info and _show_types_tree functions to
 exclude '__categories' from key processing; enhance child key filtering.

---
 src/h5ad/commands/info.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/h5ad/commands/info.py b/src/h5ad/commands/info.py
index e1d2bbe..11bd11d 100644
--- a/src/h5ad/commands/info.py
+++ b/src/h5ad/commands/info.py
@@ -54,7 +54,9 @@ def show_info(
                 obj = f[key]
                 # Only process Groups, skip Datasets like X
                 if isinstance(obj, h5py.Group):
-                    sub_keys = [k for k in obj.keys() if k != "_index"]
+                    sub_keys = [
+                        k for k in obj.keys() if k not in ("_index", "__categories")
+                    ]
                     if sub_keys and key != "X":
                         rich.print(
                             f"\t[bold yellow]{key}:[/]\t"
@@ -110,7 +112,7 @@ def add_node(
             # Recurse only if within allowed depth
             if current_depth < max_depth:
                 for child_name in sorted(obj.keys()):
-                    if child_name == "_index":
+                    if child_name in ("_index", "__categories"):
                         continue
                     child_obj = obj[child_name]
                     add_node(
@@ -122,7 +124,7 @@ def add_node(
         obj = f[key]
         # Skip empty groups
         if isinstance(obj, h5py.Group):
-            children = [k for k in obj.keys() if k != "_index"]
+            children = [k for k in obj.keys() if k not in ("_index", "__categories")]
             if not children:
                 continue
         max_depth = (
@@ -170,7 +172,7 @@ def _show_object_info(f: h5py.File, entry_path: str, console: Console) -> None:
 
     # If it's a group, show children
     if isinstance(entry, h5py.Group):
-        children = [k for k in entry.keys() if k != "_index"]
+        children = [k for k in entry.keys() if k not in ("_index", "__categories")]
         if children:
             console.print(f"\n[bold cyan]Children:[/]")
             for child_name in sorted(children):

From e0c727a39c63d4aa7554e1876c01cbee23c7f0cd Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Fri, 23 Jan 2026 18:19:36 +0000
Subject: [PATCH 21/62] Enhance export_table function to support both modern
 and legacy dataframe formats; exclude reserved keys from column list and
 improve status reporting during export.

---
 src/h5ad/commands/export.py | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py
index cf0c64b..1f60458 100644
--- a/src/h5ad/commands/export.py
+++ b/src/h5ad/commands/export.py
@@ -3,6 +3,7 @@
 import csv
 import json
 import sys
+from contextlib import nullcontext
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union, cast
 
@@ -40,15 +41,23 @@ def export_table(
         chunk_rows: Number of rows to read per chunk
         head: Output only the first n rows
         console: Rich console for status output
+
+    Supports both v0.2.0 (modern) and v0.1.0 (legacy) dataframe formats.
     """
     with h5py.File(file, "r") as f:
         group, n_rows, index_name = get_axis_group(f, axis)
 
+        # Reserved keys to exclude from column list
+        # __categories is used in v0.1.0 for storing categorical labels
+        reserved_keys = {"_index", "__categories"}
+
         # Determine columns to read
         if columns:
             col_names = list(columns)
         else:
-            col_names = [k for k in group.keys() if k != "_index" and k != index_name]
+            col_names = [
+                k for k in group.keys() if k not in reserved_keys and k != index_name
+            ]
             # Add index name if not already present
             if index_name and index_name not in col_names:
                 col_names.insert(0, index_name)
@@ -76,14 +85,22 @@ def export_table(
         try:
             writer.writerow(col_names)
             cat_cache: Dict[int, np.ndarray] = {}
-            with console.status(
-                f"[magenta]Exporting {axis} table...[/] to {'stdout' if out_fh is sys.stdout else out}"
-            ) as status:
+
+            # Use status spinner only when writing to file (not stdout)
+            use_status = out_fh is not sys.stdout
+            status_ctx = (
+                console.status(f"[magenta]Exporting {axis} table to {out}...[/]")
+                if use_status
+                else nullcontext()
+            )
+
+            with status_ctx as status:
                 for start in range(0, n_rows, chunk_rows):
                     end = min(start + chunk_rows, n_rows)
-                    status.update(
-                        f"[magenta]Exporting rows {start}-{end} of {n_rows}...[/]"
-                    )
+                    if use_status and status:
+                        status.update(
+                            f"[magenta]Exporting rows {start}-{end} of {n_rows}...[/]"
+                        )
                     cols_data: List[List[str]] = []
                     # Read each column for the current chunk
                     for col in col_names:

From f14868f54e8bf94a05d56216d90e71b3ddf2da13 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Fri, 23 Jan 2026 18:19:46 +0000
Subject: [PATCH 22/62] Enhance console initialization in CLI to ensure Rich
 output is visible in non-TTY environments.

---
 src/h5ad/cli.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py
index a09761a..772fcee 100644
--- a/src/h5ad/cli.py
+++ b/src/h5ad/cli.py
@@ -11,7 +11,9 @@
 app = typer.Typer(
     help="Streaming CLI for huge .h5ad files (info, subset, export, import)."
 )
-console = Console(stderr=True)
+# Use stderr for status/progress to keep stdout clean for data output
+# force_terminal=True ensures Rich output is visible even in non-TTY environments
+console = Console(stderr=True, force_terminal=True)
 
 # Create sub-apps for export and import
 export_app = typer.Typer(help="Export objects from h5ad files.")

From 9929e5bbe882a2e971bb9b93eefae317bfd982fb Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Fri, 23 Jan 2026 18:21:53 +0000
Subject: [PATCH 23/62] Add tests for export_dataframe command with various
 options and flags

---
 tests/test_cli.py | 121 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index 7f06a4f..2b3bd90 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -195,6 +195,127 @@ def test_export_dataframe_head(self, sample_h5ad_file, temp_dir):
             rows = list(reader)
             assert len(rows) == 3  # header + 2 rows
 
+    def test_export_dataframe_head_short_flag(self, sample_h5ad_file, temp_dir):
+        """Test export dataframe with -n short flag."""
+        output = temp_dir / "table.csv"
+        result = runner.invoke(
+            app,
+            [
+                "export",
+                "dataframe",
+                str(sample_h5ad_file),
+                "obs",
+                "--output",
+                str(output),
+                "-n",
+                "3",
+            ],
+        )
+        assert result.exit_code == 0
+
+        with open(output, "r") as f:
+            reader = csv.reader(f)
+            rows = list(reader)
+            assert len(rows) == 4  # header + 3 rows
+
+    def test_export_dataframe_stdout(self, sample_h5ad_file):
+        """Test export dataframe to stdout (no --output)."""
+        result = runner.invoke(
+            app,
+            [
+                "export",
+                "dataframe",
+                str(sample_h5ad_file),
+                "obs",
+                "--head",
+                "2",
+            ],
+        )
+        assert result.exit_code == 0
+        # Output should go to stdout
+        assert "obs_names" in result.stdout
+        assert "cell_" in result.stdout
+
+    def test_export_dataframe_columns_short_flag(self, sample_h5ad_file, temp_dir):
+        """Test export dataframe with -c short flag for columns."""
+        output = temp_dir / "table.csv"
+        result = runner.invoke(
+            app,
+            [
+                "export",
+                "dataframe",
+                str(sample_h5ad_file),
+                "obs",
+                "-o",
+                str(output),
+                "-c",
+                "obs_names",
+            ],
+        )
+        assert result.exit_code == 0
+
+        with open(output, "r") as f:
+            reader = csv.reader(f)
+            rows = list(reader)
+            header = rows[0]
+            assert len(header) == 1
+            assert "obs_names" in header
+
+    def test_export_dataframe_chunk_rows(self, sample_h5ad_file, temp_dir):
+        """Test export dataframe with custom chunk size."""
+        output = temp_dir / "table.csv"
+        result = runner.invoke(
+            app,
+            [
+                "export",
+                "dataframe",
+                str(sample_h5ad_file),
+                "obs",
+                "--output",
+                str(output),
+                "--chunk-rows",
+                "2",
+            ],
+        )
+        assert result.exit_code == 0
+        assert output.exists()
+
+        with open(output, "r") as f:
+            reader = csv.reader(f)
+            rows = list(reader)
+            assert len(rows) == 6  # header + 5 rows
+
+    def test_export_dataframe_combined_options(self, sample_h5ad_file, temp_dir):
+        """Test export dataframe with multiple options combined."""
+        output = temp_dir / "table.csv"
+        result = runner.invoke(
+            app,
+            [
+                "export",
+                "dataframe",
+                str(sample_h5ad_file),
+                "obs",
+                "-o",
+                str(output),
+                "-c",
+                "obs_names,cell_type",
+                "-n",
+                "3",
+                "-r",
+                "1",
+            ],
+        )
+        assert result.exit_code == 0
+
+        with open(output, "r") as f:
+            reader = csv.reader(f)
+            rows = list(reader)
+            assert len(rows) == 4  # header + 3 rows
+            header = rows[0]
+            assert "obs_names" in header
+            assert "cell_type" in header
+            assert "n_counts" not in header
+
     def test_export_dataframe_invalid_axis(self, sample_h5ad_file, temp_dir):
         """Test export dataframe with invalid axis."""
         output = temp_dir / "table.csv"

From bf672893e903b899ce4c0301a11e92d511d67457 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Fri, 23 Jan 2026 19:24:08 +0000
Subject: [PATCH 24/62] Update import statements in __init__.py to include
 additional export functions

---
 src/h5ad/commands/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/h5ad/commands/__init__.py b/src/h5ad/commands/__init__.py
index 7b60c31..70d960f 100644
--- a/src/h5ad/commands/__init__.py
+++ b/src/h5ad/commands/__init__.py
@@ -1,4 +1,4 @@
 from h5ad.commands.info import show_info
 from h5ad.commands.subset import subset_h5ad
-from h5ad.commands.export import export_object, export_table
+from h5ad.commands.export import export_table, export_image, export_json, export_mtx, export_npy
 from h5ad.commands.import_data import import_object

From 295548d0a23271536238dca4f0adc770f47e8f5e Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Fri, 23 Jan 2026 19:24:19 +0000
Subject: [PATCH 25/62] Add Zarr element specifications documentation

---
 docs/zarr_elements_spec.md | 276 +++++++++++++++++++++++++++++++++++++
 1 file changed, 276 insertions(+)
 create mode 100644 docs/zarr_elements_spec.md

diff --git a/docs/zarr_elements_spec.md b/docs/zarr_elements_spec.md
new file mode 100644
index 0000000..ce309e6
--- /dev/null
+++ b/docs/zarr_elements_spec.md
@@ -0,0 +1,276 @@
+# AnnData on-disk element specifications — Zarr (`.zarr`)
+
+This document describes how *elements* are encoded inside an AnnData **Zarr** container (`.zarr`).  
+It is intended to be GitHub-renderable Markdown (no Sphinx/MyST directives).
+
+> **Scope**
+>
+> - “Modern” encoding metadata (`encoding-type`, `encoding-version`) is the convention used by **anndata ≥ 0.8**.
+> - “Legacy” conventions (notably DataFrame categorical handling) are described for **anndata 0.7.x** files, which are still commonly encountered.
+
+## Table of contents
+
+- [Encoding metadata](#encoding-metadata)
+- [AnnData group](#anndata-group)
+- [Dense arrays](#dense-arrays)
+- [Sparse arrays (CSR/CSC)](#sparse-arrays-csrcsc)
+- [DataFrames](#dataframes)
+  - [DataFrame v0.2.0](#dataframe-v020)
+  - [DataFrame v0.1.0 (legacy: anndata 0.7.x)](#dataframe-v010-legacy-anndata-07x)
+  - [Legacy categorical columns (Series-level)](#legacy-categorical-columns-series-level)
+- [Mappings / dict](#mappings--dict)
+- [Scalars](#scalars)
+- [Categorical arrays](#categorical-arrays)
+- [String arrays](#string-arrays)
+- [Nullable arrays](#nullable-arrays)
+  - [Missing value semantics](#missing-value-semantics)
+- [Awkward arrays (experimental)](#awkward-arrays-experimental)
+- [Sources](#sources)
+
+## Encoding metadata
+
+**Modern convention (anndata ≥ 0.8):**
+
+- Any element (Zarr *group* or *array*) that participates in the element-dispatch system:
+  - **MUST** have attribute `encoding-type` (string)
+  - **MUST** have attribute `encoding-version` (string, parseable as a version)
+
+Readers should dispatch first on `encoding-type`, then on `encoding-version`.
+
+**Legacy convention (anndata ≤ 0.7.x):**
+
+- Many objects do *not* have `encoding-type`/`encoding-version`.
+- Some elements (e.g. CSR/CSC sparse matrices, legacy DataFrames) *do* use `encoding-type`/`encoding-version`.
+- Readers typically infer element kinds from:
+  - known AnnData keys (`X`, `obs`, `var`, …),
+  - group structure, and/or
+  - legacy attributes (e.g. the `categories` attribute on categorical columns).
+
+## AnnData group
+
+### `encoding-type: anndata`, `encoding-version: 0.1.0`
+
+An `AnnData` object **MUST** be stored as a Zarr **group** with attributes:
+
+- `encoding-type: "anndata"`
+- `encoding-version: "0.1.0"`
+
+Required members:
+
+- `obs` — a [DataFrame](#dataframes)
+- `var` — a [DataFrame](#dataframes)
+
+Optional members (if present, they must satisfy these constraints):
+
+- `X` — dense array or sparse array; shape `(n_obs, n_var)`
+- `layers` — mapping; values dense or sparse arrays; each shape `(n_obs, n_var)`
+- `obsm` — mapping; values dense arrays, sparse arrays, or dataframes; first dim `n_obs`
+- `varm` — mapping; values dense arrays, sparse arrays, or dataframes; first dim `n_var`
+- `obsp` — mapping; values dense or sparse arrays; first two dims `n_obs`
+- `varp` — mapping; values dense or sparse arrays; first two dims `n_var`
+- `uns` — mapping/dict-like container (recursive)
+
+## Dense arrays
+
+### `encoding-type: array`, `encoding-version: 0.2.0`
+
+- A dense array **MUST** be stored as a Zarr **array**.
+- The array **MUST** have attributes:
+  - `encoding-type: "array"`
+  - `encoding-version: "0.2.0"`
+
+> **Legacy note**
+>
+> In anndata 0.7.x, dense arrays were typically stored as plain Zarr arrays *without* `encoding-type`/`encoding-version`.
+
+## Sparse arrays (CSR/CSC)
+
+### `encoding-type: csr_matrix|csc_matrix`, `encoding-version: 0.1.0`
+
+A sparse matrix **MUST** be stored as a Zarr **group**.
+
+- Group attributes:
+  - `encoding-type: "csr_matrix"` **or** `"csc_matrix"`
+  - `encoding-version: "0.1.0"`
+  - `shape`: integer array of length 2 (matrix shape)
+- Group members (arrays):
+  - `data`
+  - `indices`
+  - `indptr`
+
+The exact CSR/CSC semantics follow SciPy’s conventions.
+
+## DataFrames
+
+DataFrames are stored column-wise: each column is stored as a Zarr array (or group, if the column itself is an encoded element).
+
+<a id="dataframe-v020"></a>
+### DataFrame v0.2.0
+
+#### `encoding-type: dataframe`, `encoding-version: 0.2.0`
+
+A dataframe **MUST** be stored as a Zarr **group**.
+
+- Group attributes:
+  - `_index`: string — the key of the array to be used as the row index
+  - `column-order`: array of strings — original column order
+  - `encoding-type: "dataframe"`
+  - `encoding-version: "0.2.0"`
+- Group members:
+  - the index array (named by `_index`)
+  - one member per column
+- All column entries **MUST** have the same length in their first dimension.
+- Columns **SHOULD** share chunking along the first dimension.
+
+Columns are independently encoded:
+- simple numeric/bool columns are commonly `encoding-type: array`
+- categorical columns are commonly `encoding-type: categorical`
+
+<a id="dataframe-v010-legacy-anndata-07x"></a>
+### DataFrame v0.1.0 (legacy: anndata 0.7.x)
+
+#### `encoding-type: dataframe`, `encoding-version: 0.1.0`
+
+A legacy dataframe is stored as a Zarr **group** where:
+
+- Group attributes include:
+  - `_index`
+  - `column-order`
+  - `encoding-type: "dataframe"`
+  - `encoding-version: "0.1.0"`
+- Each column is an array.
+- Categorical columns are stored as **integer code arrays**, and their category labels are stored in a reserved subgroup named `__categories`.
+
+**Reserved subgroup:**
+
+- `__categories/<colname>` stores the array of category labels for column `<colname>`.
+
+<a id="legacy-categorical-columns-series-level"></a>
+### Legacy categorical columns (Series-level)
+
+In v0.1.0 DataFrames, a categorical column array (e.g. `obs/cell_type`) can be identified by the presence of an attribute:
+
+- `categories`: an **absolute path string** to the corresponding `__categories/<colname>` array.
+
+(This differs from HDF5, which can store an object reference.)
+
+## Mappings / dict
+
+### `encoding-type: dict`, `encoding-version: 0.1.0`
+
+- A mapping **MUST** be stored as a Zarr **group**.
+- Group attributes:
+  - `encoding-type: "dict"`
+  - `encoding-version: "0.1.0"`
+- Each entry in the group is another element (recursively).
+
+> **Legacy note**
+>
+> In anndata 0.7.x, groups used as mappings often had **no special attributes**.
+
+## Scalars
+
+### `encoding-version: 0.2.0`
+
+Scalars are stored as **0-dimensional Zarr arrays**.
+
+- Numeric scalars:
+  - `encoding-type: "numeric-scalar"`
+  - `encoding-version: "0.2.0"`
+  - value is numeric (including boolean, ints, floats, complex)
+- String scalars:
+  - `encoding-type: "string"`
+  - `encoding-version: "0.2.0"`
+  - **Zarr requirement:** fixed-length unicode dtype (e.g. `<U9`)
+
+> **Legacy note**
+>
+> In anndata 0.7.x, scalar strings were commonly stored without `encoding-type`/`encoding-version`.
+
+## Categorical arrays
+
+### `encoding-type: categorical`, `encoding-version: 0.2.0`
+
+Categorical arrays are stored as a Zarr **group** with members:
+
+- `codes`: integer array  
+  - values are zero-based indices into `categories`
+  - signed integer arrays **MAY** use `-1` to denote missing values
+- `categories`: array of labels
+
+Group attributes:
+
+- `encoding-type: "categorical"`
+- `encoding-version: "0.2.0"`
+- `ordered`: boolean (whether the categories are ordered)
+
+## String arrays
+
+### `encoding-type: string-array`, `encoding-version: 0.2.0`
+
+- String arrays **MUST** be stored as Zarr arrays.
+- Array attributes:
+  - `encoding-type: "string-array"`
+  - `encoding-version: "0.2.0"`
+- **Zarr requirement:** the array **MUST** be stored using `numcodecs.VLenUTF8` for variable-length UTF-8 strings.
+
+## Nullable arrays
+
+These encodings support Pandas nullable integer/boolean/string arrays by storing a `values` array plus a boolean `mask` array.
+
+### `encoding-type: nullable-integer`, `encoding-version: 0.1.0`
+
+- Stored as a Zarr group with arrays:
+  - `values` (integer)
+  - `mask` (boolean)
+
+### `encoding-type: nullable-boolean`, `encoding-version: 0.1.0`
+
+- Stored as a Zarr group with arrays:
+  - `values` (boolean)
+  - `mask` (boolean)
+- `values` and `mask` **MUST** have the same shape.
+
+### `encoding-type: nullable-string-array`, `encoding-version: 0.1.0`
+
+- Stored as a Zarr group with arrays:
+  - `values` (string array)
+  - `mask` (boolean)
+- Group attributes:
+  - `encoding-type: "nullable-string-array"`
+  - `encoding-version: "0.1.0"`
+  - optional `na-value`: `"NA"` or `"NaN"` (default `"NA"`)
+
+<a id="missing-value-semantics"></a>
+#### Missing value semantics
+
+For elements supporting a `na-value` attribute:
+
+- `"NA"`: comparisons propagate missingness (e.g. `"x" == NA` → `NA`)
+- `"NaN"`: comparisons yield boolean results (e.g. `"x" == NaN` → `false`)
+
+Readers should preserve semantics when the runtime model supports it.
+
+## Awkward arrays (experimental)
+
+### `encoding-type: awkward-array`, `encoding-version: 0.1.0`
+
+Ragged arrays are stored by decomposing an Awkward Array into constituent buffers (via `ak.to_buffers`), then storing those buffers as Zarr arrays within a group.
+
+Group attributes:
+
+- `encoding-type: "awkward-array"`
+- `encoding-version: "0.1.0"`
+- `form`: string — serialized Awkward “form”
+- `length`: integer — logical length
+
+Group members: arrays for the buffers (often named like `nodeX-*`).
+
+> **Experimental**
+>
+> This encoding is considered experimental in the anndata 0.9.x series and later.
+
+## Sources
+
+- AnnData “on-disk format” prose docs (modern, ≥0.8): https://anndata.readthedocs.io/en/stable/fileformat-prose.html
+- AnnData 0.7.8 “on-disk format” prose docs (legacy): https://dokk.org/documentation/anndata/0.7.8/fileformat-prose/

From 6a454f0b558e0295f5e52bb8f222295d4bd10456 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Fri, 23 Jan 2026 19:24:31 +0000
Subject: [PATCH 26/62] Enhance export functionality to support awkward-array
 type and improve export_npy and export_mtx methods for better handling of
 datasets and chunk processing.

---
 src/h5ad/commands/export.py | 239 +++++++++++++++++-------------------
 1 file changed, 112 insertions(+), 127 deletions(-)

diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py
index 1f60458..dbd4277 100644
--- a/src/h5ad/commands/export.py
+++ b/src/h5ad/commands/export.py
@@ -131,6 +131,7 @@ def export_table(
     "dict": {".json"},
     "scalar": {".json"},
     "categorical": {".csv"},
+    "awkward-array": {".json"},
 }
 
 # Image extensions for validation
@@ -195,108 +196,44 @@ def _check_json_exportable(h5obj: H5Obj, max_elements: int, path: str = "") -> N
             )
 
 
-def export_object(
+def export_npy(
     file: Path,
     obj: str,
     out: Path,
-    columns: Optional[List[str]],
-    chunk_rows: int,
-    head: Optional[int],
-    max_elements: int,
-    include_attrs: bool,
+    chunk_elements: int,
     console: Console,
 ) -> None:
     """
-    Export an HDF5 object to an appropriate format based on its type.
+    Export a dense HDF5 dataset to NumPy .npy without loading it all at once.
 
-    Auto-detects the object type and validates the output file extension.
+    Supports both:
+    - v0.2.0 (modern): Datasets with encoding-type="array"
+    - v0.1.0 (legacy): Plain datasets without encoding attributes
+    - Encoded groups: nullable-integer, nullable-boolean, string-array (extracts values)
     """
-    obj = _norm_path(obj)
-    out_ext = out.suffix.lower()
-
     with h5py.File(file, "r") as f:
         h5obj = _resolve(f, obj)
-        info = get_entry_type(h5obj)
-        obj_type = info["type"]
-
-        # Check if type is exportable
-        if obj_type not in EXPORTABLE_TYPES:
-            raise ValueError(
-                f"Cannot export object of type '{obj_type}'. "
-                f"Exportable types: {', '.join(sorted(EXPORTABLE_TYPES))}."
-            )
-
-        # Check if extension matches the type
-        valid_exts = TYPE_EXTENSIONS.get(obj_type, set())
-        if out_ext not in valid_exts:
-            ext_list = ", ".join(sorted(valid_exts))
-            raise ValueError(
-                f"Output extension '{out_ext}' does not match object type '{obj_type}'. "
-                f"Expected: {ext_list}."
-            )
-
-    # Dispatch to appropriate export function
-    if obj_type == "dataframe":
-        # For dataframe, obj must be obs or var
-        if obj not in ("obs", "var"):
-            raise ValueError(
-                f"CSV export for dataframes currently supports only 'obs' or 'var', "
-                f"not '{obj}'."
-            )
-        export_table(
-            file=file,
-            axis=obj,
-            columns=columns,
-            out=out,
-            chunk_rows=chunk_rows,
-            head=head,
-            console=console,
-        )
 
-    elif obj_type == "categorical":
-        # Categorical is also exported via table if it's a column in obs/var
-        raise ValueError(
-            f"Categorical objects should be exported as part of 'obs' or 'var' table. "
-            f"Use: h5ad export <file> obs <output.csv>"
-        )
-
-    elif obj_type in ("dense-matrix", "array"):
-        if out_ext in IMAGE_EXTENSIONS:
-            # User wants image output - validate dimensions
-            _export_image(file=file, obj=obj, out=out, console=console)
-        else:
-            _export_npy(
-                file=file, obj=obj, out=out, chunk_rows=chunk_rows, console=console
-            )
-
-    elif obj_type == "sparse-matrix":
-        _export_mtx(file=file, obj=obj, out=out, console=console)
-
-    elif obj_type in ("dict", "scalar"):
-        _export_json(
-            file=file,
-            obj=obj,
-            out=out,
-            max_elements=max_elements,
-            include_attrs=include_attrs,
-            console=console,
-        )
-
-
-def _export_npy(
-    file: Path,
-    obj: str,
-    out: Path,
-    chunk_rows: int,
-    console: Console,
-) -> None:
-    """Export a dense HDF5 dataset to NumPy .npy without loading it all at once."""
-    with h5py.File(file, "r") as f:
-        h5obj = _resolve(f, obj)
+        # Handle encoded groups that contain array data
         if isinstance(h5obj, h5py.Group):
-            raise ValueError("Target is a group; cannot export as .npy.")
+            enc = _get_encoding_type(h5obj)
+            if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"):
+                # Extract values from nullable array group
+                if "values" not in h5obj:
+                    raise ValueError(
+                        f"Encoded group '{obj}' is missing 'values' dataset."
+                    )
+                ds = h5obj["values"]
+                has_mask = "mask" in h5obj
+                console.print(f"[dim]Exporting nullable array values from '{obj}'[/]")
+            else:
+                raise ValueError(
+                    f"Target '{obj}' is a group with encoding '{enc}'; cannot export as .npy directly."
+                )
+        else:
+            ds = h5obj
+            has_mask = False
 
-        ds = h5obj
         out.parent.mkdir(parents=True, exist_ok=True)
         mm = np.lib.format.open_memmap(out, mode="w+", dtype=ds.dtype, shape=ds.shape)
         try:
@@ -307,7 +244,7 @@ def _export_npy(
 
             if ds.ndim == 1:
                 n = int(ds.shape[0])
-                step = max(1, int(chunk_rows))
+                step = max(1, int(chunk_elements))
                 with console.status(
                     f"[magenta]Exporting {obj} to {out}...[/]"
                 ) as status:
@@ -321,7 +258,9 @@ def _export_npy(
                 return
 
             n0 = int(ds.shape[0])
-            step0 = max(1, int(chunk_rows))
+            row_elems = int(np.prod(ds.shape[1:])) if ds.ndim > 1 else 1
+            # Convert element budget into a row count; fallback to 1 row if rows are larger.
+            step0 = max(1, int(chunk_elements) // max(1, row_elems))
             with console.status(f"[magenta]Exporting {obj} to {out}...[/]") as status:
                 for start in range(0, n0, step0):
                     end = min(start + step0, n0)
@@ -334,8 +273,19 @@ def _export_npy(
             del mm
 
 
-def _export_mtx(file: Path, obj: str, out: Path, console: Console) -> None:
-    """Export a CSR/CSC matrix group (AnnData encoding) to Matrix Market (.mtx)."""
+def export_mtx(
+    file: Path,
+    obj: str,
+    out: Optional[Path],
+    head: Optional[int],
+    chunk_elements: int,
+    console: Console,
+) -> None:
+    """Export a CSR/CSC matrix group (AnnData encoding) to Matrix Market (.mtx).
+
+    If out is None or "-", writes to stdout. The head parameter limits output lines.
+    chunk_elements controls how many nonzero elements are processed per slice.
+    """
     with h5py.File(file, "r") as f:
         h5obj = _resolve(f, obj)
         if not isinstance(h5obj, h5py.Group):
@@ -370,51 +320,86 @@ def _export_mtx(file: Path, obj: str, out: Path, console: Console) -> None:
 
         field = "real" if np.issubdtype(data.dtype, np.floating) else "integer"
 
-        out.parent.mkdir(parents=True, exist_ok=True)
-
+        # Load sparse index pointers (1 per major axis row/col); used to slice data/indices.
         indptr_arr = np.asarray(indptr[...], dtype=np.int64)
         nnz_ptr = int(indptr_arr[-1]) if indptr_arr.size else 0
         nnz_data = int(data.shape[0])
         nnz_idx = int(indices.shape[0])
-        nnz = min(nnz_ptr, nnz_data, nnz_idx)
+        nnz_limit = min(nnz_ptr, nnz_data, nnz_idx)
+        nnz = nnz_limit
+        elem_step = max(1, int(chunk_elements))
+        if head is not None and head > 0:
+            nnz = min(nnz_limit, head)
 
-        with open(out, "w", encoding="utf-8", newline="\n") as fh:
-            fh.write(f"%%MatrixMarket matrix coordinate {field} general\n")
-            fh.write("% generated by h5ad-cli\n")
-            fh.write(f"{n_rows} {n_cols} {nnz}\n")
+        # Write to stdout when out is None or "-", otherwise open a file on disk.
+        if out is None or str(out) == "-":
+            out_fh = sys.stdout
+        else:
+            out.parent.mkdir(parents=True, exist_ok=True)
+            out_fh = open(out, "w", encoding="utf-8", newline="\n")
+
+        use_status = out_fh is not sys.stdout
+        status_ctx = (
+            console.status(f"[magenta]Exporting {obj} to {out}...[/]")
+            if use_status
+            else nullcontext()
+        )
+        try:
+            # Matrix Market header: type, generator line, then shape and nnz.
+            out_fh.write(f"%%MatrixMarket matrix coordinate {field} general\n")
+            out_fh.write("% generated by h5ad-cli\n")
+            out_fh.write(f"{n_rows} {n_cols} {nnz}\n")
 
             major = n_rows if enc == "csr_matrix" else n_cols
-            with console.status(f"[magenta]Exporting {obj} to {out}...[/]") as status:
+            max_lines = head if head is not None and head > 0 else None
+            written = 0
+            with status_ctx as status:
                 for major_i in range(major):
-                    start = min(int(indptr_arr[major_i]), nnz)
-                    end = min(int(indptr_arr[major_i + 1]), nnz)
+                    start = min(int(indptr_arr[major_i]), nnz_limit)
+                    end = min(int(indptr_arr[major_i + 1]), nnz_limit)
                     if end <= start:
                         continue
-                    status.update(
-                        f"[magenta]Exporting {obj}: block {major_i+1}/{major}...[/]"
-                    )
-                    idx = np.asarray(indices[start:end], dtype=np.int64)
-                    vals = np.asarray(data[start:end])
-                    m = min(len(idx), len(vals))
-                    if m == 0:
-                        continue
-                    idx = idx[:m]
-                    vals = vals[:m]
-                    for k in range(m):
-                        if enc == "csr_matrix":
-                            r = major_i + 1
-                            c = int(idx[k]) + 1
-                        else:
-                            r = int(idx[k]) + 1
-                            c = major_i + 1
-                        v = vals[k]
-                        if isinstance(v, np.generic):
-                            v = v.item()
-                        fh.write(f"{r} {c} {v}\n")
-        console.print(f"[green]Wrote[/] {out}")
+                    if use_status and status:
+                        status.update(
+                            f"[magenta]Exporting {obj}: block {major_i+1}/{major}...[/]"
+                        )
+                    # Slice the sparse column/row segment for this major index in element chunks.
+                    for chunk_start in range(start, end, elem_step):
+                        chunk_end = min(chunk_start + elem_step, end)
+                        idx = np.asarray(indices[chunk_start:chunk_end], dtype=np.int64)
+                        vals = np.asarray(data[chunk_start:chunk_end])
+                        m = min(len(idx), len(vals))
+                        if m == 0:
+                            continue
+                        idx = idx[:m]
+                        vals = vals[:m]
+                        for k in range(m):
+                            if max_lines is not None and written >= max_lines:
+                                break
+                            if enc == "csr_matrix":
+                                r = major_i + 1
+                                c = int(idx[k]) + 1
+                            else:
+                                r = int(idx[k]) + 1
+                                c = major_i + 1
+                            v = vals[k]
+                            if isinstance(v, np.generic):
+                                v = v.item()
+                            # Matrix Market uses 1-based indices.
+                            out_fh.write(f"{r} {c} {v}\n")
+                            written += 1
+                        if max_lines is not None and written >= max_lines:
+                            break
+                    if max_lines is not None and written >= max_lines:
+                        break
+        finally:
+            if out_fh is not sys.stdout:
+                out_fh.close()
+        if out_fh is not sys.stdout:
+            console.print(f"[green]Wrote[/] {out}")
 
 
-def _export_json(
+def export_json(
     file: Path,
     obj: str,
     out: Path,
@@ -500,7 +485,7 @@ def _to_jsonable(h5obj: H5Obj, max_elements: int, include_attrs: bool) -> Any:
     return d
 
 
-def _export_image(file: Path, obj: str, out: Path, console: Console) -> None:
+def export_image(file: Path, obj: str, out: Path, console: Console) -> None:
     """Export an image-like dataset (H,W) or (H,W,C) to PNG/JPG/TIFF."""
     try:
         from PIL import Image  # type: ignore

From 4b5d1bd158666c6e2a2fbfbff1d91f751e051b0d Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Fri, 23 Jan 2026 19:24:37 +0000
Subject: [PATCH 27/62] Refactor CLI export commands to improve parameter
 naming and enhance functionality for exporting various data types

---
 src/h5ad/cli.py | 99 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 65 insertions(+), 34 deletions(-)

diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py
index 772fcee..343560c 100644
--- a/src/h5ad/cli.py
+++ b/src/h5ad/cli.py
@@ -6,7 +6,15 @@
 from rich.console import Console
 import typer
 
-from h5ad.commands import show_info, subset_h5ad
+from h5ad.commands import (
+    show_info,
+    subset_h5ad,
+    export_mtx,
+    export_npy,
+    export_json,
+    export_image,
+    export_table,
+)
 
 app = typer.Typer(
     help="Streaming CLI for huge .h5ad files (info, subset, export, import)."
@@ -136,7 +144,7 @@ def export_dataframe(
         10000, "--chunk-rows", "-r", help="Number of rows to read per chunk"
     ),
     head: Optional[int] = typer.Option(
-        None, "--head", "-n", help="Output only the first n rows"
+        None, "--head", "-n", help="Output only the first n entries"
     ),
 ) -> None:
     """
@@ -147,7 +155,6 @@ def export_dataframe(
         h5ad export dataframe data.h5ad var --output var.csv --columns gene_id,mean
         h5ad export dataframe data.h5ad obs --head 100
     """
-    from h5ad.commands import export_table
 
     if entry not in ("obs", "var"):
         console.print(
@@ -179,12 +186,17 @@ def export_array(
     file: Path = typer.Argument(
         ..., help="Path to the .h5ad file", exists=True, readable=True
     ),
-    obj: str = typer.Argument(
-        ..., help="Object path to export (e.g., 'obsm/X_pca', 'varm/PCs', 'X')"
+    entry: str = typer.Argument(
+        ..., help="Entry path to export (e.g., 'obsm/X_pca', 'varm/PCs', 'X')"
     ),
-    out: Path = typer.Argument(..., help="Output .npy file path"),
-    chunk_rows: int = typer.Option(
-        10000, "--chunk-rows", "-r", help="Number of rows to read per chunk"
+    output: Path = typer.Option(
+        ..., "--output", "-o", help="Output .npy file path", writable=True
+    ),
+    chunk_elements: int = typer.Option(
+        10_000,
+        "--chunk",
+        "-r",
+        help="Number of elements to read per chunk",
     ),
 ) -> None:
     """
@@ -195,14 +207,13 @@ def export_array(
         h5ad export array data.h5ad X matrix.npy
         h5ad export array data.h5ad varm/PCs loadings.npy
     """
-    from h5ad.commands.export import _export_npy
 
     try:
-        _export_npy(
+        export_npy(
             file=file,
-            obj=obj,
-            out=out,
-            chunk_rows=chunk_rows,
+            obj=entry,
+            out=output,
+            chunk_elements=chunk_elements,
             console=console,
         )
     except Exception as e:
@@ -215,10 +226,25 @@ def export_sparse(
     file: Path = typer.Argument(
         ..., help="Path to the .h5ad file", exists=True, readable=True
     ),
-    obj: str = typer.Argument(
-        ..., help="Object path to export (e.g., 'X', 'layers/counts')"
+    entry: str = typer.Argument(
+        ..., help="Entry path to export (e.g., 'X', 'layers/counts')"
+    ),
+    output: Optional[Path] = typer.Option(
+        None,
+        "--output",
+        "-o",
+        writable=True,
+        help="Output .mtx file path (defaults to stdout)",
+    ),
+    head: Optional[int] = typer.Option(
+        None, "--head", "-n", help="Output only the first n rows"
+    ),
+    chunk_elements: int = typer.Option(
+        10_000,
+        "--chunk",
+        "-r",
+        help="Number of nonzero elements to process per chunk",
     ),
-    out: Path = typer.Argument(..., help="Output .mtx file path"),
 ) -> None:
     """
     Export a sparse matrix (CSR/CSC) to Matrix Market (.mtx) format.
@@ -226,11 +252,18 @@ def export_sparse(
     Examples:
         h5ad export sparse data.h5ad X matrix.mtx
         h5ad export sparse data.h5ad layers/counts counts.mtx
+        h5ad export sparse data.h5ad X --head 100
     """
-    from h5ad.commands.export import _export_mtx
 
     try:
-        _export_mtx(file=file, obj=obj, out=out, console=console)
+        export_mtx(
+            file=file,
+            obj=entry,
+            out=output,
+            head=head,
+            chunk_elements=chunk_elements,
+            console=console,
+        )
     except Exception as e:
         console.print(f"[bold red]Error:[/] {e}")
         raise typer.Exit(code=1)
@@ -241,8 +274,8 @@ def export_dict(
     file: Path = typer.Argument(
         ..., help="Path to the .h5ad file", exists=True, readable=True
     ),
-    obj: str = typer.Argument(
-        ..., help="Object path to export (e.g., 'uns', 'uns/colors')"
+    entry: str = typer.Argument(
+        ..., help="Entry path to export (e.g., 'uns', 'uns/colors')"
     ),
     out: Path = typer.Argument(..., help="Output .json file path"),
     max_elements: int = typer.Option(
@@ -261,12 +294,11 @@ def export_dict(
         h5ad export dict data.h5ad uns metadata.json
         h5ad export dict data.h5ad uns/colors colors.json
     """
-    from h5ad.commands.export import _export_json
 
     try:
-        _export_json(
+        export_json(
             file=file,
-            obj=obj,
+            obj=entry,
             out=out,
             max_elements=max_elements,
             include_attrs=include_attrs,
@@ -282,7 +314,7 @@ def export_image(
     file: Path = typer.Argument(
         ..., help="Path to the .h5ad file", exists=True, readable=True
     ),
-    obj: str = typer.Argument(..., help="Object path to export (2D or 3D array)"),
+    entry: str = typer.Argument(..., help="Entry path to export (2D or 3D array)"),
     out: Path = typer.Argument(..., help="Output image file (.png, .jpg, .tiff)"),
 ) -> None:
     """
@@ -293,10 +325,9 @@ def export_image(
     Examples:
         h5ad export image data.h5ad uns/spatial/image tissue.png
     """
-    from h5ad.commands.export import _export_image
 
     try:
-        _export_image(file=file, obj=obj, out=out, console=console)
+        export_image(file=file, obj=entry, out=out, console=console)
     except Exception as e:
         console.print(f"[bold red]Error:[/] {e}")
         raise typer.Exit(code=1)
@@ -323,8 +354,8 @@ def import_dataframe(
     file: Path = typer.Argument(
         ..., help="Path to the source .h5ad file", exists=True, readable=True
     ),
-    obj: str = typer.Argument(
-        ..., help="Object path to create/replace ('obs' or 'var')"
+    entry: str = typer.Argument(
+        ..., help="Entry path to create/replace ('obs' or 'var')"
     ),
     input_file: Path = typer.Argument(
         ..., help="Input CSV file", exists=True, readable=True
@@ -357,9 +388,9 @@ def import_dataframe(
     """
     from h5ad.commands.import_data import _import_csv
 
-    if obj not in ("obs", "var"):
+    if entry not in ("obs", "var"):
         console.print(
-            f"[bold red]Error:[/] Object must be 'obs' or 'var', not '{obj}'.",
+            f"[bold red]Error:[/] Entry must be 'obs' or 'var', not '{entry}'.",
         )
         raise typer.Exit(code=1)
 
@@ -372,7 +403,7 @@ def import_dataframe(
 
     try:
         target = _get_target_file(file, output, inplace)
-        _import_csv(target, obj, input_file, index_column, console)
+        _import_csv(target, entry, input_file, index_column, console)
     except Exception as e:
         console.print(f"[bold red]Error:[/] {e}")
         raise typer.Exit(code=1)
@@ -383,8 +414,8 @@ def import_array(
     file: Path = typer.Argument(
         ..., help="Path to the source .h5ad file", exists=True, readable=True
     ),
-    obj: str = typer.Argument(
-        ..., help="Object path to create/replace (e.g., 'X', 'obsm/X_pca')"
+    entry: str = typer.Argument(
+        ..., help="Entry path to create/replace (e.g., 'X', 'obsm/X_pca')"
     ),
     input_file: Path = typer.Argument(
         ..., help="Input .npy file", exists=True, readable=True
@@ -422,7 +453,7 @@ def import_array(
 
     try:
         target = _get_target_file(file, output, inplace)
-        _import_npy(target, obj, input_file, console)
+        _import_npy(target, entry, input_file, console)
     except Exception as e:
         console.print(f"[bold red]Error:[/] {e}")
         raise typer.Exit(code=1)

From 4d4cfde583c1e37c0867a2256daf0aa632cbceff Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Fri, 23 Jan 2026 19:24:47 +0000
Subject: [PATCH 28/62] Add tests for exporting legacy v0.1.0 dataframe and
 improve output validation in import tests

---
 tests/test_export.py | 26 ++++++++++++++++++++++++++
 tests/test_import.py | 20 ++++++++++++++------
 2 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/tests/test_export.py b/tests/test_export.py
index 730ce95..323167e 100644
--- a/tests/test_export.py
+++ b/tests/test_export.py
@@ -120,6 +120,32 @@ def test_export_dataframe_obs(self, sample_h5ad_file, temp_dir):
         text = out.read_text(encoding="utf-8")
         assert "obs_names" in text
 
+    def test_export_legacy_v010_dataframe(self, sample_legacy_v010_h5ad, temp_dir):
+        """Test exporting a legacy v0.1.0 dataframe with categorical columns."""
+        out = temp_dir / "obs_legacy.csv"
+        result = runner.invoke(
+            app,
+            [
+                "export",
+                "dataframe",
+                str(sample_legacy_v010_h5ad),
+                "obs",
+                "--output",
+                str(out),
+            ],
+        )
+        assert result.exit_code == 0
+        assert out.exists()
+        text = out.read_text(encoding="utf-8")
+        # Should contain index and columns
+        assert "obs_names" in text
+        assert "cell_type" in text
+        # Should NOT contain __categories (reserved subgroup)
+        assert "__categories" not in text
+        # Should contain decoded categorical values, not codes
+        assert "TypeA" in text
+        assert "TypeB" in text
+
 
 class TestExportValidation:
     def test_wrong_type_for_dataframe(self, sample_h5ad_file, temp_dir):
diff --git a/tests/test_import.py b/tests/test_import.py
index 736d4e2..f49af84 100644
--- a/tests/test_import.py
+++ b/tests/test_import.py
@@ -1,6 +1,7 @@
 """Tests for the import command."""
 
 import json
+import re
 from pathlib import Path
 
 import h5py
@@ -13,6 +14,11 @@
 runner = CliRunner()
 
 
+def strip_ansi(text: str) -> str:
+    """Strip ANSI escape codes from text."""
+    return re.sub(r"\x1b\[[0-9;]*m", "", text)
+
+
 class TestImportDataframe:
     def test_import_dataframe_obs_inplace(self, sample_h5ad_file, temp_dir):
         """Test importing CSV into obs with --inplace."""
@@ -40,8 +46,9 @@ def test_import_dataframe_obs_inplace(self, sample_h5ad_file, temp_dir):
             ],
         )
         assert result.exit_code == 0
-        assert "5 rows" in result.output
-        assert "2 columns" in result.output
+        output = strip_ansi(result.output)
+        assert "5 rows" in output
+        assert "2 columns" in output
 
         with h5py.File(sample_h5ad_file, "r") as f:
             assert "obs" in f
@@ -115,7 +122,7 @@ def test_import_dataframe_var(self, sample_h5ad_file, temp_dir):
             ],
         )
         assert result.exit_code == 0
-        assert "4 rows" in result.output
+        assert "4 rows" in strip_ansi(result.output)
 
     def test_import_dataframe_dimension_mismatch(self, sample_h5ad_file, temp_dir):
         """Test that dimension mismatch is rejected."""
@@ -218,7 +225,7 @@ def test_import_array_obsm(self, sample_h5ad_file, temp_dir):
             ],
         )
         assert result.exit_code == 0
-        assert "5×10" in result.output
+        assert "5×10" in strip_ansi(result.output)
 
         with h5py.File(sample_h5ad_file, "r") as f:
             assert "obsm/X_pca" in f
@@ -355,8 +362,9 @@ def test_import_sparse_X(self, sample_h5ad_file, temp_dir):
             ],
         )
         assert result.exit_code == 0
-        assert "5×4" in result.output
-        assert "5 non-zero" in result.output
+        output = strip_ansi(result.output)
+        assert "5×4" in output
+        assert "5 non-zero" in output
 
         with h5py.File(sample_h5ad_file, "r") as f:
             assert "X" in f

From f380cb611d298568acc347fce5cc197792053fad Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 13:12:35 +0000
Subject: [PATCH 29/62] Refactor error handling in subset and read functions to
 use more specific exceptions

---
 src/h5ad/commands/subset.py |  4 ++--
 src/h5ad/info.py            |  6 +++---
 src/h5ad/read.py            | 12 ++++++------
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/h5ad/commands/subset.py b/src/h5ad/commands/subset.py
index 2e01d9d..ff20d6b 100644
--- a/src/h5ad/commands/subset.py
+++ b/src/h5ad/commands/subset.py
@@ -457,7 +457,7 @@ def subset_h5ad(
             )
             if obs_names_ds is None:
                 console.print("[bold red]Error:[/] Could not find obs names")
-                raise RuntimeError("Could not find obs names")
+                raise KeyError("Could not find obs names")
 
             obs_idx, missing_obs = indices_from_name_set(obs_names_ds, obs_keep)
             if missing_obs:
@@ -476,7 +476,7 @@ def subset_h5ad(
             )
             if var_names_ds is None:
                 console.print("[bold red]Error:[/] Could not find var names")
-                raise RuntimeError("Could not find var names")
+                raise KeyError("Could not find var names")
 
             var_idx, missing_var = indices_from_name_set(var_names_ds, var_keep)
             if missing_var:
diff --git a/src/h5ad/info.py b/src/h5ad/info.py
index 7abb3e9..6144b07 100644
--- a/src/h5ad/info.py
+++ b/src/h5ad/info.py
@@ -210,7 +210,7 @@ def axis_len(file: h5py.File, axis: str) -> int:
         ValueError: If axis is not 'obs' or 'var'
         KeyError: If axis or index dataset not found in file
         TypeError: If axis is not a group or index is not a dataset
-        RuntimeError: If axis length cannot be determined
+        ValueError: If axis length cannot be determined
     """
     # Check if the specified axis exists in the file
     if axis not in file:
@@ -245,7 +245,7 @@ def axis_len(file: h5py.File, axis: str) -> int:
         raise TypeError(f"Index '{index_name}' in '{axis}' is not a dataset.")
     if dataset.shape:
         return int(dataset.shape[0])
-    raise RuntimeError(
+    raise ValueError(
         f"Cannot determine length of '{axis}': index dataset has no shape."
     )
 
@@ -265,7 +265,7 @@ def get_axis_group(file: h5py.File, axis: str) -> Tuple[h5py.Group, int, str]:
         ValueError: If axis is not 'obs' or 'var'
         KeyError: If axis or index dataset not found in file
         TypeError: If axis is not a group or index is not a dataset
-        RuntimeError: If axis length cannot be determined
+        ValueError: If axis length cannot be determined
     """
     if axis not in ("obs", "var"):
         raise ValueError("axis must be 'obs' or 'var'.")
diff --git a/src/h5ad/read.py b/src/h5ad/read.py
index 36f2e58..78fec0e 100644
--- a/src/h5ad/read.py
+++ b/src/h5ad/read.py
@@ -73,11 +73,11 @@ def read_categorical_column(
                 if col_name in cats_grp:
                     cats = cats_grp[col_name][...]
                 else:
-                    raise RuntimeError(
+                    raise KeyError(
                         f"Cannot find categories for legacy column {col.name}"
                     )
             else:
-                raise RuntimeError(
+                raise KeyError(
                     f"Cannot find categories for legacy column {col.name}"
                 )
             cats = decode_str_array(cats)
@@ -88,7 +88,7 @@ def read_categorical_column(
         codes = np.asarray(codes, dtype=np.int64)
         return [cats[c] if 0 <= c < len(cats) else "" for c in codes]
 
-    raise RuntimeError(f"Unsupported categorical column type: {type(col)}")
+    raise TypeError(f"Unsupported categorical column type: {type(col)}")
 
 
 def col_chunk_as_strings(
@@ -116,7 +116,7 @@ def col_chunk_as_strings(
         List[str]: Column values as strings for the specified slice
     """
     if col_name not in group:
-        raise RuntimeError(f"Column {col_name!r} not found in group {group.name}")
+        raise KeyError(f"Column {col_name!r} not found in group {group.name}")
 
     col = group[col_name]
 
@@ -150,10 +150,10 @@ def col_chunk_as_strings(
             # Apply mask: masked values become empty string
             return ["" if m else str(v) for v, m in zip(values, mask)]
 
-        raise RuntimeError(
+        raise ValueError(
             f"Unsupported group encoding {enc!r} for column {col_name!r}"
         )
 
-    raise RuntimeError(
+    raise TypeError(
         f"Unsupported column type for {col_name!r} in group {group.name}"
     )

From 95b6553155587465db4437eadc543c664a84d022 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 13:48:01 +0000
Subject: [PATCH 30/62] Refactor CLI options for chunk processing: update
 parameter names and default values for clarity

---
 src/h5ad/cli.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py
index 343560c..0c6190d 100644
--- a/src/h5ad/cli.py
+++ b/src/h5ad/cli.py
@@ -98,7 +98,7 @@ def subset(
         readable=True,
     ),
     chunk_rows: int = typer.Option(
-        1024, "--chunk-rows", "-r", help="Row chunk size for dense matrices"
+        1024, "--chunk", "-C", help="Row chunk size for dense matrices"
     ),
 ) -> None:
     """Subset an h5ad by obs and/or var names."""
@@ -141,7 +141,7 @@ def export_dataframe(
         help="Comma separated column names to include",
     ),
     chunk_rows: int = typer.Option(
-        10000, "--chunk-rows", "-r", help="Number of rows to read per chunk"
+        10_000, "--chunk", "-C", help="Number of rows to read per chunk"
     ),
     head: Optional[int] = typer.Option(
         None, "--head", "-n", help="Output only the first n entries"
@@ -193,9 +193,9 @@ def export_array(
         ..., "--output", "-o", help="Output .npy file path", writable=True
     ),
     chunk_elements: int = typer.Option(
-        10_000,
+        100_000,
         "--chunk",
-        "-r",
+        "-C",
         help="Number of elements to read per chunk",
     ),
 ) -> None:
@@ -237,13 +237,13 @@ def export_sparse(
         help="Output .mtx file path (defaults to stdout)",
     ),
     head: Optional[int] = typer.Option(
-        None, "--head", "-n", help="Output only the first n rows"
+        None, "--head", "-n", help="Output only the first n entries of mtx file"
     ),
     chunk_elements: int = typer.Option(
-        10_000,
+        1_000,
         "--chunk",
-        "-r",
-        help="Number of nonzero elements to process per chunk",
+        "-C",
+        help="Number of rows/columns (depends on compression format) to process per chunk",
     ),
 ) -> None:
     """
@@ -262,6 +262,7 @@ def export_sparse(
             out=output,
             head=head,
             chunk_elements=chunk_elements,
+            memory_mb=memory_mb,
             console=console,
         )
     except Exception as e:

From 1695f25e37cf2e239479697b65db9c93d2528283 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 13:55:11 +0000
Subject: [PATCH 31/62] Refactor export_mtx function: update chunk_elements
 description, improve error handling, and enhance output formatting

---
 src/h5ad/commands/export.py | 89 ++++++++++++++++++++++---------------
 1 file changed, 54 insertions(+), 35 deletions(-)

diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py
index dbd4277..4fc94b2 100644
--- a/src/h5ad/commands/export.py
+++ b/src/h5ad/commands/export.py
@@ -284,7 +284,7 @@ def export_mtx(
     """Export a CSR/CSC matrix group (AnnData encoding) to Matrix Market (.mtx).
 
     If out is None or "-", writes to stdout. The head parameter limits output lines.
-    chunk_elements controls how many nonzero elements are processed per slice.
+    chunk_elements controls how many rows/columns are processed per slice.
     """
     with h5py.File(file, "r") as f:
         h5obj = _resolve(f, obj)
@@ -307,13 +307,13 @@ def export_mtx(
             or not isinstance(indices, h5py.Dataset)
             or not isinstance(indptr, h5py.Dataset)
         ):
-            raise RuntimeError(
+            raise ValueError(
                 "Sparse matrix group must contain datasets: data, indices, indptr"
             )
 
         shape = h5obj.attrs.get("shape", None)
         if shape is None:
-            raise RuntimeError(
+            raise ValueError(
                 "Sparse matrix group is missing required 'shape' attribute."
             )
         n_rows, n_cols = (int(shape[0]), int(shape[1]))
@@ -325,11 +325,19 @@ def export_mtx(
         nnz_ptr = int(indptr_arr[-1]) if indptr_arr.size else 0
         nnz_data = int(data.shape[0])
         nnz_idx = int(indices.shape[0])
-        nnz_limit = min(nnz_ptr, nnz_data, nnz_idx)
-        nnz = nnz_limit
-        elem_step = max(1, int(chunk_elements))
+
+        # Check consistency of sparse data
+        if not (nnz_ptr == nnz_data == nnz_idx):
+            raise ValueError(
+                f"Sparse matrix data inconsistency: indptr implies {nnz_ptr} nonzeros, "
+                f"but data has {nnz_data} and indices has {nnz_idx}."
+            )
+
+        # Determine number of nonzero entries to write
+        nnz = nnz_data
+        major_step = max(1, int(chunk_elements))
         if head is not None and head > 0:
-            nnz = min(nnz_limit, head)
+            nnz = min(nnz_data, head)
 
         # Write to stdout when out is None or "-", otherwise open a file on disk.
         if out is None or str(out) == "-":
@@ -348,46 +356,57 @@ def export_mtx(
             # Matrix Market header: type, generator line, then shape and nnz.
             out_fh.write(f"%%MatrixMarket matrix coordinate {field} general\n")
             out_fh.write("% generated by h5ad-cli\n")
+            if head is not None and head > 0:
+                out_fh.write(
+                    f"% output limited to first {nnz}/{nnz_data} nonzero entries\n"
+                )
             out_fh.write(f"{n_rows} {n_cols} {nnz}\n")
 
+            # Iterate over major axis (rows for CSR, cols for CSC)
             major = n_rows if enc == "csr_matrix" else n_cols
             max_lines = head if head is not None and head > 0 else None
             written = 0
             with status_ctx as status:
-                for major_i in range(major):
-                    start = min(int(indptr_arr[major_i]), nnz_limit)
-                    end = min(int(indptr_arr[major_i + 1]), nnz_limit)
-                    if end <= start:
-                        continue
+                for major_start in range(0, major, major_step):
+                    major_end = min(major_start + major_step, major)
                     if use_status and status:
                         status.update(
-                            f"[magenta]Exporting {obj}: block {major_i+1}/{major}...[/]"
+                            f"[magenta]Exporting {obj}: {major_start+1}-{major_end} of {major}...[/]"
                         )
-                    # Slice the sparse column/row segment for this major index in element chunks.
-                    for chunk_start in range(start, end, elem_step):
-                        chunk_end = min(chunk_start + elem_step, end)
-                        idx = np.asarray(indices[chunk_start:chunk_end], dtype=np.int64)
-                        vals = np.asarray(data[chunk_start:chunk_end])
+                    for major_i in range(major_start, major_end):
+                        start = min(int(indptr_arr[major_i]), nnz_data)
+                        end = min(int(indptr_arr[major_i + 1]), nnz_data)
+                        if end <= start:
+                            continue
+                        idx = np.asarray(indices[start:end], dtype=np.int64)
+                        vals = np.asarray(data[start:end])
                         m = min(len(idx), len(vals))
                         if m == 0:
-                            continue
+                            raise ValueError("Sparse matrix chunk has zero length.")
+                        if max_lines is not None:
+                            remaining = max_lines - written
+                            if remaining <= 0:
+                                break
+                            if m > remaining:
+                                m = remaining
                         idx = idx[:m]
                         vals = vals[:m]
-                        for k in range(m):
-                            if max_lines is not None and written >= max_lines:
-                                break
-                            if enc == "csr_matrix":
-                                r = major_i + 1
-                                c = int(idx[k]) + 1
-                            else:
-                                r = int(idx[k]) + 1
-                                c = major_i + 1
-                            v = vals[k]
-                            if isinstance(v, np.generic):
-                                v = v.item()
-                            # Matrix Market uses 1-based indices.
-                            out_fh.write(f"{r} {c} {v}\n")
-                            written += 1
+                        idx_list = idx.tolist()
+                        vals_list = vals.tolist()
+                        if enc == "csr_matrix":
+                            r = major_i + 1
+                            lines = [
+                                f"{r} {c + 1} {v}\n"
+                                for c, v in zip(idx_list, vals_list)
+                            ]
+                        else:
+                            c = major_i + 1
+                            lines = [
+                                f"{r + 1} {c} {v}\n"
+                                for r, v in zip(idx_list, vals_list)
+                            ]
+                        out_fh.write("".join(lines))
+                        written += m
                         if max_lines is not None and written >= max_lines:
                             break
                     if max_lines is not None and written >= max_lines:
@@ -490,7 +509,7 @@ def export_image(file: Path, obj: str, out: Path, console: Console) -> None:
     try:
         from PIL import Image  # type: ignore
     except Exception as e:  # pragma: no cover
-        raise RuntimeError(
+        raise ImportError(
             "Pillow is required for image export. Install with: pip install h5ad[images]"
         ) from e
 

From 559b40d5c148d7e9363d62fa0a0edd8de01957a9 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 14:59:32 +0000
Subject: [PATCH 32/62] Enhance export functions: add detailed docstrings for
 export_npy and export_mtx, including argument descriptions and error handling

---
 src/h5ad/commands/export.py | 146 +++++++++++++++++++++++++-----------
 1 file changed, 102 insertions(+), 44 deletions(-)

diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py
index 4fc94b2..2d6a7b3 100644
--- a/src/h5ad/commands/export.py
+++ b/src/h5ad/commands/export.py
@@ -210,6 +210,16 @@ def export_npy(
     - v0.2.0 (modern): Datasets with encoding-type="array"
     - v0.1.0 (legacy): Plain datasets without encoding attributes
     - Encoded groups: nullable-integer, nullable-boolean, string-array (extracts values)
+
+    Args:
+        file: Path to the .h5ad file
+        obj: HDF5 path to the dataset or encoded group
+        out: Output .npy file path
+        chunk_elements: Number of elements to read per chunk
+        console: Rich console for status output
+
+    Raises:
+        ValueError: If the target object is not exportable as .npy
     """
     with h5py.File(file, "r") as f:
         h5obj = _resolve(f, obj)
@@ -279,12 +289,26 @@ def export_mtx(
     out: Optional[Path],
     head: Optional[int],
     chunk_elements: int,
+    in_memory: bool,
     console: Console,
 ) -> None:
     """Export a CSR/CSC matrix group (AnnData encoding) to Matrix Market (.mtx).
 
     If out is None or "-", writes to stdout. The head parameter limits output lines.
-    chunk_elements controls how many rows/columns are processed per slice.
+    chunk_elements controls how many rows/columns are processed per slice when
+    streaming. Use in_memory for small matrices to load everything at once.
+
+    Args:
+        file: Path to the .h5ad file
+        obj: HDF5 path to the matrix group
+        out: Output .mtx file path (or None for stdout)
+        head: Output only the first n nonzero entries
+        chunk_elements: Number of rows/columns to process per chunk
+        in_memory: Load the entire sparse matrix into memory before exporting
+        console: Rich console for status output
+
+    Raises:
+        ValueError: If the target object is not a valid CSR/CSC matrix group.
     """
     with h5py.File(file, "r") as f:
         h5obj = _resolve(f, obj)
@@ -362,55 +386,89 @@ def export_mtx(
                 )
             out_fh.write(f"{n_rows} {n_cols} {nnz}\n")
 
-            # Iterate over major axis (rows for CSR, cols for CSC)
-            major = n_rows if enc == "csr_matrix" else n_cols
-            max_lines = head if head is not None and head > 0 else None
-            written = 0
-            with status_ctx as status:
-                for major_start in range(0, major, major_step):
-                    major_end = min(major_start + major_step, major)
+            if in_memory:
+                with status_ctx as status:
                     if use_status and status:
                         status.update(
-                            f"[magenta]Exporting {obj}: {major_start+1}-{major_end} of {major}...[/]"
+                            f"[magenta]Loading entire matrix {obj} into memory...[/]"
+                        )
+                    data_arr = np.asarray(data[...])
+                    indices_arr = np.asarray(indices[...], dtype=np.int64)
+                    counts = np.diff(indptr_arr)
+                    if int(counts.sum()) != nnz_data:
+                        raise ValueError(
+                            "Sparse matrix indptr does not match data/indices length."
                         )
-                    for major_i in range(major_start, major_end):
-                        start = min(int(indptr_arr[major_i]), nnz_data)
-                        end = min(int(indptr_arr[major_i + 1]), nnz_data)
-                        if end <= start:
-                            continue
-                        idx = np.asarray(indices[start:end], dtype=np.int64)
-                        vals = np.asarray(data[start:end])
-                        m = min(len(idx), len(vals))
-                        if m == 0:
-                            raise ValueError("Sparse matrix chunk has zero length.")
-                        if max_lines is not None:
-                            remaining = max_lines - written
-                            if remaining <= 0:
+
+                    if enc == "csr_matrix":
+                        major_idx = np.repeat(np.arange(n_rows, dtype=np.int64), counts)
+                        row_idx = major_idx
+                        col_idx = indices_arr
+                    else:
+                        major_idx = np.repeat(np.arange(n_cols, dtype=np.int64), counts)
+                        row_idx = indices_arr
+                        col_idx = major_idx
+
+                    if head is not None and head > 0:
+                        row_idx = row_idx[:nnz]
+                        col_idx = col_idx[:nnz]
+                        data_arr = data_arr[:nnz]
+
+                    data_fmt = "%.18g" if field == "real" else "%d"
+                    coords = np.column_stack((row_idx + 1, col_idx + 1, data_arr))
+                    if use_status and status:
+                        status.update(f"[magenta]Saving {nnz} entries to {out}...[/]")
+                    np.savetxt(out_fh, coords, fmt=["%d", "%d", data_fmt], newline="\n")
+            else:
+                # Iterate over major axis (rows for CSR, cols for CSC)
+                major = n_rows if enc == "csr_matrix" else n_cols
+                max_lines = head if head is not None and head > 0 else None
+                written = 0
+                with status_ctx as status:
+                    for major_start in range(0, major, major_step):
+                        major_end = min(major_start + major_step, major)
+                        if use_status and status:
+                            status.update(
+                                f"[magenta]Exporting {obj}: {major_start+1}-{major_end} of {major}...[/]"
+                            )
+                        for major_i in range(major_start, major_end):
+                            start = min(int(indptr_arr[major_i]), nnz_data)
+                            end = min(int(indptr_arr[major_i + 1]), nnz_data)
+                            if end <= start:
+                                continue
+                            idx = np.asarray(indices[start:end], dtype=np.int64)
+                            vals = np.asarray(data[start:end])
+                            m = min(len(idx), len(vals))
+                            if m == 0:
+                                raise ValueError("Sparse matrix chunk has zero length.")
+                            if max_lines is not None:
+                                remaining = max_lines - written
+                                if remaining <= 0:
+                                    break
+                                if m > remaining:
+                                    m = remaining
+                            idx = idx[:m]
+                            vals = vals[:m]
+                            idx_list = idx.tolist()
+                            vals_list = vals.tolist()
+                            if enc == "csr_matrix":
+                                r = major_i + 1
+                                lines = [
+                                    f"{r} {c + 1} {v}\n"
+                                    for c, v in zip(idx_list, vals_list)
+                                ]
+                            else:
+                                c = major_i + 1
+                                lines = [
+                                    f"{r + 1} {c} {v}\n"
+                                    for r, v in zip(idx_list, vals_list)
+                                ]
+                            out_fh.write("".join(lines))
+                            written += m
+                            if max_lines is not None and written >= max_lines:
                                 break
-                            if m > remaining:
-                                m = remaining
-                        idx = idx[:m]
-                        vals = vals[:m]
-                        idx_list = idx.tolist()
-                        vals_list = vals.tolist()
-                        if enc == "csr_matrix":
-                            r = major_i + 1
-                            lines = [
-                                f"{r} {c + 1} {v}\n"
-                                for c, v in zip(idx_list, vals_list)
-                            ]
-                        else:
-                            c = major_i + 1
-                            lines = [
-                                f"{r + 1} {c} {v}\n"
-                                for r, v in zip(idx_list, vals_list)
-                            ]
-                        out_fh.write("".join(lines))
-                        written += m
                         if max_lines is not None and written >= max_lines:
                             break
-                    if max_lines is not None and written >= max_lines:
-                        break
         finally:
             if out_fh is not sys.stdout:
                 out_fh.close()

From 36e5e8b6be4b6461ecb3fbac10bdb63cb084cf26 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 14:59:38 +0000
Subject: [PATCH 33/62] Add in-memory option to export_sparse command for
 improved performance

---
 src/h5ad/cli.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py
index 0c6190d..f53b3af 100644
--- a/src/h5ad/cli.py
+++ b/src/h5ad/cli.py
@@ -245,6 +245,12 @@ def export_sparse(
         "-C",
         help="Number of rows/columns (depends on compression format) to process per chunk",
     ),
+    in_memory: bool = typer.Option(
+        False,
+        "--in-memory",
+        "-m",
+        help="Load the entire sparse matrix into memory before exporting (may be faster for small matrices)",
+    ),
 ) -> None:
     """
     Export a sparse matrix (CSR/CSC) to Matrix Market (.mtx) format.
@@ -262,7 +268,7 @@ def export_sparse(
             out=output,
             head=head,
             chunk_elements=chunk_elements,
-            memory_mb=memory_mb,
+            in_memory=in_memory,
             console=console,
         )
     except Exception as e:

From 878421d69430d8315b78b31d20670ded96ccb3e9 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 15:11:45 +0000
Subject: [PATCH 34/62] Refactor export_dict and export_json functions: update
 output parameter handling and improve file writing logic

---
 src/h5ad/cli.py             |  8 +++++---
 src/h5ad/commands/export.py | 18 ++++++++++++++----
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py
index f53b3af..5a462ac 100644
--- a/src/h5ad/cli.py
+++ b/src/h5ad/cli.py
@@ -284,9 +284,11 @@ def export_dict(
     entry: str = typer.Argument(
         ..., help="Entry path to export (e.g., 'uns', 'uns/colors')"
     ),
-    out: Path = typer.Argument(..., help="Output .json file path"),
+    output: Optional[Path] = typer.Option(
+        None, "--output", "-o", help="Output .json file path"
+    ),
     max_elements: int = typer.Option(
-        1_000_000,
+        100_000,
         "--max-elements",
         help="Maximum array elements for JSON export",
     ),
@@ -306,7 +308,7 @@ def export_dict(
         export_json(
             file=file,
             obj=entry,
-            out=out,
+            out=output,
             max_elements=max_elements,
             include_attrs=include_attrs,
             console=console,
diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py
index 2d6a7b3..ae6f73f 100644
--- a/src/h5ad/commands/export.py
+++ b/src/h5ad/commands/export.py
@@ -494,10 +494,20 @@ def export_json(
         payload = _to_jsonable(
             h5obj, max_elements=max_elements, include_attrs=include_attrs
         )
-        out.parent.mkdir(parents=True, exist_ok=True)
-        with open(out, "w", encoding="utf-8") as fh:
-            json.dump(payload, fh, indent=2, ensure_ascii=False, sort_keys=True)
-        console.print(f"[green]Wrote[/] {out}")
+        # Write to stdout when out is None or "-", otherwise open a file on disk.
+        if out is None or str(out) == "-":
+            out_fh = sys.stdout
+        else:
+            out.parent.mkdir(parents=True, exist_ok=True)
+            out_fh = open(out, "w", encoding="utf-8")
+        try:
+            json.dump(payload, out_fh, indent=2, ensure_ascii=False, sort_keys=True)
+            out_fh.write("\n")
+        finally:
+            if out_fh is not sys.stdout:
+                out_fh.close()
+        if out_fh is not sys.stdout:
+            console.print(f"[green]Wrote[/] {out}")
 
 
 def _attrs_to_jsonable(

From 47fc2833ed0f5ed7462f1dda67baf70758d2bebc Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 15:22:29 +0000
Subject: [PATCH 35/62] Refactor export_image function: update output parameter
 to use Option, enhance docstring with argument descriptions and error
 handling

---
 src/h5ad/cli.py             |  9 ++++++---
 src/h5ad/commands/export.py | 23 +++++++++++++++--------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py
index 5a462ac..d947270 100644
--- a/src/h5ad/cli.py
+++ b/src/h5ad/cli.py
@@ -12,10 +12,11 @@
     export_mtx,
     export_npy,
     export_json,
-    export_image,
     export_table,
 )
 
+from h5ad.commands import export_image as export_image_cmd
+
 app = typer.Typer(
     help="Streaming CLI for huge .h5ad files (info, subset, export, import)."
 )
@@ -324,7 +325,9 @@ def export_image(
         ..., help="Path to the .h5ad file", exists=True, readable=True
     ),
     entry: str = typer.Argument(..., help="Entry path to export (2D or 3D array)"),
-    out: Path = typer.Argument(..., help="Output image file (.png, .jpg, .tiff)"),
+    output: Optional[Path] = typer.Option(
+        None, "--output", "-o", help="Output image file (.png, .jpg, .tiff)"
+    ),
 ) -> None:
     """
     Export an image-like array to PNG/JPG/TIFF format.
@@ -336,7 +339,7 @@ def export_image(
     """
 
     try:
-        export_image(file=file, obj=entry, out=out, console=console)
+        export_image_cmd(file=file, obj=entry, out=output, console=console)
     except Exception as e:
         console.print(f"[bold red]Error:[/] {e}")
         raise typer.Exit(code=1)
diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py
index ae6f73f..06c7f0f 100644
--- a/src/h5ad/commands/export.py
+++ b/src/h5ad/commands/export.py
@@ -10,6 +10,7 @@
 import h5py
 import numpy as np
 from rich.console import Console
+from PIL import Image
 
 from h5ad.read import col_chunk_as_strings, decode_str_array
 from h5ad.info import get_axis_group, get_entry_type
@@ -573,20 +574,24 @@ def _to_jsonable(h5obj: H5Obj, max_elements: int, include_attrs: bool) -> Any:
 
 
 def export_image(file: Path, obj: str, out: Path, console: Console) -> None:
-    """Export an image-like dataset (H,W) or (H,W,C) to PNG/JPG/TIFF."""
-    try:
-        from PIL import Image  # type: ignore
-    except Exception as e:  # pragma: no cover
-        raise ImportError(
-            "Pillow is required for image export. Install with: pip install h5ad[images]"
-        ) from e
-
+    """
+    Export an image-like dataset (H,W) or (H,W,C) to PNG/JPG/TIFF.
+    Args:
+        file: Path to the .h5ad file
+        obj: HDF5 path to the dataset
+        out: Output image file path
+        console: Rich console for status output
+    Raises:
+        ValueError: If the target object is not a valid image array.
+    """
+    # Load dataset
     with h5py.File(file, "r") as f:
         h5obj = _resolve(f, obj)
         if not isinstance(h5obj, h5py.Dataset):
             raise ValueError("Image export requires a dataset.")
         arr = np.asarray(h5obj[...])
 
+    # Validate shape
     if arr.ndim not in (2, 3):
         raise ValueError(f"Expected 2D or 3D image array; got shape {arr.shape}.")
     if arr.ndim == 3 and arr.shape[2] not in (1, 3, 4):
@@ -609,9 +614,11 @@ def export_image(file: Path, obj: str, out: Path, console: Console) -> None:
     else:
         raise ValueError(f"Unsupported image dtype: {arr.dtype}")
 
+    # If single-channel 3D, convert to 2D
     if arr.ndim == 3 and arr.shape[2] == 1:
         arr = arr[:, :, 0]
 
+    # Save image
     img = Image.fromarray(arr)
     out.parent.mkdir(parents=True, exist_ok=True)
     img.save(out)

From 595d81bf1f5f56ead652f3cf599294db7de68175 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 17:31:44 +0000
Subject: [PATCH 36/62] HUGE REFACTOR: Add format-specific import/export
 helpers for various data types

---
 src/h5ad/formats/__init__.py  |   1 +
 src/h5ad/formats/array.py     |  99 +++++++++++++
 src/h5ad/formats/common.py    |  67 +++++++++
 src/h5ad/formats/dataframe.py | 169 ++++++++++++++++++++++
 src/h5ad/formats/image.py     |  47 ++++++
 src/h5ad/formats/json_data.py | 155 ++++++++++++++++++++
 src/h5ad/formats/sparse.py    | 262 ++++++++++++++++++++++++++++++++++
 src/h5ad/formats/validate.py  |  97 +++++++++++++
 8 files changed, 897 insertions(+)
 create mode 100644 src/h5ad/formats/__init__.py
 create mode 100644 src/h5ad/formats/array.py
 create mode 100644 src/h5ad/formats/common.py
 create mode 100644 src/h5ad/formats/dataframe.py
 create mode 100644 src/h5ad/formats/image.py
 create mode 100644 src/h5ad/formats/json_data.py
 create mode 100644 src/h5ad/formats/sparse.py
 create mode 100644 src/h5ad/formats/validate.py

diff --git a/src/h5ad/formats/__init__.py b/src/h5ad/formats/__init__.py
new file mode 100644
index 0000000..18b9721
--- /dev/null
+++ b/src/h5ad/formats/__init__.py
@@ -0,0 +1 @@
+"""Format-specific import/export helpers."""
diff --git a/src/h5ad/formats/array.py b/src/h5ad/formats/array.py
new file mode 100644
index 0000000..1dd21ac
--- /dev/null
+++ b/src/h5ad/formats/array.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from rich.console import Console
+
+from h5ad.formats.common import _get_encoding_type, _resolve
+from h5ad.formats.validate import validate_dimensions
+from h5ad.storage import create_dataset, is_dataset, is_group
+from h5ad.util.path import norm_path
+
+
+def export_npy(
+    root: Any,
+    obj: str,
+    out: Path,
+    chunk_elements: int,
+    console: Console,
+) -> None:
+    h5obj = _resolve(root, obj)
+
+    if is_group(h5obj):
+        enc = _get_encoding_type(h5obj)
+        if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"):
+            if "values" not in h5obj:
+                raise ValueError(f"Encoded group '{obj}' is missing 'values' dataset.")
+            ds = h5obj["values"]
+            console.print(f"[dim]Exporting nullable array values from '{obj}'[/]")
+        else:
+            raise ValueError(
+                f"Target '{obj}' is a group with encoding '{enc}'; cannot export as .npy directly."
+            )
+    elif is_dataset(h5obj):
+        ds = h5obj
+    else:
+        raise ValueError("Target is not an array-like object.")
+
+    out.parent.mkdir(parents=True, exist_ok=True)
+    mm = np.lib.format.open_memmap(out, mode="w+", dtype=ds.dtype, shape=ds.shape)
+    try:
+        if ds.shape == ():
+            mm[...] = ds[()]
+            console.print(f"[green]Wrote[/] {out}")
+            return
+
+        if ds.ndim == 1:
+            n = int(ds.shape[0])
+            step = max(1, int(chunk_elements))
+            with console.status(f"[magenta]Exporting {obj} to {out}...[/]") as status:
+                for start in range(0, n, step):
+                    end = min(start + step, n)
+                    status.update(
+                        f"[magenta]Exporting {obj}: {start}-{end} of {n}...[/]"
+                    )
+                    mm[start:end] = ds[start:end]
+            console.print(f"[green]Wrote[/] {out}")
+            return
+
+        n0 = int(ds.shape[0])
+        row_elems = int(np.prod(ds.shape[1:])) if ds.ndim > 1 else 1
+        step0 = max(1, int(chunk_elements) // max(1, row_elems))
+        with console.status(f"[magenta]Exporting {obj} to {out}...[/]") as status:
+            for start in range(0, n0, step0):
+                end = min(start + step0, n0)
+                status.update(
+                    f"[magenta]Exporting {obj}: {start}-{end} of {n0}...[/]"
+                )
+                mm[start:end, ...] = ds[start:end, ...]
+        console.print(f"[green]Wrote[/] {out}")
+    finally:
+        del mm
+
+
+def import_npy(
+    root: Any,
+    obj: str,
+    input_file: Path,
+    console: Console,
+) -> None:
+    obj = norm_path(obj)
+    arr = np.load(input_file)
+
+    validate_dimensions(root, obj, arr.shape, console)
+
+    parts = obj.split("/")
+    parent = root
+    for part in parts[:-1]:
+        parent = parent[part] if part in parent else parent.create_group(part)
+    name = parts[-1]
+
+    if name in parent:
+        del parent[name]
+
+    create_dataset(parent, name, data=arr)
+
+    shape_str = "×".join(str(d) for d in arr.shape)
+    console.print(f"[green]Imported[/] {shape_str} array into '{obj}'")
diff --git a/src/h5ad/formats/common.py b/src/h5ad/formats/common.py
new file mode 100644
index 0000000..6282eb5
--- /dev/null
+++ b/src/h5ad/formats/common.py
@@ -0,0 +1,67 @@
+from __future__ import annotations
+
+from typing import Any, Dict
+
+import numpy as np
+
+from h5ad.storage import is_dataset, is_group
+from h5ad.util.path import norm_path
+
+
+TYPE_EXTENSIONS = {
+    "dataframe": {".csv"},
+    "sparse-matrix": {".mtx"},
+    "dense-matrix": {".npy", ".png", ".jpg", ".jpeg", ".tif", ".tiff"},
+    "array": {".npy", ".png", ".jpg", ".jpeg", ".tif", ".tiff"},
+    "dict": {".json"},
+    "scalar": {".json"},
+    "categorical": {".csv"},
+    "awkward-array": {".json"},
+}
+
+IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".tif", ".tiff"}
+
+EXPORTABLE_TYPES = set(TYPE_EXTENSIONS.keys())
+
+
+def _get_encoding_type(group: Any) -> str:
+    enc = group.attrs.get("encoding-type", "")
+    if isinstance(enc, bytes):
+        enc = enc.decode("utf-8")
+    return str(enc)
+
+
+def _resolve(root: Any, obj: str) -> Any:
+    obj = norm_path(obj)
+    if obj not in root:
+        raise KeyError(f"'{obj}' not found in the file.")
+    return root[obj]
+
+
+def _check_json_exportable(h5obj: Any, max_elements: int, path: str = "") -> None:
+    if is_dataset(h5obj):
+        if h5obj.shape == ():
+            return
+        n = int(np.prod(h5obj.shape)) if h5obj.shape else 0
+        if n > max_elements:
+            obj_name = getattr(h5obj, "name", "<object>")
+            raise ValueError(
+                f"Cannot export to JSON: '{path or obj_name}' has {n} elements "
+                f"(max {max_elements}). Use --max-elements to increase limit."
+            )
+        return
+
+    if is_group(h5obj):
+        enc = _get_encoding_type(h5obj)
+        if enc in ("csr_matrix", "csc_matrix"):
+            obj_name = getattr(h5obj, "name", "<object>")
+            raise ValueError(
+                f"Cannot export to JSON: '{path or obj_name}' is a sparse matrix. "
+                "Export it as .mtx instead."
+            )
+
+        for key in h5obj.keys():
+            child = h5obj[key]
+            child_path = f"{path}/{key}" if path else key
+            if is_group(child) or is_dataset(child):
+                _check_json_exportable(child, max_elements=max_elements, path=child_path)
diff --git a/src/h5ad/formats/dataframe.py b/src/h5ad/formats/dataframe.py
new file mode 100644
index 0000000..f767c4c
--- /dev/null
+++ b/src/h5ad/formats/dataframe.py
@@ -0,0 +1,169 @@
+from __future__ import annotations
+
+import csv
+import sys
+from contextlib import nullcontext
+from pathlib import Path
+from typing import Any, List, Optional, Tuple
+
+import numpy as np
+from rich.console import Console
+
+from h5ad.core.info import get_axis_group
+from h5ad.core.read import col_chunk_as_strings
+from h5ad.formats.validate import validate_dimensions
+from h5ad.storage import create_dataset, is_zarr_group
+
+
+def export_dataframe(
+    root: Any,
+    axis: str,
+    columns: Optional[List[str]],
+    out: Optional[Path],
+    chunk_rows: int,
+    head: Optional[int],
+    console: Console,
+) -> None:
+    group, n_rows, index_name = get_axis_group(root, axis)
+
+    reserved_keys = {"_index", "__categories"}
+
+    if columns:
+        col_names = list(columns)
+    else:
+        col_names = [
+            k for k in group.keys() if k not in reserved_keys and k != index_name
+        ]
+        if index_name and index_name not in col_names:
+            col_names.insert(0, index_name)
+
+    if isinstance(index_name, bytes):
+        index_name = index_name.decode("utf-8")
+
+    if index_name not in col_names:
+        col_names.insert(0, index_name)
+    else:
+        col_names = [index_name] + [c for c in col_names if c != index_name]
+
+    if head is not None and head > 0:
+        n_rows = min(n_rows, head)
+
+    if out is None or str(out) == "-":
+        out_fh = sys.stdout
+    else:
+        out_fh = open(out, "w", newline="", encoding="utf-8")
+    writer = csv.writer(out_fh)
+
+    try:
+        writer.writerow(col_names)
+        cat_cache = {}
+
+        use_status = out_fh is not sys.stdout
+        status_ctx = (
+            console.status(f"[magenta]Exporting {axis} table to {out}...[/]")
+            if use_status
+            else nullcontext()
+        )
+
+        with status_ctx as status:
+            for start in range(0, n_rows, chunk_rows):
+                end = min(start + chunk_rows, n_rows)
+                if use_status and status:
+                    status.update(
+                        f"[magenta]Exporting rows {start}-{end} of {n_rows}...[/]"
+                    )
+                cols_data: List[List[str]] = []
+                for col in col_names:
+                    cols_data.append(
+                        col_chunk_as_strings(group, col, start, end, cat_cache)
+                    )
+                for row_idx in range(end - start):
+                    row = [
+                        cols_data[col_idx][row_idx]
+                        for col_idx in range(len(col_names))
+                    ]
+                    writer.writerow(row)
+    finally:
+        if out_fh is not sys.stdout:
+            out_fh.close()
+
+
+def _read_csv(
+    input_file: Path,
+    index_column: Optional[str],
+) -> Tuple[List[dict], List[str], List[str], str]:
+    with open(input_file, "r", encoding="utf-8", newline="") as f:
+        reader = csv.DictReader(f)
+        if reader.fieldnames is None:
+            raise ValueError("CSV file has no header.")
+        fieldnames = list(reader.fieldnames)
+
+        if index_column:
+            if index_column not in fieldnames:
+                raise ValueError(
+                    f"Index column '{index_column}' not found in CSV. "
+                    f"Available columns: {', '.join(fieldnames)}"
+                )
+            idx_col = index_column
+        else:
+            idx_col = fieldnames[0]
+
+        rows = list(reader)
+
+    index_values = [row[idx_col] for row in rows]
+    data_columns = [c for c in fieldnames if c != idx_col]
+
+    return rows, data_columns, index_values, idx_col
+
+
+def import_dataframe(
+    root: Any,
+    obj: str,
+    input_file: Path,
+    index_column: Optional[str],
+    console: Console,
+) -> None:
+    if obj not in ("obs", "var"):
+        raise ValueError(
+            f"CSV import is only supported for 'obs' or 'var', not '{obj}'."
+        )
+
+    rows, data_columns, index_values, _ = _read_csv(input_file, index_column)
+    n_rows = len(rows)
+
+    validate_dimensions(root, obj, (n_rows,), console)
+
+    if obj in root:
+        del root[obj]
+
+    group = root.create_group(obj)
+    index_name = "obs_names" if obj == "obs" else "var_names"
+    group.attrs["_index"] = index_name
+    group.attrs["encoding-type"] = "dataframe"
+    group.attrs["encoding-version"] = "0.2.0"
+
+    if is_zarr_group(group):
+        group.attrs["column-order"] = list(data_columns)
+    else:
+        group.attrs["column-order"] = np.array(data_columns, dtype="S")
+
+    create_dataset(group, index_name, data=np.array(index_values, dtype="S"))
+
+    for col in data_columns:
+        values = [row[col] for row in rows]
+        try:
+            arr = np.array(values, dtype=np.float64)
+            create_dataset(group, col, data=arr)
+        except (ValueError, TypeError):
+            try:
+                arr = np.array(values, dtype=np.int64)
+                create_dataset(group, col, data=arr)
+            except (ValueError, TypeError):
+                arr = np.array(values, dtype="S")
+                ds = create_dataset(group, col, data=arr)
+                ds.attrs["encoding-type"] = "string-array"
+                ds.attrs["encoding-version"] = "0.2.0"
+
+    console.print(
+        f"[green]Imported[/] {n_rows} rows × {len(data_columns)} columns into '{obj}'"
+    )
diff --git a/src/h5ad/formats/image.py b/src/h5ad/formats/image.py
new file mode 100644
index 0000000..fe5d2ce
--- /dev/null
+++ b/src/h5ad/formats/image.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from PIL import Image
+from rich.console import Console
+
+from h5ad.formats.common import _resolve
+from h5ad.storage import is_dataset
+
+
+def export_image(root: Any, obj: str, out: Path, console: Console) -> None:
+    h5obj = _resolve(root, obj)
+    if not is_dataset(h5obj):
+        raise ValueError("Image export requires a dataset.")
+    arr = np.asarray(h5obj[...])
+
+    if arr.ndim not in (2, 3):
+        raise ValueError(f"Expected 2D or 3D image array; got shape {arr.shape}.")
+    if arr.ndim == 3 and arr.shape[2] not in (1, 3, 4):
+        raise ValueError(
+            f"Expected last dimension (channels) to be 1, 3, or 4; got {arr.shape}."
+        )
+
+    if np.issubdtype(arr.dtype, np.floating):
+        amax = float(np.nanmax(arr)) if arr.size else 0.0
+        if amax <= 1.0:
+            arr = np.clip(arr, 0.0, 1.0) * 255.0
+        else:
+            arr = np.clip(arr, 0.0, 255.0)
+        arr = arr.astype(np.uint8)
+    elif np.issubdtype(arr.dtype, np.integer):
+        arr = np.clip(arr, 0, 255).astype(np.uint8)
+    elif arr.dtype == np.bool_:
+        arr = arr.astype(np.uint8) * 255
+    else:
+        raise ValueError(f"Unsupported image dtype: {arr.dtype}")
+
+    if arr.ndim == 3 and arr.shape[2] == 1:
+        arr = arr[:, :, 0]
+
+    img = Image.fromarray(arr)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    img.save(out)
+    console.print(f"[green]Wrote[/] {out}")
diff --git a/src/h5ad/formats/json_data.py b/src/h5ad/formats/json_data.py
new file mode 100644
index 0000000..c983677
--- /dev/null
+++ b/src/h5ad/formats/json_data.py
@@ -0,0 +1,155 @@
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict
+
+import numpy as np
+from rich.console import Console
+
+from h5ad.core.read import decode_str_array
+from h5ad.formats.common import _check_json_exportable, _resolve
+from h5ad.storage import create_dataset, is_dataset, is_group
+from h5ad.util.path import norm_path
+
+
+def export_json(
+    root: Any,
+    obj: str,
+    out: Path | None,
+    max_elements: int,
+    include_attrs: bool,
+    console: Console,
+) -> None:
+    h5obj = _resolve(root, obj)
+    _check_json_exportable(h5obj, max_elements=max_elements)
+
+    payload = _to_jsonable(
+        h5obj, max_elements=max_elements, include_attrs=include_attrs
+    )
+    if out is None or str(out) == "-":
+        out_fh = sys.stdout
+    else:
+        out.parent.mkdir(parents=True, exist_ok=True)
+        out_fh = open(out, "w", encoding="utf-8")
+    try:
+        json.dump(payload, out_fh, indent=2, ensure_ascii=False, sort_keys=True)
+        out_fh.write("\n")
+    finally:
+        if out_fh is not sys.stdout:
+            out_fh.close()
+    if out_fh is not sys.stdout:
+        console.print(f"[green]Wrote[/] {out}")
+
+
+def _attrs_to_jsonable(attrs: Any, max_elements: int) -> Dict[str, Any]:
+    out: Dict[str, Any] = {}
+    for k in attrs.keys():
+        v = attrs.get(k)
+        out[str(k)] = _pyify(v, max_elements=max_elements)
+    return out
+
+
+def _pyify(value: Any, max_elements: int) -> Any:
+    if isinstance(value, bytes):
+        try:
+            return value.decode("utf-8")
+        except Exception:
+            return value.decode("utf-8", errors="replace")
+    if isinstance(value, np.generic):
+        return value.item()
+    if isinstance(value, np.ndarray):
+        if value.size > max_elements:
+            raise ValueError(
+                f"Refusing to convert array of size {value.size} (> {max_elements}) to JSON."
+            )
+        if np.issubdtype(value.dtype, np.bytes_) or value.dtype.kind == "O":
+            value = decode_str_array(value)
+        return value.tolist()
+    return value
+
+
+def _dataset_to_jsonable(ds: Any, max_elements: int) -> Any:
+    if ds.shape == ():
+        v = ds[()]
+        return _pyify(v, max_elements=max_elements)
+    n = int(np.prod(ds.shape)) if ds.shape else 0
+    if n > max_elements:
+        ds_name = getattr(ds, "name", "<dataset>")
+        raise ValueError(
+            f"Refusing to convert dataset {ds_name!r} with {n} elements (> {max_elements}) to JSON."
+        )
+    arr = np.asarray(ds[...])
+    return _pyify(arr, max_elements=max_elements)
+
+
+def _to_jsonable(h5obj: Any, max_elements: int, include_attrs: bool) -> Any:
+    if is_dataset(h5obj):
+        return _dataset_to_jsonable(h5obj, max_elements=max_elements)
+
+    d: Dict[str, Any] = {}
+    if include_attrs and len(h5obj.attrs):
+        d["__attrs__"] = _attrs_to_jsonable(h5obj.attrs, max_elements=max_elements)
+
+    for key in h5obj.keys():
+        child = h5obj[key]
+        if is_group(child) or is_dataset(child):
+            d[str(key)] = _to_jsonable(
+                child,
+                max_elements=max_elements,
+                include_attrs=include_attrs,
+            )
+    return d
+
+
+def import_json(
+    root: Any,
+    obj: str,
+    input_file: Path,
+    console: Console,
+) -> None:
+    obj = norm_path(obj)
+    with open(input_file, "r", encoding="utf-8") as fh:
+        payload = json.load(fh)
+
+    parts = obj.split("/")
+    parent = root
+    for part in parts[:-1]:
+        parent = parent[part] if part in parent else parent.create_group(part)
+    name = parts[-1]
+
+    if name in parent:
+        del parent[name]
+
+    _write_json_to_group(parent, name, payload)
+
+    console.print(f"[green]Imported[/] JSON data into '{obj}'")
+
+
+def _write_json_to_group(parent: Any, name: str, value: Any) -> None:
+    if isinstance(value, dict):
+        group = parent.create_group(name)
+        for k, v in value.items():
+            _write_json_to_group(group, k, v)
+    elif isinstance(value, list):
+        try:
+            arr = np.array(value)
+            if arr.dtype.kind in ("U", "O"):
+                arr = np.array(value, dtype="S")
+            create_dataset(parent, name, data=arr)
+        except (ValueError, TypeError):
+            create_dataset(parent, name, data=json.dumps(value).encode("utf-8"))
+    elif isinstance(value, str):
+        create_dataset(parent, name, data=np.array([value], dtype="S"))
+    elif isinstance(value, bool):
+        create_dataset(parent, name, data=np.array(value, dtype=bool))
+    elif isinstance(value, int):
+        create_dataset(parent, name, data=np.array(value, dtype=np.int64))
+    elif isinstance(value, float):
+        create_dataset(parent, name, data=np.array(value, dtype=np.float64))
+    elif value is None:
+        ds = create_dataset(parent, name, data=np.array([], dtype="S"))
+        ds.attrs["_is_none"] = True
+    else:
+        raise ValueError(f"Cannot convert JSON value of type {type(value).__name__}")
diff --git a/src/h5ad/formats/sparse.py b/src/h5ad/formats/sparse.py
new file mode 100644
index 0000000..4045ce5
--- /dev/null
+++ b/src/h5ad/formats/sparse.py
@@ -0,0 +1,262 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, List, Tuple
+import sys
+from contextlib import nullcontext
+
+import numpy as np
+from rich.console import Console
+
+from h5ad.formats.common import _get_encoding_type, _resolve
+from h5ad.formats.validate import validate_dimensions
+from h5ad.storage import create_dataset, is_dataset, is_group, is_zarr_group
+from h5ad.util.path import norm_path
+
+
+def _read_mtx(
+    input_file: Path,
+) -> Tuple[List[Tuple[int, int, float]], Tuple[int, int], int]:
+    with open(input_file, "r", encoding="utf-8") as fh:
+        header = fh.readline()
+        if not header.startswith("%%MatrixMarket"):
+            raise ValueError("Invalid MTX file: missing MatrixMarket header.")
+
+        parts = header.lower().split()
+        field = "real"
+        for p in parts:
+            if p in ("real", "integer", "complex", "pattern"):
+                field = p
+                break
+
+        line = fh.readline()
+        while line.startswith("%"):
+            line = fh.readline()
+
+        dims = line.split()
+        n_rows, n_cols, nnz = int(dims[0]), int(dims[1]), int(dims[2])
+
+        entries = []
+        for _ in range(nnz):
+            parts = fh.readline().split()
+            r, c = int(parts[0]) - 1, int(parts[1]) - 1
+            if field == "pattern":
+                v = 1.0
+            else:
+                v = float(parts[2])
+            entries.append((r, c, v))
+
+    return entries, (n_rows, n_cols), nnz
+
+
+def _create_csr_from_entries(
+    entries: List[Tuple[int, int, float]], shape: Tuple[int, int]
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    n_rows, _ = shape
+    entries.sort(key=lambda x: (x[0], x[1]))
+
+    data = np.array([e[2] for e in entries], dtype=np.float32)
+    indices = np.array([e[1] for e in entries], dtype=np.int32)
+
+    indptr = np.zeros(n_rows + 1, dtype=np.int32)
+    for r, _, _ in entries:
+        indptr[r + 1] += 1
+    indptr = np.cumsum(indptr)
+
+    return data, indices, indptr
+
+
+def export_mtx(
+    root: Any,
+    obj: str,
+    out: Path | None,
+    head: int | None,
+    chunk_elements: int,
+    in_memory: bool,
+    console: Console,
+) -> None:
+    h5obj = _resolve(root, obj)
+    if not is_group(h5obj):
+        raise ValueError("MTX export requires a CSR/CSC matrix group (not a dataset).")
+
+    enc = _get_encoding_type(h5obj)
+    if enc not in ("csr_matrix", "csc_matrix"):
+        raise ValueError(
+            f"Target group encoding-type is {enc!r}; expected 'csr_matrix' or 'csc_matrix'."
+        )
+
+    data = h5obj.get("data")
+    indices = h5obj.get("indices")
+    indptr = h5obj.get("indptr")
+    if not (is_dataset(data) and is_dataset(indices) and is_dataset(indptr)):
+        raise ValueError(
+            "Sparse matrix group must contain datasets: data, indices, indptr"
+        )
+
+    shape = h5obj.attrs.get("shape", None)
+    if shape is None:
+        raise ValueError("Sparse matrix group is missing required 'shape' attribute.")
+    n_rows, n_cols = (int(shape[0]), int(shape[1]))
+
+    field = "real" if np.issubdtype(data.dtype, np.floating) else "integer"
+
+    indptr_arr = np.asarray(indptr[...], dtype=np.int64)
+    nnz_ptr = int(indptr_arr[-1]) if indptr_arr.size else 0
+    nnz_data = int(data.shape[0])
+    nnz_idx = int(indices.shape[0])
+
+    if not (nnz_ptr == nnz_data == nnz_idx):
+        raise ValueError(
+            f"Sparse matrix data inconsistency: indptr implies {nnz_ptr} nonzeros, "
+            f"but data has {nnz_data} and indices has {nnz_idx}."
+        )
+
+    nnz = nnz_data
+    major_step = max(1, int(chunk_elements))
+    if head is not None and head > 0:
+        nnz = min(nnz_data, head)
+
+    if out is None or str(out) == "-":
+        out_fh = sys.stdout
+    else:
+        out.parent.mkdir(parents=True, exist_ok=True)
+        out_fh = open(out, "w", encoding="utf-8", newline="\n")
+
+    use_status = out_fh is not sys.stdout
+    status_ctx = (
+        console.status(f"[magenta]Exporting {obj} to {out}...[/]")
+        if use_status
+        else nullcontext()
+    )
+    try:
+        out_fh.write(f"%%MatrixMarket matrix coordinate {field} general\n")
+        out_fh.write("% generated by h5ad-cli\n")
+        if head is not None and head > 0:
+            out_fh.write(f"% output limited to first {nnz}/{nnz_data} nonzero entries\n")
+        out_fh.write(f"{n_rows} {n_cols} {nnz}\n")
+
+        if in_memory:
+            with status_ctx as status:
+                if use_status and status:
+                    status.update(
+                        f"[magenta]Loading entire matrix {obj} into memory...[/]"
+                    )
+                data_arr = np.asarray(data[...])
+                indices_arr = np.asarray(indices[...], dtype=np.int64)
+                counts = np.diff(indptr_arr)
+                if int(counts.sum()) != nnz_data:
+                    raise ValueError(
+                        "Sparse matrix indptr does not match data/indices length."
+                    )
+
+                if enc == "csr_matrix":
+                    major_idx = np.repeat(np.arange(n_rows, dtype=np.int64), counts)
+                    row_idx = major_idx
+                    col_idx = indices_arr
+                else:
+                    major_idx = np.repeat(np.arange(n_cols, dtype=np.int64), counts)
+                    row_idx = indices_arr
+                    col_idx = major_idx
+
+                if head is not None and head > 0:
+                    row_idx = row_idx[:nnz]
+                    col_idx = col_idx[:nnz]
+                    data_arr = data_arr[:nnz]
+
+                data_fmt = "%.18g" if field == "real" else "%d"
+                coords = np.column_stack((row_idx + 1, col_idx + 1, data_arr))
+                if use_status and status:
+                    status.update(f"[magenta]Saving {nnz} entries to {out}...[/]")
+                np.savetxt(out_fh, coords, fmt=["%d", "%d", data_fmt], newline="\n")
+        else:
+            major = n_rows if enc == "csr_matrix" else n_cols
+            max_lines = head if head is not None and head > 0 else None
+            written = 0
+            with status_ctx as status:
+                for major_start in range(0, major, major_step):
+                    major_end = min(major_start + major_step, major)
+                    if use_status and status:
+                        status.update(
+                            f"[magenta]Exporting {obj}: {major_start+1}-{major_end} of {major}...[/]"
+                        )
+                    for major_i in range(major_start, major_end):
+                        start = min(int(indptr_arr[major_i]), nnz_data)
+                        end = min(int(indptr_arr[major_i + 1]), nnz_data)
+                        if end <= start:
+                            continue
+                        idx = np.asarray(indices[start:end], dtype=np.int64)
+                        vals = np.asarray(data[start:end])
+                        m = min(len(idx), len(vals))
+                        if m == 0:
+                            raise ValueError("Sparse matrix chunk has zero length.")
+                        if max_lines is not None:
+                            remaining = max_lines - written
+                            if remaining <= 0:
+                                break
+                            if m > remaining:
+                                m = remaining
+                        idx = idx[:m]
+                        vals = vals[:m]
+                        idx_list = idx.tolist()
+                        vals_list = vals.tolist()
+                        if enc == "csr_matrix":
+                            r = major_i + 1
+                            lines = [
+                                f"{r} {c + 1} {v}\n"
+                                for c, v in zip(idx_list, vals_list)
+                            ]
+                        else:
+                            c = major_i + 1
+                            lines = [
+                                f"{r + 1} {c} {v}\n"
+                                for r, v in zip(idx_list, vals_list)
+                            ]
+                        out_fh.write("".join(lines))
+                        written += m
+                        if max_lines is not None and written >= max_lines:
+                            break
+                    if max_lines is not None and written >= max_lines:
+                        break
+    finally:
+        if out_fh is not sys.stdout:
+            out_fh.close()
+    if out_fh is not sys.stdout:
+        console.print(f"[green]Wrote[/] {out}")
+
+
+def import_mtx(
+    root: Any,
+    obj: str,
+    input_file: Path,
+    console: Console,
+) -> None:
+    obj = norm_path(obj)
+    entries, shape, nnz = _read_mtx(input_file)
+    data, indices, indptr = _create_csr_from_entries(entries, shape)
+
+    validate_dimensions(root, obj, shape, console)
+
+    parts = obj.split("/")
+    parent = root
+    for part in parts[:-1]:
+        parent = parent[part] if part in parent else parent.create_group(part)
+    name = parts[-1]
+
+    if name in parent:
+        del parent[name]
+
+    group = parent.create_group(name)
+    group.attrs["encoding-type"] = "csr_matrix"
+    group.attrs["encoding-version"] = "0.1.0"
+    if is_zarr_group(group):
+        group.attrs["shape"] = list(shape)
+    else:
+        group.attrs["shape"] = np.array(shape, dtype=np.int64)
+
+    create_dataset(group, "data", data=data)
+    create_dataset(group, "indices", data=indices)
+    create_dataset(group, "indptr", data=indptr)
+
+    console.print(
+        f"[green]Imported[/] {shape[0]}×{shape[1]} sparse matrix ({nnz} non-zero) into '{obj}'"
+    )
diff --git a/src/h5ad/formats/validate.py b/src/h5ad/formats/validate.py
new file mode 100644
index 0000000..194192b
--- /dev/null
+++ b/src/h5ad/formats/validate.py
@@ -0,0 +1,97 @@
+from __future__ import annotations
+
+from typing import Optional, Tuple, Any
+
+from rich.console import Console
+
+from h5ad.core.info import axis_len
+from h5ad.util.path import norm_path
+
+
+OBS_AXIS_PREFIXES = ("obs", "obsm/", "obsp/")
+VAR_AXIS_PREFIXES = ("var", "varm/", "varp/")
+MATRIX_PREFIXES = ("X", "layers/")
+
+
+def _get_axis_length(root: Any, axis: str) -> Optional[int]:
+    try:
+        return axis_len(root, axis)
+    except Exception:
+        return None
+
+
+def validate_dimensions(
+    root: Any,
+    obj_path: str,
+    data_shape: Tuple[int, ...],
+    console: Console,
+) -> None:
+    obj_path = norm_path(obj_path)
+    n_obs = _get_axis_length(root, "obs")
+    n_var = _get_axis_length(root, "var")
+
+    if obj_path == "obs":
+        if n_obs is not None and data_shape[0] != n_obs:
+            raise ValueError(
+                f"Row count mismatch: input has {data_shape[0]} rows, "
+                f"but obs has {n_obs} cells."
+            )
+        return
+    if obj_path == "var":
+        if n_var is not None and data_shape[0] != n_var:
+            raise ValueError(
+                f"Row count mismatch: input has {data_shape[0]} rows, "
+                f"but var has {n_var} features."
+            )
+        return
+
+    for prefix in MATRIX_PREFIXES:
+        if obj_path == prefix or obj_path.startswith(prefix + "/") or obj_path.startswith(prefix):
+            if obj_path == "X" or obj_path.startswith("layers/"):
+                if len(data_shape) < 2:
+                    raise ValueError(
+                        f"Matrix data requires 2D shape, got {len(data_shape)}D."
+                    )
+                if n_obs is not None and data_shape[0] != n_obs:
+                    raise ValueError(
+                        f"First dimension mismatch: input has {data_shape[0]} rows, "
+                        f"but obs has {n_obs} cells."
+                    )
+                if n_var is not None and data_shape[1] != n_var:
+                    raise ValueError(
+                        f"Second dimension mismatch: input has {data_shape[1]} columns, "
+                        f"but var has {n_var} features."
+                    )
+                return
+
+    for prefix in OBS_AXIS_PREFIXES:
+        if obj_path.startswith(prefix) and obj_path != "obs":
+            if n_obs is not None and data_shape[0] != n_obs:
+                raise ValueError(
+                    f"First dimension mismatch: input has {data_shape[0]} rows, "
+                    f"but obs has {n_obs} cells."
+                )
+            if obj_path.startswith("obsp/") and len(data_shape) >= 2:
+                if data_shape[1] != n_obs:
+                    raise ValueError(
+                        "obsp matrix must be square (n_obs × n_obs): "
+                        f"got {data_shape[0]}×{data_shape[1]}, expected {n_obs}×{n_obs}."
+                    )
+            return
+
+    for prefix in VAR_AXIS_PREFIXES:
+        if obj_path.startswith(prefix) and obj_path != "var":
+            if n_var is not None and data_shape[0] != n_var:
+                raise ValueError(
+                    f"First dimension mismatch: input has {data_shape[0]} rows, "
+                    f"but var has {n_var} features."
+                )
+            if obj_path.startswith("varp/") and len(data_shape) >= 2:
+                if data_shape[1] != n_var:
+                    raise ValueError(
+                        "varp matrix must be square (n_var × n_var): "
+                        f"got {data_shape[0]}×{data_shape[1]}, expected {n_var}×{n_var}."
+                    )
+            return
+
+    console.print(f"[dim]Note: No dimension validation for path '{obj_path}'[/]")

From 6d8903bd5dfa75eb77537f00df9975fe6cf93521 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 17:31:57 +0000
Subject: [PATCH 37/62] HUGE REFACTOR: Add core functionality for handling
 .h5ad and .zarr stores with subset operations

---
 src/h5ad/core/__init__.py |   1 +
 src/h5ad/core/info.py     | 221 ++++++++++++++++++
 src/h5ad/core/read.py     | 112 +++++++++
 src/h5ad/core/subset.py   | 464 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 798 insertions(+)
 create mode 100644 src/h5ad/core/__init__.py
 create mode 100644 src/h5ad/core/info.py
 create mode 100644 src/h5ad/core/read.py
 create mode 100644 src/h5ad/core/subset.py

diff --git a/src/h5ad/core/__init__.py b/src/h5ad/core/__init__.py
new file mode 100644
index 0000000..9224273
--- /dev/null
+++ b/src/h5ad/core/__init__.py
@@ -0,0 +1 @@
+"""Core logic shared by CLI commands and format handlers."""
diff --git a/src/h5ad/core/info.py b/src/h5ad/core/info.py
new file mode 100644
index 0000000..8db8a14
--- /dev/null
+++ b/src/h5ad/core/info.py
@@ -0,0 +1,221 @@
+from __future__ import annotations
+
+from typing import Optional, Tuple, Dict, Any, Union
+
+import numpy as np
+
+from h5ad.storage import is_dataset, is_group, is_hdf5_dataset
+
+
+def _decode_attr(value: Any) -> Any:
+    if isinstance(value, bytes):
+        return value.decode("utf-8")
+    return value
+
+
+def get_entry_type(entry: Any) -> Dict[str, Any]:
+    """
+    Determine the type/format of an object for export guidance.
+
+    Supports both:
+    - v0.2.0 (modern): Objects with encoding-type/encoding-version attributes
+    - v0.1.0 (legacy): Objects without encoding attributes, inferred from structure
+    """
+    result: Dict[str, Any] = {
+        "type": "unknown",
+        "export_as": None,
+        "encoding": None,
+        "shape": None,
+        "dtype": None,
+        "details": "",
+        "version": None,
+    }
+
+    enc = _decode_attr(entry.attrs.get("encoding-type", b""))
+    result["encoding"] = enc if enc else None
+
+    enc_ver = _decode_attr(entry.attrs.get("encoding-version", b""))
+    result["version"] = enc_ver if enc_ver else None
+
+    if is_dataset(entry):
+        result["shape"] = entry.shape
+        result["dtype"] = str(entry.dtype)
+
+        if "categories" in entry.attrs:
+            result["type"] = "categorical"
+            result["export_as"] = "csv"
+            result["version"] = result["version"] or "0.1.0"
+            n_cats = "?"
+            if is_hdf5_dataset(entry):
+                try:
+                    cats_ref = entry.attrs["categories"]
+                    cats_ds = entry.file[cats_ref]
+                    n_cats = cats_ds.shape[0]
+                except Exception:
+                    n_cats = "?"
+            result["details"] = (
+                f"Legacy categorical [{entry.shape[0]} values, {n_cats} categories]"
+            )
+            return result
+
+        if entry.shape == ():
+            result["type"] = "scalar"
+            result["export_as"] = "json"
+            result["details"] = f"Scalar value ({entry.dtype})"
+            return result
+
+        if entry.ndim == 1:
+            result["type"] = "array"
+            result["export_as"] = "npy"
+            result["details"] = f"1D array [{entry.shape[0]}] ({entry.dtype})"
+        elif entry.ndim == 2:
+            result["type"] = "dense-matrix"
+            result["export_as"] = "npy"
+            result["details"] = (
+                f"Dense matrix {entry.shape[0]}×{entry.shape[1]} ({entry.dtype})"
+            )
+        elif entry.ndim == 3:
+            result["type"] = "array"
+            result["export_as"] = "npy"
+            result["details"] = f"3D array {entry.shape} ({entry.dtype})"
+        else:
+            result["type"] = "array"
+            result["export_as"] = "npy"
+            result["details"] = f"ND array {entry.shape} ({entry.dtype})"
+        return result
+
+    if is_group(entry):
+        if enc in ("csr_matrix", "csc_matrix"):
+            shape = entry.attrs.get("shape", None)
+            shape_str = f"{shape[0]}×{shape[1]}" if shape is not None else "?"
+            result["type"] = "sparse-matrix"
+            result["export_as"] = "mtx"
+            result["details"] = (
+                f"Sparse {enc.replace('_matrix', '').upper()} matrix {shape_str}"
+            )
+            return result
+
+        if enc == "categorical":
+            codes = entry.get("codes")
+            cats = entry.get("categories")
+            n_codes = codes.shape[0] if codes is not None else "?"
+            n_cats = cats.shape[0] if cats is not None else "?"
+            result["type"] = "categorical"
+            result["export_as"] = "csv"
+            result["details"] = f"Categorical [{n_codes} values, {n_cats} categories]"
+            return result
+
+        if (
+            enc == "dataframe"
+            or "_index" in entry.attrs
+            or "obs_names" in entry
+            or "var_names" in entry
+        ):
+            if enc == "dataframe":
+                df_version = result["version"] or "0.2.0"
+            else:
+                df_version = "0.1.0"
+            result["version"] = df_version
+
+            has_legacy_cats = "__categories" in entry
+            n_cols = len(
+                [k for k in entry.keys() if k not in ("_index", "__categories")]
+            )
+
+            result["type"] = "dataframe"
+            result["export_as"] = "csv"
+            if has_legacy_cats:
+                result["details"] = f"DataFrame with {n_cols} columns (legacy v0.1.0)"
+            else:
+                result["details"] = f"DataFrame with {n_cols} columns"
+            return result
+
+        if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"):
+            result["type"] = "array"
+            result["export_as"] = "npy"
+            result["details"] = f"Encoded array ({enc})"
+            return result
+
+        if enc == "string-array":
+            result["type"] = "array"
+            result["export_as"] = "npy"
+            result["details"] = "Encoded string array"
+            return result
+
+        if enc == "awkward-array":
+            length = entry.attrs.get("length", "?")
+            result["type"] = "awkward-array"
+            result["export_as"] = "json"
+            result["details"] = f"Awkward array (length={length})"
+            return result
+
+        n_keys = len(list(entry.keys()))
+        result["type"] = "dict"
+        result["export_as"] = "json"
+        result["details"] = f"Group with {n_keys} keys"
+        return result
+
+    return result
+
+
+def format_type_info(info: Dict[str, Any]) -> str:
+    type_colors = {
+        "dataframe": "green",
+        "sparse-matrix": "magenta",
+        "dense-matrix": "blue",
+        "array": "blue",
+        "dict": "yellow",
+        "categorical": "green",
+        "scalar": "white",
+        "unknown": "red",
+    }
+
+    color = type_colors.get(info["type"], "white")
+    return f"[{color}]<{info['type']}>[/]"
+
+
+def axis_len(file: Any, axis: str) -> int:
+    if axis not in file:
+        raise KeyError(f"'{axis}' not found in the file.")
+
+    group = file[axis]
+    if not is_group(group):
+        raise TypeError(f"'{axis}' is not a group.")
+
+    index_name = group.attrs.get("_index", None)
+    if index_name is None:
+        if axis == "obs":
+            index_name = "obs_names"
+        elif axis == "var":
+            index_name = "var_names"
+        else:
+            raise ValueError(f"Invalid axis '{axis}'. Must be 'obs' or 'var'.")
+
+    index_name = _decode_attr(index_name)
+
+    if index_name not in group:
+        raise KeyError(f"Index dataset '{index_name}' not found in '{axis}' group.")
+
+    dataset = group[index_name]
+    if not is_dataset(dataset):
+        raise TypeError(f"Index '{index_name}' in '{axis}' is not a dataset.")
+    if dataset.shape:
+        return int(dataset.shape[0])
+    raise ValueError(
+        f"Cannot determine length of '{axis}': index dataset has no shape."
+    )
+
+
+def get_axis_group(file: Any, axis: str) -> Tuple[Any, int, str]:
+    if axis not in ("obs", "var"):
+        raise ValueError("axis must be 'obs' or 'var'.")
+
+    n = axis_len(file, axis)
+    group = file[axis]
+
+    index_name = group.attrs.get("_index", None)
+    if index_name is None:
+        index_name = "obs_names" if axis == "obs" else "var_names"
+    index_name = _decode_attr(index_name)
+
+    return group, n, index_name
diff --git a/src/h5ad/core/read.py b/src/h5ad/core/read.py
new file mode 100644
index 0000000..b81ee1f
--- /dev/null
+++ b/src/h5ad/core/read.py
@@ -0,0 +1,112 @@
+from __future__ import annotations
+
+from typing import List, Dict, Any
+
+import h5py
+import numpy as np
+
+from h5ad.storage import is_group, is_dataset, is_hdf5_dataset
+
+
+def decode_str_array(array: np.ndarray) -> np.ndarray:
+    if np.issubdtype(array.dtype, np.bytes_):
+        return array.astype("U")
+    if array.dtype.kind == "O":
+        return array.astype(str)
+    return array.astype(str)
+
+
+def read_categorical_column(
+    col: Any,
+    start: int,
+    end: int,
+    cache: Dict[int, np.ndarray],
+    parent_group: Any | None = None,
+) -> List[str]:
+    key = id(col)
+
+    if is_group(col):
+        if key not in cache:
+            cats = col["categories"][...]
+            cats = decode_str_array(cats)
+            cache[key] = np.asarray(cats, dtype=str)
+        cats = cache[key]
+
+        codes_ds = col["codes"]
+        codes = codes_ds[start:end]
+        codes = np.asarray(codes, dtype=np.int64)
+        return [cats[c] if 0 <= c < len(cats) else "" for c in codes]
+
+    if is_dataset(col):
+        if key not in cache:
+            cats_ref = col.attrs.get("categories", None)
+            if cats_ref is not None and is_hdf5_dataset(col):
+                cats_ds = col.file[cats_ref]
+                cats = cats_ds[...]
+            elif parent_group is not None and "__categories" in parent_group:
+                col_name = col.name.split("/")[-1]
+                cats_grp = parent_group["__categories"]
+                if col_name in cats_grp:
+                    cats = cats_grp[col_name][...]
+                else:
+                    raise KeyError(
+                        f"Cannot find categories for legacy column {col.name}"
+                    )
+            else:
+                raise KeyError(
+                    f"Cannot find categories for legacy column {col.name}"
+                )
+            cats = decode_str_array(cats)
+            cache[key] = np.asarray(cats, dtype=str)
+        cats = cache[key]
+
+        codes = col[start:end]
+        codes = np.asarray(codes, dtype=np.int64)
+        return [cats[c] if 0 <= c < len(cats) else "" for c in codes]
+
+    raise TypeError(f"Unsupported categorical column type: {type(col)}")
+
+
+def col_chunk_as_strings(
+    group: Any,
+    col_name: str,
+    start: int,
+    end: int,
+    cat_cache: Dict[int, np.ndarray],
+) -> List[str]:
+    if col_name not in group:
+        raise RuntimeError(f"Column {col_name!r} not found in group {group.name}")
+
+    col = group[col_name]
+
+    if is_dataset(col):
+        if "categories" in col.attrs:
+            return read_categorical_column(col, start, end, cat_cache, group)
+
+        chunk = col[start:end]
+        if chunk.ndim != 1:
+            chunk = chunk.reshape(-1)
+        chunk = decode_str_array(np.asarray(chunk))
+        return chunk.tolist()
+
+    if is_group(col):
+        enc = col.attrs.get("encoding-type", b"")
+        if isinstance(enc, bytes):
+            enc = enc.decode("utf-8")
+
+        if enc == "categorical":
+            return read_categorical_column(col, start, end, cat_cache)
+
+        if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"):
+            values = col["values"][start:end]
+            mask = col["mask"][start:end]
+            values = decode_str_array(np.asarray(values))
+            return ["" if m else str(v) for v, m in zip(values, mask)]
+
+        raise ValueError(
+            f"Unsupported group encoding {enc!r} for column {col_name!r}"
+        )
+
+    raise TypeError(
+        f"Unsupported column type for {col_name!r} in group {group.name}"
+    )
diff --git a/src/h5ad/core/subset.py b/src/h5ad/core/subset.py
new file mode 100644
index 0000000..ee254cd
--- /dev/null
+++ b/src/h5ad/core/subset.py
@@ -0,0 +1,464 @@
+"""Subset operations for .h5ad and .zarr stores."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Optional, Set, Tuple, List, Dict, Any
+
+import numpy as np
+from rich.console import Console
+from rich.progress import (
+    Progress,
+    SpinnerColumn,
+    TextColumn,
+    BarColumn,
+    TaskProgressColumn,
+    TimeElapsedColumn,
+)
+
+from h5ad.core.read import decode_str_array
+from h5ad.storage import (
+    create_dataset,
+    copy_attrs,
+    copy_tree,
+    dataset_create_kwargs,
+    is_dataset,
+    is_group,
+    is_zarr_group,
+    is_zarr_array,
+    open_store,
+)
+
+
+def _target_backend(dst_group: Any) -> str:
+    return "zarr" if is_zarr_group(dst_group) else "hdf5"
+
+
+def _ensure_group(parent: Any, name: str) -> Any:
+    return parent[name] if name in parent else parent.create_group(name)
+
+
+def _group_get(parent: Any, key: str) -> Any | None:
+    return parent[key] if key in parent else None
+
+
+def _decode_attr(value: Any) -> Any:
+    if isinstance(value, bytes):
+        return value.decode("utf-8")
+    return value
+
+
+def _read_name_file(path: Path) -> Set[str]:
+    names: Set[str] = set()
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                names.add(line)
+    return names
+
+
+def indices_from_name_set(
+    names_ds: Any,
+    keep: Set[str],
+    *,
+    chunk_size: int = 200_000,
+) -> Tuple[np.ndarray, Set[str]]:
+    if names_ds.ndim != 1:
+        flat_len = int(np.prod(names_ds.shape))
+    else:
+        flat_len = names_ds.shape[0]
+
+    remaining = set(keep)
+    found_indices: List[int] = []
+
+    for start in range(0, flat_len, chunk_size):
+        end = min(start + chunk_size, flat_len)
+        chunk = names_ds[start:end]
+        chunk = decode_str_array(np.asarray(chunk)).astype(str)
+
+        for i, name in enumerate(chunk):
+            if name in remaining:
+                found_indices.append(start + i)
+                remaining.remove(name)
+
+        if not remaining:
+            break
+
+    return np.asarray(found_indices, dtype=np.int64), remaining
+
+
+def subset_axis_group(
+    src: Any,
+    dst: Any,
+    indices: Optional[np.ndarray],
+) -> None:
+    copy_attrs(src.attrs, dst.attrs, target_backend=_target_backend(dst))
+    target_backend = _target_backend(dst)
+
+    for key in src.keys():
+        obj = src[key]
+
+        if is_dataset(obj):
+            if indices is None:
+                copy_tree(obj, dst, key)
+            else:
+                if is_zarr_array(obj):
+                    if obj.ndim == 1:
+                        data = obj.oindex[indices]
+                    else:
+                        selection = (indices,) + (slice(None),) * (obj.ndim - 1)
+                        data = obj.oindex[selection]
+                else:
+                    data = obj[indices, ...]
+                ds = create_dataset(
+                    dst,
+                    key,
+                    data=data,
+                    **dataset_create_kwargs(obj, target_backend=target_backend),
+                )
+                copy_attrs(obj.attrs, ds.attrs, target_backend=target_backend)
+        elif is_group(obj):
+            enc = obj.attrs.get("encoding-type", b"")
+            if isinstance(enc, bytes):
+                enc = enc.decode("utf-8")
+
+            if enc == "categorical":
+                gdst = dst.create_group(key)
+                copy_attrs(obj.attrs, gdst.attrs, target_backend=target_backend)
+                copy_tree(obj["categories"], gdst, "categories")
+
+                codes = obj["codes"]
+                if indices is None:
+                    copy_tree(codes, gdst, "codes")
+                else:
+                    codes_sub = codes[indices, ...]
+                    ds = create_dataset(
+                        gdst,
+                        "codes",
+                        data=codes_sub,
+                        **dataset_create_kwargs(codes, target_backend=target_backend),
+                    )
+                    copy_attrs(codes.attrs, ds.attrs, target_backend=target_backend)
+            else:
+                copy_tree(obj, dst, key)
+
+
+def subset_dense_matrix(
+    src: Any,
+    dst_parent: Any,
+    name: str,
+    obs_idx: Optional[np.ndarray],
+    var_idx: Optional[np.ndarray],
+    *,
+    chunk_rows: int = 1024,
+) -> None:
+    if src.ndim != 2:
+        copy_tree(src, dst_parent, name)
+        return
+
+    n_obs, n_var = src.shape
+    out_obs = len(obs_idx) if obs_idx is not None else n_obs
+    out_var = len(var_idx) if var_idx is not None else n_var
+
+    target_backend = _target_backend(dst_parent)
+    kw = dataset_create_kwargs(src, target_backend=target_backend)
+    chunks = kw.get("chunks")
+    if isinstance(chunks, (tuple, list)) and len(chunks) >= 2:
+        kw["chunks"] = (min(int(chunks[0]), out_obs), min(int(chunks[1]), out_var))
+
+    dst = create_dataset(
+        dst_parent,
+        name,
+        shape=(out_obs, out_var),
+        dtype=src.dtype,
+        **kw,
+    )
+    copy_attrs(src.attrs, dst.attrs, target_backend=_target_backend(dst_parent))
+
+    for out_start in range(0, out_obs, chunk_rows):
+        out_end = min(out_start + chunk_rows, out_obs)
+
+        if obs_idx is None:
+            block = src[out_start:out_end, :]
+        else:
+            rows = obs_idx[out_start:out_end]
+            block = src[rows, :]
+
+        if var_idx is not None:
+            block = block[:, var_idx]
+
+        dst[out_start:out_end, :] = block
+
+
+def subset_sparse_matrix_group(
+    src: Any,
+    dst_parent: Any,
+    name: str,
+    obs_idx: Optional[np.ndarray],
+    var_idx: Optional[np.ndarray],
+) -> None:
+    enc = src.attrs.get("encoding-type", b"")
+    if isinstance(enc, bytes):
+        enc = enc.decode("utf-8")
+
+    if enc not in ("csr_matrix", "csc_matrix"):
+        raise ValueError(f"Unsupported sparse encoding type: {enc}")
+
+    data = np.asarray(src["data"][...])
+    indices = np.asarray(src["indices"][...], dtype=np.int64)
+    indptr = np.asarray(src["indptr"][...], dtype=np.int64)
+    shape = src.attrs.get("shape", None)
+    if shape is None:
+        raise ValueError("Sparse matrix group missing 'shape' attribute.")
+    n_rows, n_cols = int(shape[0]), int(shape[1])
+
+    if enc == "csr_matrix":
+        row_idx = obs_idx if obs_idx is not None else np.arange(n_rows, dtype=np.int64)
+        col_idx = var_idx if var_idx is not None else np.arange(n_cols, dtype=np.int64)
+
+        new_data = []
+        new_indices = []
+        new_indptr = [0]
+
+        for r in row_idx:
+            start = indptr[r]
+            end = indptr[r + 1]
+            row_cols = indices[start:end]
+            row_data = data[start:end]
+
+            if var_idx is not None:
+                col_mask = np.isin(row_cols, col_idx)
+                row_cols = row_cols[col_mask]
+                row_data = row_data[col_mask]
+
+            if var_idx is not None:
+                col_map = {c: i for i, c in enumerate(col_idx)}
+                row_cols = np.array([col_map[c] for c in row_cols], dtype=np.int64)
+
+            new_indices.extend(row_cols.tolist())
+            new_data.extend(row_data.tolist())
+            new_indptr.append(len(new_indices))
+
+        new_shape = (len(row_idx), len(col_idx))
+    else:
+        row_idx = obs_idx if obs_idx is not None else np.arange(n_rows, dtype=np.int64)
+        col_idx = var_idx if var_idx is not None else np.arange(n_cols, dtype=np.int64)
+
+        new_data = []
+        new_indices = []
+        new_indptr = [0]
+
+        for c in col_idx:
+            start = indptr[c]
+            end = indptr[c + 1]
+            col_rows = indices[start:end]
+            col_data = data[start:end]
+
+            if obs_idx is not None:
+                row_mask = np.isin(col_rows, row_idx)
+                col_rows = col_rows[row_mask]
+                col_data = col_data[row_mask]
+
+            if obs_idx is not None:
+                row_map = {r: i for i, r in enumerate(row_idx)}
+                col_rows = np.array([row_map[r] for r in col_rows], dtype=np.int64)
+
+            new_indices.extend(col_rows.tolist())
+            new_data.extend(col_data.tolist())
+            new_indptr.append(len(new_indices))
+
+        new_shape = (len(row_idx), len(col_idx))
+
+    group = dst_parent.create_group(name)
+    group.attrs["encoding-type"] = enc
+    group.attrs["encoding-version"] = "0.1.0"
+    if is_zarr_group(group):
+        group.attrs["shape"] = list(new_shape)
+    else:
+        group.attrs["shape"] = np.array(new_shape, dtype=np.int64)
+
+    create_dataset(group, "data", data=np.array(new_data, dtype=data.dtype))
+    create_dataset(group, "indices", data=np.array(new_indices, dtype=indices.dtype))
+    create_dataset(group, "indptr", data=np.array(new_indptr, dtype=indptr.dtype))
+
+
+def subset_h5ad(
+    file: Path,
+    output: Path,
+    obs_file: Optional[Path],
+    var_file: Optional[Path],
+    *,
+    chunk_rows: int = 1024,
+    console: Console,
+) -> None:
+    obs_keep: Optional[Set[str]] = None
+    if obs_file is not None:
+        obs_keep = _read_name_file(obs_file)
+        console.print(f"[cyan]Found {len(obs_keep)} obs names to keep[/]")
+
+    var_keep: Optional[Set[str]] = None
+    if var_file is not None:
+        var_keep = _read_name_file(var_file)
+        console.print(f"[cyan]Found {len(var_keep)} var names to keep[/]")
+
+    if obs_keep is None and var_keep is None:
+        raise ValueError("At least one of --obs or --var must be provided.")
+
+    with console.status("[magenta]Opening files...[/]"):
+        with open_store(file, "r") as src_store, open_store(output, "w") as dst_store:
+            src = src_store.root
+            dst = dst_store.root
+
+            obs_idx = None
+            if obs_keep is not None:
+                console.print("[cyan]Matching obs names...[/]")
+                obs_group = src["obs"]
+                obs_index = _decode_attr(obs_group.attrs.get("_index", "obs_names"))
+                obs_names_ds = _group_get(obs_group, "obs_names") or _group_get(
+                    obs_group, obs_index
+                )
+                if obs_names_ds is None:
+                    raise KeyError("Could not find obs names")
+
+                obs_idx, missing_obs = indices_from_name_set(obs_names_ds, obs_keep)
+                if missing_obs:
+                    console.print(
+                        f"[yellow]Warning: {len(missing_obs)} obs names not found in file[/]"
+                    )
+                console.print(
+                    f"[green]Selected {len(obs_idx)} obs (of {obs_names_ds.shape[0]})[/]"
+                )
+
+            var_idx = None
+            if var_keep is not None:
+                console.print("[cyan]Matching var names...[/]")
+                var_group = src["var"]
+                var_index = _decode_attr(var_group.attrs.get("_index", "var_names"))
+                var_names_ds = _group_get(var_group, "var_names") or _group_get(
+                    var_group, var_index
+                )
+                if var_names_ds is None:
+                    raise KeyError("Could not find var names")
+
+                var_idx, missing_var = indices_from_name_set(var_names_ds, var_keep)
+                if missing_var:
+                    console.print(
+                        f"[yellow]Warning: {len(missing_var)} var names not found in file[/]"
+                    )
+                console.print(
+                    f"[green]Selected {len(var_idx)} var (of {var_names_ds.shape[0]})[/]"
+                )
+
+            tasks: List[str] = []
+            if "obs" in src:
+                tasks.append("obs")
+            if "var" in src:
+                tasks.append("var")
+            if "X" in src:
+                tasks.append("X")
+            if "layers" in src:
+                tasks.extend([f"layer:{k}" for k in src["layers"].keys()])
+            if "obsm" in src:
+                tasks.extend([f"obsm:{k}" for k in src["obsm"].keys()])
+            if "varm" in src:
+                tasks.extend([f"varm:{k}" for k in src["varm"].keys()])
+            if "obsp" in src:
+                tasks.extend([f"obsp:{k}" for k in src["obsp"].keys()])
+            if "varp" in src:
+                tasks.extend([f"varp:{k}" for k in src["varp"].keys()])
+            if "uns" in src:
+                tasks.append("uns")
+
+            with Progress(
+                SpinnerColumn(),
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                TaskProgressColumn(),
+                TimeElapsedColumn(),
+                console=console,
+            ) as progress:
+                task_id = progress.add_task("[cyan]Subsetting...", total=len(tasks))
+
+                for task in tasks:
+                    if task == "obs":
+                        obs_dst = dst.create_group("obs")
+                        subset_axis_group(src["obs"], obs_dst, obs_idx)
+                    elif task == "var":
+                        var_dst = dst.create_group("var")
+                        subset_axis_group(src["var"], var_dst, var_idx)
+                    elif task == "X":
+                        X = src["X"]
+                        if is_dataset(X):
+                            subset_dense_matrix(
+                                X, dst, "X", obs_idx, var_idx, chunk_rows=chunk_rows
+                            )
+                        elif is_group(X):
+                            subset_sparse_matrix_group(X, dst, "X", obs_idx, var_idx)
+                        else:
+                            copy_tree(X, dst, "X")
+                    elif task.startswith("layer:"):
+                        key = task.split(":", 1)[1]
+                        layer_src = src["layers"][key]
+                        if is_dataset(layer_src):
+                            layers_dst = _ensure_group(dst, "layers")
+                            subset_dense_matrix(
+                                layer_src,
+                                layers_dst,
+                                key,
+                                obs_idx,
+                                var_idx,
+                                chunk_rows=chunk_rows,
+                            )
+                        elif is_group(layer_src):
+                            layers_dst = _ensure_group(dst, "layers")
+                            subset_sparse_matrix_group(
+                                layer_src, layers_dst, key, obs_idx, var_idx
+                            )
+                    elif task.startswith("obsm:"):
+                        key = task.split(":", 1)[1]
+                        obsm_dst = _ensure_group(dst, "obsm")
+                        subset_dense_matrix(
+                            src["obsm"][key],
+                            obsm_dst,
+                            key,
+                            obs_idx,
+                            None,
+                            chunk_rows=chunk_rows,
+                        )
+                    elif task.startswith("varm:"):
+                        key = task.split(":", 1)[1]
+                        varm_dst = _ensure_group(dst, "varm")
+                        subset_dense_matrix(
+                            src["varm"][key],
+                            varm_dst,
+                            key,
+                            var_idx,
+                            None,
+                            chunk_rows=chunk_rows,
+                        )
+                    elif task.startswith("obsp:"):
+                        key = task.split(":", 1)[1]
+                        obsp_dst = _ensure_group(dst, "obsp")
+                        subset_dense_matrix(
+                            src["obsp"][key],
+                            obsp_dst,
+                            key,
+                            obs_idx,
+                            obs_idx,
+                            chunk_rows=chunk_rows,
+                        )
+                    elif task.startswith("varp:"):
+                        key = task.split(":", 1)[1]
+                        varp_dst = _ensure_group(dst, "varp")
+                        subset_dense_matrix(
+                            src["varp"][key],
+                            varp_dst,
+                            key,
+                            var_idx,
+                            var_idx,
+                            chunk_rows=chunk_rows,
+                        )
+                    elif task == "uns":
+                        copy_tree(src["uns"], dst, "uns")
+                    progress.advance(task_id)

From 4f6d2e58ebdc309ec857389a75ad80a6bd5eae32 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 17:32:13 +0000
Subject: [PATCH 38/62] HUGE REFACTOR: Refactor h5ad command modules: update
 imports and streamline subset operations

- Modified `info.py` to utilize `open_store` for file handling and updated type checks to use new utility functions.
- Enhanced `_show_types_tree` and `_show_object_info` functions for better clarity and functionality.
- Removed redundant code in `subset.py` by consolidating functions and improving the structure for handling dense and sparse matrices.
- Updated the `subset_h5ad` function to improve the process of subsetting observations and variables, including better handling of missing names.
---
 src/h5ad/commands/export.py      | 630 +++-------------------------
 src/h5ad/commands/import_data.py | 470 ++-------------------
 src/h5ad/commands/info.py        |  28 +-
 src/h5ad/commands/subset.py      | 699 +------------------------------
 4 files changed, 131 insertions(+), 1696 deletions(-)

diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py
index 06c7f0f..22221a7 100644
--- a/src/h5ad/commands/export.py
+++ b/src/h5ad/commands/export.py
@@ -1,27 +1,19 @@
 from __future__ import annotations
 
-import csv
-import json
-import sys
-from contextlib import nullcontext
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union, cast
+from typing import List, Optional
 
-import h5py
-import numpy as np
 from rich.console import Console
-from PIL import Image
 
-from h5ad.read import col_chunk_as_strings, decode_str_array
-from h5ad.info import get_axis_group, get_entry_type
+from h5ad.formats.array import export_npy as export_npy_format
+from h5ad.formats.common import EXPORTABLE_TYPES, IMAGE_EXTENSIONS, TYPE_EXTENSIONS
+from h5ad.formats.dataframe import export_dataframe
+from h5ad.formats.image import export_image as export_image_format
+from h5ad.formats.json_data import export_json as export_json_format
+from h5ad.formats.sparse import export_mtx as export_mtx_format
+from h5ad.storage import open_store
 
 
-H5Obj = Union[h5py.Group, h5py.Dataset]
-
-
-# ============================================================================
-# DATAFRAME EXPORT (CSV)
-# ============================================================================
 def export_table(
     file: Path,
     axis: str,
@@ -31,171 +23,17 @@ def export_table(
     head: Optional[int],
     console: Console,
 ) -> None:
-    """
-    Export a dataframe (obs or var) to CSV format.
-
-    Args:
-        file: Path to the .h5ad file
-        axis: Axis to read from ('obs' or 'var')
-        columns: List of column names to include in the output table
-        out: Output file path (defaults to stdout if None)
-        chunk_rows: Number of rows to read per chunk
-        head: Output only the first n rows
-        console: Rich console for status output
-
-    Supports both v0.2.0 (modern) and v0.1.0 (legacy) dataframe formats.
-    """
-    with h5py.File(file, "r") as f:
-        group, n_rows, index_name = get_axis_group(f, axis)
-
-        # Reserved keys to exclude from column list
-        # __categories is used in v0.1.0 for storing categorical labels
-        reserved_keys = {"_index", "__categories"}
-
-        # Determine columns to read
-        if columns:
-            col_names = list(columns)
-        else:
-            col_names = [
-                k for k in group.keys() if k not in reserved_keys and k != index_name
-            ]
-            # Add index name if not already present
-            if index_name and index_name not in col_names:
-                col_names.insert(0, index_name)
-
-        if isinstance(index_name, bytes):
-            index_name = index_name.decode("utf-8")
-
-        if index_name not in col_names:
-            col_names.insert(0, index_name)
-        else:
-            col_names = [index_name] + [c for c in col_names if c != index_name]
-
-        # Limit rows if head option is specified
-        if head is not None and head > 0:
-            n_rows = min(n_rows, head)
-
-        # Open writer
-        if out is None or str(out) == "-":
-            out_fh = sys.stdout
-        else:
-            out_fh = open(out, "w", newline="", encoding="utf-8")
-        writer = csv.writer(out_fh)
-
-        # Write data in chunks
-        try:
-            writer.writerow(col_names)
-            cat_cache: Dict[int, np.ndarray] = {}
-
-            # Use status spinner only when writing to file (not stdout)
-            use_status = out_fh is not sys.stdout
-            status_ctx = (
-                console.status(f"[magenta]Exporting {axis} table to {out}...[/]")
-                if use_status
-                else nullcontext()
-            )
-
-            with status_ctx as status:
-                for start in range(0, n_rows, chunk_rows):
-                    end = min(start + chunk_rows, n_rows)
-                    if use_status and status:
-                        status.update(
-                            f"[magenta]Exporting rows {start}-{end} of {n_rows}...[/]"
-                        )
-                    cols_data: List[List[str]] = []
-                    # Read each column for the current chunk
-                    for col in col_names:
-                        cols_data.append(
-                            col_chunk_as_strings(group, col, start, end, cat_cache)
-                        )
-                    # Write rows
-                    for row_idx in range(end - start):
-                        row = [
-                            cols_data[col_idx][row_idx]
-                            for col_idx in range(len(col_names))
-                        ]
-                        writer.writerow(row)
-        finally:
-            if out_fh is not sys.stdout:
-                out_fh.close()
-
-
-# ============================================================================
-# TYPE DETECTION AND VALIDATION
-# ============================================================================
-# Map object types to valid output extensions
-TYPE_EXTENSIONS = {
-    "dataframe": {".csv"},
-    "sparse-matrix": {".mtx"},
-    "dense-matrix": {".npy", ".png", ".jpg", ".jpeg", ".tif", ".tiff"},
-    "array": {".npy", ".png", ".jpg", ".jpeg", ".tif", ".tiff"},
-    "dict": {".json"},
-    "scalar": {".json"},
-    "categorical": {".csv"},
-    "awkward-array": {".json"},
-}
-
-# Image extensions for validation
-IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".tif", ".tiff"}
-
-# Known exportable types
-EXPORTABLE_TYPES = set(TYPE_EXTENSIONS.keys())
-
-
-def _norm_path(p: str) -> str:
-    p = p.strip()
-    if not p:
-        raise ValueError("Object path must be non-empty.")
-    return p.lstrip("/")
-
-
-def _get_encoding_type(group: h5py.Group) -> str:
-    enc = group.attrs.get("encoding-type", "")
-    if isinstance(enc, bytes):
-        enc = enc.decode("utf-8")
-    return str(enc)
-
-
-def _resolve(file: h5py.File, obj: str) -> H5Obj:
-    obj = _norm_path(obj)
-    if obj not in file:
-        raise KeyError(f"'{obj}' not found in the file.")
-    return cast(H5Obj, file[obj])
-
-
-def _check_json_exportable(h5obj: H5Obj, max_elements: int, path: str = "") -> None:
-    """
-    Recursively check if a group/dataset can be exported to JSON.
-    Raises ValueError if it contains non-exportable structures.
-    """
-    if isinstance(h5obj, h5py.Dataset):
-        if h5obj.shape == ():
-            return  # scalar is fine
-        n = int(np.prod(h5obj.shape)) if h5obj.shape else 0
-        if n > max_elements:
-            raise ValueError(
-                f"Cannot export to JSON: '{path or h5obj.name}' has {n} elements "
-                f"(max {max_elements}). Use --max-elements to increase limit."
-            )
-        return
-
-    # It's a Group - check encoding
-    enc = _get_encoding_type(h5obj)
-    if enc in ("csr_matrix", "csc_matrix"):
-        raise ValueError(
-            f"Cannot export to JSON: '{path or h5obj.name}' is a sparse matrix. "
-            f"Export it as .mtx instead."
+    with open_store(file, "r") as store:
+        export_dataframe(
+            store.root,
+            axis=axis,
+            columns=columns,
+            out=out,
+            chunk_rows=chunk_rows,
+            head=head,
+            console=console,
         )
 
-    # Check children recursively
-    for key in h5obj.keys():
-        child = h5obj[key]
-        child_path = f"{path}/{key}" if path else key
-        if isinstance(child, (h5py.Group, h5py.Dataset)):
-            _check_json_exportable(
-                cast(H5Obj, child), max_elements=max_elements, path=child_path
-            )
-
 
 def export_npy(
     file: Path,
@@ -204,84 +42,14 @@ def export_npy(
     chunk_elements: int,
     console: Console,
 ) -> None:
-    """
-    Export a dense HDF5 dataset to NumPy .npy without loading it all at once.
-
-    Supports both:
-    - v0.2.0 (modern): Datasets with encoding-type="array"
-    - v0.1.0 (legacy): Plain datasets without encoding attributes
-    - Encoded groups: nullable-integer, nullable-boolean, string-array (extracts values)
-
-    Args:
-        file: Path to the .h5ad file
-        obj: HDF5 path to the dataset or encoded group
-        out: Output .npy file path
-        chunk_elements: Number of elements to read per chunk
-        console: Rich console for status output
-
-    Raises:
-        ValueError: If the target object is not exportable as .npy
-    """
-    with h5py.File(file, "r") as f:
-        h5obj = _resolve(f, obj)
-
-        # Handle encoded groups that contain array data
-        if isinstance(h5obj, h5py.Group):
-            enc = _get_encoding_type(h5obj)
-            if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"):
-                # Extract values from nullable array group
-                if "values" not in h5obj:
-                    raise ValueError(
-                        f"Encoded group '{obj}' is missing 'values' dataset."
-                    )
-                ds = h5obj["values"]
-                has_mask = "mask" in h5obj
-                console.print(f"[dim]Exporting nullable array values from '{obj}'[/]")
-            else:
-                raise ValueError(
-                    f"Target '{obj}' is a group with encoding '{enc}'; cannot export as .npy directly."
-                )
-        else:
-            ds = h5obj
-            has_mask = False
-
-        out.parent.mkdir(parents=True, exist_ok=True)
-        mm = np.lib.format.open_memmap(out, mode="w+", dtype=ds.dtype, shape=ds.shape)
-        try:
-            if ds.shape == ():
-                mm[...] = ds[()]
-                console.print(f"[green]Wrote[/] {out}")
-                return
-
-            if ds.ndim == 1:
-                n = int(ds.shape[0])
-                step = max(1, int(chunk_elements))
-                with console.status(
-                    f"[magenta]Exporting {obj} to {out}...[/]"
-                ) as status:
-                    for start in range(0, n, step):
-                        end = min(start + step, n)
-                        status.update(
-                            f"[magenta]Exporting {obj}: {start}-{end} of {n}...[/]"
-                        )
-                        mm[start:end] = ds[start:end]
-                console.print(f"[green]Wrote[/] {out}")
-                return
-
-            n0 = int(ds.shape[0])
-            row_elems = int(np.prod(ds.shape[1:])) if ds.ndim > 1 else 1
-            # Convert element budget into a row count; fallback to 1 row if rows are larger.
-            step0 = max(1, int(chunk_elements) // max(1, row_elems))
-            with console.status(f"[magenta]Exporting {obj} to {out}...[/]") as status:
-                for start in range(0, n0, step0):
-                    end = min(start + step0, n0)
-                    status.update(
-                        f"[magenta]Exporting {obj}: {start}-{end} of {n0}...[/]"
-                    )
-                    mm[start:end, ...] = ds[start:end, ...]
-            console.print(f"[green]Wrote[/] {out}")
-        finally:
-            del mm
+    with open_store(file, "r") as store:
+        export_npy_format(
+            store.root,
+            obj=obj,
+            out=out,
+            chunk_elements=chunk_elements,
+            console=console,
+        )
 
 
 def export_mtx(
@@ -293,333 +61,49 @@ def export_mtx(
     in_memory: bool,
     console: Console,
 ) -> None:
-    """Export a CSR/CSC matrix group (AnnData encoding) to Matrix Market (.mtx).
-
-    If out is None or "-", writes to stdout. The head parameter limits output lines.
-    chunk_elements controls how many rows/columns are processed per slice when
-    streaming. Use in_memory for small matrices to load everything at once.
-
-    Args:
-        file: Path to the .h5ad file
-        obj: HDF5 path to the matrix group
-        out: Output .mtx file path (or None for stdout)
-        head: Output only the first n nonzero entries
-        chunk_elements: Number of rows/columns to process per chunk
-        in_memory: Load the entire sparse matrix into memory before exporting
-        console: Rich console for status output
-
-    Raises:
-        ValueError: If the target object is not a valid CSR/CSC matrix group.
-    """
-    with h5py.File(file, "r") as f:
-        h5obj = _resolve(f, obj)
-        if not isinstance(h5obj, h5py.Group):
-            raise ValueError(
-                "MTX export requires a CSR/CSC matrix group (not a dataset)."
-            )
-
-        enc = _get_encoding_type(h5obj)
-        if enc not in ("csr_matrix", "csc_matrix"):
-            raise ValueError(
-                f"Target group encoding-type is {enc!r}; expected 'csr_matrix' or 'csc_matrix'."
-            )
-
-        data = h5obj.get("data")
-        indices = h5obj.get("indices")
-        indptr = h5obj.get("indptr")
-        if (
-            not isinstance(data, h5py.Dataset)
-            or not isinstance(indices, h5py.Dataset)
-            or not isinstance(indptr, h5py.Dataset)
-        ):
-            raise ValueError(
-                "Sparse matrix group must contain datasets: data, indices, indptr"
-            )
-
-        shape = h5obj.attrs.get("shape", None)
-        if shape is None:
-            raise ValueError(
-                "Sparse matrix group is missing required 'shape' attribute."
-            )
-        n_rows, n_cols = (int(shape[0]), int(shape[1]))
-
-        field = "real" if np.issubdtype(data.dtype, np.floating) else "integer"
-
-        # Load sparse index pointers (1 per major axis row/col); used to slice data/indices.
-        indptr_arr = np.asarray(indptr[...], dtype=np.int64)
-        nnz_ptr = int(indptr_arr[-1]) if indptr_arr.size else 0
-        nnz_data = int(data.shape[0])
-        nnz_idx = int(indices.shape[0])
-
-        # Check consistency of sparse data
-        if not (nnz_ptr == nnz_data == nnz_idx):
-            raise ValueError(
-                f"Sparse matrix data inconsistency: indptr implies {nnz_ptr} nonzeros, "
-                f"but data has {nnz_data} and indices has {nnz_idx}."
-            )
-
-        # Determine number of nonzero entries to write
-        nnz = nnz_data
-        major_step = max(1, int(chunk_elements))
-        if head is not None and head > 0:
-            nnz = min(nnz_data, head)
-
-        # Write to stdout when out is None or "-", otherwise open a file on disk.
-        if out is None or str(out) == "-":
-            out_fh = sys.stdout
-        else:
-            out.parent.mkdir(parents=True, exist_ok=True)
-            out_fh = open(out, "w", encoding="utf-8", newline="\n")
-
-        use_status = out_fh is not sys.stdout
-        status_ctx = (
-            console.status(f"[magenta]Exporting {obj} to {out}...[/]")
-            if use_status
-            else nullcontext()
+    with open_store(file, "r") as store:
+        export_mtx_format(
+            store.root,
+            obj=obj,
+            out=out,
+            head=head,
+            chunk_elements=chunk_elements,
+            in_memory=in_memory,
+            console=console,
         )
-        try:
-            # Matrix Market header: type, generator line, then shape and nnz.
-            out_fh.write(f"%%MatrixMarket matrix coordinate {field} general\n")
-            out_fh.write("% generated by h5ad-cli\n")
-            if head is not None and head > 0:
-                out_fh.write(
-                    f"% output limited to first {nnz}/{nnz_data} nonzero entries\n"
-                )
-            out_fh.write(f"{n_rows} {n_cols} {nnz}\n")
-
-            if in_memory:
-                with status_ctx as status:
-                    if use_status and status:
-                        status.update(
-                            f"[magenta]Loading entire matrix {obj} into memory...[/]"
-                        )
-                    data_arr = np.asarray(data[...])
-                    indices_arr = np.asarray(indices[...], dtype=np.int64)
-                    counts = np.diff(indptr_arr)
-                    if int(counts.sum()) != nnz_data:
-                        raise ValueError(
-                            "Sparse matrix indptr does not match data/indices length."
-                        )
-
-                    if enc == "csr_matrix":
-                        major_idx = np.repeat(np.arange(n_rows, dtype=np.int64), counts)
-                        row_idx = major_idx
-                        col_idx = indices_arr
-                    else:
-                        major_idx = np.repeat(np.arange(n_cols, dtype=np.int64), counts)
-                        row_idx = indices_arr
-                        col_idx = major_idx
-
-                    if head is not None and head > 0:
-                        row_idx = row_idx[:nnz]
-                        col_idx = col_idx[:nnz]
-                        data_arr = data_arr[:nnz]
-
-                    data_fmt = "%.18g" if field == "real" else "%d"
-                    coords = np.column_stack((row_idx + 1, col_idx + 1, data_arr))
-                    if use_status and status:
-                        status.update(f"[magenta]Saving {nnz} entries to {out}...[/]")
-                    np.savetxt(out_fh, coords, fmt=["%d", "%d", data_fmt], newline="\n")
-            else:
-                # Iterate over major axis (rows for CSR, cols for CSC)
-                major = n_rows if enc == "csr_matrix" else n_cols
-                max_lines = head if head is not None and head > 0 else None
-                written = 0
-                with status_ctx as status:
-                    for major_start in range(0, major, major_step):
-                        major_end = min(major_start + major_step, major)
-                        if use_status and status:
-                            status.update(
-                                f"[magenta]Exporting {obj}: {major_start+1}-{major_end} of {major}...[/]"
-                            )
-                        for major_i in range(major_start, major_end):
-                            start = min(int(indptr_arr[major_i]), nnz_data)
-                            end = min(int(indptr_arr[major_i + 1]), nnz_data)
-                            if end <= start:
-                                continue
-                            idx = np.asarray(indices[start:end], dtype=np.int64)
-                            vals = np.asarray(data[start:end])
-                            m = min(len(idx), len(vals))
-                            if m == 0:
-                                raise ValueError("Sparse matrix chunk has zero length.")
-                            if max_lines is not None:
-                                remaining = max_lines - written
-                                if remaining <= 0:
-                                    break
-                                if m > remaining:
-                                    m = remaining
-                            idx = idx[:m]
-                            vals = vals[:m]
-                            idx_list = idx.tolist()
-                            vals_list = vals.tolist()
-                            if enc == "csr_matrix":
-                                r = major_i + 1
-                                lines = [
-                                    f"{r} {c + 1} {v}\n"
-                                    for c, v in zip(idx_list, vals_list)
-                                ]
-                            else:
-                                c = major_i + 1
-                                lines = [
-                                    f"{r + 1} {c} {v}\n"
-                                    for r, v in zip(idx_list, vals_list)
-                                ]
-                            out_fh.write("".join(lines))
-                            written += m
-                            if max_lines is not None and written >= max_lines:
-                                break
-                        if max_lines is not None and written >= max_lines:
-                            break
-        finally:
-            if out_fh is not sys.stdout:
-                out_fh.close()
-        if out_fh is not sys.stdout:
-            console.print(f"[green]Wrote[/] {out}")
 
 
 def export_json(
     file: Path,
     obj: str,
-    out: Path,
+    out: Optional[Path],
     max_elements: int,
     include_attrs: bool,
     console: Console,
 ) -> None:
-    """Export an HDF5 group/dataset to JSON (best-effort, with size limits)."""
-    with h5py.File(file, "r") as f:
-        h5obj = _resolve(f, obj)
-
-        # Check if exportable before attempting
-        _check_json_exportable(h5obj, max_elements=max_elements)
-
-        payload = _to_jsonable(
-            h5obj, max_elements=max_elements, include_attrs=include_attrs
+    with open_store(file, "r") as store:
+        export_json_format(
+            store.root,
+            obj=obj,
+            out=out,
+            max_elements=max_elements,
+            include_attrs=include_attrs,
+            console=console,
         )
-        # Write to stdout when out is None or "-", otherwise open a file on disk.
-        if out is None or str(out) == "-":
-            out_fh = sys.stdout
-        else:
-            out.parent.mkdir(parents=True, exist_ok=True)
-            out_fh = open(out, "w", encoding="utf-8")
-        try:
-            json.dump(payload, out_fh, indent=2, ensure_ascii=False, sort_keys=True)
-            out_fh.write("\n")
-        finally:
-            if out_fh is not sys.stdout:
-                out_fh.close()
-        if out_fh is not sys.stdout:
-            console.print(f"[green]Wrote[/] {out}")
-
-
-def _attrs_to_jsonable(
-    attrs: h5py.AttributeManager, max_elements: int
-) -> Dict[str, Any]:
-    out: Dict[str, Any] = {}
-    for k in attrs.keys():
-        v = attrs.get(k)
-        out[str(k)] = _pyify(v, max_elements=max_elements)
-    return out
-
-
-def _pyify(value: Any, max_elements: int) -> Any:
-    if isinstance(value, bytes):
-        try:
-            return value.decode("utf-8")
-        except Exception:
-            return value.decode("utf-8", errors="replace")
-    if isinstance(value, np.generic):
-        return value.item()
-    if isinstance(value, np.ndarray):
-        if value.size > max_elements:
-            raise ValueError(
-                f"Refusing to convert array of size {value.size} (> {max_elements}) to JSON."
-            )
-        if np.issubdtype(value.dtype, np.bytes_) or value.dtype.kind == "O":
-            value = decode_str_array(value)
-        return value.tolist()
-    return value
-
-
-def _dataset_to_jsonable(ds: h5py.Dataset, max_elements: int) -> Any:
-    if ds.shape == ():
-        v = ds[()]
-        return _pyify(v, max_elements=max_elements)
-    n = int(np.prod(ds.shape)) if ds.shape else 0
-    if n > max_elements:
-        raise ValueError(
-            f"Refusing to convert dataset {ds.name!r} with {n} elements (> {max_elements}) to JSON."
-        )
-    arr = np.asarray(ds[...])
-    return _pyify(arr, max_elements=max_elements)
-
-
-def _to_jsonable(h5obj: H5Obj, max_elements: int, include_attrs: bool) -> Any:
-    if isinstance(h5obj, h5py.Dataset):
-        return _dataset_to_jsonable(h5obj, max_elements=max_elements)
-
-    # Group
-    d: Dict[str, Any] = {}
-    if include_attrs and len(h5obj.attrs):
-        d["__attrs__"] = _attrs_to_jsonable(h5obj.attrs, max_elements=max_elements)
-
-    for key in h5obj.keys():
-        child = h5obj[key]
-        if isinstance(child, (h5py.Group, h5py.Dataset)):
-            d[str(key)] = _to_jsonable(
-                cast(H5Obj, child),
-                max_elements=max_elements,
-                include_attrs=include_attrs,
-            )
-    return d
 
 
 def export_image(file: Path, obj: str, out: Path, console: Console) -> None:
-    """
-    Export an image-like dataset (H,W) or (H,W,C) to PNG/JPG/TIFF.
-    Args:
-        file: Path to the .h5ad file
-        obj: HDF5 path to the dataset
-        out: Output image file path
-        console: Rich console for status output
-    Raises:
-        ValueError: If the target object is not a valid image array.
-    """
-    # Load dataset
-    with h5py.File(file, "r") as f:
-        h5obj = _resolve(f, obj)
-        if not isinstance(h5obj, h5py.Dataset):
-            raise ValueError("Image export requires a dataset.")
-        arr = np.asarray(h5obj[...])
-
-    # Validate shape
-    if arr.ndim not in (2, 3):
-        raise ValueError(f"Expected 2D or 3D image array; got shape {arr.shape}.")
-    if arr.ndim == 3 and arr.shape[2] not in (1, 3, 4):
-        raise ValueError(
-            f"Expected last dimension (channels) to be 1, 3, or 4; got {arr.shape}."
-        )
-
-    # Convert to uint8 for common image formats
-    if np.issubdtype(arr.dtype, np.floating):
-        amax = float(np.nanmax(arr)) if arr.size else 0.0
-        if amax <= 1.0:
-            arr = np.clip(arr, 0.0, 1.0) * 255.0
-        else:
-            arr = np.clip(arr, 0.0, 255.0)
-        arr = arr.astype(np.uint8)
-    elif np.issubdtype(arr.dtype, np.integer):
-        arr = np.clip(arr, 0, 255).astype(np.uint8)
-    elif arr.dtype == np.bool_:
-        arr = arr.astype(np.uint8) * 255
-    else:
-        raise ValueError(f"Unsupported image dtype: {arr.dtype}")
-
-    # If single-channel 3D, convert to 2D
-    if arr.ndim == 3 and arr.shape[2] == 1:
-        arr = arr[:, :, 0]
-
-    # Save image
-    img = Image.fromarray(arr)
-    out.parent.mkdir(parents=True, exist_ok=True)
-    img.save(out)
-    console.print(f"[green]Wrote[/] {out}")
+    with open_store(file, "r") as store:
+        export_image_format(store.root, obj=obj, out=out, console=console)
+
+
+__all__ = [
+    "EXPORTABLE_TYPES",
+    "IMAGE_EXTENSIONS",
+    "TYPE_EXTENSIONS",
+    "export_image",
+    "export_json",
+    "export_mtx",
+    "export_npy",
+    "export_table",
+]
diff --git a/src/h5ad/commands/import_data.py b/src/h5ad/commands/import_data.py
index c208a9d..dad838a 100644
--- a/src/h5ad/commands/import_data.py
+++ b/src/h5ad/commands/import_data.py
@@ -1,19 +1,19 @@
-"""Import command for creating/replacing objects in h5ad files."""
+"""Import command helpers for creating/replacing objects in h5ad/zarr stores."""
 
 from __future__ import annotations
 
-import csv
-import json
-import shutil
 from pathlib import Path
-from typing import Any, List, Optional, Tuple, cast
+from typing import Optional
 
-import h5py
-import numpy as np
 from rich.console import Console
 
+from h5ad.formats.array import import_npy
+from h5ad.formats.dataframe import import_dataframe
+from h5ad.formats.json_data import import_json
+from h5ad.formats.sparse import import_mtx
+from h5ad.storage import copy_path, copy_store_contents, detect_backend, open_store
+
 
-# Map file extensions to expected input formats
 EXTENSION_FORMAT = {
     ".csv": "csv",
     ".npy": "npy",
@@ -21,229 +21,32 @@
     ".json": "json",
 }
 
-# Define which object paths expect which dimensions
-# obs-axis: first dimension must match n_obs
-# var-axis: first dimension must match n_var
-# matrix: must match (n_obs, n_var)
-OBS_AXIS_PREFIXES = ("obs", "obsm/", "obsp/")
-VAR_AXIS_PREFIXES = ("var", "varm/", "varp/")
-MATRIX_PREFIXES = ("X", "layers/")
-
-
-def _norm_path(p: str) -> str:
-    p = p.strip()
-    if not p:
-        raise ValueError("Object path must be non-empty.")
-    return p.lstrip("/")
-
-
-def _get_axis_length(file: h5py.File, axis: str) -> Optional[int]:
-    """Get the length of obs or var axis."""
-    if axis not in file:
-        return None
-    group = file[axis]
-    if not isinstance(group, h5py.Group):
-        return None
-    index_name = group.attrs.get("_index", None)
-    if index_name is None:
-        index_name = "obs_names" if axis == "obs" else "var_names"
-    if isinstance(index_name, bytes):
-        index_name = index_name.decode("utf-8")
-    if index_name not in group:
-        return None
-    dataset = group[index_name]
-    if isinstance(dataset, h5py.Dataset) and dataset.shape:
-        return int(dataset.shape[0])
-    return None
 
-
-def _validate_dimensions(
-    file: h5py.File,
-    obj_path: str,
-    data_shape: tuple,
+def _prepare_target_path(
+    file: Path,
+    output_file: Optional[Path],
+    inplace: bool,
     console: Console,
-) -> None:
-    """Validate that data dimensions match the target path requirements."""
-    n_obs = _get_axis_length(file, "obs")
-    n_var = _get_axis_length(file, "var")
-
-    # Check obs/var replacement (dataframe)
-    if obj_path == "obs":
-        if n_obs is not None and data_shape[0] != n_obs:
-            raise ValueError(
-                f"Row count mismatch: input has {data_shape[0]} rows, "
-                f"but obs has {n_obs} cells."
-            )
-        return
-    if obj_path == "var":
-        if n_var is not None and data_shape[0] != n_var:
-            raise ValueError(
-                f"Row count mismatch: input has {data_shape[0]} rows, "
-                f"but var has {n_var} features."
-            )
-        return
-
-    # Check matrix (X, layers/*)
-    for prefix in MATRIX_PREFIXES:
-        if (
-            obj_path == prefix
-            or obj_path.startswith(prefix + "/")
-            or obj_path.startswith(prefix)
-        ):
-            if obj_path == "X" or obj_path.startswith("layers/"):
-                if len(data_shape) < 2:
-                    raise ValueError(
-                        f"Matrix data requires 2D shape, got {len(data_shape)}D."
-                    )
-                if n_obs is not None and data_shape[0] != n_obs:
-                    raise ValueError(
-                        f"First dimension mismatch: input has {data_shape[0]} rows, "
-                        f"but obs has {n_obs} cells."
-                    )
-                if n_var is not None and data_shape[1] != n_var:
-                    raise ValueError(
-                        f"Second dimension mismatch: input has {data_shape[1]} columns, "
-                        f"but var has {n_var} features."
-                    )
-                return
-
-    # Check obs-axis matrices (obsm/*, obsp/*)
-    for prefix in OBS_AXIS_PREFIXES:
-        if obj_path.startswith(prefix) and obj_path != "obs":
-            if n_obs is not None and data_shape[0] != n_obs:
-                raise ValueError(
-                    f"First dimension mismatch: input has {data_shape[0]} rows, "
-                    f"but obs has {n_obs} cells."
-                )
-            # obsp should be square n_obs x n_obs
-            if obj_path.startswith("obsp/") and len(data_shape) >= 2:
-                if data_shape[1] != n_obs:
-                    raise ValueError(
-                        f"obsp matrix must be square (n_obs × n_obs): "
-                        f"got {data_shape[0]}×{data_shape[1]}, expected {n_obs}×{n_obs}."
-                    )
-            return
-
-    # Check var-axis matrices (varm/*, varp/*)
-    for prefix in VAR_AXIS_PREFIXES:
-        if obj_path.startswith(prefix) and obj_path != "var":
-            if n_var is not None and data_shape[0] != n_var:
-                raise ValueError(
-                    f"First dimension mismatch: input has {data_shape[0]} rows, "
-                    f"but var has {n_var} features."
-                )
-            # varp should be square n_var x n_var
-            if obj_path.startswith("varp/") and len(data_shape) >= 2:
-                if data_shape[1] != n_var:
-                    raise ValueError(
-                        f"varp matrix must be square (n_var × n_var): "
-                        f"got {data_shape[0]}×{data_shape[1]}, expected {n_var}×{n_var}."
-                    )
-            return
-
-    # For other paths (like uns/*), no dimension validation
-    console.print(f"[dim]Note: No dimension validation for path '{obj_path}'[/]")
-
-
-def _read_csv(
-    input_file: Path,
-    index_column: Optional[str],
-) -> Tuple[List[dict], List[str], List[str], str]:
-    """
-    Read CSV file and return rows, column names, index values, and index column name.
-
-    Returns:
-        (rows, column_names, index_values, index_column_name)
-    """
-    with open(input_file, "r", encoding="utf-8", newline="") as f:
-        reader = csv.DictReader(f)
-        if reader.fieldnames is None:
-            raise ValueError("CSV file has no header.")
-        fieldnames = list(reader.fieldnames)
-
-        # Determine index column
-        if index_column:
-            if index_column not in fieldnames:
-                raise ValueError(
-                    f"Index column '{index_column}' not found in CSV. "
-                    f"Available columns: {', '.join(fieldnames)}"
-                )
-            idx_col = index_column
-        else:
-            idx_col = fieldnames[0]
-
-        # Read all rows
-        rows = list(reader)
-
-    index_values = [row[idx_col] for row in rows]
-    data_columns = [c for c in fieldnames if c != idx_col]
-
-    return rows, data_columns, index_values, idx_col
-
-
-def _read_mtx(
-    input_file: Path,
-) -> Tuple[List[Tuple[int, int, float]], Tuple[int, int], int]:
-    """
-    Read Matrix Market file and return sparse matrix data.
-
-    Returns:
-        (data, indices, indptr, shape, nnz, is_csr)
-    """
-    with open(input_file, "r", encoding="utf-8") as fh:
-        header = fh.readline()
-        if not header.startswith("%%MatrixMarket"):
-            raise ValueError("Invalid MTX file: missing MatrixMarket header.")
-
-        # Parse header for field type
-        parts = header.lower().split()
-        field = "real"
-        for p in parts:
-            if p in ("real", "integer", "complex", "pattern"):
-                field = p
-                break
-
-        # Skip comments
-        line = fh.readline()
-        while line.startswith("%"):
-            line = fh.readline()
-
-        # Read dimensions
-        dims = line.split()
-        n_rows, n_cols, nnz = int(dims[0]), int(dims[1]), int(dims[2])
-
-        # Read entries
-        entries = []
-        for _ in range(nnz):
-            parts = fh.readline().split()
-            r, c = int(parts[0]) - 1, int(parts[1]) - 1
-            if field == "pattern":
-                v = 1.0
-            else:
-                v = float(parts[2])
-            entries.append((r, c, v))
-
-    return entries, (n_rows, n_cols), nnz
-
-
-def _create_csr_from_entries(
-    entries: List[Tuple[int, int, float]], shape: Tuple[int, int]
-) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-    """Convert coordinate entries to CSR format."""
-    n_rows, _ = shape
-    # Sort by row, then column
-    entries.sort(key=lambda x: (x[0], x[1]))
+) -> Path:
+    if inplace:
+        return file
+    if output_file is None:
+        raise ValueError("Output file is required unless --inplace is specified.")
 
-    data = np.array([e[2] for e in entries], dtype=np.float32)
-    indices = np.array([e[1] for e in entries], dtype=np.int32)
+    src_backend = detect_backend(file)
+    dst_backend = detect_backend(output_file)
 
-    # Build indptr
-    indptr = np.zeros(n_rows + 1, dtype=np.int32)
-    for r, _, _ in entries:
-        indptr[r + 1] += 1
-    indptr = np.cumsum(indptr)
+    if src_backend == dst_backend:
+        copy_path(file, output_file)
+        console.print(f"[dim]Copied {file} → {output_file}[/]")
+        return output_file
 
-    return data, indices, indptr
+    with open_store(file, "r") as src_store, open_store(output_file, "w") as dst_store:
+        copy_store_contents(src_store.root, dst_store.root)
+    console.print(
+        f"[dim]Converted {file} ({src_backend}) → {output_file} ({dst_backend})[/]"
+    )
+    return output_file
 
 
 def import_object(
@@ -255,30 +58,7 @@ def import_object(
     index_column: Optional[str],
     console: Console,
 ) -> None:
-    """
-    Import data from a file into an h5ad object.
-
-    Args:
-        file: Path to the source h5ad file
-        obj: Object path to create/replace (e.g., 'obs', 'obsm/X_pca', 'X')
-        input_file: Input data file (.csv, .npy, .mtx, .json)
-        output_file: Path to output h5ad file (None if inplace)
-        inplace: If True, modify the source file directly
-        index_column: Column to use as index for obs/var CSV import
-        console: Console for output
-    """
-    # Determine target file
-    if inplace:
-        target_file = file
-    else:
-        if output_file is None:
-            raise ValueError("Output file is required unless --inplace is specified.")
-        # Copy source to output first
-        shutil.copy2(file, output_file)
-        target_file = output_file
-        console.print(f"[dim]Copied {file} → {output_file}[/]")
-
-    obj = _norm_path(obj)
+    target_file = _prepare_target_path(file, output_file, inplace, console)
     ext = input_file.suffix.lower()
 
     if ext not in EXTENSION_FORMAT:
@@ -289,11 +69,8 @@ def import_object(
 
     fmt = EXTENSION_FORMAT[ext]
 
-    # Validate index_column is only used for obs/var CSV
     if index_column and (fmt != "csv" or obj not in ("obs", "var")):
-        raise ValueError(
-            "--index-column is only valid for CSV import into 'obs' or 'var'."
-        )
+        raise ValueError("--index-column is only valid for CSV import into 'obs' or 'var'.")
 
     if fmt == "csv":
         _import_csv(target_file, obj, input_file, index_column, console)
@@ -312,59 +89,15 @@ def _import_csv(
     index_column: Optional[str],
     console: Console,
 ) -> None:
-    """Import CSV data into obs or var."""
-    if obj not in ("obs", "var"):
-        raise ValueError(
-            f"CSV import is only supported for 'obs' or 'var', not '{obj}'."
-        )
-
-    rows, data_columns, index_values, _ = _read_csv(input_file, index_column)
-    n_rows = len(rows)
-
-    with h5py.File(file, "a") as f:
-        # Validate dimensions if the file already has obs/var
-        _validate_dimensions(f, obj, (n_rows,), console)
-
-        # Delete existing group if present
-        if obj in f:
-            del f[obj]
-
-        # Create new group
-        group = f.create_group(obj)
-        index_name = "obs_names" if obj == "obs" else "var_names"
-        group.attrs["_index"] = index_name
-        group.attrs["encoding-type"] = "dataframe"
-        group.attrs["encoding-version"] = "0.2.0"
-        group.attrs["column-order"] = np.array(data_columns, dtype="S")
-
-        # Create index dataset
-        group.create_dataset(
-            index_name,
-            data=np.array(index_values, dtype="S"),
+    with open_store(file, "a") as store:
+        import_dataframe(
+            store.root,
+            obj=obj,
+            input_file=input_file,
+            index_column=index_column,
+            console=console,
         )
 
-        # Create column datasets
-        for col in data_columns:
-            values = [row[col] for row in rows]
-            # Try to infer type
-            try:
-                arr = np.array(values, dtype=np.float64)
-                group.create_dataset(col, data=arr)
-            except (ValueError, TypeError):
-                try:
-                    arr = np.array(values, dtype=np.int64)
-                    group.create_dataset(col, data=arr)
-                except (ValueError, TypeError):
-                    # Fallback to string
-                    arr = np.array(values, dtype="S")
-                    ds = group.create_dataset(col, data=arr)
-                    ds.attrs["encoding-type"] = "string-array"
-                    ds.attrs["encoding-version"] = "0.2.0"
-
-    console.print(
-        f"[green]Imported[/] {n_rows} rows × {len(data_columns)} columns into '{obj}'"
-    )
-
 
 def _import_npy(
     file: Path,
@@ -372,34 +105,8 @@ def _import_npy(
     input_file: Path,
     console: Console,
 ) -> None:
-    """Import NPY data into a dataset."""
-    arr = np.load(input_file)
-
-    with h5py.File(file, "a") as f:
-        _validate_dimensions(f, obj, arr.shape, console)
-
-        # Handle nested paths
-        parts = obj.split("/")
-        parent_path = "/".join(parts[:-1])
-        name = parts[-1]
-
-        # Ensure parent groups exist
-        if parent_path:
-            if parent_path not in f:
-                f.create_group(parent_path)
-            parent = cast(h5py.Group, f[parent_path])
-        else:
-            parent = f
-
-        # Delete existing if present
-        if name in parent:
-            del parent[name]
-
-        # Create dataset
-        parent.create_dataset(name, data=arr)
-
-    shape_str = "×".join(str(d) for d in arr.shape)
-    console.print(f"[green]Imported[/] {shape_str} array into '{obj}'")
+    with open_store(file, "a") as store:
+        import_npy(store.root, obj=obj, input_file=input_file, console=console)
 
 
 def _import_mtx(
@@ -408,42 +115,8 @@ def _import_mtx(
     input_file: Path,
     console: Console,
 ) -> None:
-    """Import MTX (Matrix Market) data as CSR sparse matrix."""
-    entries, shape, nnz = _read_mtx(input_file)
-    data, indices, indptr = _create_csr_from_entries(entries, shape)
-
-    with h5py.File(file, "a") as f:
-        _validate_dimensions(f, obj, shape, console)
-
-        # Handle nested paths
-        parts = obj.split("/")
-        parent_path = "/".join(parts[:-1])
-        name = parts[-1]
-
-        if parent_path:
-            if parent_path not in f:
-                f.create_group(parent_path)
-            parent = cast(h5py.Group, f[parent_path])
-        else:
-            parent = f
-
-        # Delete existing if present
-        if name in parent:
-            del parent[name]
-
-        # Create sparse matrix group
-        group = parent.create_group(name)
-        group.attrs["encoding-type"] = "csr_matrix"
-        group.attrs["encoding-version"] = "0.1.0"
-        group.attrs["shape"] = np.array(shape, dtype=np.int64)
-
-        group.create_dataset("data", data=data)
-        group.create_dataset("indices", data=indices)
-        group.create_dataset("indptr", data=indptr)
-
-    console.print(
-        f"[green]Imported[/] {shape[0]}×{shape[1]} sparse matrix ({nnz} non-zero) into '{obj}'"
-    )
+    with open_store(file, "a") as store:
+        import_mtx(store.root, obj=obj, input_file=input_file, console=console)
 
 
 def _import_json(
@@ -452,60 +125,5 @@ def _import_json(
     input_file: Path,
     console: Console,
 ) -> None:
-    """Import JSON data into uns or other dict-like groups."""
-    with open(input_file, "r", encoding="utf-8") as fh:
-        payload = json.load(fh)
-
-    with h5py.File(file, "a") as f:
-        # Handle nested paths
-        parts = obj.split("/")
-        parent_path = "/".join(parts[:-1])
-        name = parts[-1]
-
-        if parent_path:
-            if parent_path not in f:
-                f.create_group(parent_path)
-            parent = cast(h5py.Group, f[parent_path])
-        else:
-            parent = f
-
-        # Delete existing if present
-        if name in parent:
-            del parent[name]
-
-        # Create from JSON
-        _write_json_to_h5(parent, name, payload)
-
-    console.print(f"[green]Imported[/] JSON data into '{obj}'")
-
-
-def _write_json_to_h5(parent: h5py.Group, name: str, value: Any) -> None:
-    """Recursively write JSON-like data to HDF5."""
-    if isinstance(value, dict):
-        group = parent.create_group(name)
-        for k, v in value.items():
-            _write_json_to_h5(group, k, v)
-    elif isinstance(value, list):
-        # Try to convert to array
-        try:
-            arr = np.array(value)
-            if arr.dtype.kind in ("U", "O"):
-                arr = np.array(value, dtype="S")
-            parent.create_dataset(name, data=arr)
-        except (ValueError, TypeError):
-            # Fallback: store as JSON string
-            parent.create_dataset(name, data=json.dumps(value).encode("utf-8"))
-    elif isinstance(value, str):
-        parent.create_dataset(name, data=np.array([value], dtype="S"))
-    elif isinstance(value, bool):
-        parent.create_dataset(name, data=np.array(value, dtype=bool))
-    elif isinstance(value, int):
-        parent.create_dataset(name, data=np.array(value, dtype=np.int64))
-    elif isinstance(value, float):
-        parent.create_dataset(name, data=np.array(value, dtype=np.float64))
-    elif value is None:
-        # Store None as empty string attribute or special marker
-        ds = parent.create_dataset(name, data=np.array([], dtype="S"))
-        ds.attrs["_is_none"] = True
-    else:
-        raise ValueError(f"Cannot convert JSON value of type {type(value).__name__}")
+    with open_store(file, "a") as store:
+        import_json(store.root, obj=obj, input_file=input_file, console=console)
diff --git a/src/h5ad/commands/info.py b/src/h5ad/commands/info.py
index 11bd11d..b58b5b0 100644
--- a/src/h5ad/commands/info.py
+++ b/src/h5ad/commands/info.py
@@ -1,11 +1,12 @@
 from pathlib import Path
-from typing import Optional, Union
+from typing import Any, Optional
 
-import h5py
 import rich
 from rich.console import Console
 from rich.tree import Tree
-from h5ad.info import axis_len, get_entry_type, format_type_info
+
+from h5ad.core.info import axis_len, format_type_info, get_entry_type
+from h5ad.storage import is_dataset, is_group, open_store
 
 # Preferred display order for top-level keys
 KEY_ORDER = ["X", "obs", "var", "obsm", "varm", "layers", "obsp", "varp", "uns"]
@@ -33,7 +34,8 @@ def show_info(
         depth (Optional[int]): Maximum recursion depth for type display (only with show_types=True)
         entry_path (Optional[str]): Specific entry path to inspect (e.g., 'obsm/X_pca')
     """
-    with h5py.File(file, "r") as f:
+    with open_store(file, "r") as store:
+        f = store.root
         # If a specific path is requested, show detailed info for that object
         if entry_path:
             _show_object_info(f, entry_path, console)
@@ -47,13 +49,13 @@ def show_info(
         )
 
         if show_types:
-            _show_types_tree(f, console, depth=depth)
+            _show_types_tree(f, console, root_label=str(file), depth=depth)
         else:
             # List top-level keys and their sub-keys (original behavior)
             for key in _sort_keys(list(f.keys())):
                 obj = f[key]
                 # Only process Groups, skip Datasets like X
-                if isinstance(obj, h5py.Group):
+                if is_group(obj):
                     sub_keys = [
                         k for k in obj.keys() if k not in ("_index", "__categories")
                     ]
@@ -65,7 +67,7 @@ def show_info(
 
 
 def _show_types_tree(
-    f: h5py.File, console: Console, depth: Optional[int] = None
+    f: Any, console: Console, root_label: str, depth: Optional[int] = None
 ) -> None:
     """Show a tree view with type information for all entries.
 
@@ -75,7 +77,7 @@ def _show_types_tree(
         - obsm/obsp/varm/varp/layers: 1 level (show matrices)
         - uns: 2 levels deep
     """
-    tree = Tree(f"[bold]{f.filename}[/]")
+    tree = Tree(f"[bold]{root_label}[/]")
 
     # Define max depth for each top-level group
     max_depth_map = {
@@ -93,14 +95,14 @@ def _show_types_tree(
     def add_node(
         parent_tree: Tree,
         name: str,
-        obj: Union[h5py.Group, h5py.Dataset],
+        obj: Any,
         current_depth: int,
         max_depth: int,
     ) -> None:
         info = get_entry_type(obj)
         type_str = format_type_info(info)
 
-        if isinstance(obj, h5py.Dataset):
+        if is_dataset(obj):
             shape_str = f"[dim]{obj.shape}[/]" if obj.shape else ""
             node_text = f"[bright_white]{name}[/] {shape_str} {type_str}"
             parent_tree.add(node_text)
@@ -123,7 +125,7 @@ def add_node(
     for key in _sort_keys(list(f.keys())):
         obj = f[key]
         # Skip empty groups
-        if isinstance(obj, h5py.Group):
+        if is_group(obj):
             children = [k for k in obj.keys() if k not in ("_index", "__categories")]
             if not children:
                 continue
@@ -135,7 +137,7 @@ def add_node(
     console.print(tree)
 
 
-def _show_object_info(f: h5py.File, entry_path: str, console: Console) -> None:
+def _show_object_info(f: Any, entry_path: str, console: Console) -> None:
     """Show detailed info for a specific object path."""
     # Normalize path
     entry_path = entry_path.strip().lstrip("/")
@@ -171,7 +173,7 @@ def _show_object_info(f: h5py.File, entry_path: str, console: Console) -> None:
             console.print(f"  [dim]{k}:[/] {v_str}")
 
     # If it's a group, show children
-    if isinstance(entry, h5py.Group):
+    if is_group(entry):
         children = [k for k in entry.keys() if k not in ("_index", "__categories")]
         if children:
             console.print(f"\n[bold cyan]Children:[/]")
diff --git a/src/h5ad/commands/subset.py b/src/h5ad/commands/subset.py
index ff20d6b..940ef07 100644
--- a/src/h5ad/commands/subset.py
+++ b/src/h5ad/commands/subset.py
@@ -1,686 +1,17 @@
-"""Subset operations for .h5ad files."""
-
-from __future__ import annotations
-
-from pathlib import Path
-from typing import Optional, Set, Tuple, List, Dict, Any
-
-import h5py
-import numpy as np
-import typer
-from rich.console import Console
-from rich.progress import (
-    Progress,
-    SpinnerColumn,
-    TextColumn,
-    BarColumn,
-    TaskProgressColumn,
-    TimeElapsedColumn,
+from h5ad.core.subset import (
+    _read_name_file,
+    indices_from_name_set,
+    subset_axis_group,
+    subset_dense_matrix,
+    subset_h5ad,
+    subset_sparse_matrix_group,
 )
 
-from h5ad.read import decode_str_array
-
-
-def _copy_attrs(src: h5py.AttributeManager, dst: h5py.AttributeManager) -> None:
-    """
-    Copy HDF5 attributes from source to destination.
-    Args:
-        src (h5py.AttributeManager): Source attributes
-        dst (h5py.AttributeManager): Destination attributes
-    """
-    for k, v in src.items():
-        dst[k] = v
-
-
-def _ds_create_kwargs(src: h5py.Dataset) -> Dict[str, Any]:
-    """
-    Best-effort carryover of dataset creation properties.
-    (h5py doesn't expose everything perfectly; this covers the big ones.)
-
-    Args:
-        src (h5py.Dataset): Source dataset
-    Returns:
-        Dict[str, Any]: Dataset creation keyword arguments
-    """
-    kw: Dict[str, Any] = {}
-    if src.chunks is not None:
-        kw["chunks"] = src.chunks
-    if src.compression is not None:
-        kw["compression"] = src.compression
-        kw["compression_opts"] = src.compression_opts
-    kw["shuffle"] = bool(src.shuffle)
-    kw["fletcher32"] = bool(src.fletcher32)
-    if src.scaleoffset is not None:
-        kw["scaleoffset"] = src.scaleoffset
-    if src.fillvalue is not None:
-        kw["fillvalue"] = src.fillvalue
-    return kw
-
-
-def _read_name_file(path: Path) -> Set[str]:
-    """
-    Read one name per line from a file. Blank lines ignored.
-    """
-    names: Set[str] = set()
-    with open(path, "r", encoding="utf-8") as f:
-        for line in f:
-            line = line.strip()
-            if line:
-                names.add(line)
-    return names
-
-
-def indices_from_name_set(
-    names_ds: h5py.Dataset,
-    keep: Set[str],
-    *,
-    chunk_size: int = 200_000,
-) -> Tuple[np.ndarray, Set[str]]:
-    """
-    Returns (indices_sorted, missing_names).
-    Chunked scan so we don't do names_ds[...] for huge datasets.
-
-    Args:
-        names_ds (h5py.Dataset): Dataset containing names
-        keep (Set[str]): Set of names to find
-        chunk_size (int): Number of names to read per chunk
-
-    Returns:
-        Tuple[np.ndarray, Set[str]]: (Array of found indices, set of missing names)
-    """
-    if names_ds.ndim != 1:
-        # common h5ad uses 1D obs_names/var_names
-        flat_len = int(np.prod(names_ds.shape))
-    else:
-        flat_len = names_ds.shape[0]
-
-    remaining = set(keep)  # we'll delete as we find
-    found_indices: List[int] = []
-
-    for start in range(0, flat_len, chunk_size):
-        end = min(start + chunk_size, flat_len)
-        chunk = names_ds[start:end]
-        chunk = decode_str_array(np.asarray(chunk)).astype(str)
-
-        for i, name in enumerate(chunk):
-            if name in remaining:
-                found_indices.append(start + i)
-                remaining.remove(name)
-
-        if not remaining:
-            break
-
-    return np.asarray(found_indices, dtype=np.int64), remaining
-
-
-def subset_axis_group(
-    src: h5py.Group,
-    dst: h5py.Group,
-    indices: Optional[np.ndarray],
-) -> None:
-    """
-    Subset obs/var group:
-    - datasets: subset along first axis (obj[indices, ...])
-    - categorical groups: copy categories, subset codes
-    - unknown groups: copy as-is if indices is None; otherwise copy conservatively
-
-    Args:
-        src (h5py.Group): Source axis group
-        dst (h5py.Group): Destination axis group
-        indices (Optional[np.ndarray]): Indices to keep; if None, copy as-is
-    """
-    _copy_attrs(src.attrs, dst.attrs)
-
-    for key in src.keys():
-        obj = src[key]
-
-        if isinstance(obj, h5py.Dataset):
-            if indices is None:
-                src.copy(key, dst, name=key)
-            else:
-                data = obj[indices, ...]
-                ds = dst.create_dataset(key, data=data)
-                _copy_attrs(obj.attrs, ds.attrs)
-
-        elif isinstance(obj, h5py.Group):
-            enc = obj.attrs.get("encoding-type", b"")
-            if isinstance(enc, bytes):
-                enc = enc.decode("utf-8")
-
-            if enc == "categorical":
-                gdst = dst.create_group(key)
-                _copy_attrs(obj.attrs, gdst.attrs)
-                obj.copy("categories", gdst, name="categories")
-
-                codes = obj["codes"]
-                if indices is None:
-                    obj.copy("codes", gdst, name="codes")
-                else:
-                    codes_sub = codes[indices, ...]
-                    ds = gdst.create_dataset("codes", data=codes_sub)
-                    _copy_attrs(codes.attrs, ds.attrs)
-            else:
-                # Unknown group type - copy as-is
-                src.copy(key, dst, name=key)
-
-
-def subset_dense_matrix(
-    src: h5py.Dataset,
-    dst_parent: h5py.Group,
-    name: str,
-    obs_idx: Optional[np.ndarray],
-    var_idx: Optional[np.ndarray],
-    *,
-    chunk_rows: int = 1024,
-) -> None:
-    """
-    Chunked write for dense 2D datasets.
-    Args:
-        src (h5py.Dataset): Source dense matrix dataset
-        dst_parent (h5py.Group): Destination parent group
-        name (str): Name for the destination dataset
-        obs_idx (Optional[np.ndarray]): Indices of observations to keep
-        var_idx (Optional[np.ndarray]): Indices of variables to keep
-        chunk_rows (int): Number of rows to read per chunk
-    """
-    if src.ndim != 2:
-        # fallback: copy whole dataset
-        src.parent.copy(src.name.split("/")[-1], dst_parent, name=name)
-        return
-
-    n_obs, n_var = src.shape
-    out_obs = len(obs_idx) if obs_idx is not None else n_obs
-    out_var = len(var_idx) if var_idx is not None else n_var
-
-    kw = _ds_create_kwargs(src)
-    # adjust chunks to output shape if possible
-    if "chunks" in kw and kw["chunks"] is not None:
-        c0, c1 = kw["chunks"]
-        kw["chunks"] = (min(c0, out_obs), min(c1, out_var))
-
-    dst = dst_parent.create_dataset(
-        name, shape=(out_obs, out_var), dtype=src.dtype, **kw
-    )
-    _copy_attrs(src.attrs, dst.attrs)
-
-    # Write in blocks of output rows
-    for out_start in range(0, out_obs, chunk_rows):
-        out_end = min(out_start + chunk_rows, out_obs)
-
-        if obs_idx is None:
-            block = src[out_start:out_end, :]
-        else:
-            rows = obs_idx[out_start:out_end]
-            block = src[rows, :]
-
-        if var_idx is not None:
-            block = block[:, var_idx]
-
-        dst[out_start:out_end, :] = block
-
-
-def subset_sparse_matrix_group(
-    src: h5py.Group,
-    dst_parent: h5py.Group,
-    name: str,
-    obs_idx: Optional[np.ndarray],
-    var_idx: Optional[np.ndarray],
-) -> None:
-    """
-    Subset a sparse matrix stored as an h5ad group with datasets:
-      - data, indices, indptr
-    Supports both CSR (Compressed Sparse Row) and CSC (Compressed Sparse Column) formats.
-
-    CSR: rows are compressed, efficient for row-wise operations
-    CSC: columns are compressed, efficient for column-wise operations
-
-    Args:
-        src (h5py.Group): Source sparse matrix group
-        dst_parent (h5py.Group): Destination parent group
-        name (str): Name for the destination group
-        obs_idx (Optional[np.ndarray]): Indices of observations to keep
-        var_idx (Optional[np.ndarray]): Indices of variables to keep
-    """
-    data = src["data"]
-    indices = src["indices"]
-    indptr = src["indptr"]
-
-    # Determine format
-    encoding = src.attrs.get("encoding-type", b"")
-    if isinstance(encoding, bytes):
-        encoding = encoding.decode("utf-8")
-
-    is_csr = encoding == "csr_matrix"
-    is_csc = encoding == "csc_matrix"
-
-    if not is_csr and not is_csc:
-        raise ValueError(f"Unsupported sparse format: {encoding}")
-
-    # Determine shape
-    shape = src.attrs.get("shape", None)
-    if shape is None:
-        # fallback: infer from indptr len and max index
-        major_dim = indptr.shape[0] - 1
-        minor_dim = int(indices[...].max()) + 1 if indices.size else 0
-        if is_csr:
-            n_obs, n_var = major_dim, minor_dim
-        else:  # CSC
-            n_obs, n_var = minor_dim, major_dim
-    else:
-        n_obs, n_var = shape
-
-    # For CSR: major axis = obs (rows), minor axis = var (cols)
-    # For CSC: major axis = var (cols), minor axis = obs (rows)
-    if is_csr:
-        major_idx = obs_idx if obs_idx is not None else np.arange(n_obs, dtype=np.int64)
-        minor_idx = var_idx
-        out_obs = major_idx.shape[0]
-        out_var = minor_idx.shape[0] if minor_idx is not None else n_var
-    else:  # CSC
-        major_idx = var_idx if var_idx is not None else np.arange(n_var, dtype=np.int64)
-        minor_idx = obs_idx
-        out_obs = minor_idx.shape[0] if minor_idx is not None else n_obs
-        out_var = major_idx.shape[0]
-
-    # Build minor axis remap if needed
-    minor_map = None
-    out_minor_dim = out_var if is_csr else out_obs
-    total_minor_dim = n_var if is_csr else n_obs
-
-    if minor_idx is not None:
-        # array remap is fastest; if dimension is huge and memory matters, use dict instead
-        minor_map = np.full(total_minor_dim, -1, dtype=np.int64)
-        minor_map[minor_idx] = np.arange(minor_idx.shape[0], dtype=np.int64)
-
-    # Pass 1: count nnz in output to preallocate
-    out_counts = np.zeros(len(major_idx), dtype=np.int64)
-    for i, major_pos in enumerate(major_idx):
-        s = int(indptr[major_pos])
-        e = int(indptr[major_pos + 1])
-        if s == e:
-            continue
-        minor_indices = indices[s:e]
-        if minor_map is None:
-            out_counts[i] = e - s
-        else:
-            mask = minor_map[minor_indices] >= 0
-            out_counts[i] = mask.sum()
-
-    out_indptr = np.zeros(len(major_idx) + 1, dtype=indptr.dtype)
-    np.cumsum(out_counts, out=out_indptr[1:])
-    out_nnz = int(out_indptr[-1])
-
-    # Preallocate output arrays
-    out_data = np.empty(out_nnz, dtype=data.dtype)
-    out_indices = np.empty(out_nnz, dtype=indices.dtype)
-
-    # Pass 2: fill
-    cursor = 0
-    for i, major_pos in enumerate(major_idx):
-        s = int(indptr[major_pos])
-        e = int(indptr[major_pos + 1])
-        if s == e:
-            continue
-
-        minor_indices = indices[s:e]
-        vals = data[s:e]
-
-        if minor_map is None:
-            length = e - s
-            out_indices[cursor : cursor + length] = minor_indices
-            out_data[cursor : cursor + length] = vals
-            cursor += length
-        else:
-            mask = minor_map[minor_indices] >= 0
-            new_minor = minor_map[minor_indices[mask]]
-            new_vals = vals[mask]
-            length = len(new_minor)
-            out_indices[cursor : cursor + length] = new_minor
-            out_data[cursor : cursor + length] = new_vals
-            cursor += length
-
-    # Create dst group
-    gdst = dst_parent.create_group(name)
-    _copy_attrs(src.attrs, gdst.attrs)
-    gdst.attrs["shape"] = (out_obs, out_var)
-    # Write encoding-type as bytes to match h5ad standard
-    gdst.attrs["encoding-type"] = (
-        encoding.encode("utf-8") if isinstance(encoding, str) else encoding
-    )
-
-    # Write datasets (best-effort preserve compression/etc.)
-    # Adjust chunks to not exceed output size
-    data_kw = _ds_create_kwargs(data)
-    if "chunks" in data_kw and data_kw["chunks"] is not None:
-        data_kw["chunks"] = (min(data_kw["chunks"][0], out_nnz),)
-    d_data = gdst.create_dataset("data", data=out_data, **data_kw)
-    _copy_attrs(data.attrs, d_data.attrs)
-
-    indices_kw = _ds_create_kwargs(indices)
-    if "chunks" in indices_kw and indices_kw["chunks"] is not None:
-        indices_kw["chunks"] = (min(indices_kw["chunks"][0], out_nnz),)
-    d_indices = gdst.create_dataset("indices", data=out_indices, **indices_kw)
-    _copy_attrs(indices.attrs, d_indices.attrs)
-
-    indptr_kw = _ds_create_kwargs(indptr)
-    if "chunks" in indptr_kw and indptr_kw["chunks"] is not None:
-        indptr_kw["chunks"] = (min(indptr_kw["chunks"][0], len(out_indptr)),)
-    d_indptr = gdst.create_dataset("indptr", data=out_indptr, **indptr_kw)
-    _copy_attrs(indptr.attrs, d_indptr.attrs)
-
-
-def subset_matrix_like(
-    src_obj: h5py.Dataset | h5py.Group,
-    dst_parent: h5py.Group,
-    name: str,
-    obs_idx: Optional[np.ndarray],
-    var_idx: Optional[np.ndarray],
-    *,
-    chunk_rows: int = 1024,
-) -> None:
-    """
-    Dispatch for dense dataset vs sparse (csr/csc) group.
-    Args:
-        src_obj (h5py.Dataset | h5py.Group): Source dataset or group
-        dst_parent (h5py.Group): Destination parent group
-        name (str): Name for the destination dataset/group
-        obs_idx (Optional[np.ndarray]): Indices of observations to keep
-        var_idx (Optional[np.ndarray]): Indices of variables to keep
-        chunk_rows (int): Number of rows to read per chunk when subsetting dense matrices
-    """
-    if isinstance(src_obj, h5py.Dataset):
-        subset_dense_matrix(
-            src_obj, dst_parent, name, obs_idx, var_idx, chunk_rows=chunk_rows
-        )
-        return
-
-    # group
-    enc = src_obj.attrs.get("encoding-type", b"")
-    if isinstance(enc, bytes):
-        enc = enc.decode("utf-8")
-
-    if enc in ("csr_matrix", "csc_matrix"):
-        subset_sparse_matrix_group(src_obj, dst_parent, name, obs_idx, var_idx)
-    else:
-        # unknown sparse type -> copy as-is (or raise)
-        src_obj.file.copy(src_obj, dst_parent, name)
-
-
-def subset_h5ad(
-    file: Path,
-    output: Path,
-    obs_file: Optional[Path],
-    var_file: Optional[Path],
-    *,
-    chunk_rows: int = 1024,
-    console: Console,
-) -> None:
-    """
-    Subset an h5ad file by obs and/or var names.
-    Args:
-        file (Path): Input h5ad file path
-        output (Path): Output h5ad file path
-        obs_file (Optional[Path]): File with obs names to keep (one per line)
-        var_file (Optional[Path]): File with var names to keep (one per line)
-        chunk_rows (int): Number of rows to read per chunk when subsetting dense matrices
-        console (Console): Rich console for output
-    """
-    # ---- Read keep-lists
-    obs_keep: Optional[Set[str]] = None
-    if obs_file is not None:
-        obs_keep = _read_name_file(obs_file)
-        console.print(f"[cyan]Found {len(obs_keep)} obs names to keep[/]")
-
-    var_keep: Optional[Set[str]] = None
-    if var_file is not None:
-        var_keep = _read_name_file(var_file)
-        console.print(f"[cyan]Found {len(var_keep)} var names to keep[/]")
-
-    if obs_keep is None and var_keep is None:
-        console.print(
-            "[bold red]Error:[/] At least one of [cyan]--obs[/] or [cyan]--var[/] must be provided.",
-        )
-        raise typer.Exit(code=1)
-
-    # ---- Open files
-    with console.status("[magenta]Opening files...[/]"):
-        src = h5py.File(file, "r")
-        dst = h5py.File(output, "w")
-
-    try:
-        # ---- Compute indices
-        obs_idx = None
-        if obs_keep is not None:
-            console.print("[cyan]Matching obs names...[/]")
-            obs_names_ds = src["obs"].get("obs_names") or src["obs"].get(
-                src["obs"].attrs.get("_index", "obs_names")
-            )
-            if obs_names_ds is None:
-                console.print("[bold red]Error:[/] Could not find obs names")
-                raise KeyError("Could not find obs names")
-
-            obs_idx, missing_obs = indices_from_name_set(obs_names_ds, obs_keep)
-            if missing_obs:
-                console.print(
-                    f"[yellow]Warning: {len(missing_obs)} obs names not found in file[/]"
-                )
-            console.print(
-                f"[green]Selected {len(obs_idx)} obs (of {obs_names_ds.shape[0]})[/]"
-            )
-
-        var_idx = None
-        if var_keep is not None:
-            console.print("[cyan]Matching var names...[/]")
-            var_names_ds = src["var"].get("var_names") or src["var"].get(
-                src["var"].attrs.get("_index", "var_names")
-            )
-            if var_names_ds is None:
-                console.print("[bold red]Error:[/] Could not find var names")
-                raise KeyError("Could not find var names")
-
-            var_idx, missing_var = indices_from_name_set(var_names_ds, var_keep)
-            if missing_var:
-                console.print(
-                    f"[yellow]Warning: {len(missing_var)} var names not found in file[/]"
-                )
-            console.print(
-                f"[green]Selected {len(var_idx)} var (of {var_names_ds.shape[0]})[/]"
-            )
-
-        # ---- Build task list
-        tasks: List[str] = []
-        if "obs" in src:
-            tasks.append("obs")
-        if "var" in src:
-            tasks.append("var")
-        if "X" in src:
-            tasks.append("X")
-        if "layers" in src:
-            tasks.extend([f"layer:{k}" for k in src["layers"].keys()])
-        if "obsm" in src:
-            tasks.extend([f"obsm:{k}" for k in src["obsm"].keys()])
-        if "varm" in src:
-            tasks.extend([f"varm:{k}" for k in src["varm"].keys()])
-        if "obsp" in src:
-            tasks.extend([f"obsp:{k}" for k in src["obsp"].keys()])
-        if "varp" in src:
-            tasks.extend([f"varp:{k}" for k in src["varp"].keys()])
-        if "uns" in src:
-            tasks.append("uns")
-
-        # ---- Progress bar for all operations
-        with Progress(
-            SpinnerColumn(),
-            TextColumn("[progress.description]{task.description}"),
-            BarColumn(),
-            TaskProgressColumn(),
-            TimeElapsedColumn(),
-            console=console,
-        ) as progress:
-            task_id = progress.add_task("[cyan]Subsetting...", total=len(tasks))
-            processed_top: Set[str] = set()
-
-            # obs
-            if "obs" in src:
-                progress.update(task_id, description="[cyan]Subsetting obs...[/]")
-                obs_dst = dst.create_group("obs")
-                subset_axis_group(src["obs"], obs_dst, obs_idx)
-                processed_top.add("obs")
-                progress.advance(task_id)
-
-            # var
-            if "var" in src:
-                progress.update(task_id, description="[cyan]Subsetting var...[/]")
-                var_dst = dst.create_group("var")
-                subset_axis_group(src["var"], var_dst, var_idx)
-                processed_top.add("var")
-                progress.advance(task_id)
-
-            # X
-            if "X" in src:
-                progress.update(task_id, description="[cyan]Subsetting X...[/]")
-                subset_matrix_like(
-                    src["X"], dst, "X", obs_idx, var_idx, chunk_rows=chunk_rows
-                )
-                processed_top.add("X")
-                progress.advance(task_id)
-
-            # layers
-            if "layers" in src:
-                layers_dst = dst.create_group("layers")
-                processed_top.add("layers")
-                for lname in src["layers"].keys():
-                    progress.update(
-                        task_id, description=f"[cyan]Subsetting layer: {lname}...[/]"
-                    )
-                    subset_matrix_like(
-                        src["layers"][lname],
-                        layers_dst,
-                        lname,
-                        obs_idx,
-                        var_idx,
-                        chunk_rows=chunk_rows,
-                    )
-                    progress.advance(task_id)
-
-            # obsm
-            if "obsm" in src:
-                obsm_dst = dst.create_group("obsm")
-                processed_top.add("obsm")
-                for k in src["obsm"].keys():
-                    if obs_idx is None:
-                        progress.update(
-                            task_id, description=f"[cyan]Copying obsm: {k}...[/]"
-                        )
-                        src["obsm"].copy(k, obsm_dst, name=k)
-                    else:
-                        progress.update(
-                            task_id, description=f"[cyan]Subsetting obsm: {k}...[/]"
-                        )
-                        obj = src["obsm"][k]
-                        if isinstance(obj, h5py.Dataset):
-                            data = obj[obs_idx, ...]
-                            obsm_dst.create_dataset(k, data=data)
-                            for ak, av in obj.attrs.items():
-                                obsm_dst[k].attrs[ak] = av
-                        else:
-                            subset_matrix_like(
-                                obj, obsm_dst, k, obs_idx, None, chunk_rows=chunk_rows
-                            )
-                    progress.advance(task_id)
-
-            # varm
-            if "varm" in src:
-                varm_dst = dst.create_group("varm")
-                processed_top.add("varm")
-                for k in src["varm"].keys():
-                    if var_idx is None:
-                        progress.update(
-                            task_id, description=f"[cyan]Copying varm: {k}...[/]"
-                        )
-                        src["varm"].copy(k, varm_dst, name=k)
-                    else:
-                        progress.update(
-                            task_id, description=f"[cyan]Subsetting varm: {k}...[/]"
-                        )
-                        obj = src["varm"][k]
-                        if isinstance(obj, h5py.Dataset):
-                            data = obj[var_idx, ...]
-                            varm_dst.create_dataset(k, data=data)
-                            for ak, av in obj.attrs.items():
-                                varm_dst[k].attrs[ak] = av
-                        else:
-                            subset_matrix_like(
-                                obj, varm_dst, k, var_idx, None, chunk_rows=chunk_rows
-                            )
-                    progress.advance(task_id)
-
-            # obsp
-            if "obsp" in src:
-                obsp_dst = dst.create_group("obsp")
-                processed_top.add("obsp")
-                for k in src["obsp"].keys():
-                    if obs_idx is None:
-                        progress.update(
-                            task_id, description=f"[cyan]Copying obsp: {k}...[/]"
-                        )
-                        src["obsp"].copy(k, obsp_dst, name=k)
-                    else:
-                        progress.update(
-                            task_id, description=f"[cyan]Subsetting obsp: {k}...[/]"
-                        )
-                        subset_matrix_like(
-                            src["obsp"][k],
-                            obsp_dst,
-                            k,
-                            obs_idx,
-                            obs_idx,
-                            chunk_rows=chunk_rows,
-                        )
-                    progress.advance(task_id)
-
-            # varp
-            if "varp" in src:
-                varp_dst = dst.create_group("varp")
-                processed_top.add("varp")
-                for k in src["varp"].keys():
-                    if var_idx is None:
-                        progress.update(
-                            task_id, description=f"[cyan]Copying varp: {k}...[/]"
-                        )
-                        src["varp"].copy(k, varp_dst, name=k)
-                    else:
-                        progress.update(
-                            task_id, description=f"[cyan]Subsetting varp: {k}...[/]"
-                        )
-                        subset_matrix_like(
-                            src["varp"][k],
-                            varp_dst,
-                            k,
-                            var_idx,
-                            var_idx,
-                            chunk_rows=chunk_rows,
-                        )
-                    progress.advance(task_id)
-
-            # uns
-            if "uns" in src:
-                progress.update(task_id, description="[cyan]Copying uns...[/]")
-                src.copy("uns", dst)
-                processed_top.add("uns")
-                progress.advance(task_id)
-
-            # copy any remaining top-level keys
-            for key in src.keys():
-                if key not in processed_top:
-                    src.copy(key, dst)
-
-            # top-level attrs
-            for ak, av in src.attrs.items():
-                dst.attrs[ak] = av
-
-        console.print(f"[bold green]✓ Successfully created {output}[/]")
-
-    finally:
-        dst.close()
-        src.close()
+__all__ = [
+    "_read_name_file",
+    "indices_from_name_set",
+    "subset_axis_group",
+    "subset_dense_matrix",
+    "subset_h5ad",
+    "subset_sparse_matrix_group",
+]

From 4b09cf50baf81c1991e251fce0f9e48320749f2b Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 17:32:27 +0000
Subject: [PATCH 39/62] HUGE REFACTOR:Add initial implementation of Store class
 and backend detection for HDF5 and Zarr

---
 src/h5ad/storage/__init__.py | 267 +++++++++++++++++++++++++++++++++++
 1 file changed, 267 insertions(+)
 create mode 100644 src/h5ad/storage/__init__.py

diff --git a/src/h5ad/storage/__init__.py b/src/h5ad/storage/__init__.py
new file mode 100644
index 0000000..0b652d5
--- /dev/null
+++ b/src/h5ad/storage/__init__.py
@@ -0,0 +1,267 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Iterable, Optional, Sequence
+import shutil
+
+import h5py
+
+try:
+    import zarr
+except Exception:  # pragma: no cover - optional dependency
+    zarr = None
+
+import numpy as np
+
+
+@dataclass
+class Store:
+    backend: str
+    root: Any
+    path: Path
+
+    def close(self) -> None:
+        if self.backend == "hdf5":
+            try:
+                self.root.close()
+            except Exception:
+                return
+
+    def __enter__(self) -> "Store":
+        return self
+
+    def __exit__(self, exc_type, exc, tb) -> None:
+        self.close()
+
+
+def _require_zarr() -> None:
+    if zarr is None:  # pragma: no cover - optional dependency
+        raise ImportError(
+            "zarr is required for .zarr support. Install with: uv sync --extra zarr"
+        )
+
+
+def is_hdf5_group(obj: Any) -> bool:
+    return isinstance(obj, (h5py.File, h5py.Group))
+
+
+def is_hdf5_dataset(obj: Any) -> bool:
+    return isinstance(obj, h5py.Dataset)
+
+
+def is_zarr_group(obj: Any) -> bool:
+    return zarr is not None and isinstance(obj, zarr.Group)
+
+
+def is_zarr_array(obj: Any) -> bool:
+    return zarr is not None and isinstance(obj, zarr.Array)
+
+
+def is_group(obj: Any) -> bool:
+    return is_hdf5_group(obj) or is_zarr_group(obj)
+
+
+def is_dataset(obj: Any) -> bool:
+    return is_hdf5_dataset(obj) or is_zarr_array(obj)
+
+
+def is_zarr_path(path: Path) -> bool:
+    if not path.exists() or not path.is_dir():
+        return False
+    if (path / "zarr.json").exists():
+        return True
+    if (path / ".zgroup").exists() or (path / ".zattrs").exists():
+        return True
+    return False
+
+
+def detect_backend(path: Path) -> str:
+    if path.exists():
+        if path.is_dir():
+            if is_zarr_path(path):
+                return "zarr"
+            raise ValueError(
+                f"Path '{path}' is a directory but does not look like a Zarr store."
+            )
+        return "hdf5"
+    if path.suffix == ".zarr":
+        return "zarr"
+    return "hdf5"
+
+
+def open_store(path: Path, mode: str) -> Store:
+    path = Path(path)
+    backend = detect_backend(path)
+    if backend == "zarr":
+        _require_zarr()
+        root = zarr.open_group(str(path), mode=mode)
+        return Store(backend="zarr", root=root, path=path)
+    root = h5py.File(path, mode)
+    return Store(backend="hdf5", root=root, path=path)
+
+
+def _normalize_attr_value(value: Any, target_backend: str) -> Any:
+    if target_backend == "zarr":
+        if isinstance(value, bytes):
+            return value.decode("utf-8")
+        if isinstance(value, (list, tuple)):
+            return [
+                v.decode("utf-8") if isinstance(v, bytes) else v for v in value
+            ]
+        if isinstance(value, np.ndarray):
+            if value.dtype.kind in ("S", "O"):
+                return [
+                    v.decode("utf-8") if isinstance(v, bytes) else v
+                    for v in value.tolist()
+                ]
+            return value.tolist()
+        if isinstance(value, np.generic):
+            return value.item()
+    return value
+
+
+def copy_attrs(src_attrs: Any, dst_attrs: Any, *, target_backend: str) -> None:
+    for k, v in src_attrs.items():
+        dst_attrs[k] = _normalize_attr_value(v, target_backend)
+
+
+def dataset_create_kwargs(src: Any, *, target_backend: str) -> dict:
+    kw: dict = {}
+    chunks = getattr(src, "chunks", None)
+    if chunks is not None:
+        kw["chunks"] = chunks
+    if target_backend == "hdf5" and is_hdf5_dataset(src):
+        if src.compression is not None:
+            kw["compression"] = src.compression
+            kw["compression_opts"] = src.compression_opts
+        kw["shuffle"] = bool(src.shuffle)
+        kw["fletcher32"] = bool(src.fletcher32)
+        if src.scaleoffset is not None:
+            kw["scaleoffset"] = src.scaleoffset
+        if src.fillvalue is not None:
+            kw["fillvalue"] = src.fillvalue
+    if target_backend == "zarr" and is_zarr_array(src):
+        src_zarr_format = getattr(getattr(src, "metadata", None), "zarr_format", None)
+        if src_zarr_format == 3:
+            compressors = None
+            try:
+                compressors = getattr(src, "compressors", None)
+            except Exception:
+                compressors = None
+            if compressors is not None:
+                kw["compressors"] = compressors
+        else:
+            try:
+                compressor = getattr(src, "compressor", None)
+            except Exception:
+                compressor = None
+            if compressor is not None:
+                kw["compressor"] = compressor
+        try:
+            filters = getattr(src, "filters", None)
+        except Exception:
+            filters = None
+        if filters is not None:
+            kw["filters"] = filters
+        try:
+            fill_value = getattr(src, "fill_value", None)
+        except Exception:
+            fill_value = None
+        if fill_value is not None:
+            kw["fill_value"] = fill_value
+    return kw
+
+
+def create_dataset(
+    parent: Any,
+    name: str,
+    *,
+    data: Any = None,
+    shape: Optional[Sequence[int]] = None,
+    dtype: Any = None,
+    **kwargs: Any,
+) -> Any:
+    if is_zarr_group(parent):
+        zarr_format = getattr(getattr(parent, "metadata", None), "zarr_format", None)
+        if zarr_format == 3:
+            kwargs = dict(kwargs)
+            kwargs.pop("compressor", None)
+        elif zarr_format == 2 and "compressors" in kwargs and "compressor" not in kwargs:
+            kwargs = dict(kwargs)
+            compressors = kwargs.pop("compressors")
+            if isinstance(compressors, (list, tuple)) and len(compressors) == 1:
+                kwargs["compressor"] = compressors[0]
+        if data is not None:
+            return parent.create_array(name, data=data, **kwargs)
+        return parent.create_array(name, shape=shape, dtype=dtype, **kwargs)
+    if data is not None:
+        return parent.create_dataset(name, data=data, **kwargs)
+    return parent.create_dataset(name, shape=shape, dtype=dtype, **kwargs)
+
+
+def _chunk_step(shape: Sequence[int], chunks: Optional[Sequence[int]]) -> int:
+    if chunks is not None and len(chunks) > 0 and chunks[0]:
+        return int(chunks[0])
+    if not shape:
+        return 1
+    return max(1, min(1024, int(shape[0])))
+
+
+def copy_dataset(src: Any, dst_group: Any, name: str) -> Any:
+    shape = tuple(src.shape) if getattr(src, "shape", None) is not None else ()
+    target_backend = "zarr" if is_zarr_group(dst_group) else "hdf5"
+    ds = create_dataset(
+        dst_group,
+        name,
+        shape=shape,
+        dtype=src.dtype,
+        **dataset_create_kwargs(src, target_backend=target_backend),
+    )
+    copy_attrs(src.attrs, ds.attrs, target_backend=target_backend)
+
+    if shape == ():
+        ds[()] = src[()]
+        return ds
+
+    step = _chunk_step(shape, getattr(src, "chunks", None))
+    for start in range(0, shape[0], step):
+        end = min(start + step, shape[0])
+        if len(shape) == 1:
+            ds[start:end] = src[start:end]
+        else:
+            ds[start:end, ...] = src[start:end, ...]
+    return ds
+
+
+def copy_tree(src_obj: Any, dst_group: Any, name: str, *, exclude: Iterable[str] = ()) -> Any:
+    if is_dataset(src_obj):
+        return copy_dataset(src_obj, dst_group, name)
+    if not is_group(src_obj):
+        raise TypeError(f"Unsupported object type for copy: {type(src_obj)}")
+
+    target_backend = "zarr" if is_zarr_group(dst_group) else "hdf5"
+    grp = dst_group.create_group(name)
+    copy_attrs(src_obj.attrs, grp.attrs, target_backend=target_backend)
+    for key in src_obj.keys():
+        if key in exclude:
+            continue
+        child = src_obj[key]
+        copy_tree(child, grp, key, exclude=exclude)
+    return grp
+
+
+def copy_store_contents(src_root: Any, dst_root: Any) -> None:
+    for key in src_root.keys():
+        copy_tree(src_root[key], dst_root, key)
+
+
+def copy_path(src: Path, dst: Path) -> None:
+    src = Path(src)
+    dst = Path(dst)
+    if is_zarr_path(src):
+        if dst.exists():
+            raise FileExistsError(f"Destination '{dst}' already exists.")
+        shutil.copytree(src, dst)
+        return
+    shutil.copy2(src, dst)

From 55558f9fcf472ee2bbc06128ce6ae25ac0ae85dd Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 17:32:33 +0000
Subject: [PATCH 40/62] HUGE REFACTOR:Add utility functions for path
 normalization in h5ad modules

---
 src/h5ad/util/__init__.py | 1 +
 src/h5ad/util/path.py     | 9 +++++++++
 2 files changed, 10 insertions(+)
 create mode 100644 src/h5ad/util/__init__.py
 create mode 100644 src/h5ad/util/path.py

diff --git a/src/h5ad/util/__init__.py b/src/h5ad/util/__init__.py
new file mode 100644
index 0000000..364e184
--- /dev/null
+++ b/src/h5ad/util/__init__.py
@@ -0,0 +1 @@
+"""Utility helpers used across h5ad modules."""
diff --git a/src/h5ad/util/path.py b/src/h5ad/util/path.py
new file mode 100644
index 0000000..c5c7102
--- /dev/null
+++ b/src/h5ad/util/path.py
@@ -0,0 +1,9 @@
+from __future__ import annotations
+
+
+def norm_path(path: str) -> str:
+    """Normalize object paths used inside h5ad/zarr stores."""
+    value = path.strip()
+    if not value:
+        raise ValueError("Object path must be non-empty.")
+    return value.lstrip("/")

From d40e8490336a3a5ee04deac2e36f4b9175c03052 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 17:32:42 +0000
Subject: [PATCH 41/62] HUGE REFACTOR: Update CLI to support .zarr stores
 alongside .h5ad, enhancing file handling and command descriptions

---
 src/h5ad/cli.py | 132 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 99 insertions(+), 33 deletions(-)

diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py
index d947270..3d084a6 100644
--- a/src/h5ad/cli.py
+++ b/src/h5ad/cli.py
@@ -18,7 +18,7 @@
 from h5ad.commands import export_image as export_image_cmd
 
 app = typer.Typer(
-    help="Streaming CLI for huge .h5ad files (info, subset, export, import)."
+    help="Streaming CLI for huge .h5ad and .zarr files (info, subset, export, import)."
 )
 # Use stderr for status/progress to keep stdout clean for data output
 # force_terminal=True ensures Rich output is visible even in non-TTY environments
@@ -38,9 +38,11 @@
 def info(
     file: Path = typer.Argument(
         ...,
-        help="Path to the .h5ad file",
+        help="Path to the .h5ad/.zarr store",
         exists=True,
         readable=True,
+        dir_okay=True,
+        file_okay=True,
     ),
     entry: Optional[str] = typer.Argument(
         None,
@@ -82,8 +84,17 @@ def info(
 # ============================================================================
 @app.command()
 def subset(
-    file: Path = typer.Argument(..., help="Input .h5ad", exists=True, readable=True),
-    output: Path = typer.Argument(..., help="Output .h5ad", writable=True),
+    file: Path = typer.Argument(
+        ...,
+        help="Input .h5ad/.zarr",
+        exists=True,
+        readable=True,
+        dir_okay=True,
+        file_okay=True,
+    ),
+    output: Path = typer.Argument(
+        ..., help="Output .h5ad/.zarr", dir_okay=True, file_okay=True
+    ),
     obs: Optional[Path] = typer.Option(
         None,
         "--obs",
@@ -99,7 +110,12 @@ def subset(
         readable=True,
     ),
     chunk_rows: int = typer.Option(
-        1024, "--chunk", "-C", help="Row chunk size for dense matrices"
+        1024,
+        "--chunk",
+        "-C",
+        "--chunk-rows",
+        "-r",
+        help="Row chunk size for dense matrices",
     ),
 ) -> None:
     """Subset an h5ad by obs and/or var names."""
@@ -129,7 +145,12 @@ def subset(
 @export_app.command("dataframe")
 def export_dataframe(
     file: Path = typer.Argument(
-        ..., help="Path to the .h5ad file", exists=True, readable=True
+        ...,
+        help="Path to the .h5ad/.zarr store",
+        exists=True,
+        readable=True,
+        dir_okay=True,
+        file_okay=True,
     ),
     entry: str = typer.Argument(..., help="Entry path to export ('obs' or 'var')"),
     output: Path = typer.Option(
@@ -142,7 +163,12 @@ def export_dataframe(
         help="Comma separated column names to include",
     ),
     chunk_rows: int = typer.Option(
-        10_000, "--chunk", "-C", help="Number of rows to read per chunk"
+        10_000,
+        "--chunk",
+        "-C",
+        "--chunk-rows",
+        "-r",
+        help="Number of rows to read per chunk",
     ),
     head: Optional[int] = typer.Option(
         None, "--head", "-n", help="Output only the first n entries"
@@ -185,7 +211,12 @@ def export_dataframe(
 @export_app.command("array")
 def export_array(
     file: Path = typer.Argument(
-        ..., help="Path to the .h5ad file", exists=True, readable=True
+        ...,
+        help="Path to the .h5ad/.zarr store",
+        exists=True,
+        readable=True,
+        dir_okay=True,
+        file_okay=True,
     ),
     entry: str = typer.Argument(
         ..., help="Entry path to export (e.g., 'obsm/X_pca', 'varm/PCs', 'X')"
@@ -225,7 +256,12 @@ def export_array(
 @export_app.command("sparse")
 def export_sparse(
     file: Path = typer.Argument(
-        ..., help="Path to the .h5ad file", exists=True, readable=True
+        ...,
+        help="Path to the .h5ad/.zarr store",
+        exists=True,
+        readable=True,
+        dir_okay=True,
+        file_okay=True,
     ),
     entry: str = typer.Argument(
         ..., help="Entry path to export (e.g., 'X', 'layers/counts')"
@@ -280,11 +316,17 @@ def export_sparse(
 @export_app.command("dict")
 def export_dict(
     file: Path = typer.Argument(
-        ..., help="Path to the .h5ad file", exists=True, readable=True
+        ...,
+        help="Path to the .h5ad/.zarr store",
+        exists=True,
+        readable=True,
+        dir_okay=True,
+        file_okay=True,
     ),
     entry: str = typer.Argument(
         ..., help="Entry path to export (e.g., 'uns', 'uns/colors')"
     ),
+    output_arg: Optional[Path] = typer.Argument(None, help="Output .json file path"),
     output: Optional[Path] = typer.Option(
         None, "--output", "-o", help="Output .json file path"
     ),
@@ -306,10 +348,11 @@ def export_dict(
     """
 
     try:
+        out_path = output if output is not None else output_arg
         export_json(
             file=file,
             obj=entry,
-            out=output,
+            out=out_path,
             max_elements=max_elements,
             include_attrs=include_attrs,
             console=console,
@@ -322,7 +365,12 @@ def export_dict(
 @export_app.command("image")
 def export_image(
     file: Path = typer.Argument(
-        ..., help="Path to the .h5ad file", exists=True, readable=True
+        ...,
+        help="Path to the .h5ad/.zarr store",
+        exists=True,
+        readable=True,
+        dir_okay=True,
+        file_okay=True,
     ),
     entry: str = typer.Argument(..., help="Entry path to export (2D or 3D array)"),
     output: Optional[Path] = typer.Option(
@@ -349,22 +397,21 @@ def export_image(
 # IMPORT subcommands
 # ============================================================================
 def _get_target_file(file: Path, output: Optional[Path], inplace: bool) -> Path:
-    """Determine target file and copy if needed."""
-    import shutil
+    """Determine target path and copy/convert if needed."""
+    from h5ad.commands.import_data import _prepare_target_path
 
-    if inplace:
-        return file
-    if output is None:
-        raise ValueError("Output file is required unless --inplace is specified.")
-    shutil.copy2(file, output)
-    console.print(f"[dim]Copied {file} → {output}[/]")
-    return output
+    return _prepare_target_path(file, output, inplace, console)
 
 
 @import_app.command("dataframe")
 def import_dataframe(
     file: Path = typer.Argument(
-        ..., help="Path to the source .h5ad file", exists=True, readable=True
+        ...,
+        help="Path to the source .h5ad/.zarr store",
+        exists=True,
+        readable=True,
+        dir_okay=True,
+        file_okay=True,
     ),
     entry: str = typer.Argument(
         ..., help="Entry path to create/replace ('obs' or 'var')"
@@ -376,8 +423,9 @@ def import_dataframe(
         None,
         "--output",
         "-o",
-        help="Output .h5ad file path. Required unless --inplace.",
-        writable=True,
+        help="Output .h5ad/.zarr path. Required unless --inplace.",
+        dir_okay=True,
+        file_okay=True,
     ),
     inplace: bool = typer.Option(
         False,
@@ -424,7 +472,12 @@ def import_dataframe(
 @import_app.command("array")
 def import_array(
     file: Path = typer.Argument(
-        ..., help="Path to the source .h5ad file", exists=True, readable=True
+        ...,
+        help="Path to the source .h5ad/.zarr store",
+        exists=True,
+        readable=True,
+        dir_okay=True,
+        file_okay=True,
     ),
     entry: str = typer.Argument(
         ..., help="Entry path to create/replace (e.g., 'X', 'obsm/X_pca')"
@@ -436,8 +489,9 @@ def import_array(
         None,
         "--output",
         "-o",
-        help="Output .h5ad file path. Required unless --inplace.",
-        writable=True,
+        help="Output .h5ad/.zarr path. Required unless --inplace.",
+        dir_okay=True,
+        file_okay=True,
     ),
     inplace: bool = typer.Option(
         False,
@@ -474,7 +528,12 @@ def import_array(
 @import_app.command("sparse")
 def import_sparse(
     file: Path = typer.Argument(
-        ..., help="Path to the source .h5ad file", exists=True, readable=True
+        ...,
+        help="Path to the source .h5ad/.zarr store",
+        exists=True,
+        readable=True,
+        dir_okay=True,
+        file_okay=True,
     ),
     obj: str = typer.Argument(
         ..., help="Object path to create/replace (e.g., 'X', 'layers/counts')"
@@ -486,8 +545,9 @@ def import_sparse(
         None,
         "--output",
         "-o",
-        help="Output .h5ad file path. Required unless --inplace.",
-        writable=True,
+        help="Output .h5ad/.zarr path. Required unless --inplace.",
+        dir_okay=True,
+        file_okay=True,
     ),
     inplace: bool = typer.Option(
         False,
@@ -524,7 +584,12 @@ def import_sparse(
 @import_app.command("dict")
 def import_dict(
     file: Path = typer.Argument(
-        ..., help="Path to the source .h5ad file", exists=True, readable=True
+        ...,
+        help="Path to the source .h5ad/.zarr store",
+        exists=True,
+        readable=True,
+        dir_okay=True,
+        file_okay=True,
     ),
     obj: str = typer.Argument(
         ..., help="Object path to create/replace (e.g., 'uns', 'uns/metadata')"
@@ -536,8 +601,9 @@ def import_dict(
         None,
         "--output",
         "-o",
-        help="Output .h5ad file path. Required unless --inplace.",
-        writable=True,
+        help="Output .h5ad/.zarr path. Required unless --inplace.",
+        dir_okay=True,
+        file_okay=True,
     ),
     inplace: bool = typer.Option(
         False,

From bfec2b21a768b2656720c328e0d62242a5ba58f9 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 17:33:01 +0000
Subject: [PATCH 42/62] HUGE REFACTOR: Remove unused functions and imports from
 info.py and read.py, streamlining codebase

---
 src/h5ad/info.py | 287 +----------------------------------------------
 src/h5ad/read.py | 160 +-------------------------
 2 files changed, 4 insertions(+), 443 deletions(-)

diff --git a/src/h5ad/info.py b/src/h5ad/info.py
index 6144b07..635b03a 100644
--- a/src/h5ad/info.py
+++ b/src/h5ad/info.py
@@ -1,286 +1,3 @@
-from typing import Optional, Tuple, Dict, Any, Union
-import h5py
-import numpy as np
+from h5ad.core.info import axis_len, format_type_info, get_axis_group, get_entry_type
 
-
-def get_entry_type(entry: Union[h5py.Group, h5py.Dataset]) -> Dict[str, Any]:
-    """
-    Determine the type/format of an HDF5 object for export guidance.
-
-    Supports both:
-    - v0.2.0 (modern): Objects with encoding-type/encoding-version attributes
-    - v0.1.0 (legacy): Objects without encoding attributes, inferred from structure
-
-    Returns a dict with:
-        - type: str (e.g., 'dataframe', 'sparse-matrix', 'dense-matrix', 'dict', 'image', 'array', 'scalar')
-        - export_as: str (suggested export format: csv, mtx, npy, json, image)
-        - encoding: str (h5ad encoding-type if present)
-        - shape: tuple or None
-        - dtype: str or None
-        - details: str (human-readable description)
-        - version: str ('0.2.0', '0.1.0', or None for unknown)
-    """
-    result: Dict[str, Any] = {
-        "type": "unknown",
-        "export_as": None,
-        "encoding": None,
-        "shape": None,
-        "dtype": None,
-        "details": "",
-        "version": None,
-    }
-
-    # Get encoding-type attribute if present
-    enc = entry.attrs.get("encoding-type", b"")
-    if isinstance(enc, bytes):
-        enc = enc.decode("utf-8")
-    result["encoding"] = enc if enc else None
-
-    # Get encoding-version if present
-    enc_ver = entry.attrs.get("encoding-version", b"")
-    if isinstance(enc_ver, bytes):
-        enc_ver = enc_ver.decode("utf-8")
-    result["version"] = enc_ver if enc_ver else None
-
-    # Infer the type for Dataset entry
-    if isinstance(entry, h5py.Dataset):
-        result["shape"] = entry.shape
-        result["dtype"] = str(entry.dtype)
-
-        # Check for legacy categorical (v0.1.0): dataset with 'categories' attribute
-        if "categories" in entry.attrs:
-            result["type"] = "categorical"
-            result["export_as"] = "csv"
-            result["version"] = result["version"] or "0.1.0"
-            # Try to get category count from referenced dataset
-            try:
-                cats_ref = entry.attrs["categories"]
-                cats_ds = entry.file[cats_ref]
-                n_cats = cats_ds.shape[0]
-            except Exception:
-                n_cats = "?"
-            result["details"] = (
-                f"Legacy categorical [{entry.shape[0]} values, {n_cats} categories]"
-            )
-            return result
-
-        # Scalar
-        if entry.shape == ():
-            result["type"] = "scalar"
-            result["export_as"] = "json"
-            result["details"] = f"Scalar value ({entry.dtype})"
-            return result
-
-        # 1D or 2D numeric array -> dense matrix / array
-        if entry.ndim == 1:
-            result["type"] = "array"
-            result["export_as"] = "npy"
-            result["details"] = f"1D array [{entry.shape[0]}] ({entry.dtype})"
-        elif entry.ndim == 2:
-            result["type"] = "dense-matrix"
-            result["export_as"] = "npy"
-            result["details"] = (
-                f"Dense matrix {entry.shape[0]}×{entry.shape[1]} ({entry.dtype})"
-            )
-        elif entry.ndim == 3:
-            result["type"] = "array"
-            result["export_as"] = "npy"
-            result["details"] = f"3D array {entry.shape} ({entry.dtype})"
-        else:
-            result["type"] = "array"
-            result["export_as"] = "npy"
-            result["details"] = f"ND array {entry.shape} ({entry.dtype})"
-        return result
-
-    # It's a Group
-    if isinstance(entry, h5py.Group):
-        # Check for sparse matrix (CSR/CSC) - same in both versions
-        if enc in ("csr_matrix", "csc_matrix"):
-            shape = entry.attrs.get("shape", None)
-            shape_str = f"{shape[0]}×{shape[1]}" if shape is not None else "?"
-            result["type"] = "sparse-matrix"
-            result["export_as"] = "mtx"
-            result["details"] = (
-                f"Sparse {enc.replace('_matrix', '').upper()} matrix {shape_str}"
-            )
-            return result
-
-        # Check for v0.2.0 categorical (Group with codes/categories)
-        if enc == "categorical":
-            codes = entry.get("codes")
-            cats = entry.get("categories")
-            n_codes = codes.shape[0] if codes is not None else "?"
-            n_cats = cats.shape[0] if cats is not None else "?"
-            result["type"] = "categorical"
-            result["export_as"] = "csv"
-            result["details"] = f"Categorical [{n_codes} values, {n_cats} categories]"
-            return result
-
-        # Check for dataframe (obs/var style)
-        # v0.2.0: has encoding-type="dataframe"
-        # v0.1.0: has _index attribute or obs_names/var_names dataset
-        if (
-            enc == "dataframe"
-            or "_index" in entry.attrs
-            or "obs_names" in entry
-            or "var_names" in entry
-        ):
-            # Detect version
-            if enc == "dataframe":
-                df_version = result["version"] or "0.2.0"
-            else:
-                df_version = "0.1.0"  # No encoding-type, legacy format
-            result["version"] = df_version
-
-            # Check for __categories subgroup (v0.1.0 legacy)
-            has_legacy_cats = "__categories" in entry
-            n_cols = len(
-                [k for k in entry.keys() if k not in ("_index", "__categories")]
-            )
-
-            result["type"] = "dataframe"
-            result["export_as"] = "csv"
-            if has_legacy_cats:
-                result["details"] = f"DataFrame with {n_cols} columns (legacy v0.1.0)"
-            else:
-                result["details"] = f"DataFrame with {n_cols} columns"
-            return result
-
-        # Check for nullable arrays (v0.2.0)
-        if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"):
-            result["type"] = "array"
-            result["export_as"] = "npy"
-            result["details"] = f"Encoded array ({enc})"
-            return result
-
-        # Check for string-array encoding
-        if enc == "string-array":
-            result["type"] = "array"
-            result["export_as"] = "npy"
-            result["details"] = "Encoded string array"
-            return result
-
-        # Check for awkward-array (experimental)
-        if enc == "awkward-array":
-            length = entry.attrs.get("length", "?")
-            result["type"] = "awkward-array"
-            result["export_as"] = "json"
-            result["details"] = f"Awkward array (length={length})"
-            return result
-
-        # Generic dict/group (v0.2.0 has encoding-type="dict", v0.1.0 has no attributes)
-        n_keys = len(list(entry.keys()))
-        result["type"] = "dict"
-        result["export_as"] = "json"
-        result["details"] = f"Group with {n_keys} keys"
-        return result
-
-    return result
-
-
-def format_type_info(info: Dict[str, Any]) -> str:
-    """Format type info as a colored string for display."""
-    type_colors = {
-        "dataframe": "green",
-        "sparse-matrix": "magenta",
-        "dense-matrix": "blue",
-        "array": "blue",
-        "dict": "yellow",
-        "categorical": "green",
-        "scalar": "white",
-        "unknown": "red",
-    }
-
-    color = type_colors.get(info["type"], "white")
-    return f"[{color}]<{info['type']}>[/]"
-
-
-def axis_len(file: h5py.File, axis: str) -> int:
-    """
-    Get the length of the specified axis ('obs' or 'var') in the h5ad file.
-
-    Args:
-        file (h5py.File): Opened h5ad file object
-        axis (str): Axis name ('obs' or 'var')
-
-    Returns:
-        int: Length of the axis
-
-    Raises:
-        ValueError: If axis is not 'obs' or 'var'
-        KeyError: If axis or index dataset not found in file
-        TypeError: If axis is not a group or index is not a dataset
-        ValueError: If axis length cannot be determined
-    """
-    # Check if the specified axis exists in the file
-    if axis not in file:
-        raise KeyError(f"'{axis}' not found in the file.")
-
-    # Get the group corresponding to the axis
-    group = file[axis]
-    if not isinstance(group, h5py.Group):
-        raise TypeError(f"'{axis}' is not a group.")
-
-    # Determine the index name for the axis
-    index_name = group.attrs.get("_index", None)
-    if index_name is None:
-        if axis == "obs":
-            index_name = "obs_names"
-        elif axis == "var":
-            index_name = "var_names"
-        else:
-            raise ValueError(f"Invalid axis '{axis}'. Must be 'obs' or 'var'.")
-
-    # Decode bytes to string if necessary
-    if isinstance(index_name, bytes):
-        index_name = index_name.decode("utf-8")
-
-    # Check if the index dataset exists
-    if index_name not in group:
-        raise KeyError(f"Index dataset '{index_name}' not found in '{axis}' group.")
-
-    # Return the length of the index dataset
-    dataset = group[index_name]
-    if not isinstance(dataset, h5py.Dataset):
-        raise TypeError(f"Index '{index_name}' in '{axis}' is not a dataset.")
-    if dataset.shape:
-        return int(dataset.shape[0])
-    raise ValueError(
-        f"Cannot determine length of '{axis}': index dataset has no shape."
-    )
-
-
-def get_axis_group(file: h5py.File, axis: str) -> Tuple[h5py.Group, int, str]:
-    """
-    Get the axis group, its length, and index name.
-
-    Args:
-        file (h5py.File): Opened h5ad file object
-        axis (str): Axis name ('obs' or 'var')
-
-    Returns:
-        Tuple[h5py.Group, int, str]: Axis group, its length, and index name
-
-    Raises:
-        ValueError: If axis is not 'obs' or 'var'
-        KeyError: If axis or index dataset not found in file
-        TypeError: If axis is not a group or index is not a dataset
-        ValueError: If axis length cannot be determined
-    """
-    if axis not in ("obs", "var"):
-        raise ValueError("axis must be 'obs' or 'var'.")
-
-    # axis_len will validate existence and get length (raises exceptions if issues)
-    n = axis_len(file, axis)
-
-    # Get the group (already validated by axis_len)
-    group = file[axis]
-
-    # Get the index name
-    index_name = group.attrs.get("_index", None)
-    if index_name is None:
-        index_name = "obs_names" if axis == "obs" else "var_names"
-    if isinstance(index_name, bytes):
-        index_name = index_name.decode("utf-8")
-
-    return group, n, index_name
+__all__ = ["axis_len", "format_type_info", "get_axis_group", "get_entry_type"]
diff --git a/src/h5ad/read.py b/src/h5ad/read.py
index 78fec0e..63f8c4d 100644
--- a/src/h5ad/read.py
+++ b/src/h5ad/read.py
@@ -1,159 +1,3 @@
-import numpy as np
-import h5py
-from typing import List, Dict
+from h5ad.core.read import col_chunk_as_strings, decode_str_array, read_categorical_column
 
-
-def decode_str_array(array: np.ndarray) -> np.ndarray:
-    """
-    Decode a numpy array of bytes or objects to strings.
-    Args:
-        array (np.ndarray): Input numpy array of bytes or objects
-
-    Returns:
-        np.ndarray: Decoded numpy array of strings
-    """
-    if np.issubdtype(array.dtype, np.bytes_):
-        return array.astype("U")
-    if array.dtype.kind == "O":
-        return array.astype(str)
-    return array.astype(str)
-
-
-def read_categorical_column(
-    col: h5py.Group | h5py.Dataset,
-    start: int,
-    end: int,
-    cache: Dict[int, np.ndarray],
-    parent_group: h5py.Group | None = None,
-) -> List[str]:
-    """
-    Decode an AnnData 'categorical' column for a slice [start:end].
-
-    Supports both:
-    - v0.2.0 (modern): Group with 'codes' and 'categories' datasets
-    - v0.1.0 (legacy): Dataset with 'categories' attribute referencing __categories/<colname>
-
-    Args:
-        col: Column group (v0.2.0) or dataset (v0.1.0)
-        start: Start index of the slice
-        end: End index of the slice
-        cache: Cache for decoded categories
-        parent_group: Parent obs/var group (needed for v0.1.0 to resolve __categories)
-
-    Returns:
-        List[str]: Decoded categorical values for the specified slice
-    """
-    key = id(col)
-
-    # v0.2.0 format: Group with 'codes' and 'categories' datasets
-    if isinstance(col, h5py.Group):
-        if key not in cache:
-            cats = col["categories"][...]
-            cats = decode_str_array(cats)
-            cache[key] = np.asarray(cats, dtype=str)
-        cats = cache[key]
-
-        codes_ds = col["codes"]
-        codes = codes_ds[start:end]
-        codes = np.asarray(codes, dtype=np.int64)
-        return [cats[c] if 0 <= c < len(cats) else "" for c in codes]
-
-    # v0.1.0 format: Dataset with 'categories' attribute (object reference)
-    if isinstance(col, h5py.Dataset):
-        if key not in cache:
-            cats_ref = col.attrs.get("categories", None)
-            if cats_ref is not None:
-                # Dereference the HDF5 object reference
-                cats_ds = col.file[cats_ref]
-                cats = cats_ds[...]
-            elif parent_group is not None and "__categories" in parent_group:
-                # Fallback: look for __categories subgroup
-                col_name = col.name.split("/")[-1]
-                cats_grp = parent_group["__categories"]
-                if col_name in cats_grp:
-                    cats = cats_grp[col_name][...]
-                else:
-                    raise KeyError(
-                        f"Cannot find categories for legacy column {col.name}"
-                    )
-            else:
-                raise KeyError(
-                    f"Cannot find categories for legacy column {col.name}"
-                )
-            cats = decode_str_array(cats)
-            cache[key] = np.asarray(cats, dtype=str)
-        cats = cache[key]
-
-        codes = col[start:end]
-        codes = np.asarray(codes, dtype=np.int64)
-        return [cats[c] if 0 <= c < len(cats) else "" for c in codes]
-
-    raise TypeError(f"Unsupported categorical column type: {type(col)}")
-
-
-def col_chunk_as_strings(
-    group: h5py.Group,
-    col_name: str,
-    start: int,
-    end: int,
-    cat_cache: Dict[int, np.ndarray],
-) -> List[str]:
-    """
-    Read a column from an obs/var group as strings.
-
-    Supports both:
-    - v0.2.0 (modern): Columns with encoding-type attribute
-    - v0.1.0 (legacy): Categorical columns with 'categories' attribute referencing __categories
-
-    Args:
-        group (h5py.Group): The obs/var group
-        col_name (str): Name of the column to read
-        start (int): Start index of the slice
-        end (int): End index of the slice
-        cat_cache (Dict[int, np.ndarray]): Cache for decoded categorical columns
-
-    Returns:
-        List[str]: Column values as strings for the specified slice
-    """
-    if col_name not in group:
-        raise KeyError(f"Column {col_name!r} not found in group {group.name}")
-
-    col = group[col_name]
-
-    # Case 1: Dataset (could be plain array or legacy categorical)
-    if isinstance(col, h5py.Dataset):
-        # Check for v0.1.0 legacy categorical (has 'categories' attribute)
-        if "categories" in col.attrs:
-            return read_categorical_column(col, start, end, cat_cache, group)
-
-        # Plain dataset (numeric, string, etc.)
-        chunk = col[start:end]
-        if chunk.ndim != 1:
-            chunk = chunk.reshape(-1)
-        chunk = decode_str_array(np.asarray(chunk))
-        return chunk.tolist()
-
-    # Case 2: Group (v0.2.0 encoded types like categorical, nullable, etc.)
-    if isinstance(col, h5py.Group):
-        enc = col.attrs.get("encoding-type", b"")
-        if isinstance(enc, bytes):
-            enc = enc.decode("utf-8")
-
-        if enc == "categorical":
-            return read_categorical_column(col, start, end, cat_cache)
-
-        # Handle nullable arrays (nullable-integer, nullable-boolean, nullable-string-array)
-        if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"):
-            values = col["values"][start:end]
-            mask = col["mask"][start:end]
-            values = decode_str_array(np.asarray(values))
-            # Apply mask: masked values become empty string
-            return ["" if m else str(v) for v, m in zip(values, mask)]
-
-        raise ValueError(
-            f"Unsupported group encoding {enc!r} for column {col_name!r}"
-        )
-
-    raise TypeError(
-        f"Unsupported column type for {col_name!r} in group {group.name}"
-    )
+__all__ = ["col_chunk_as_strings", "decode_str_array", "read_categorical_column"]

From dd14d3e4119357bd8b6e33f104793740cc170a29 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 17:33:12 +0000
Subject: [PATCH 43/62] HUGE REFACTOR: Enhance export tests for Zarr support,
 adding new test cases and improving output handling

---
 tests/test_export.py |  88 ++++++++++++++++++++--
 tests/test_zarr.py   | 170 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 253 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_zarr.py

diff --git a/tests/test_export.py b/tests/test_export.py
index 323167e..6a3fad9 100644
--- a/tests/test_export.py
+++ b/tests/test_export.py
@@ -28,11 +28,46 @@ def _read_mtx(path: Path) -> np.ndarray:
         return mat
 
 
+def _read_mtx_header_and_data(path: Path) -> tuple[int, int, int, list[str]]:
+    with open(path, "r", encoding="utf-8") as fh:
+        header = fh.readline()
+        assert header.startswith("%%MatrixMarket")
+        line = fh.readline()
+        while line.startswith("%"):
+            line = fh.readline()
+        n_rows, n_cols, nnz = map(int, line.split())
+        data_lines = [line.strip() for line in fh if line.strip()]
+        return n_rows, n_cols, nnz, data_lines
+
+
 class TestExportArray:
     def test_export_array_dense_X(self, sample_h5ad_file, temp_dir):
         out = temp_dir / "X.npy"
         result = runner.invoke(
-            app, ["export", "array", str(sample_h5ad_file), "X", str(out)]
+            app, ["export", "array", str(sample_h5ad_file), "X", "--output", str(out)]
+        )
+        assert result.exit_code == 0
+        assert out.exists()
+
+        got = np.load(out)
+        with h5py.File(sample_h5ad_file, "r") as f:
+            expected = np.asarray(f["X"][...])
+        np.testing.assert_allclose(got, expected)
+
+    def test_export_array_chunk(self, sample_h5ad_file, temp_dir):
+        out = temp_dir / "X_chunk.npy"
+        result = runner.invoke(
+            app,
+            [
+                "export",
+                "array",
+                str(sample_h5ad_file),
+                "X",
+                "--output",
+                str(out),
+                "--chunk",
+                "3",
+            ],
         )
         assert result.exit_code == 0
         assert out.exists()
@@ -47,7 +82,15 @@ class TestExportSparse:
     def test_export_sparse_csr(self, sample_sparse_csr_h5ad, temp_dir):
         out = temp_dir / "X_csr.mtx"
         result = runner.invoke(
-            app, ["export", "sparse", str(sample_sparse_csr_h5ad), "X", str(out)]
+            app,
+            [
+                "export",
+                "sparse",
+                str(sample_sparse_csr_h5ad),
+                "X",
+                "--output",
+                str(out),
+            ],
         )
         assert result.exit_code == 0
         assert out.exists()
@@ -64,6 +107,31 @@ def test_export_sparse_csr(self, sample_sparse_csr_h5ad, temp_dir):
         )
         np.testing.assert_allclose(got, expected)
 
+    def test_export_sparse_head_limits_entries(self, sample_sparse_csr_h5ad, temp_dir):
+        out = temp_dir / "X_csr_head.mtx"
+        result = runner.invoke(
+            app,
+            [
+                "export",
+                "sparse",
+                str(sample_sparse_csr_h5ad),
+                "X",
+                "--output",
+                str(out),
+                "--head",
+                "2",
+            ],
+        )
+        assert result.exit_code == 0
+        assert out.exists()
+
+        n_rows, n_cols, nnz, data_lines = _read_mtx_header_and_data(out)
+        assert (n_rows, n_cols) == (4, 3)
+        assert nnz == 2
+        assert len(data_lines) == 2
+        assert data_lines[0].startswith("1 1 ")
+        assert data_lines[1].startswith("1 3 ")
+
     def test_export_sparse_csc(self, temp_dir):
         # Build a small, consistent CSC matrix group
         file_path = temp_dir / "test_csc.h5ad"
@@ -79,7 +147,9 @@ def test_export_sparse_csc(self, temp_dir):
             X.create_dataset("indptr", data=indptr)
 
         out = temp_dir / "X_csc.mtx"
-        result = runner.invoke(app, ["export", "sparse", str(file_path), "X", str(out)])
+        result = runner.invoke(
+            app, ["export", "sparse", str(file_path), "X", "--output", str(out)]
+        )
         assert result.exit_code == 0
         assert out.exists()
 
@@ -162,7 +232,8 @@ def test_sparse_matrix_array_export(self, sample_sparse_csr_h5ad, temp_dir):
         """Test that sparse matrix requires sparse export."""
         out = temp_dir / "X.npy"
         result = runner.invoke(
-            app, ["export", "array", str(sample_sparse_csr_h5ad), "X", str(out)]
+            app,
+            ["export", "array", str(sample_sparse_csr_h5ad), "X", "--output", str(out)],
         )
         # Should fail because X is sparse, not dense
         assert result.exit_code == 1
@@ -172,7 +243,14 @@ def test_nonexistent_object(self, sample_h5ad_file, temp_dir):
         out = temp_dir / "output.npy"
         result = runner.invoke(
             app,
-            ["export", "array", str(sample_h5ad_file), "nonexistent/path", str(out)],
+            [
+                "export",
+                "array",
+                str(sample_h5ad_file),
+                "nonexistent/path",
+                "--output",
+                str(out),
+            ],
         )
         assert result.exit_code == 1
         assert "not found" in result.output.lower() or "error" in result.output.lower()
diff --git a/tests/test_zarr.py b/tests/test_zarr.py
new file mode 100644
index 0000000..1008d6a
--- /dev/null
+++ b/tests/test_zarr.py
@@ -0,0 +1,170 @@
+"""Tests for zarr auto-detection support (v2 and v3)."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, Optional
+
+import numpy as np
+import pytest
+from typer.testing import CliRunner
+from rich.console import Console
+
+from h5ad.cli import app
+from h5ad.core.subset import subset_h5ad
+
+
+zarr = pytest.importorskip("zarr")
+
+runner = CliRunner()
+
+
+class UnsupportedZarrFormat(Exception):
+    pass
+
+
+def _open_zarr_group(path: Path, zarr_format: Optional[int]) -> Any:
+    if zarr_format is None:
+        return zarr.open_group(path, mode="w")
+
+    last_exc: Exception | None = None
+    for kw in ("zarr_format", "zarr_version"):
+        try:
+            return zarr.open_group(path, mode="w", **{kw: zarr_format})
+        except (TypeError, ValueError) as exc:
+            last_exc = exc
+            continue
+
+    raise UnsupportedZarrFormat(str(last_exc)) from last_exc
+
+
+def _create_array(group: Any, name: str, data: np.ndarray) -> Any:
+    data = np.asarray(data)
+    if hasattr(group, "create_array"):
+        try:
+            return group.create_array(name, data=data)
+        except TypeError:
+            return group.create_array(
+                name, data=data, shape=data.shape, dtype=data.dtype
+            )
+    try:
+        return group.create_dataset(name, data=data, shape=data.shape)
+    except TypeError:
+        return group.create_dataset(name, data=data)
+
+
+def _create_zarr_store(path: Path, *, zarr_format: Optional[int]) -> None:
+    root = _open_zarr_group(path, zarr_format)
+
+    obs = root.create_group("obs")
+    obs.attrs["_index"] = "obs_names"
+    obs_names = ["cell_1", "cell_2", "cell_3", "cell_4", "cell_5"]
+    _create_array(obs, "obs_names", np.array(obs_names, dtype="S"))
+    _create_array(
+        obs,
+        "cell_type",
+        np.array(["TypeA", "TypeB", "TypeA", "TypeC", "TypeB"], dtype="S"),
+    )
+
+    var = root.create_group("var")
+    var.attrs["_index"] = "var_names"
+    var_names = ["gene_1", "gene_2", "gene_3", "gene_4"]
+    _create_array(var, "var_names", np.array(var_names, dtype="S"))
+
+    X = np.array(
+        [
+            [1.0, 0.0, 2.5, 0.0],
+            [0.0, 3.2, 0.0, 1.1],
+            [2.1, 0.0, 1.8, 0.0],
+            [0.0, 4.5, 0.0, 2.3],
+            [1.5, 0.0, 3.0, 0.0],
+        ],
+        dtype=np.float32,
+    )
+    _create_array(root, "X", X)
+
+    uns = root.create_group("uns")
+    _create_array(uns, "description", np.array(["Test dataset"], dtype="S"))
+
+
+@pytest.fixture(params=[None, 2], ids=["default", "v2"])
+def zarr_format(request) -> Optional[int]:
+    return request.param
+
+
+def _skip_if_unsupported(exc: Exception, zarr_format: Optional[int]) -> None:
+    if zarr_format == 2:
+        pytest.skip("zarr v2 not supported by installed zarr")
+    raise exc
+
+
+def test_info_zarr_auto_detect(temp_dir, zarr_format):
+    store_path = temp_dir / f"test_{zarr_format or 'default'}.zarr"
+    try:
+        _create_zarr_store(store_path, zarr_format=zarr_format)
+    except UnsupportedZarrFormat as exc:
+        _skip_if_unsupported(exc, zarr_format)
+
+    result = runner.invoke(app, ["info", str(store_path)])
+    output = result.stdout + (result.stderr or "")
+    assert result.exit_code == 0, output
+    assert "5 × 4" in output
+
+
+def test_export_dataframe_zarr(temp_dir, zarr_format):
+    store_path = temp_dir / f"test_{zarr_format or 'default'}.zarr"
+    try:
+        _create_zarr_store(store_path, zarr_format=zarr_format)
+    except UnsupportedZarrFormat as exc:
+        _skip_if_unsupported(exc, zarr_format)
+    output = temp_dir / "obs.csv"
+
+    result = runner.invoke(
+        app,
+        ["export", "dataframe", str(store_path), "obs", "--output", str(output)],
+    )
+    if result.exit_code != 0:
+        raise AssertionError(
+            f"exit_code={result.exit_code} exception={result.exception!r} output={result.output}"
+        )
+    assert output.exists()
+    assert "obs_names" in output.read_text(encoding="utf-8")
+
+
+def test_export_dict_zarr(temp_dir, zarr_format):
+    store_path = temp_dir / f"test_{zarr_format or 'default'}.zarr"
+    try:
+        _create_zarr_store(store_path, zarr_format=zarr_format)
+    except UnsupportedZarrFormat as exc:
+        _skip_if_unsupported(exc, zarr_format)
+    output = temp_dir / "uns.json"
+
+    result = runner.invoke(
+        app, ["export", "dict", str(store_path), "uns", str(output)]
+    )
+    assert result.exit_code == 0
+    assert output.exists()
+
+
+def test_subset_zarr_output(temp_dir, zarr_format):
+    store_path = temp_dir / f"test_{zarr_format or 'default'}.zarr"
+    try:
+        _create_zarr_store(store_path, zarr_format=zarr_format)
+    except UnsupportedZarrFormat as exc:
+        _skip_if_unsupported(exc, zarr_format)
+    obs_file = temp_dir / "obs.txt"
+    obs_file.write_text("cell_1\ncell_3\n")
+    output = temp_dir / "subset.zarr"
+
+    console = Console()
+    subset_h5ad(
+        file=store_path,
+        output=output,
+        obs_file=obs_file,
+        var_file=None,
+        chunk_rows=1024,
+        console=console,
+    )
+    root = zarr.open_group(output, mode="r")
+    assert root["obs"]["obs_names"].shape[0] == 2
+    assert root["X"].shape == (2, 4)

From eee67ae5713e6ba8b56192af3c92f334e6222b26 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 17:33:19 +0000
Subject: [PATCH 44/62] HUGE REFACTOR: Rename job from 'test' to 'tests' and
 enhance test matrix for additional modules, improving test organization and
 coverage reporting

---
 .github/workflows/tests.yml | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 7548bf7..f946c87 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -11,7 +11,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  test:
+  tests:
     runs-on: ubuntu-latest
     timeout-minutes: 20
 
@@ -23,6 +23,21 @@ jobs:
       fail-fast: false
       matrix:
         python-version: ["3.12"]  # add "3.13" if you want
+        module:
+          - name: cli
+            tests: tests/test_cli.py
+          - name: export
+            tests: tests/test_export.py
+          - name: import
+            tests: tests/test_import.py
+          - name: info-read
+            tests: tests/test_info_read.py
+          - name: subset
+            tests: tests/test_subset.py
+          - name: zarr
+            tests: tests/test_zarr.py
+
+    name: tests (${{ matrix.module.name }})
 
     steps:
       - uses: actions/checkout@v4
@@ -42,29 +57,29 @@ jobs:
 
       - name: Run tests with coverage
         run: |
-          uv run pytest -v \
+          uv run pytest -v ${{ matrix.module.tests }} \
             --cov=h5ad \
             --cov-report=term-missing \
             --cov-report=xml \
             --cov-report=html \
-            --junitxml=pytest-results.xml
+            --junitxml=pytest-results-${{ matrix.module.name }}.xml
 
       - name: Publish test results summary
         uses: EnricoMi/publish-unit-test-result-action@v2
         if: always()
         with:
-          files: pytest-results.xml
-          check_name: Test Results
+          files: pytest-results-${{ matrix.module.name }}.xml
+          check_name: Test Results (${{ matrix.module.name }})
 
       - name: Upload coverage artifacts
         uses: actions/upload-artifact@v4
         if: always()
         with:
-          name: coverage
+          name: coverage-${{ matrix.module.name }}
           path: |
             coverage.xml
             htmlcov/
-            pytest-results.xml
+            pytest-results-${{ matrix.module.name }}.xml
           retention-days: 30
 
       - name: Upload coverage to Codecov

From a2dc8f71ceaa2c117b97d20da8b415640fabe2f7 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 17:52:39 +0000
Subject: [PATCH 45/62] Update README to include support for .zarr stores and
 enhance feature descriptions for clarity

---
 README.md | 60 +++++++++++++++++++------------------------------------
 1 file changed, 21 insertions(+), 39 deletions(-)

diff --git a/README.md b/README.md
index eecee0f..b2396eb 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,20 @@
 # h5ad CLI
 
-A command-line tool for exploring huge `.h5ad` (AnnData) files without loading them fully into memory. Streams data directly from disk for efficient inspection of structure, metadata, and matrices.
+A command-line tool for exploring huge AnnData stores (`.h5ad` and `.zarr`) without loading them fully into memory. Streams data directly from disk for efficient inspection of structure, metadata, and matrices.
 
 ## Features
 
-- **`info`** – Show file structure and dimensions (`n_obs × n_var`)
-- **`table`** – Export obs/var metadata to CSV with chunked streaming
-- **`subset`** – Filter h5ad files by cell/gene names (supports dense and sparse CSR/CSC matrices)
-- Memory-efficient chunked processing for large files
-- Rich terminal output with colors and progress bars
+- Streaming access to very large `.h5ad` and `.zarr` stores
+- Auto-detects `.h5ad` files vs `.zarr` directories
+- Chunked processing for dense and sparse matrices (CSR/CSC)
+- Rich terminal output with progress indicators
 
 ## Installation
 
+Using [uv](https://docs.astral.sh/uv/) (recommended):
 ```bash
+git clone https://github.com/cellgeni/h5ad-cli.git
+cd h5ad-cli
 uv sync
 ```
 
@@ -21,45 +23,25 @@ For development and testing:
 uv sync --extra dev
 ```
 
-See [docs/TESTING.md](docs/TESTING.md) for testing documentation.
-
-## Usage
-Invoke any subcommand via `uv run h5ad ...`:
-
+Alternative with pip:
 ```bash
-uv run h5ad --help
+git clone https://github.com/cellgeni/h5ad-cli.git
+cd h5ad-cli
+pip install .
 ```
 
-#### Examples
-
-**Inspect overall structure and axis sizes:**
+For development and testing with pip:
 ```bash
-uv run h5ad info data.h5ad
+pip install -e ".[dev]"
 ```
 
-**Export full obs metadata to CSV:**
-```bash
-uv run h5ad table data.h5ad --axis obs --out obs_metadata.csv
-```
-
-**Export selected obs columns to stdout:**
-```bash
-uv run h5ad table data.h5ad --axis obs --cols cell_type,donor
-```
-
-**Export var metadata with custom chunk size:**
-```bash
-uv run h5ad table data.h5ad --axis var --chunk-rows 5000 --out var_metadata.csv
-```
+See [docs/TESTING.md](docs/TESTING.md) for testing documentation.
 
-**Subset by cell names:**
-```bash
-uv run h5ad subset input.h5ad output.h5ad --obs cells.txt
-```
+## Commands (Overview)
 
-**Subset by both cells and genes:**
-```bash
-uv run h5ad subset input.h5ad output.h5ad --obs cells.txt --var genes.txt
-```
+Run help at any level (e.g. `uv run h5ad --help`, `uv run h5ad export --help`).
 
-All commands stream from disk, so even multi-GB `.h5ad` files remain responsive.
+- `info` – read-only inspection of store layout, shapes, and type hints; supports drilling into paths like `obsm/X_pca` or `uns`.
+- `subset` – stream and write a filtered copy based on obs/var name lists, preserving dense and sparse matrix encodings.
+- `export` – extract data from a store; subcommands: `dataframe` (obs/var to CSV), `array` (dense to `.npy`), `sparse` (CSR/CSC to `.mtx`), `dict` (JSON), `image` (PNG).
+- `import` – write new data into a store; subcommands: `dataframe` (CSV → obs/var), `array` (`.npy`), `sparse` (`.mtx`), `dict` (JSON).
\ No newline at end of file

From b4c58a65b2e9e7374a629e530317784f1ae71c12 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 17:55:14 +0000
Subject: [PATCH 46/62] Renamed docs

---
 docs/{h5ad_elements_spec.md => ELEMENTS_h5ad.md} | 0
 docs/{zarr_elements_spec.md => ELEMENTS_zarr.md} | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename docs/{h5ad_elements_spec.md => ELEMENTS_h5ad.md} (100%)
 rename docs/{zarr_elements_spec.md => ELEMENTS_zarr.md} (100%)

diff --git a/docs/h5ad_elements_spec.md b/docs/ELEMENTS_h5ad.md
similarity index 100%
rename from docs/h5ad_elements_spec.md
rename to docs/ELEMENTS_h5ad.md
diff --git a/docs/zarr_elements_spec.md b/docs/ELEMENTS_zarr.md
similarity index 100%
rename from docs/zarr_elements_spec.md
rename to docs/ELEMENTS_zarr.md

From d418e4d8d450d4438d093b601dcaa90922d92245 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 17:56:05 +0000
Subject: [PATCH 47/62] Update README to add tutorial reference and ensure
 proper formatting

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b2396eb..ff7a474 100644
--- a/README.md
+++ b/README.md
@@ -44,4 +44,6 @@ Run help at any level (e.g. `uv run h5ad --help`, `uv run h5ad export --help`).
 - `info` – read-only inspection of store layout, shapes, and type hints; supports drilling into paths like `obsm/X_pca` or `uns`.
 - `subset` – stream and write a filtered copy based on obs/var name lists, preserving dense and sparse matrix encodings.
 - `export` – extract data from a store; subcommands: `dataframe` (obs/var to CSV), `array` (dense to `.npy`), `sparse` (CSR/CSC to `.mtx`), `dict` (JSON), `image` (PNG).
-- `import` – write new data into a store; subcommands: `dataframe` (CSV → obs/var), `array` (`.npy`), `sparse` (`.mtx`), `dict` (JSON).
\ No newline at end of file
+- `import` – write new data into a store; subcommands: `dataframe` (CSV → obs/var), `array` (`.npy`), `sparse` (`.mtx`), `dict` (JSON).
+
+See [docs/GET_STARTED.md](docs/GET_STARTED.md) for a short tutorial.
\ No newline at end of file

From 333925f1691b7ff38aed4bbd2a505596606c912f Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 17:56:13 +0000
Subject: [PATCH 48/62] Update dependencies in pyproject.toml: remove obsolete
 images section and ensure correct versions for pillow and zarr

---
 pyproject.toml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3df76b2..281812b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,8 +7,10 @@ requires-python = ">=3.12"
 dependencies = [
     "h5py>=3.15.1",
     "numpy>=2.3.5",
+    "pillow>=12.1.0",
     "rich>=14.2.0",
     "typer>=0.20.0",
+    "zarr>=3.1.5",
 ]
 
 [project.optional-dependencies]
@@ -16,9 +18,6 @@ dev = [
     "pytest>=8.3.4",
     "pytest-cov>=6.0.0",
 ]
-images = [
-    "pillow>=10.0.0",
-]
 
 [build-system]
 requires = ["uv_build>=0.8.0,<0.9.0"]

From bc7711fc0b43a70e7aa6269c0e9f805095f45117 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Mon, 26 Jan 2026 18:48:22 +0000
Subject: [PATCH 49/62] Add GET_STARTED.md for initial setup and usage
 instructions

---
 docs/GET_STARTED.md | 146 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 146 insertions(+)
 create mode 100644 docs/GET_STARTED.md

diff --git a/docs/GET_STARTED.md b/docs/GET_STARTED.md
new file mode 100644
index 0000000..61488fb
--- /dev/null
+++ b/docs/GET_STARTED.md
@@ -0,0 +1,146 @@
+# Get Started
+
+This short walkthrough shows the basic workflow: inspect a store, export metadata, and write a subset.
+
+## 1 Install
+
+Using uv (recommended):
+```bash
+git clone https://github.com/cellgeni/h5ad-cli.git
+cd h5ad-cli
+uv sync
+```
+
+With pip:
+```bash
+git clone https://github.com/cellgeni/h5ad-cli.git
+cd h5ad-cli
+pip install .
+```
+
+Additionally, it might be useful to install `csvkit` for inspecting exported CSV files:
+```bash
+# with uv
+uv pip install csvkit
+
+# with pip
+pip install csvkit
+```
+
+## 2 Inspect a files with `info` command
+
+Let's load an example `.h5ad` file:
+```bash
+wget -O visium.h5ad https://exampledata.scverse.org/squidpy/figshare/visium_hne_adata.h5ad
+```
+
+Now run `info` to see the file structure:
+```bash
+uv run h5ad info visium.h5ad
+```
+```
+An object with n_obs × n_var: 2688 × 18078
+        obs:    array_col, array_row, cluster, in_tissue, leiden, log1p_n_genes_by_counts, log1p_total_counts, log1p_total_counts_mt, n_counts, n_genes_by_counts, pct_counts_in_top_100_genes, pct_counts_in_top_200_genes, pct_counts_in_top_500_genes, 
+pct_counts_in_top_50_genes, pct_counts_mt, total_counts, total_counts_mt
+        var:    feature_types, gene_ids, genome, highly_variable, highly_variable_rank, log1p_mean_counts, log1p_total_counts, mean_counts, means, mt, n_cells, n_cells_by_counts, pct_dropout_by_counts, total_counts, variances, variances_norm
+        obsm:   X_pca, X_umap, spatial
+        varm:   PCs
+        obsp:   connectivities, distances
+        uns:    cluster_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, spatial, umap
+        raw:    X, var
+```
+
+To inspect a specific entry:
+```bash
+uv run h5ad info visium.h5ad obsm/X_pca
+```
+```
+Path: obsm/X_pca
+Type: dense-matrix
+Shape: (2688, 50)
+Dtype: float32
+Details: Dense matrix 2688×50 (float32)
+```
+
+## 3 Export entries
+View the first few lines of the `obs` dataframe:
+
+```bash
+uv run h5ad export dataframe visium.h5ad obs --head 10
+```
+```csv
+_index,array_col,array_row,cluster,in_tissue,leiden,log1p_n_genes_by_counts,log1p_total_counts,log1p_total_counts_mt,n_counts,n_genes_by_counts,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes,pct_counts_in_top_50_genes,pct_counts_mt,total_counts,total_counts_mt
+AAACAAGTATCTCCCA-1,102,50,Cortex_2,1,Cortex_3,8.502891406705377,9.869983,8.257904,19340.0,4928,43.13340227507756,49.21406411582213,60.449844881075485,38.42812823164426,19.943123,19340.0,3857.0
+AAACAATCTACTAGCA-1,43,3,Cortex_5,1,Pyramidal_layer_dentate_gyrus,8.145839612936841,9.528867,8.091933,13750.0,3448,55.14181818181818,60.95272727272727,70.57454545454546,50.516363636363636,23.76,13750.0,3267.0
+AAACACCAATAACTGC-1,19,59,Thalamus_2,1,Hypothalamus_1,8.70334075304372,10.395467,8.499233,32710.0,6022,47.071232039131765,54.56435340874351,65.0871293182513,40.48303271170896,15.010699,32710.0,4910.0
+AAACAGAGCGACTCCT-1,94,14,Cortex_5,1,Pyramidal_layer_dentate_gyrus,8.369157112588834,9.674704,8.092851,15909.0,4311,45.81054748884279,52.07744044251681,62.97693129675027,40.95794833113332,20.554403,15909.0,3270.0
+AAACCGGGTAGGTACC-1,28,42,Thalamus_2,1,Hypothalamus_1,8.663542087751374,10.369013,8.808967,31856.0,5787,45.887744851833254,52.98216976393771,64.24849321948768,40.287543947764945,21.01017,31856.0,6693.0
+AAACCGTTCGTCCAGG-1,42,52,Hypothalamus_2,1,Pyramidal_layer,8.682538124003075,10.337314,8.559678,30862.0,5898,43.79171797031949,51.18592443781998,62.65634113148856,37.80053139783553,16.901043,30862.0,5216.0
+AAACCTCATGAAGTTG-1,19,37,Thalamus_2,1,Hypothalamus_1,9.027858802380862,11.007419,8.849371,60319.0,8331,34.28770370861586,42.45594257199224,55.48997828213332,27.803842901904872,11.553574,60319.0,6969.0
+AAACGAAGAACATACC-1,64,6,Cortex_4,1,Hypothalamus_2,8.84246002419529,10.578089,8.855521,39264.0,6921,37.99663814180929,44.75346373268134,56.6320293398533,32.95639771801141,17.858597,39264.0,7012.0
+AAACGAGACGGTTGAT-1,79,35,Fiber_tract,1,Cortex_5,8.80941494391005,10.458923,8.351847,34853.0,6696,39.947780678851174,47.52818982583996,58.838550483459095,33.7245000430379,12.156773,34853.0,4237.0
+AAACGGTTGCGAACTG-1,59,67,Lateral_ventricle,1,Striatum,8.718663567048953,10.254004,8.416489,28395.0,6115,41.67635147032928,49.20232435287903,60.556435992252155,35.562599049128366,15.918295,28395.0,4520.0
+```
+
+Export cell metadata to a CSV file:
+```bash
+uv run h5ad export dataframe visium.h5ad obs --output cells.csv
+wc -l cells.csv # 2689 cells.csv
+```
+
+## 4 Subset by names
+
+Let's get all cluster names from `cells.csv`:
+```bash
+awk -F ',' 'NR>1{print $4}' cells.csv | sort | uniq -c
+```
+```
+284 Cortex_1
+257 Cortex_2
+244 Cortex_3
+164 Cortex_4
+129 Cortex_5
+226 Fiber_tract
+222 Hippocampus
+208 Hypothalamus_1
+133 Hypothalamus_2
+105 Lateral_ventricle
+42 Pyramidal_layer
+68 Pyramidal_layer_dentate_gyrus
+153 Striatum
+261 Thalamus_1
+192 Thalamus_2
+```
+
+To get all obs names in "Cortex_2", you can use `csvsql` from `csvkit`:
+```bash
+csvsql -d ',' -I --query "SELECT _index FROM cells WHERE cluster='Cortex_2'" cells.csv > barcodes.txt
+sed -i '1d' barcodes.txt # remove header
+wc -l barcodes.txt  # 257 barcodes.txt
+```
+
+Now you can use this list to create a subset `.h5ad` file:
+```bash
+uv run h5ad subset visium.h5ad cortex2.h5ad --obs barcodes.txt
+```
+
+Check the result:
+```bash
+uv run h5ad info cortex2.h5ad
+```
+
+## Import or replace data
+You can also import new data into an existing store. For example, let's replace the `obs` dataframe with a modified version. First, leave only first 5 columns in `cells.csv`:
+```bash
+cut -d ',' -f 1-5 cells.csv > cells1to5.csv
+```
+
+Now import it back into `cortex2.h5ad`:
+```bash
+uv run h5ad import dataframe visium.h5ad obs cells1to5.csv
+```
+
+Check the updated `obs` structure:
+```bash
+uv run h5ad info visium.h5ad obs
+```
\ No newline at end of file

From e28d5d36ff046a182d314996adb51a7e0953a92a Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Tue, 27 Jan 2026 11:10:10 +0000
Subject: [PATCH 50/62] Rename option '--types' to '--tree' in info command for
 clarity and update help text accordingly

---
 src/h5ad/cli.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py
index 3d084a6..250aa55 100644
--- a/src/h5ad/cli.py
+++ b/src/h5ad/cli.py
@@ -48,32 +48,32 @@ def info(
         None,
         help="Entry path to inspect (e.g., 'obsm/X_pca', 'X', 'uns')",
     ),
-    types: bool = typer.Option(
+    tree: bool = typer.Option(
         False,
-        "--types",
+        "--tree",
         "-t",
-        help="Show detailed type information for all entries",
+        help="Show a tree of all entries",
     ),
     depth: int = typer.Option(
         None,
         "--depth",
         "-d",
-        help="Maximum recursion depth for type display (only with --types)",
+        help="Maximum recursion depth for tree display (only with --tree)",
     ),
 ) -> None:
     """
     Show high-level information about the .h5ad file.
 
-    Use --types to see type information for each entry.
+    Use --tree to see a tree of all entries.
     Use --entry to inspect a specific entry in detail.
 
     Examples:
         h5ad info data.h5ad
-        h5ad info --types data.h5ad
+        h5ad info --tree data.h5ad
         h5ad info obsm/X_pca data.h5ad
     """
     try:
-        show_info(file, console, show_types=types, depth=depth, entry_path=entry)
+        show_info(file, console, show_types=tree, depth=depth, entry_path=entry)
     except Exception as e:
         console.print(f"[bold red]Error:[/] {e}")
         raise typer.Exit(code=1)

From 06740184b90b6f90ed5ff62331b9b27fe7e4fdb8 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Tue, 27 Jan 2026 11:54:25 +0000
Subject: [PATCH 51/62] Add support for copying HDF5 groups in copy_tree
 function

---
 src/h5ad/storage/__init__.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/h5ad/storage/__init__.py b/src/h5ad/storage/__init__.py
index 0b652d5..43d876d 100644
--- a/src/h5ad/storage/__init__.py
+++ b/src/h5ad/storage/__init__.py
@@ -235,6 +235,10 @@ def copy_dataset(src: Any, dst_group: Any, name: str) -> Any:
 
 
 def copy_tree(src_obj: Any, dst_group: Any, name: str, *, exclude: Iterable[str] = ()) -> Any:
+    if is_hdf5_group(dst_group) and (is_hdf5_group(src_obj) or is_hdf5_dataset(src_obj)):
+        if not exclude:
+            dst_group.copy(src_obj, dst_group, name)
+            return dst_group[name]
     if is_dataset(src_obj):
         return copy_dataset(src_obj, dst_group, name)
     if not is_group(src_obj):

From e66004748b1c044cf6edf0dde2dc3f43e40269eb Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Tue, 27 Jan 2026 11:54:30 +0000
Subject: [PATCH 52/62] Implement subset_matrix_entry function for handling
 dense and sparse matrix subsetting

---
 src/h5ad/core/subset.py | 95 ++++++++++++++++++++++++++++-------------
 1 file changed, 65 insertions(+), 30 deletions(-)

diff --git a/src/h5ad/core/subset.py b/src/h5ad/core/subset.py
index ee254cd..d6bccc9 100644
--- a/src/h5ad/core/subset.py
+++ b/src/h5ad/core/subset.py
@@ -283,6 +283,34 @@ def subset_sparse_matrix_group(
     create_dataset(group, "indptr", data=np.array(new_indptr, dtype=indptr.dtype))
 
 
+def subset_matrix_entry(
+    obj: Any,
+    dst_parent: Any,
+    name: str,
+    obs_idx: Optional[np.ndarray],
+    var_idx: Optional[np.ndarray],
+    *,
+    chunk_rows: int,
+    entry_label: str,
+) -> None:
+    if is_dataset(obj):
+        subset_dense_matrix(
+            obj, dst_parent, name, obs_idx, var_idx, chunk_rows=chunk_rows
+        )
+        return
+
+    if is_group(obj):
+        enc = obj.attrs.get("encoding-type", b"")
+        if isinstance(enc, bytes):
+            enc = enc.decode("utf-8")
+        if enc in ("csr_matrix", "csc_matrix"):
+            subset_sparse_matrix_group(obj, dst_parent, name, obs_idx, var_idx)
+            return
+        raise ValueError(f"Unsupported {entry_label} encoding type: {enc}")
+
+    raise ValueError(f"Unsupported {entry_label} object type")
+
+
 def subset_h5ad(
     file: Path,
     output: Path,
@@ -371,16 +399,15 @@ def subset_h5ad(
                 tasks.append("uns")
 
             with Progress(
-                SpinnerColumn(),
+                SpinnerColumn(finished_text="[green]✓[/]"),
                 TextColumn("[progress.description]{task.description}"),
-                BarColumn(),
-                TaskProgressColumn(),
-                TimeElapsedColumn(),
                 console=console,
+                transient=False,
             ) as progress:
-                task_id = progress.add_task("[cyan]Subsetting...", total=len(tasks))
-
                 for task in tasks:
+                    task_id = progress.add_task(
+                        f"[cyan]Subsetting {task}...[/]", total=None
+                    )
                     if task == "obs":
                         obs_dst = dst.create_group("obs")
                         subset_axis_group(src["obs"], obs_dst, obs_idx)
@@ -400,65 +427,73 @@ def subset_h5ad(
                     elif task.startswith("layer:"):
                         key = task.split(":", 1)[1]
                         layer_src = src["layers"][key]
-                        if is_dataset(layer_src):
-                            layers_dst = _ensure_group(dst, "layers")
-                            subset_dense_matrix(
-                                layer_src,
-                                layers_dst,
-                                key,
-                                obs_idx,
-                                var_idx,
-                                chunk_rows=chunk_rows,
-                            )
-                        elif is_group(layer_src):
-                            layers_dst = _ensure_group(dst, "layers")
-                            subset_sparse_matrix_group(
-                                layer_src, layers_dst, key, obs_idx, var_idx
-                            )
+                        layers_dst = _ensure_group(dst, "layers")
+                        subset_matrix_entry(
+                            layer_src,
+                            layers_dst,
+                            key,
+                            obs_idx,
+                            var_idx,
+                            chunk_rows=chunk_rows,
+                            entry_label=f"layer:{key}",
+                        )
                     elif task.startswith("obsm:"):
                         key = task.split(":", 1)[1]
                         obsm_dst = _ensure_group(dst, "obsm")
-                        subset_dense_matrix(
-                            src["obsm"][key],
+                        obsm_obj = src["obsm"][key]
+                        subset_matrix_entry(
+                            obsm_obj,
                             obsm_dst,
                             key,
                             obs_idx,
                             None,
                             chunk_rows=chunk_rows,
+                            entry_label=f"obsm:{key}",
                         )
                     elif task.startswith("varm:"):
                         key = task.split(":", 1)[1]
                         varm_dst = _ensure_group(dst, "varm")
-                        subset_dense_matrix(
-                            src["varm"][key],
+                        varm_obj = src["varm"][key]
+                        subset_matrix_entry(
+                            varm_obj,
                             varm_dst,
                             key,
                             var_idx,
                             None,
                             chunk_rows=chunk_rows,
+                            entry_label=f"varm:{key}",
                         )
                     elif task.startswith("obsp:"):
                         key = task.split(":", 1)[1]
                         obsp_dst = _ensure_group(dst, "obsp")
-                        subset_dense_matrix(
-                            src["obsp"][key],
+                        obsp_obj = src["obsp"][key]
+                        subset_matrix_entry(
+                            obsp_obj,
                             obsp_dst,
                             key,
                             obs_idx,
                             obs_idx,
                             chunk_rows=chunk_rows,
+                            entry_label=f"obsp:{key}",
                         )
                     elif task.startswith("varp:"):
                         key = task.split(":", 1)[1]
                         varp_dst = _ensure_group(dst, "varp")
-                        subset_dense_matrix(
-                            src["varp"][key],
+                        varp_obj = src["varp"][key]
+                        subset_matrix_entry(
+                            varp_obj,
                             varp_dst,
                             key,
                             var_idx,
                             var_idx,
                             chunk_rows=chunk_rows,
+                            entry_label=f"varp:{key}",
                         )
                     elif task == "uns":
                         copy_tree(src["uns"], dst, "uns")
-                    progress.advance(task_id)
+                    progress.update(
+                        task_id,
+                        description=f"[green]Subsetting {task}[/]",
+                        completed=1,
+                        total=1,
+                    )

From 949f37ea8b89d43781f217d385ac89f215f0472c Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Tue, 27 Jan 2026 11:54:35 +0000
Subject: [PATCH 53/62] Rename --types flag to --tree in info command tests for
 clarity

---
 tests/test_cli.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index 2b3bd90..50cc137 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -33,9 +33,9 @@ def test_info_function_direct(self, sample_h5ad_file):
         # Should not raise exception
         show_info(sample_h5ad_file, console)
 
-    def test_info_types_flag(self, sample_h5ad_file):
-        """Test info command with --types flag."""
-        result = runner.invoke(app, ["info", "--types", str(sample_h5ad_file)])
+    def test_info_tree_flag(self, sample_h5ad_file):
+        """Test info command with --tree flag."""
+        result = runner.invoke(app, ["info", "--tree", str(sample_h5ad_file)])
         assert result.exit_code == 0
         # Should show type annotations in angle brackets
         # Output may go to stdout or stderr depending on console config
@@ -43,7 +43,7 @@ def test_info_types_flag(self, sample_h5ad_file):
         assert "<" in output
         assert ">" in output
 
-    def test_info_types_short_flag(self, sample_h5ad_file):
+    def test_info_tree_short_flag(self, sample_h5ad_file):
         """Test info command with -t short flag."""
         result = runner.invoke(app, ["info", "-t", str(sample_h5ad_file)])
         assert result.exit_code == 0
@@ -53,7 +53,7 @@ def test_info_types_short_flag(self, sample_h5ad_file):
     def test_info_depth_flag(self, sample_h5ad_file):
         """Test info command with --depth flag."""
         result = runner.invoke(
-            app, ["info", "--types", "--depth", "1", str(sample_h5ad_file)]
+            app, ["info", "--tree", "--depth", "1", str(sample_h5ad_file)]
         )
         assert result.exit_code == 0
         output = result.stdout + (result.stderr or "")

From 3056ed4f16c1f2743e3b85428cd29d92c71fb25b Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Tue, 27 Jan 2026 11:54:42 +0000
Subject: [PATCH 54/62] Add tests for subsetting H5AD files with sparse
 matrices and variable-length strings

---
 tests/test_subset.py | 177 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)

diff --git a/tests/test_subset.py b/tests/test_subset.py
index 2aa9264..c2db94d 100644
--- a/tests/test_subset.py
+++ b/tests/test_subset.py
@@ -414,3 +414,180 @@ def test_subset_h5ad_sparse_csc(self, sample_sparse_csc_h5ad, temp_dir):
             if isinstance(encoding, bytes):
                 encoding = encoding.decode("utf-8")
             assert encoding == "csc_matrix"
+
+    def test_subset_h5ad_obsp_sparse_group(self, temp_dir):
+        """Test subsetting obsp sparse matrix groups."""
+        file_path = temp_dir / "obsp_sparse.h5ad"
+        with h5py.File(file_path, "w") as f:
+            obs = f.create_group("obs")
+            obs.attrs["_index"] = "obs_names"
+            obs_names = ["cell_1", "cell_2", "cell_3", "cell_4"]
+            obs.create_dataset("obs_names", data=np.array(obs_names, dtype="S"))
+
+            var = f.create_group("var")
+            var.attrs["_index"] = "var_names"
+            var_names = ["gene_1", "gene_2"]
+            var.create_dataset("var_names", data=np.array(var_names, dtype="S"))
+
+            f.create_dataset("X", data=np.zeros((4, 2), dtype=np.float32))
+
+            obsp = f.create_group("obsp")
+            conn = obsp.create_group("connectivities")
+            conn.attrs["encoding-type"] = "csr_matrix"
+            conn.attrs["encoding-version"] = "0.1.0"
+            conn.attrs["shape"] = np.array([4, 4], dtype=np.int64)
+            conn.create_dataset("data", data=np.array([1.0, 2.0, 3.0, 4.0]))
+            conn.create_dataset("indices", data=np.array([0, 1, 2, 3], dtype=np.int64))
+            conn.create_dataset("indptr", data=np.array([0, 1, 2, 3, 4], dtype=np.int64))
+
+        obs_file = temp_dir / "obs_names.txt"
+        obs_file.write_text("cell_1\ncell_3\n")
+
+        output = temp_dir / "subset.h5ad"
+        console = Console(stderr=True)
+
+        subset_h5ad(
+            file=file_path,
+            output=output,
+            obs_file=obs_file,
+            var_file=None,
+            chunk_rows=1024,
+            console=console,
+        )
+
+        with h5py.File(output, "r") as f:
+            conn = f["obsp"]["connectivities"]
+            encoding = conn.attrs["encoding-type"]
+            if isinstance(encoding, bytes):
+                encoding = encoding.decode("utf-8")
+            assert encoding == "csr_matrix"
+            assert tuple(conn.attrs["shape"]) == (2, 2)
+            assert conn["indptr"].shape[0] == 3
+
+    def test_subset_h5ad_uns_vlen_strings(self, temp_dir):
+        """Test copying uns datasets with variable-length strings."""
+        file_path = temp_dir / "uns_strings.h5ad"
+        with h5py.File(file_path, "w") as f:
+            obs = f.create_group("obs")
+            obs.attrs["_index"] = "obs_names"
+            obs.create_dataset(
+                "obs_names", data=np.array(["cell_1", "cell_2"], dtype="S")
+            )
+
+            var = f.create_group("var")
+            var.attrs["_index"] = "var_names"
+            var.create_dataset(
+                "var_names", data=np.array(["gene_1", "gene_2"], dtype="S")
+            )
+
+            f.create_dataset("X", data=np.zeros((2, 2), dtype=np.float32))
+
+            uns = f.create_group("uns")
+            vlen = h5py.string_dtype(encoding="utf-8")
+            uns.create_dataset("labels", data=np.array(["a", "b", "c"]), dtype=vlen)
+            meta = uns.create_group("meta")
+            meta.create_dataset("method", data="test", dtype=vlen)
+
+        obs_file = temp_dir / "obs_names.txt"
+        obs_file.write_text("cell_1\n")
+
+        output = temp_dir / "subset.h5ad"
+        console = Console(stderr=True)
+
+        subset_h5ad(
+            file=file_path,
+            output=output,
+            obs_file=obs_file,
+            var_file=None,
+            chunk_rows=1024,
+            console=console,
+        )
+
+        with h5py.File(output, "r") as f:
+            labels = [
+                v.decode("utf-8") if isinstance(v, bytes) else v
+                for v in f["uns"]["labels"][...]
+            ]
+            assert labels == ["a", "b", "c"]
+            method = f["uns"]["meta"]["method"][()]
+            if isinstance(method, bytes):
+                method = method.decode("utf-8")
+            assert method == "test"
+
+    def test_subset_h5ad_sparse_entries(self, temp_dir):
+        """Test sparse matrices in layers, obsm, varm, obsp, and varp."""
+        file_path = temp_dir / "sparse_entries.h5ad"
+
+        def _csr_group(parent, name, shape):
+            group = parent.create_group(name)
+            group.attrs["encoding-type"] = "csr_matrix"
+            group.attrs["encoding-version"] = "0.1.0"
+            group.attrs["shape"] = np.array(shape, dtype=np.int64)
+            n_rows, n_cols = shape
+            data = []
+            indices = []
+            indptr = [0]
+            for r in range(n_rows):
+                c = r % n_cols
+                data.append(float(r + 1))
+                indices.append(c)
+                indptr.append(len(indices))
+            group.create_dataset("data", data=np.array(data, dtype=np.float32))
+            group.create_dataset("indices", data=np.array(indices, dtype=np.int64))
+            group.create_dataset("indptr", data=np.array(indptr, dtype=np.int64))
+            return group
+
+        with h5py.File(file_path, "w") as f:
+            obs = f.create_group("obs")
+            obs.attrs["_index"] = "obs_names"
+            obs.create_dataset(
+                "obs_names", data=np.array(["cell_1", "cell_2", "cell_3", "cell_4"], dtype="S")
+            )
+
+            var = f.create_group("var")
+            var.attrs["_index"] = "var_names"
+            var.create_dataset(
+                "var_names", data=np.array(["gene_1", "gene_2", "gene_3"], dtype="S")
+            )
+
+            f.create_dataset("X", data=np.zeros((4, 3), dtype=np.float32))
+
+            layers = f.create_group("layers")
+            _csr_group(layers, "counts", (4, 3))
+
+            obsm = f.create_group("obsm")
+            _csr_group(obsm, "pca", (4, 2))
+
+            varm = f.create_group("varm")
+            _csr_group(varm, "pca", (3, 2))
+
+            obsp = f.create_group("obsp")
+            _csr_group(obsp, "connectivities", (4, 4))
+
+            varp = f.create_group("varp")
+            _csr_group(varp, "correlations", (3, 3))
+
+        obs_file = temp_dir / "obs_names.txt"
+        obs_file.write_text("cell_1\ncell_3\n")
+
+        var_file = temp_dir / "var_names.txt"
+        var_file.write_text("gene_1\ngene_3\n")
+
+        output = temp_dir / "subset.h5ad"
+        console = Console(stderr=True)
+
+        subset_h5ad(
+            file=file_path,
+            output=output,
+            obs_file=obs_file,
+            var_file=var_file,
+            chunk_rows=1024,
+            console=console,
+        )
+
+        with h5py.File(output, "r") as f:
+            assert tuple(f["layers"]["counts"].attrs["shape"]) == (2, 2)
+            assert tuple(f["obsm"]["pca"].attrs["shape"]) == (2, 2)
+            assert tuple(f["varm"]["pca"].attrs["shape"]) == (2, 2)
+            assert tuple(f["obsp"]["connectivities"].attrs["shape"]) == (2, 2)
+            assert tuple(f["varp"]["correlations"].attrs["shape"]) == (2, 2)

From 3047c7a514a93f61a9b46387f660c7aeb06060af Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Tue, 27 Jan 2026 12:08:52 +0000
Subject: [PATCH 55/62] Exclude 'obs_names' and 'var_names' from keys in group
 processing for improved data handling

---
 src/h5ad/commands/info.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/h5ad/commands/info.py b/src/h5ad/commands/info.py
index b58b5b0..76b56da 100644
--- a/src/h5ad/commands/info.py
+++ b/src/h5ad/commands/info.py
@@ -57,7 +57,9 @@ def show_info(
                 # Only process Groups, skip Datasets like X
                 if is_group(obj):
                     sub_keys = [
-                        k for k in obj.keys() if k not in ("_index", "__categories")
+                        k
+                        for k in obj.keys()
+                        if k not in ("_index", "__categories", "obs_names", "var_names")
                     ]
                     if sub_keys and key != "X":
                         rich.print(
@@ -126,7 +128,11 @@ def add_node(
         obj = f[key]
         # Skip empty groups
         if is_group(obj):
-            children = [k for k in obj.keys() if k not in ("_index", "__categories")]
+            children = [
+                k
+                for k in obj.keys()
+                if k not in ("_index", "__categories", "obs_names", "var_names")
+            ]
             if not children:
                 continue
         max_depth = (
@@ -174,7 +180,11 @@ def _show_object_info(f: Any, entry_path: str, console: Console) -> None:
 
     # If it's a group, show children
     if is_group(entry):
-        children = [k for k in entry.keys() if k not in ("_index", "__categories")]
+        children = [
+            k
+            for k in entry.keys()
+            if k not in ("_index", "__categories", "obs_names", "var_names")
+        ]
         if children:
             console.print(f"\n[bold cyan]Children:[/]")
             for child_name in sorted(children):

From 361a2aedaf6c41fc3ec190131d75d112107ba0e0 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Tue, 27 Jan 2026 12:08:56 +0000
Subject: [PATCH 56/62] Update GET_STARTED.md to include additional output
 examples for `info` command and clarify import options

---
 docs/GET_STARTED.md | 49 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 46 insertions(+), 3 deletions(-)

diff --git a/docs/GET_STARTED.md b/docs/GET_STARTED.md
index 61488fb..a4024ac 100644
--- a/docs/GET_STARTED.md
+++ b/docs/GET_STARTED.md
@@ -128,6 +128,17 @@ Check the result:
 ```bash
 uv run h5ad info cortex2.h5ad
 ```
+```
+An object with n_obs × n_var: 257 × 18078
+        obs:    array_col, array_row, cluster, in_tissue, leiden, log1p_n_genes_by_counts, log1p_total_counts, log1p_total_counts_mt, n_counts, n_genes_by_counts, 
+pct_counts_in_top_100_genes, pct_counts_in_top_200_genes, pct_counts_in_top_500_genes, pct_counts_in_top_50_genes, pct_counts_mt, total_counts, total_counts_mt
+        var:    feature_types, gene_ids, genome, highly_variable, highly_variable_rank, log1p_mean_counts, log1p_total_counts, mean_counts, means, mt, n_cells, n_cells_by_counts, 
+pct_dropout_by_counts, total_counts, variances, variances_norm
+        obsm:   X_pca, X_umap, spatial
+        varm:   PCs
+        obsp:   connectivities, distances
+        uns:    cluster_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, spatial, umap
+```
 
 ## Import or replace data
 You can also import new data into an existing store. For example, let's replace the `obs` dataframe with a modified version. First, leave only first 5 columns in `cells.csv`:
@@ -135,12 +146,44 @@ You can also import new data into an existing store. For example, let's replace
 cut -d ',' -f 1-5 cells.csv > cells1to5.csv
 ```
 
-Now import it back into `cortex2.h5ad`:
+Now import it back into `cortex2.h5ad` with the `_index` column as index:
+```bash
+uv run h5ad import dataframe visium.h5ad obs cells1to5.csv --index-column _index --output visium_obs1to5.h5ad
+```
+
+Check the updated `obs` structure:
+```bash
+uv run h5ad info visium_obs1to5.h5ad
+```
+```
+An object with n_obs × n_var: 2688 × 18078
+        obs:    array_col, array_row, cluster, in_tissue
+        var:    feature_types, gene_ids, genome, highly_variable, highly_variable_rank, log1p_mean_counts, log1p_total_counts, mean_counts, means, mt, n_cells, n_cells_by_counts, 
+pct_dropout_by_counts, total_counts, variances, variances_norm
+        obsm:   X_pca, X_umap, spatial
+        varm:   PCs
+        obsp:   connectivities, distances
+        uns:    cluster_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, spatial, umap
+        raw:    X, var
+```
+
+You can also import the data into existing file:
 ```bash
-uv run h5ad import dataframe visium.h5ad obs cells1to5.csv
+uv run h5ad import dataframe visium.h5ad obs cells1to5.csv --index-column _index --inplace
 ```
 
 Check the updated `obs` structure:
 ```bash
-uv run h5ad info visium.h5ad obs
+uv run h5ad info visium.h5ad
+```
+```
+An object with n_obs × n_var: 2688 × 18078
+        obs:    array_col, array_row, cluster, in_tissue
+        var:    feature_types, gene_ids, genome, highly_variable, highly_variable_rank, log1p_mean_counts, log1p_total_counts, mean_counts, means, mt, n_cells, n_cells_by_counts, 
+pct_dropout_by_counts, total_counts, variances, variances_norm
+        obsm:   X_pca, X_umap, spatial
+        varm:   PCs
+        obsp:   connectivities, distances
+        uns:    cluster_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, spatial, umap
+        raw:    X, var
 ```
\ No newline at end of file

From eeb34d952f3bfdca0d52ac8e7b71c2745ebb98de Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Tue, 27 Jan 2026 12:20:08 +0000
Subject: [PATCH 57/62] Refactor subset command to require output path or use
 --inplace option for file modification

---
 src/h5ad/cli.py         | 22 ++++++++++++++++++++--
 src/h5ad/core/subset.py | 34 ++++++++++++++++++++++++++++++++--
 2 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py
index 250aa55..66bbd22 100644
--- a/src/h5ad/cli.py
+++ b/src/h5ad/cli.py
@@ -92,8 +92,18 @@ def subset(
         dir_okay=True,
         file_okay=True,
     ),
-    output: Path = typer.Argument(
-        ..., help="Output .h5ad/.zarr", dir_okay=True, file_okay=True
+    output: Optional[Path] = typer.Option(
+        None,
+        "--output",
+        "-o",
+        help="Output .h5ad/.zarr path. Required unless --inplace.",
+        dir_okay=True,
+        file_okay=True,
+    ),
+    inplace: bool = typer.Option(
+        False,
+        "--inplace",
+        help="Modify source file directly.",
     ),
     obs: Optional[Path] = typer.Option(
         None,
@@ -125,6 +135,13 @@ def subset(
         )
         raise typer.Exit(code=1)
 
+    if not inplace and output is None:
+        console.print(
+            "[bold red]Error:[/] Output file is required. "
+            "Use --output/-o or --inplace.",
+        )
+        raise typer.Exit(code=1)
+
     try:
         subset_h5ad(
             file=file,
@@ -133,6 +150,7 @@ def subset(
             var_file=var,
             chunk_rows=chunk_rows,
             console=console,
+            inplace=inplace,
         )
     except Exception as e:
         console.print(f"[bold red]Error:[/] {e}")
diff --git a/src/h5ad/core/subset.py b/src/h5ad/core/subset.py
index d6bccc9..d9e7829 100644
--- a/src/h5ad/core/subset.py
+++ b/src/h5ad/core/subset.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 from pathlib import Path
+import shutil
 from typing import Optional, Set, Tuple, List, Dict, Any
 
 import numpy as np
@@ -22,6 +23,7 @@
     copy_attrs,
     copy_tree,
     dataset_create_kwargs,
+    detect_backend,
     is_dataset,
     is_group,
     is_zarr_group,
@@ -313,12 +315,13 @@ def subset_matrix_entry(
 
 def subset_h5ad(
     file: Path,
-    output: Path,
+    output: Optional[Path],
     obs_file: Optional[Path],
     var_file: Optional[Path],
     *,
     chunk_rows: int = 1024,
     console: Console,
+    inplace: bool = False,
 ) -> None:
     obs_keep: Optional[Set[str]] = None
     if obs_file is not None:
@@ -333,8 +336,24 @@ def subset_h5ad(
     if obs_keep is None and var_keep is None:
         raise ValueError("At least one of --obs or --var must be provided.")
 
+    if not inplace and output is None:
+        raise ValueError("Output file is required unless --inplace is specified.")
+
+    if inplace:
+        src_backend = detect_backend(file)
+        if src_backend == "zarr":
+            base_name = file.stem if file.suffix else file.name
+            tmp_path = file.with_name(f"{base_name}.subset-tmp.zarr")
+        else:
+            tmp_path = file.with_name(f"{file.name}.subset-tmp")
+        if tmp_path.exists():
+            raise FileExistsError(f"Temporary path already exists: {tmp_path}")
+        dst_path = tmp_path
+    else:
+        dst_path = output
+
     with console.status("[magenta]Opening files...[/]"):
-        with open_store(file, "r") as src_store, open_store(output, "w") as dst_store:
+        with open_store(file, "r") as src_store, open_store(dst_path, "w") as dst_store:
             src = src_store.root
             dst = dst_store.root
 
@@ -497,3 +516,14 @@ def subset_h5ad(
                         completed=1,
                         total=1,
                     )
+
+    if inplace:
+        if file.exists():
+            if file.is_dir():
+                shutil.rmtree(file)
+            else:
+                file.unlink()
+        if dst_path.is_dir():
+            shutil.move(str(dst_path), str(file))
+        else:
+            dst_path.replace(file)

From cb36c17b6da5a7c00dcf037a09d34d3327a788e9 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Tue, 27 Jan 2026 12:20:12 +0000
Subject: [PATCH 58/62] Update GET_STARTED.md to modify subset command syntax
 for clarity

---
 docs/GET_STARTED.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/GET_STARTED.md b/docs/GET_STARTED.md
index a4024ac..2ca023c 100644
--- a/docs/GET_STARTED.md
+++ b/docs/GET_STARTED.md
@@ -121,7 +121,7 @@ wc -l barcodes.txt  # 257 barcodes.txt
 
 Now you can use this list to create a subset `.h5ad` file:
 ```bash
-uv run h5ad subset visium.h5ad cortex2.h5ad --obs barcodes.txt
+uv run h5ad subset visium.h5ad --output cortex2.h5ad --obs barcodes.txt
 ```
 
 Check the result:

From 6ba8336273653472da260387820be45196419d3b Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Tue, 27 Jan 2026 12:20:19 +0000
Subject: [PATCH 59/62] Add inplace subsetting test for subset_h5ad function
 and fix dataset creation

---
 tests/test_subset.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/tests/test_subset.py b/tests/test_subset.py
index c2db94d..78c5cf8 100644
--- a/tests/test_subset.py
+++ b/tests/test_subset.py
@@ -484,7 +484,7 @@ def test_subset_h5ad_uns_vlen_strings(self, temp_dir):
 
             uns = f.create_group("uns")
             vlen = h5py.string_dtype(encoding="utf-8")
-            uns.create_dataset("labels", data=np.array(["a", "b", "c"]), dtype=vlen)
+            uns.create_dataset("labels", data=["a", "b", "c"], dtype=vlen)
             meta = uns.create_group("meta")
             meta.create_dataset("method", data="test", dtype=vlen)
 
@@ -514,6 +514,27 @@ def test_subset_h5ad_uns_vlen_strings(self, temp_dir):
                 method = method.decode("utf-8")
             assert method == "test"
 
+    def test_subset_h5ad_inplace(self, sample_h5ad_file, temp_dir):
+        """Test subsetting with --inplace behavior."""
+        obs_file = temp_dir / "obs_names.txt"
+        obs_file.write_text("cell_1\ncell_3\n")
+
+        console = Console(stderr=True)
+
+        subset_h5ad(
+            file=sample_h5ad_file,
+            output=None,
+            obs_file=obs_file,
+            var_file=None,
+            chunk_rows=1024,
+            console=console,
+            inplace=True,
+        )
+
+        with h5py.File(sample_h5ad_file, "r") as f:
+            assert f["obs"]["obs_names"].shape[0] == 2
+            assert f["X"].shape[0] == 2
+
     def test_subset_h5ad_sparse_entries(self, temp_dir):
         """Test sparse matrices in layers, obsm, varm, obsp, and varp."""
         file_path = temp_dir / "sparse_entries.h5ad"

From a4cbb6bfcd3c8eac1696988a4e51024dacd8d943 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Tue, 27 Jan 2026 12:24:28 +0000
Subject: [PATCH 60/62] Refactor subset command tests to use --output flag for
 output file specification

---
 .github/workflows/tests.yml |  2 +-
 tests/test_cli.py           | 27 ++++++++++++++++++++++++---
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f946c87..cd6a3bc 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -57,7 +57,7 @@ jobs:
 
       - name: Run tests with coverage
         run: |
-          uv run pytest -v ${{ matrix.module.tests }} \
+          uv run pytest -v -W default ${{ matrix.module.tests }} \
             --cov=h5ad \
             --cov-report=term-missing \
             --cov-report=xml \
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 50cc137..9a27d0c 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -367,7 +367,15 @@ def test_subset_command_obs(self, sample_h5ad_file, temp_dir):
 
         output = temp_dir / "subset.h5ad"
         result = runner.invoke(
-            app, ["subset", str(sample_h5ad_file), str(output), "--obs", str(obs_file)]
+            app,
+            [
+                "subset",
+                str(sample_h5ad_file),
+                "--output",
+                str(output),
+                "--obs",
+                str(obs_file),
+            ],
         )
         assert result.exit_code == 0
         assert output.exists()
@@ -379,7 +387,15 @@ def test_subset_command_var(self, sample_h5ad_file, temp_dir):
 
         output = temp_dir / "subset.h5ad"
         result = runner.invoke(
-            app, ["subset", str(sample_h5ad_file), str(output), "--var", str(var_file)]
+            app,
+            [
+                "subset",
+                str(sample_h5ad_file),
+                "--output",
+                str(output),
+                "--var",
+                str(var_file),
+            ],
         )
         assert result.exit_code == 0
         assert output.exists()
@@ -398,6 +414,7 @@ def test_subset_command_both(self, sample_h5ad_file, temp_dir):
             [
                 "subset",
                 str(sample_h5ad_file),
+                "--output",
                 str(output),
                 "--obs",
                 str(obs_file),
@@ -411,7 +428,9 @@ def test_subset_command_both(self, sample_h5ad_file, temp_dir):
     def test_subset_command_no_filters(self, sample_h5ad_file, temp_dir):
         """Test subset command without any filters (should fail)."""
         output = temp_dir / "subset.h5ad"
-        result = runner.invoke(app, ["subset", str(sample_h5ad_file), str(output)])
+        result = runner.invoke(
+            app, ["subset", str(sample_h5ad_file), "--output", str(output)]
+        )
         assert result.exit_code == 1
         # Check both stdout and stderr since Console uses stderr=True
         output_text = result.stdout + result.stderr
@@ -428,6 +447,7 @@ def test_subset_command_chunk_rows(self, sample_h5ad_file, temp_dir):
             [
                 "subset",
                 str(sample_h5ad_file),
+                "--output",
                 str(output),
                 "--obs",
                 str(obs_file),
@@ -449,6 +469,7 @@ def test_subset_command_sparse(self, sample_sparse_csr_h5ad, temp_dir):
             [
                 "subset",
                 str(sample_sparse_csr_h5ad),
+                "--output",
                 str(output),
                 "--obs",
                 str(obs_file),

From 982efd46ab4b6a0bebfd7d663c3a9dbf09d5b6e1 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Tue, 27 Jan 2026 12:24:41 +0000
Subject: [PATCH 61/62] Update uv.lock

---
 uv.lock | 200 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 199 insertions(+), 1 deletion(-)

diff --git a/uv.lock b/uv.lock
index 71c45c8..266fa80 100644
--- a/uv.lock
+++ b/uv.lock
@@ -97,15 +97,52 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cc/48/d9f421cb8da5afaa1a64570d9989e00fb7955e6acddc5a12979f7666ef60/coverage-7.13.1-py3-none-any.whl", hash = "sha256:2016745cb3ba554469d02819d78958b571792bb68e31302610e898f80dd3a573", size = 210722, upload-time = "2025-12-28T15:42:54.901Z" },
 ]
 
+[[package]]
+name = "donfig"
+version = "0.8.1.post1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pyyaml" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/25/71/80cc718ff6d7abfbabacb1f57aaa42e9c1552bfdd01e64ddd704e4a03638/donfig-0.8.1.post1.tar.gz", hash = "sha256:3bef3413a4c1c601b585e8d297256d0c1470ea012afa6e8461dc28bfb7c23f52", size = 19506, upload-time = "2024-05-23T14:14:31.513Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/0c/d5/c5db1ea3394c6e1732fb3286b3bd878b59507a8f77d32a2cebda7d7b7cd4/donfig-0.8.1.post1-py3-none-any.whl", hash = "sha256:2a3175ce74a06109ff9307d90a230f81215cbac9a751f4d1c6194644b8204f9d", size = 21592, upload-time = "2024-05-23T14:13:55.283Z" },
+]
+
+[[package]]
+name = "google-crc32c"
+version = "1.8.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/03/41/4b9c02f99e4c5fb477122cd5437403b552873f014616ac1d19ac8221a58d/google_crc32c-1.8.0.tar.gz", hash = "sha256:a428e25fb7691024de47fecfbff7ff957214da51eddded0da0ae0e0f03a2cf79", size = 14192, upload-time = "2025-12-16T00:35:25.142Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e9/5f/7307325b1198b59324c0fa9807cafb551afb65e831699f2ce211ad5c8240/google_crc32c-1.8.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:4b8286b659c1335172e39563ab0a768b8015e88e08329fa5321f774275fc3113", size = 31300, upload-time = "2025-12-16T00:21:56.723Z" },
+    { url = "https://files.pythonhosted.org/packages/21/8e/58c0d5d86e2220e6a37befe7e6a94dd2f6006044b1a33edf1ff6d9f7e319/google_crc32c-1.8.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:2a3dc3318507de089c5384cc74d54318401410f82aa65b2d9cdde9d297aca7cb", size = 30867, upload-time = "2025-12-16T00:38:31.302Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/a9/a780cc66f86335a6019f557a8aaca8fbb970728f0efd2430d15ff1beae0e/google_crc32c-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:14f87e04d613dfa218d6135e81b78272c3b904e2a7053b841481b38a7d901411", size = 33364, upload-time = "2025-12-16T00:40:22.96Z" },
+    { url = "https://files.pythonhosted.org/packages/21/3f/3457ea803db0198c9aaca2dd373750972ce28a26f00544b6b85088811939/google_crc32c-1.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cb5c869c2923d56cb0c8e6bcdd73c009c36ae39b652dbe46a05eb4ef0ad01454", size = 33740, upload-time = "2025-12-16T00:40:23.96Z" },
+    { url = "https://files.pythonhosted.org/packages/df/c0/87c2073e0c72515bb8733d4eef7b21548e8d189f094b5dad20b0ecaf64f6/google_crc32c-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:3cc0c8912038065eafa603b238abf252e204accab2a704c63b9e14837a854962", size = 34437, upload-time = "2025-12-16T00:35:21.395Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/db/000f15b41724589b0e7bc24bc7a8967898d8d3bc8caf64c513d91ef1f6c0/google_crc32c-1.8.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:3ebb04528e83b2634857f43f9bb8ef5b2bbe7f10f140daeb01b58f972d04736b", size = 31297, upload-time = "2025-12-16T00:23:20.709Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/0d/8ebed0c39c53a7e838e2a486da8abb0e52de135f1b376ae2f0b160eb4c1a/google_crc32c-1.8.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:450dc98429d3e33ed2926fc99ee81001928d63460f8538f21a5d6060912a8e27", size = 30867, upload-time = "2025-12-16T00:43:14.628Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/42/b468aec74a0354b34c8cbf748db20d6e350a68a2b0912e128cabee49806c/google_crc32c-1.8.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:3b9776774b24ba76831609ffbabce8cdf6fa2bd5e9df37b594221c7e333a81fa", size = 33344, upload-time = "2025-12-16T00:40:24.742Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/e8/b33784d6fc77fb5062a8a7854e43e1e618b87d5ddf610a88025e4de6226e/google_crc32c-1.8.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:89c17d53d75562edfff86679244830599ee0a48efc216200691de8b02ab6b2b8", size = 33694, upload-time = "2025-12-16T00:40:25.505Z" },
+    { url = "https://files.pythonhosted.org/packages/92/b1/d3cbd4d988afb3d8e4db94ca953df429ed6db7282ed0e700d25e6c7bfc8d/google_crc32c-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:57a50a9035b75643996fbf224d6661e386c7162d1dfdab9bc4ca790947d1007f", size = 34435, upload-time = "2025-12-16T00:35:22.107Z" },
+    { url = "https://files.pythonhosted.org/packages/21/88/8ecf3c2b864a490b9e7010c84fd203ec8cf3b280651106a3a74dd1b0ca72/google_crc32c-1.8.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:e6584b12cb06796d285d09e33f63309a09368b9d806a551d8036a4207ea43697", size = 31301, upload-time = "2025-12-16T00:24:48.527Z" },
+    { url = "https://files.pythonhosted.org/packages/36/c6/f7ff6c11f5ca215d9f43d3629163727a272eabc356e5c9b2853df2bfe965/google_crc32c-1.8.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:f4b51844ef67d6cf2e9425983274da75f18b1597bb2c998e1c0a0e8d46f8f651", size = 30868, upload-time = "2025-12-16T00:48:12.163Z" },
+    { url = "https://files.pythonhosted.org/packages/56/15/c25671c7aad70f8179d858c55a6ae8404902abe0cdcf32a29d581792b491/google_crc32c-1.8.0-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b0d1a7afc6e8e4635564ba8aa5c0548e3173e41b6384d7711a9123165f582de2", size = 33381, upload-time = "2025-12-16T00:40:26.268Z" },
+    { url = "https://files.pythonhosted.org/packages/42/fa/f50f51260d7b0ef5d4898af122d8a7ec5a84e2984f676f746445f783705f/google_crc32c-1.8.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8b3f68782f3cbd1bce027e48768293072813469af6a61a86f6bb4977a4380f21", size = 33734, upload-time = "2025-12-16T00:40:27.028Z" },
+    { url = "https://files.pythonhosted.org/packages/08/a5/7b059810934a09fb3ccb657e0843813c1fee1183d3bc2c8041800374aa2c/google_crc32c-1.8.0-cp314-cp314-win_amd64.whl", hash = "sha256:d511b3153e7011a27ab6ee6bb3a5404a55b994dc1a7322c0b87b29606d9790e2", size = 34878, upload-time = "2025-12-16T00:35:23.142Z" },
+]
+
 [[package]]
 name = "h5ad"
-version = "0.1.0"
+version = "0.2.0"
 source = { editable = "." }
 dependencies = [
     { name = "h5py" },
     { name = "numpy" },
+    { name = "pillow" },
     { name = "rich" },
     { name = "typer" },
+    { name = "zarr" },
 ]
 
 [package.optional-dependencies]
@@ -118,10 +155,12 @@ dev = [
 requires-dist = [
     { name = "h5py", specifier = ">=3.15.1" },
     { name = "numpy", specifier = ">=2.3.5" },
+    { name = "pillow", specifier = ">=12.1.0" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.3.4" },
     { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=6.0.0" },
     { name = "rich", specifier = ">=14.2.0" },
     { name = "typer", specifier = ">=0.20.0" },
+    { name = "zarr", specifier = ">=3.1.5" },
 ]
 provides-extras = ["dev"]
 
@@ -190,6 +229,33 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" },
 ]
 
+[[package]]
+name = "numcodecs"
+version = "0.16.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/44/bd/8a391e7c356366224734efd24da929cc4796fff468bfb179fe1af6548535/numcodecs-0.16.5.tar.gz", hash = "sha256:0d0fb60852f84c0bd9543cc4d2ab9eefd37fc8efcc410acd4777e62a1d300318", size = 6276387, upload-time = "2025-11-21T02:49:48.986Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/75/cc/55420f3641a67f78392dc0bc5d02cb9eb0a9dcebf2848d1ac77253ca61fa/numcodecs-0.16.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:24e675dc8d1550cd976a99479b87d872cb142632c75cc402fea04c08c4898523", size = 1656287, upload-time = "2025-11-21T02:49:25.755Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/6c/86644987505dcb90ba6d627d6989c27bafb0699f9fd00187e06d05ea8594/numcodecs-0.16.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:94ddfa4341d1a3ab99989d13b01b5134abb687d3dab2ead54b450aefe4ad5bd6", size = 1148899, upload-time = "2025-11-21T02:49:26.87Z" },
+    { url = "https://files.pythonhosted.org/packages/97/1e/98aaddf272552d9fef1f0296a9939d1487914a239e98678f6b20f8b0a5c8/numcodecs-0.16.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b554ab9ecf69de7ca2b6b5e8bc696bd9747559cb4dd5127bd08d7a28bec59c3a", size = 8534814, upload-time = "2025-11-21T02:49:28.547Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/53/78c98ef5c8b2b784453487f3e4d6c017b20747c58b470393e230c78d18e8/numcodecs-0.16.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ad1a379a45bd3491deab8ae6548313946744f868c21d5340116977ea3be5b1d6", size = 9173471, upload-time = "2025-11-21T02:49:30.444Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/20/2fdec87fc7f8cec950d2b0bea603c12dc9f05b4966dc5924ba5a36a61bf6/numcodecs-0.16.5-cp312-cp312-win_amd64.whl", hash = "sha256:845a9857886ffe4a3172ba1c537ae5bcc01e65068c31cf1fce1a844bd1da050f", size = 801412, upload-time = "2025-11-21T02:49:32.123Z" },
+    { url = "https://files.pythonhosted.org/packages/38/38/071ced5a5fd1c85ba0e14ba721b66b053823e5176298c2f707e50bed11d9/numcodecs-0.16.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:25be3a516ab677dad890760d357cfe081a371d9c0a2e9a204562318ac5969de3", size = 1654359, upload-time = "2025-11-21T02:49:33.673Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/c0/5f84ba7525577c1b9909fc2d06ef11314825fc4ad4378f61d0e4c9883b4a/numcodecs-0.16.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0107e839ef75b854e969cb577e140b1aadb9847893937636582d23a2a4c6ce50", size = 1144237, upload-time = "2025-11-21T02:49:35.294Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/00/787ea5f237b8ea7bc67140c99155f9c00b5baf11c49afc5f3bfefa298f95/numcodecs-0.16.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:015a7c859ecc2a06e2a548f64008c0ec3aaecabc26456c2c62f4278d8fc20597", size = 8483064, upload-time = "2025-11-21T02:49:36.454Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/e6/d359fdd37498e74d26a167f7a51e54542e642ea47181eb4e643a69a066c3/numcodecs-0.16.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:84230b4b9dad2392f2a84242bd6e3e659ac137b5a1ce3571d6965fca673e0903", size = 9126063, upload-time = "2025-11-21T02:49:38.018Z" },
+    { url = "https://files.pythonhosted.org/packages/27/72/6663cc0382ddbb866136c255c837bcb96cc7ce5e83562efec55e1b995941/numcodecs-0.16.5-cp313-cp313-win_amd64.whl", hash = "sha256:5088145502ad1ebf677ec47d00eb6f0fd600658217db3e0c070c321c85d6cf3d", size = 799275, upload-time = "2025-11-21T02:49:39.558Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/9e/38e7ca8184c958b51f45d56a4aeceb1134ecde2d8bd157efadc98502cc42/numcodecs-0.16.5-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b05647b8b769e6bc8016e9fd4843c823ce5c9f2337c089fb5c9c4da05e5275de", size = 1654721, upload-time = "2025-11-21T02:49:40.602Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/37/260fa42e7b2b08e6e00ad632f8dd620961a60a459426c26cea390f8c68d0/numcodecs-0.16.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3832bd1b5af8bb3e413076b7d93318c8e7d7b68935006b9fa36ca057d1725a8f", size = 1146887, upload-time = "2025-11-21T02:49:41.721Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/15/e2e1151b5a8b14a15dfd4bb4abccce7fff7580f39bc34092780088835f3a/numcodecs-0.16.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49f7b7d24f103187f53135bed28bb9f0ed6b2e14c604664726487bb6d7c882e1", size = 8476987, upload-time = "2025-11-21T02:49:43.363Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/30/16a57fc4d9fb0ba06c600408bd6634f2f1753c54a7a351c99c5e09b51ee2/numcodecs-0.16.5-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:aec9736d81b70f337d89c4070ee3ffeff113f386fd789492fa152d26a15043e4", size = 9102377, upload-time = "2025-11-21T02:49:45.508Z" },
+    { url = "https://files.pythonhosted.org/packages/31/a5/a0425af36c20d55a3ea884db4b4efca25a43bea9214ba69ca7932dd997b4/numcodecs-0.16.5-cp314-cp314-win_amd64.whl", hash = "sha256:b16a14303800e9fb88abc39463ab4706c037647ac17e49e297faa5f7d7dbbf1d", size = 819022, upload-time = "2025-11-21T02:49:47.39Z" },
+]
+
 [[package]]
 name = "numpy"
 version = "2.3.5"
@@ -262,6 +328,75 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" },
 ]
 
+[[package]]
+name = "pillow"
+version = "12.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d0/02/d52c733a2452ef1ffcc123b68e6606d07276b0e358db70eabad7e40042b7/pillow-12.1.0.tar.gz", hash = "sha256:5c5ae0a06e9ea030ab786b0251b32c7e4ce10e58d983c0d5c56029455180b5b9", size = 46977283, upload-time = "2026-01-02T09:13:29.892Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/20/31/dc53fe21a2f2996e1b7d92bf671cdb157079385183ef7c1ae08b485db510/pillow-12.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a332ac4ccb84b6dde65dbace8431f3af08874bf9770719d32a635c4ef411b18b", size = 5262642, upload-time = "2026-01-02T09:11:10.138Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/c1/10e45ac9cc79419cedf5121b42dcca5a50ad2b601fa080f58c22fb27626e/pillow-12.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:907bfa8a9cb790748a9aa4513e37c88c59660da3bcfffbd24a7d9e6abf224551", size = 4657464, upload-time = "2026-01-02T09:11:12.319Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/26/7b82c0ab7ef40ebede7a97c72d473bda5950f609f8e0c77b04af574a0ddb/pillow-12.1.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:efdc140e7b63b8f739d09a99033aa430accce485ff78e6d311973a67b6bf3208", size = 6234878, upload-time = "2026-01-02T09:11:14.096Z" },
+    { url = "https://files.pythonhosted.org/packages/76/25/27abc9792615b5e886ca9411ba6637b675f1b77af3104710ac7353fe5605/pillow-12.1.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bef9768cab184e7ae6e559c032e95ba8d07b3023c289f79a2bd36e8bf85605a5", size = 8044868, upload-time = "2026-01-02T09:11:15.903Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/ea/f200a4c36d836100e7bc738fc48cd963d3ba6372ebc8298a889e0cfc3359/pillow-12.1.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:742aea052cf5ab5034a53c3846165bc3ce88d7c38e954120db0ab867ca242661", size = 6349468, upload-time = "2026-01-02T09:11:17.631Z" },
+    { url = "https://files.pythonhosted.org/packages/11/8f/48d0b77ab2200374c66d344459b8958c86693be99526450e7aee714e03e4/pillow-12.1.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6dfc2af5b082b635af6e08e0d1f9f1c4e04d17d4e2ca0ef96131e85eda6eb17", size = 7041518, upload-time = "2026-01-02T09:11:19.389Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/23/c281182eb986b5d31f0a76d2a2c8cd41722d6fb8ed07521e802f9bba52de/pillow-12.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:609e89d9f90b581c8d16358c9087df76024cf058fa693dd3e1e1620823f39670", size = 6462829, upload-time = "2026-01-02T09:11:21.28Z" },
+    { url = "https://files.pythonhosted.org/packages/25/ef/7018273e0faac099d7b00982abdcc39142ae6f3bd9ceb06de09779c4a9d6/pillow-12.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:43b4899cfd091a9693a1278c4982f3e50f7fb7cff5153b05174b4afc9593b616", size = 7166756, upload-time = "2026-01-02T09:11:23.559Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/c8/993d4b7ab2e341fe02ceef9576afcf5830cdec640be2ac5bee1820d693d4/pillow-12.1.0-cp312-cp312-win32.whl", hash = "sha256:aa0c9cc0b82b14766a99fbe6084409972266e82f459821cd26997a488a7261a7", size = 6328770, upload-time = "2026-01-02T09:11:25.661Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/87/90b358775a3f02765d87655237229ba64a997b87efa8ccaca7dd3e36e7a7/pillow-12.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:d70534cea9e7966169ad29a903b99fc507e932069a881d0965a1a84bb57f6c6d", size = 7033406, upload-time = "2026-01-02T09:11:27.474Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/cf/881b457eccacac9e5b2ddd97d5071fb6d668307c57cbf4e3b5278e06e536/pillow-12.1.0-cp312-cp312-win_arm64.whl", hash = "sha256:65b80c1ee7e14a87d6a068dd3b0aea268ffcabfe0498d38661b00c5b4b22e74c", size = 2452612, upload-time = "2026-01-02T09:11:29.309Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/c7/2530a4aa28248623e9d7f27316b42e27c32ec410f695929696f2e0e4a778/pillow-12.1.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:7b5dd7cbae20285cdb597b10eb5a2c13aa9de6cde9bb64a3c1317427b1db1ae1", size = 4062543, upload-time = "2026-01-02T09:11:31.566Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/1f/40b8eae823dc1519b87d53c30ed9ef085506b05281d313031755c1705f73/pillow-12.1.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:29a4cef9cb672363926f0470afc516dbf7305a14d8c54f7abbb5c199cd8f8179", size = 4138373, upload-time = "2026-01-02T09:11:33.367Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/77/6fa60634cf06e52139fd0e89e5bbf055e8166c691c42fb162818b7fda31d/pillow-12.1.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:681088909d7e8fa9e31b9799aaa59ba5234c58e5e4f1951b4c4d1082a2e980e0", size = 3601241, upload-time = "2026-01-02T09:11:35.011Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/bf/28ab865de622e14b747f0cd7877510848252d950e43002e224fb1c9ababf/pillow-12.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:983976c2ab753166dc66d36af6e8ec15bb511e4a25856e2227e5f7e00a160587", size = 5262410, upload-time = "2026-01-02T09:11:36.682Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/34/583420a1b55e715937a85bd48c5c0991598247a1fd2eb5423188e765ea02/pillow-12.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:db44d5c160a90df2d24a24760bbd37607d53da0b34fb546c4c232af7192298ac", size = 4657312, upload-time = "2026-01-02T09:11:38.535Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/fd/f5a0896839762885b3376ff04878f86ab2b097c2f9a9cdccf4eda8ba8dc0/pillow-12.1.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6b7a9d1db5dad90e2991645874f708e87d9a3c370c243c2d7684d28f7e133e6b", size = 6232605, upload-time = "2026-01-02T09:11:40.602Z" },
+    { url = "https://files.pythonhosted.org/packages/98/aa/938a09d127ac1e70e6ed467bd03834350b33ef646b31edb7452d5de43792/pillow-12.1.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6258f3260986990ba2fa8a874f8b6e808cf5abb51a94015ca3dc3c68aa4f30ea", size = 8041617, upload-time = "2026-01-02T09:11:42.721Z" },
+    { url = "https://files.pythonhosted.org/packages/17/e8/538b24cb426ac0186e03f80f78bc8dc7246c667f58b540bdd57c71c9f79d/pillow-12.1.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e115c15e3bc727b1ca3e641a909f77f8ca72a64fff150f666fcc85e57701c26c", size = 6346509, upload-time = "2026-01-02T09:11:44.955Z" },
+    { url = "https://files.pythonhosted.org/packages/01/9a/632e58ec89a32738cabfd9ec418f0e9898a2b4719afc581f07c04a05e3c9/pillow-12.1.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6741e6f3074a35e47c77b23a4e4f2d90db3ed905cb1c5e6e0d49bff2045632bc", size = 7038117, upload-time = "2026-01-02T09:11:46.736Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/a2/d40308cf86eada842ca1f3ffa45d0ca0df7e4ab33c83f81e73f5eaed136d/pillow-12.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:935b9d1aed48fcfb3f838caac506f38e29621b44ccc4f8a64d575cb1b2a88644", size = 6460151, upload-time = "2026-01-02T09:11:48.625Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/88/f5b058ad6453a085c5266660a1417bdad590199da1b32fb4efcff9d33b05/pillow-12.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5fee4c04aad8932da9f8f710af2c1a15a83582cfb884152a9caa79d4efcdbf9c", size = 7164534, upload-time = "2026-01-02T09:11:50.445Z" },
+    { url = "https://files.pythonhosted.org/packages/19/ce/c17334caea1db789163b5d855a5735e47995b0b5dc8745e9a3605d5f24c0/pillow-12.1.0-cp313-cp313-win32.whl", hash = "sha256:a786bf667724d84aa29b5db1c61b7bfdde380202aaca12c3461afd6b71743171", size = 6332551, upload-time = "2026-01-02T09:11:52.234Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/07/74a9d941fa45c90a0d9465098fe1ec85de3e2afbdc15cc4766622d516056/pillow-12.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:461f9dfdafa394c59cd6d818bdfdbab4028b83b02caadaff0ffd433faf4c9a7a", size = 7040087, upload-time = "2026-01-02T09:11:54.822Z" },
+    { url = "https://files.pythonhosted.org/packages/88/09/c99950c075a0e9053d8e880595926302575bc742b1b47fe1bbcc8d388d50/pillow-12.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:9212d6b86917a2300669511ed094a9406888362e085f2431a7da985a6b124f45", size = 2452470, upload-time = "2026-01-02T09:11:56.522Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/ba/970b7d85ba01f348dee4d65412476321d40ee04dcb51cd3735b9dc94eb58/pillow-12.1.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:00162e9ca6d22b7c3ee8e61faa3c3253cd19b6a37f126cad04f2f88b306f557d", size = 5264816, upload-time = "2026-01-02T09:11:58.227Z" },
+    { url = "https://files.pythonhosted.org/packages/10/60/650f2fb55fdba7a510d836202aa52f0baac633e50ab1cf18415d332188fb/pillow-12.1.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:7d6daa89a00b58c37cb1747ec9fb7ac3bc5ffd5949f5888657dfddde6d1312e0", size = 4660472, upload-time = "2026-01-02T09:12:00.798Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/c0/5273a99478956a099d533c4f46cbaa19fd69d606624f4334b85e50987a08/pillow-12.1.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e2479c7f02f9d505682dc47df8c0ea1fc5e264c4d1629a5d63fe3e2334b89554", size = 6268974, upload-time = "2026-01-02T09:12:02.572Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/26/0bf714bc2e73d5267887d47931d53c4ceeceea6978148ed2ab2a4e6463c4/pillow-12.1.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f188d580bd870cda1e15183790d1cc2fa78f666e76077d103edf048eed9c356e", size = 8073070, upload-time = "2026-01-02T09:12:04.75Z" },
+    { url = "https://files.pythonhosted.org/packages/43/cf/1ea826200de111a9d65724c54f927f3111dc5ae297f294b370a670c17786/pillow-12.1.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0fde7ec5538ab5095cc02df38ee99b0443ff0e1c847a045554cf5f9af1f4aa82", size = 6380176, upload-time = "2026-01-02T09:12:06.626Z" },
+    { url = "https://files.pythonhosted.org/packages/03/e0/7938dd2b2013373fd85d96e0f38d62b7a5a262af21ac274250c7ca7847c9/pillow-12.1.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0ed07dca4a8464bada6139ab38f5382f83e5f111698caf3191cb8dbf27d908b4", size = 7067061, upload-time = "2026-01-02T09:12:08.624Z" },
+    { url = "https://files.pythonhosted.org/packages/86/ad/a2aa97d37272a929a98437a8c0ac37b3cf012f4f8721e1bd5154699b2518/pillow-12.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f45bd71d1fa5e5749587613037b172e0b3b23159d1c00ef2fc920da6f470e6f0", size = 6491824, upload-time = "2026-01-02T09:12:10.488Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/44/80e46611b288d51b115826f136fb3465653c28f491068a72d3da49b54cd4/pillow-12.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:277518bf4fe74aa91489e1b20577473b19ee70fb97c374aa50830b279f25841b", size = 7190911, upload-time = "2026-01-02T09:12:12.772Z" },
+    { url = "https://files.pythonhosted.org/packages/86/77/eacc62356b4cf81abe99ff9dbc7402750044aed02cfd6a503f7c6fc11f3e/pillow-12.1.0-cp313-cp313t-win32.whl", hash = "sha256:7315f9137087c4e0ee73a761b163fc9aa3b19f5f606a7fc08d83fd3e4379af65", size = 6336445, upload-time = "2026-01-02T09:12:14.775Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/3c/57d81d0b74d218706dafccb87a87ea44262c43eef98eb3b164fd000e0491/pillow-12.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:0ddedfaa8b5f0b4ffbc2fa87b556dc59f6bb4ecb14a53b33f9189713ae8053c0", size = 7045354, upload-time = "2026-01-02T09:12:16.599Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/82/8b9b97bba2e3576a340f93b044a3a3a09841170ab4c1eb0d5c93469fd32f/pillow-12.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:80941e6d573197a0c28f394753de529bb436b1ca990ed6e765cf42426abc39f8", size = 2454547, upload-time = "2026-01-02T09:12:18.704Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/87/bdf971d8bbcf80a348cc3bacfcb239f5882100fe80534b0ce67a784181d8/pillow-12.1.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:5cb7bc1966d031aec37ddb9dcf15c2da5b2e9f7cc3ca7c54473a20a927e1eb91", size = 4062533, upload-time = "2026-01-02T09:12:20.791Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/4f/5eb37a681c68d605eb7034c004875c81f86ec9ef51f5be4a63eadd58859a/pillow-12.1.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:97e9993d5ed946aba26baf9c1e8cf18adbab584b99f452ee72f7ee8acb882796", size = 4138546, upload-time = "2026-01-02T09:12:23.664Z" },
+    { url = "https://files.pythonhosted.org/packages/11/6d/19a95acb2edbace40dcd582d077b991646b7083c41b98da4ed7555b59733/pillow-12.1.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:414b9a78e14ffeb98128863314e62c3f24b8a86081066625700b7985b3f529bd", size = 3601163, upload-time = "2026-01-02T09:12:26.338Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/36/2b8138e51cb42e4cc39c3297713455548be855a50558c3ac2beebdc251dd/pillow-12.1.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:e6bdb408f7c9dd2a5ff2b14a3b0bb6d4deb29fb9961e6eb3ae2031ae9a5cec13", size = 5266086, upload-time = "2026-01-02T09:12:28.782Z" },
+    { url = "https://files.pythonhosted.org/packages/53/4b/649056e4d22e1caa90816bf99cef0884aed607ed38075bd75f091a607a38/pillow-12.1.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3413c2ae377550f5487991d444428f1a8ae92784aac79caa8b1e3b89b175f77e", size = 4657344, upload-time = "2026-01-02T09:12:31.117Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/6b/c5742cea0f1ade0cd61485dc3d81f05261fc2276f537fbdc00802de56779/pillow-12.1.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e5dcbe95016e88437ecf33544ba5db21ef1b8dd6e1b434a2cb2a3d605299e643", size = 6232114, upload-time = "2026-01-02T09:12:32.936Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/8f/9f521268ce22d63991601aafd3d48d5ff7280a246a1ef62d626d67b44064/pillow-12.1.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d0a7735df32ccbcc98b98a1ac785cc4b19b580be1bdf0aeb5c03223220ea09d5", size = 8042708, upload-time = "2026-01-02T09:12:34.78Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/eb/257f38542893f021502a1bbe0c2e883c90b5cff26cc33b1584a841a06d30/pillow-12.1.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0c27407a2d1b96774cbc4a7594129cc027339fd800cd081e44497722ea1179de", size = 6347762, upload-time = "2026-01-02T09:12:36.748Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/5a/8ba375025701c09b309e8d5163c5a4ce0102fa86bbf8800eb0d7ac87bc51/pillow-12.1.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15c794d74303828eaa957ff8070846d0efe8c630901a1c753fdc63850e19ecd9", size = 7039265, upload-time = "2026-01-02T09:12:39.082Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/dc/cf5e4cdb3db533f539e88a7bbf9f190c64ab8a08a9bc7a4ccf55067872e4/pillow-12.1.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c990547452ee2800d8506c4150280757f88532f3de2a58e3022e9b179107862a", size = 6462341, upload-time = "2026-01-02T09:12:40.946Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/47/0291a25ac9550677e22eda48510cfc4fa4b2ef0396448b7fbdc0a6946309/pillow-12.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b63e13dd27da389ed9475b3d28510f0f954bca0041e8e551b2a4eb1eab56a39a", size = 7165395, upload-time = "2026-01-02T09:12:42.706Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/4c/e005a59393ec4d9416be06e6b45820403bb946a778e39ecec62f5b2b991e/pillow-12.1.0-cp314-cp314-win32.whl", hash = "sha256:1a949604f73eb07a8adab38c4fe50791f9919344398bdc8ac6b307f755fc7030", size = 6431413, upload-time = "2026-01-02T09:12:44.944Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/af/f23697f587ac5f9095d67e31b81c95c0249cd461a9798a061ed6709b09b5/pillow-12.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:4f9f6a650743f0ddee5593ac9e954ba1bdbc5e150bc066586d4f26127853ab94", size = 7176779, upload-time = "2026-01-02T09:12:46.727Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/36/6a51abf8599232f3e9afbd16d52829376a68909fe14efe29084445db4b73/pillow-12.1.0-cp314-cp314-win_arm64.whl", hash = "sha256:808b99604f7873c800c4840f55ff389936ef1948e4e87645eaf3fccbc8477ac4", size = 2543105, upload-time = "2026-01-02T09:12:49.243Z" },
+    { url = "https://files.pythonhosted.org/packages/82/54/2e1dd20c8749ff225080d6ba465a0cab4387f5db0d1c5fb1439e2d99923f/pillow-12.1.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:bc11908616c8a283cf7d664f77411a5ed2a02009b0097ff8abbba5e79128ccf2", size = 5268571, upload-time = "2026-01-02T09:12:51.11Z" },
+    { url = "https://files.pythonhosted.org/packages/57/61/571163a5ef86ec0cf30d265ac2a70ae6fc9e28413d1dc94fa37fae6bda89/pillow-12.1.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:896866d2d436563fa2a43a9d72f417874f16b5545955c54a64941e87c1376c61", size = 4660426, upload-time = "2026-01-02T09:12:52.865Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/e1/53ee5163f794aef1bf84243f755ee6897a92c708505350dd1923f4afec48/pillow-12.1.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8e178e3e99d3c0ea8fc64b88447f7cac8ccf058af422a6cedc690d0eadd98c51", size = 6269908, upload-time = "2026-01-02T09:12:54.884Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/0b/b4b4106ff0ee1afa1dc599fde6ab230417f800279745124f6c50bcffed8e/pillow-12.1.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:079af2fb0c599c2ec144ba2c02766d1b55498e373b3ac64687e43849fbbef5bc", size = 8074733, upload-time = "2026-01-02T09:12:56.802Z" },
+    { url = "https://files.pythonhosted.org/packages/19/9f/80b411cbac4a732439e629a26ad3ef11907a8c7fc5377b7602f04f6fe4e7/pillow-12.1.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bdec5e43377761c5dbca620efb69a77f6855c5a379e32ac5b158f54c84212b14", size = 6381431, upload-time = "2026-01-02T09:12:58.823Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/b7/d65c45db463b66ecb6abc17c6ba6917a911202a07662247e1355ce1789e7/pillow-12.1.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:565c986f4b45c020f5421a4cea13ef294dde9509a8577f29b2fc5edc7587fff8", size = 7068529, upload-time = "2026-01-02T09:13:00.885Z" },
+    { url = "https://files.pythonhosted.org/packages/50/96/dfd4cd726b4a45ae6e3c669fc9e49deb2241312605d33aba50499e9d9bd1/pillow-12.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:43aca0a55ce1eefc0aefa6253661cb54571857b1a7b2964bd8a1e3ef4b729924", size = 6492981, upload-time = "2026-01-02T09:13:03.314Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/1c/b5dc52cf713ae46033359c5ca920444f18a6359ce1020dd3e9c553ea5bc6/pillow-12.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0deedf2ea233722476b3a81e8cdfbad786f7adbed5d848469fa59fe52396e4ef", size = 7191878, upload-time = "2026-01-02T09:13:05.276Z" },
+    { url = "https://files.pythonhosted.org/packages/53/26/c4188248bd5edaf543864fe4834aebe9c9cb4968b6f573ce014cc42d0720/pillow-12.1.0-cp314-cp314t-win32.whl", hash = "sha256:b17fbdbe01c196e7e159aacb889e091f28e61020a8abeac07b68079b6e626988", size = 6438703, upload-time = "2026-01-02T09:13:07.491Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/0e/69ed296de8ea05cb03ee139cee600f424ca166e632567b2d66727f08c7ed/pillow-12.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:27b9baecb428899db6c0de572d6d305cfaf38ca1596b5c0542a5182e3e74e8c6", size = 7182927, upload-time = "2026-01-02T09:13:09.841Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/f5/68334c015eed9b5cff77814258717dec591ded209ab5b6fb70e2ae873d1d/pillow-12.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:f61333d817698bdcdd0f9d7793e365ac3d2a21c1f1eb02b32ad6aefb8d8ea831", size = 2545104, upload-time = "2026-01-02T09:13:12.068Z" },
+]
+
 [[package]]
 name = "pluggy"
 version = "1.6.0"
@@ -310,6 +445,52 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ee/49/1377b49de7d0c1ce41292161ea0f721913fa8722c19fb9c1e3aa0367eecb/pytest_cov-7.0.0-py3-none-any.whl", hash = "sha256:3b8e9558b16cc1479da72058bdecf8073661c7f57f7d3c5f22a1c23507f2d861", size = 22424, upload-time = "2025-09-09T10:57:00.695Z" },
 ]
 
+[[package]]
+name = "pyyaml"
+version = "6.0.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" },
+    { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" },
+    { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" },
+    { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" },
+    { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" },
+    { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" },
+    { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669, upload-time = "2025-09-25T21:32:23.673Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252, upload-time = "2025-09-25T21:32:25.149Z" },
+    { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081, upload-time = "2025-09-25T21:32:26.575Z" },
+    { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159, upload-time = "2025-09-25T21:32:27.727Z" },
+    { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626, upload-time = "2025-09-25T21:32:28.878Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613, upload-time = "2025-09-25T21:32:30.178Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115, upload-time = "2025-09-25T21:32:31.353Z" },
+    { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" },
+    { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" },
+    { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" },
+    { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" },
+    { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" },
+    { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" },
+    { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" },
+    { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" },
+]
+
 [[package]]
 name = "rich"
 version = "14.2.0"
@@ -355,3 +536,20 @@ sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac8
 wheels = [
     { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" },
 ]
+
+[[package]]
+name = "zarr"
+version = "3.1.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "donfig" },
+    { name = "google-crc32c" },
+    { name = "numcodecs" },
+    { name = "numpy" },
+    { name = "packaging" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/fc/76/7fa87f57c112c7b9c82f0a730f8b6f333e792574812872e2cd45ab604199/zarr-3.1.5.tar.gz", hash = "sha256:fbe0c79675a40c996de7ca08e80a1c0a20537bd4a9f43418b6d101395c0bba2b", size = 366825, upload-time = "2025-11-21T14:06:01.492Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/44/15/bb13b4913ef95ad5448490821eee4671d0e67673342e4d4070854e5fe081/zarr-3.1.5-py3-none-any.whl", hash = "sha256:29cd905afb6235b94c09decda4258c888fcb79bb6c862ef7c0b8fe009b5c8563", size = 284067, upload-time = "2025-11-21T14:05:59.235Z" },
+]

From e3acef8c51da0358bd7ecc895afebb450a962d81 Mon Sep 17 00:00:00 2001
From: Aljes <aleksandr.binkevich@gmail.com>
Date: Tue, 27 Jan 2026 13:17:05 +0000
Subject: [PATCH 62/62] Disable caching in UV setup for consistent test
 environment

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index cd6a3bc..dde3803 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -50,7 +50,7 @@ jobs:
       - name: Set up uv
         uses: astral-sh/setup-uv@v3
         with:
-          enable-cache: true
+          enable-cache: false
 
       - name: Install dependencies (frozen)
         run: uv sync --extra dev --frozen