diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 7548bf7..dde3803 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -11,7 +11,7 @@ concurrency:
cancel-in-progress: true
jobs:
- test:
+ tests:
runs-on: ubuntu-latest
timeout-minutes: 20
@@ -23,6 +23,21 @@ jobs:
fail-fast: false
matrix:
python-version: ["3.12"] # add "3.13" if you want
+ module:
+ - name: cli
+ tests: tests/test_cli.py
+ - name: export
+ tests: tests/test_export.py
+ - name: import
+ tests: tests/test_import.py
+ - name: info-read
+ tests: tests/test_info_read.py
+ - name: subset
+ tests: tests/test_subset.py
+ - name: zarr
+ tests: tests/test_zarr.py
+
+ name: tests (${{ matrix.module.name }})
steps:
- uses: actions/checkout@v4
@@ -35,36 +50,36 @@ jobs:
- name: Set up uv
uses: astral-sh/setup-uv@v3
with:
- enable-cache: true
+ enable-cache: false
- name: Install dependencies (frozen)
run: uv sync --extra dev --frozen
- name: Run tests with coverage
run: |
- uv run pytest -v \
+ uv run pytest -v -W default ${{ matrix.module.tests }} \
--cov=h5ad \
--cov-report=term-missing \
--cov-report=xml \
--cov-report=html \
- --junitxml=pytest-results.xml
+ --junitxml=pytest-results-${{ matrix.module.name }}.xml
- name: Publish test results summary
uses: EnricoMi/publish-unit-test-result-action@v2
if: always()
with:
- files: pytest-results.xml
- check_name: Test Results
+ files: pytest-results-${{ matrix.module.name }}.xml
+ check_name: Test Results (${{ matrix.module.name }})
- name: Upload coverage artifacts
uses: actions/upload-artifact@v4
if: always()
with:
- name: coverage
+ name: coverage-${{ matrix.module.name }}
path: |
coverage.xml
htmlcov/
- pytest-results.xml
+ pytest-results-${{ matrix.module.name }}.xml
retention-days: 30
- name: Upload coverage to Codecov
diff --git a/README.md b/README.md
index eecee0f..ff7a474 100644
--- a/README.md
+++ b/README.md
@@ -1,18 +1,20 @@
# h5ad CLI
-A command-line tool for exploring huge `.h5ad` (AnnData) files without loading them fully into memory. Streams data directly from disk for efficient inspection of structure, metadata, and matrices.
+A command-line tool for exploring huge AnnData stores (`.h5ad` and `.zarr`) without loading them fully into memory. Streams data directly from disk for efficient inspection of structure, metadata, and matrices.
## Features
-- **`info`** – Show file structure and dimensions (`n_obs × n_var`)
-- **`table`** – Export obs/var metadata to CSV with chunked streaming
-- **`subset`** – Filter h5ad files by cell/gene names (supports dense and sparse CSR/CSC matrices)
-- Memory-efficient chunked processing for large files
-- Rich terminal output with colors and progress bars
+- Streaming access to very large `.h5ad` and `.zarr` stores
+- Auto-detects `.h5ad` files vs `.zarr` directories
+- Chunked processing for dense and sparse matrices (CSR/CSC)
+- Rich terminal output with progress indicators
## Installation
+Using [uv](https://docs.astral.sh/uv/) (recommended):
```bash
+git clone https://github.com/cellgeni/h5ad-cli.git
+cd h5ad-cli
uv sync
```
@@ -21,45 +23,27 @@ For development and testing:
uv sync --extra dev
```
-See [docs/TESTING.md](docs/TESTING.md) for testing documentation.
-
-## Usage
-Invoke any subcommand via `uv run h5ad ...`:
-
-```bash
-uv run h5ad --help
-```
-
-#### Examples
-
-**Inspect overall structure and axis sizes:**
+Alternative with pip:
```bash
-uv run h5ad info data.h5ad
+git clone https://github.com/cellgeni/h5ad-cli.git
+cd h5ad-cli
+pip install .
```
-**Export full obs metadata to CSV:**
+For development and testing with pip:
```bash
-uv run h5ad table data.h5ad --axis obs --out obs_metadata.csv
+pip install -e ".[dev]"
```
-**Export selected obs columns to stdout:**
-```bash
-uv run h5ad table data.h5ad --axis obs --cols cell_type,donor
-```
+See [docs/TESTING.md](docs/TESTING.md) for testing documentation.
-**Export var metadata with custom chunk size:**
-```bash
-uv run h5ad table data.h5ad --axis var --chunk-rows 5000 --out var_metadata.csv
-```
+## Commands (Overview)
-**Subset by cell names:**
-```bash
-uv run h5ad subset input.h5ad output.h5ad --obs cells.txt
-```
+Run help at any level (e.g. `uv run h5ad --help`, `uv run h5ad export --help`).
-**Subset by both cells and genes:**
-```bash
-uv run h5ad subset input.h5ad output.h5ad --obs cells.txt --var genes.txt
-```
+- `info` – read-only inspection of store layout, shapes, and type hints; supports drilling into paths like `obsm/X_pca` or `uns`.
+- `subset` – stream and write a filtered copy based on obs/var name lists, preserving dense and sparse matrix encodings.
+- `export` – extract data from a store; subcommands: `dataframe` (obs/var to CSV), `array` (dense to `.npy`), `sparse` (CSR/CSC to `.mtx`), `dict` (JSON), `image` (PNG).
+- `import` – write new data into a store; subcommands: `dataframe` (CSV → obs/var), `array` (`.npy`), `sparse` (`.mtx`), `dict` (JSON).
-All commands stream from disk, so even multi-GB `.h5ad` files remain responsive.
+See [docs/GET_STARTED.md](docs/GET_STARTED.md) for a short tutorial.
\ No newline at end of file
diff --git a/docs/ELEMENTS_h5ad.md b/docs/ELEMENTS_h5ad.md
new file mode 100644
index 0000000..acb491d
--- /dev/null
+++ b/docs/ELEMENTS_h5ad.md
@@ -0,0 +1,274 @@
+# AnnData on-disk element specifications — HDF5 (`.h5ad`)
+
+This document describes how *elements* are encoded inside an AnnData **HDF5** container (`.h5ad`).
+It is intended to be GitHub-renderable Markdown (no Sphinx/MyST directives).
+
+> **Scope**
+>
+> - “Modern” encoding metadata (`encoding-type`, `encoding-version`) is the convention used by **anndata ≥ 0.8**.
+> - “Legacy” conventions (notably DataFrame categorical handling) are described for **anndata 0.7.x** files, which are still commonly encountered.
+
+## Table of contents
+
+- [Encoding metadata](#encoding-metadata)
+- [AnnData group](#anndata-group)
+- [Dense arrays](#dense-arrays)
+- [Sparse arrays (CSR/CSC)](#sparse-arrays-csrcsc)
+- [DataFrames](#dataframes)
+ - [DataFrame v0.2.0](#dataframe-v020)
+ - [DataFrame v0.1.0 (legacy: anndata 0.7.x)](#dataframe-v010-legacy-anndata-07x)
+ - [Legacy categorical columns (Series-level)](#legacy-categorical-columns-series-level)
+- [Mappings / dict](#mappings--dict)
+- [Scalars](#scalars)
+- [Categorical arrays](#categorical-arrays)
+- [String arrays](#string-arrays)
+- [Nullable arrays](#nullable-arrays)
+ - [Missing value semantics](#missing-value-semantics)
+- [Awkward arrays (experimental)](#awkward-arrays-experimental)
+- [Sources](#sources)
+
+## Encoding metadata
+
+**Modern convention (anndata ≥ 0.8):**
+
+- Any element (HDF5 *group* or *dataset*) that participates in the element-dispatch system:
+ - **MUST** have attribute `encoding-type` (string)
+ - **MUST** have attribute `encoding-version` (string, parseable as a version)
+
+Readers should dispatch first on `encoding-type`, then on `encoding-version`.
+
+**Legacy convention (anndata ≤ 0.7.x):**
+
+- Many objects do *not* have `encoding-type`/`encoding-version`.
+- Some elements (e.g. CSR/CSC sparse matrices, legacy DataFrames) *do* use `encoding-type`/`encoding-version`.
+- Readers typically infer element kinds from:
+ - known AnnData keys (`X`, `obs`, `var`, …),
+ - group structure, and/or
+ - legacy attributes (e.g. the `categories` attribute on categorical columns).
+
+## AnnData group
+
+### `encoding-type: anndata`, `encoding-version: 0.1.0`
+
+An `AnnData` object **MUST** be stored as an HDF5 **group** with attributes:
+
+- `encoding-type: "anndata"`
+- `encoding-version: "0.1.0"`
+
+Required members:
+
+- `obs` — a [DataFrame](#dataframes)
+- `var` — a [DataFrame](#dataframes)
+
+Optional members (if present, they must satisfy these constraints):
+
+- `X` — dense array or sparse array; shape `(n_obs, n_var)`
+- `layers` — mapping; values dense or sparse arrays; each shape `(n_obs, n_var)`
+- `obsm` — mapping; values dense arrays, sparse arrays, or dataframes; first dim `n_obs`
+- `varm` — mapping; values dense arrays, sparse arrays, or dataframes; first dim `n_var`
+- `obsp` — mapping; values dense or sparse arrays; first two dims `n_obs`
+- `varp` — mapping; values dense or sparse arrays; first two dims `n_var`
+- `uns` — mapping/dict-like container (recursive)
+
+## Dense arrays
+
+### `encoding-type: array`, `encoding-version: 0.2.0`
+
+- A dense array **MUST** be an HDF5 **dataset**.
+- The dataset **MUST** have attributes:
+ - `encoding-type: "array"`
+ - `encoding-version: "0.2.0"`
+
+> **Legacy note**
+>
+> In anndata 0.7.x, dense arrays were typically stored as plain datasets *without* `encoding-type`/`encoding-version`.
+
+## Sparse arrays (CSR/CSC)
+
+### `encoding-type: csr_matrix|csc_matrix`, `encoding-version: 0.1.0`
+
+A sparse matrix **MUST** be stored as an HDF5 **group**.
+
+- Group attributes:
+ - `encoding-type: "csr_matrix"` **or** `"csc_matrix"`
+ - `encoding-version: "0.1.0"`
+ - `shape`: integer array of length 2 (matrix shape)
+- Group members (datasets):
+ - `data`
+ - `indices`
+ - `indptr`
+
+The exact CSR/CSC semantics follow SciPy’s conventions.
+
+## DataFrames
+
+DataFrames are stored column-wise: each column is stored as a dataset (or group, if the column itself is an encoded element).
+
+
+### DataFrame v0.2.0
+
+#### `encoding-type: dataframe`, `encoding-version: 0.2.0`
+
+A dataframe **MUST** be stored as an HDF5 **group**.
+
+- Group attributes:
+ - `_index`: string — the key of the dataset to be used as the row index
+ - `column-order`: array of strings — original column order
+ - `encoding-type: "dataframe"`
+ - `encoding-version: "0.2.0"`
+- Group members:
+ - the index dataset (named by `_index`)
+ - one member per column
+- All column entries **MUST** have the same length in their first dimension.
+- Columns **SHOULD** share chunking along the first dimension.
+
+Columns are independently encoded:
+- simple numeric/bool columns are commonly `encoding-type: array`
+- categorical columns are commonly `encoding-type: categorical`
+
+
+### DataFrame v0.1.0 (legacy: anndata 0.7.x)
+
+#### `encoding-type: dataframe`, `encoding-version: 0.1.0`
+
+A legacy dataframe is stored as an HDF5 **group** where:
+
+- Group attributes include:
+ - `_index`
+ - `column-order`
+ - `encoding-type: "dataframe"`
+ - `encoding-version: "0.1.0"`
+- Each column is a dataset.
+- Categorical columns are stored as **integer code datasets**, and their category labels are stored in a reserved subgroup named `__categories`.
+
+**Reserved subgroup:**
+
+- `__categories/` stores the array of category labels for column ``.
+
+
+### Legacy categorical columns (Series-level)
+
+In v0.1.0 DataFrames, a categorical column dataset (e.g. `obs/cell_type`) can be identified by the presence of an attribute:
+
+- `categories`: an **HDF5 object reference** pointing to the corresponding `__categories/` dataset.
+
+## Mappings / dict
+
+### `encoding-type: dict`, `encoding-version: 0.1.0`
+
+- A mapping **MUST** be stored as an HDF5 **group**.
+- Group attributes:
+ - `encoding-type: "dict"`
+ - `encoding-version: "0.1.0"`
+- Each entry in the group is another element (recursively).
+
+> **Legacy note**
+>
+> In anndata 0.7.x, groups used as mappings often had **no special attributes**.
+
+## Scalars
+
+### `encoding-version: 0.2.0`
+
+Scalars are stored as **0-dimensional datasets**.
+
+- Numeric scalars:
+ - `encoding-type: "numeric-scalar"`
+ - `encoding-version: "0.2.0"`
+ - value is numeric (including boolean, ints, floats, complex)
+- String scalars:
+ - `encoding-type: "string"`
+ - `encoding-version: "0.2.0"`
+ - **HDF5 requirement:** variable-length UTF-8 string dtype
+
+> **Legacy note**
+>
+> In anndata 0.7.x, scalar strings were commonly stored as `|O` datasets without `encoding-type`/`encoding-version`.
+
+## Categorical arrays
+
+### `encoding-type: categorical`, `encoding-version: 0.2.0`
+
+Categorical arrays are stored as an HDF5 **group** with members:
+
+- `codes`: integer dataset
+ - values are zero-based indices into `categories`
+ - signed integer arrays **MAY** use `-1` to denote missing values
+- `categories`: array of labels
+
+Group attributes:
+
+- `encoding-type: "categorical"`
+- `encoding-version: "0.2.0"`
+- `ordered`: boolean (whether the categories are ordered)
+
+## String arrays
+
+### `encoding-type: string-array`, `encoding-version: 0.2.0`
+
+- String arrays **MUST** be stored as HDF5 datasets.
+- Dataset attributes:
+ - `encoding-type: "string-array"`
+ - `encoding-version: "0.2.0"`
+- **HDF5 requirement:** variable-length UTF-8 string dtype
+
+## Nullable arrays
+
+These encodings support Pandas nullable integer/boolean/string arrays by storing a `values` array plus a boolean `mask` array.
+
+### `encoding-type: nullable-integer`, `encoding-version: 0.1.0`
+
+- Stored as an HDF5 group with datasets:
+ - `values` (integer)
+ - `mask` (boolean)
+
+### `encoding-type: nullable-boolean`, `encoding-version: 0.1.0`
+
+- Stored as an HDF5 group with datasets:
+ - `values` (boolean)
+ - `mask` (boolean)
+- `values` and `mask` **MUST** have the same shape.
+
+### `encoding-type: nullable-string-array`, `encoding-version: 0.1.0`
+
+- Stored as an HDF5 group with datasets:
+ - `values` (string array)
+ - `mask` (boolean)
+- Group attributes:
+ - `encoding-type: "nullable-string-array"`
+ - `encoding-version: "0.1.0"`
+ - optional `na-value`: `"NA"` or `"NaN"` (default `"NA"`)
+
+
+#### Missing value semantics
+
+For elements supporting a `na-value` attribute:
+
+- `"NA"`: comparisons propagate missingness (e.g. `"x" == NA` → `NA`)
+- `"NaN"`: comparisons yield boolean results (e.g. `"x" == NaN` → `false`)
+
+Readers should preserve semantics when the runtime model supports it.
+
+## Awkward arrays (experimental)
+
+### `encoding-type: awkward-array`, `encoding-version: 0.1.0`
+
+Ragged arrays are stored by decomposing an Awkward Array into constituent buffers (via `ak.to_buffers`), then storing those buffers as datasets within a group.
+
+Group attributes:
+
+- `encoding-type: "awkward-array"`
+- `encoding-version: "0.1.0"`
+- `form`: string — serialized Awkward “form”
+- `length`: integer — logical length
+
+Group members: datasets for the buffers (often named like `nodeX-*`).
+
+> **Experimental**
+>
+> This encoding is considered experimental in the anndata 0.9.x series and later.
+
+## Sources
+
+- AnnData “on-disk format” prose docs (modern, ≥0.8): https://anndata.readthedocs.io/en/stable/fileformat-prose.html
+- AnnData 0.7.8 “on-disk format” prose docs (legacy): https://dokk.org/documentation/anndata/0.7.8/fileformat-prose/
diff --git a/docs/ELEMENTS_zarr.md b/docs/ELEMENTS_zarr.md
new file mode 100644
index 0000000..ce309e6
--- /dev/null
+++ b/docs/ELEMENTS_zarr.md
@@ -0,0 +1,276 @@
+# AnnData on-disk element specifications — Zarr (`.zarr`)
+
+This document describes how *elements* are encoded inside an AnnData **Zarr** container (`.zarr`).
+It is intended to be GitHub-renderable Markdown (no Sphinx/MyST directives).
+
+> **Scope**
+>
+> - “Modern” encoding metadata (`encoding-type`, `encoding-version`) is the convention used by **anndata ≥ 0.8**.
+> - “Legacy” conventions (notably DataFrame categorical handling) are described for **anndata 0.7.x** files, which are still commonly encountered.
+
+## Table of contents
+
+- [Encoding metadata](#encoding-metadata)
+- [AnnData group](#anndata-group)
+- [Dense arrays](#dense-arrays)
+- [Sparse arrays (CSR/CSC)](#sparse-arrays-csrcsc)
+- [DataFrames](#dataframes)
+ - [DataFrame v0.2.0](#dataframe-v020)
+ - [DataFrame v0.1.0 (legacy: anndata 0.7.x)](#dataframe-v010-legacy-anndata-07x)
+ - [Legacy categorical columns (Series-level)](#legacy-categorical-columns-series-level)
+- [Mappings / dict](#mappings--dict)
+- [Scalars](#scalars)
+- [Categorical arrays](#categorical-arrays)
+- [String arrays](#string-arrays)
+- [Nullable arrays](#nullable-arrays)
+ - [Missing value semantics](#missing-value-semantics)
+- [Awkward arrays (experimental)](#awkward-arrays-experimental)
+- [Sources](#sources)
+
+## Encoding metadata
+
+**Modern convention (anndata ≥ 0.8):**
+
+- Any element (Zarr *group* or *array*) that participates in the element-dispatch system:
+ - **MUST** have attribute `encoding-type` (string)
+ - **MUST** have attribute `encoding-version` (string, parseable as a version)
+
+Readers should dispatch first on `encoding-type`, then on `encoding-version`.
+
+**Legacy convention (anndata ≤ 0.7.x):**
+
+- Many objects do *not* have `encoding-type`/`encoding-version`.
+- Some elements (e.g. CSR/CSC sparse matrices, legacy DataFrames) *do* use `encoding-type`/`encoding-version`.
+- Readers typically infer element kinds from:
+ - known AnnData keys (`X`, `obs`, `var`, …),
+ - group structure, and/or
+ - legacy attributes (e.g. the `categories` attribute on categorical columns).
+
+## AnnData group
+
+### `encoding-type: anndata`, `encoding-version: 0.1.0`
+
+An `AnnData` object **MUST** be stored as a Zarr **group** with attributes:
+
+- `encoding-type: "anndata"`
+- `encoding-version: "0.1.0"`
+
+Required members:
+
+- `obs` — a [DataFrame](#dataframes)
+- `var` — a [DataFrame](#dataframes)
+
+Optional members (if present, they must satisfy these constraints):
+
+- `X` — dense array or sparse array; shape `(n_obs, n_var)`
+- `layers` — mapping; values dense or sparse arrays; each shape `(n_obs, n_var)`
+- `obsm` — mapping; values dense arrays, sparse arrays, or dataframes; first dim `n_obs`
+- `varm` — mapping; values dense arrays, sparse arrays, or dataframes; first dim `n_var`
+- `obsp` — mapping; values dense or sparse arrays; first two dims `n_obs`
+- `varp` — mapping; values dense or sparse arrays; first two dims `n_var`
+- `uns` — mapping/dict-like container (recursive)
+
+## Dense arrays
+
+### `encoding-type: array`, `encoding-version: 0.2.0`
+
+- A dense array **MUST** be stored as a Zarr **array**.
+- The array **MUST** have attributes:
+ - `encoding-type: "array"`
+ - `encoding-version: "0.2.0"`
+
+> **Legacy note**
+>
+> In anndata 0.7.x, dense arrays were typically stored as plain Zarr arrays *without* `encoding-type`/`encoding-version`.
+
+## Sparse arrays (CSR/CSC)
+
+### `encoding-type: csr_matrix|csc_matrix`, `encoding-version: 0.1.0`
+
+A sparse matrix **MUST** be stored as a Zarr **group**.
+
+- Group attributes:
+ - `encoding-type: "csr_matrix"` **or** `"csc_matrix"`
+ - `encoding-version: "0.1.0"`
+ - `shape`: integer array of length 2 (matrix shape)
+- Group members (arrays):
+ - `data`
+ - `indices`
+ - `indptr`
+
+The exact CSR/CSC semantics follow SciPy’s conventions.
+
+## DataFrames
+
+DataFrames are stored column-wise: each column is stored as a Zarr array (or group, if the column itself is an encoded element).
+
+
+### DataFrame v0.2.0
+
+#### `encoding-type: dataframe`, `encoding-version: 0.2.0`
+
+A dataframe **MUST** be stored as a Zarr **group**.
+
+- Group attributes:
+ - `_index`: string — the key of the array to be used as the row index
+ - `column-order`: array of strings — original column order
+ - `encoding-type: "dataframe"`
+ - `encoding-version: "0.2.0"`
+- Group members:
+ - the index array (named by `_index`)
+ - one member per column
+- All column entries **MUST** have the same length in their first dimension.
+- Columns **SHOULD** share chunking along the first dimension.
+
+Columns are independently encoded:
+- simple numeric/bool columns are commonly `encoding-type: array`
+- categorical columns are commonly `encoding-type: categorical`
+
+
+### DataFrame v0.1.0 (legacy: anndata 0.7.x)
+
+#### `encoding-type: dataframe`, `encoding-version: 0.1.0`
+
+A legacy dataframe is stored as a Zarr **group** where:
+
+- Group attributes include:
+ - `_index`
+ - `column-order`
+ - `encoding-type: "dataframe"`
+ - `encoding-version: "0.1.0"`
+- Each column is an array.
+- Categorical columns are stored as **integer code arrays**, and their category labels are stored in a reserved subgroup named `__categories`.
+
+**Reserved subgroup:**
+
+- `__categories/` stores the array of category labels for column ``.
+
+
+### Legacy categorical columns (Series-level)
+
+In v0.1.0 DataFrames, a categorical column array (e.g. `obs/cell_type`) can be identified by the presence of an attribute:
+
+- `categories`: an **absolute path string** to the corresponding `__categories/` array.
+
+(This differs from HDF5, which can store an object reference.)
+
+## Mappings / dict
+
+### `encoding-type: dict`, `encoding-version: 0.1.0`
+
+- A mapping **MUST** be stored as a Zarr **group**.
+- Group attributes:
+ - `encoding-type: "dict"`
+ - `encoding-version: "0.1.0"`
+- Each entry in the group is another element (recursively).
+
+> **Legacy note**
+>
+> In anndata 0.7.x, groups used as mappings often had **no special attributes**.
+
+## Scalars
+
+### `encoding-version: 0.2.0`
+
+Scalars are stored as **0-dimensional Zarr arrays**.
+
+- Numeric scalars:
+ - `encoding-type: "numeric-scalar"`
+ - `encoding-version: "0.2.0"`
+ - value is numeric (including boolean, ints, floats, complex)
+- String scalars:
+ - `encoding-type: "string"`
+ - `encoding-version: "0.2.0"`
+ - **Zarr requirement:** fixed-length unicode dtype (e.g. ` **Legacy note**
+>
+> In anndata 0.7.x, scalar strings were commonly stored without `encoding-type`/`encoding-version`.
+
+## Categorical arrays
+
+### `encoding-type: categorical`, `encoding-version: 0.2.0`
+
+Categorical arrays are stored as a Zarr **group** with members:
+
+- `codes`: integer array
+ - values are zero-based indices into `categories`
+ - signed integer arrays **MAY** use `-1` to denote missing values
+- `categories`: array of labels
+
+Group attributes:
+
+- `encoding-type: "categorical"`
+- `encoding-version: "0.2.0"`
+- `ordered`: boolean (whether the categories are ordered)
+
+## String arrays
+
+### `encoding-type: string-array`, `encoding-version: 0.2.0`
+
+- String arrays **MUST** be stored as Zarr arrays.
+- Array attributes:
+ - `encoding-type: "string-array"`
+ - `encoding-version: "0.2.0"`
+- **Zarr requirement:** the array **MUST** be stored using `numcodecs.VLenUTF8` for variable-length UTF-8 strings.
+
+## Nullable arrays
+
+These encodings support Pandas nullable integer/boolean/string arrays by storing a `values` array plus a boolean `mask` array.
+
+### `encoding-type: nullable-integer`, `encoding-version: 0.1.0`
+
+- Stored as a Zarr group with arrays:
+ - `values` (integer)
+ - `mask` (boolean)
+
+### `encoding-type: nullable-boolean`, `encoding-version: 0.1.0`
+
+- Stored as a Zarr group with arrays:
+ - `values` (boolean)
+ - `mask` (boolean)
+- `values` and `mask` **MUST** have the same shape.
+
+### `encoding-type: nullable-string-array`, `encoding-version: 0.1.0`
+
+- Stored as a Zarr group with arrays:
+ - `values` (string array)
+ - `mask` (boolean)
+- Group attributes:
+ - `encoding-type: "nullable-string-array"`
+ - `encoding-version: "0.1.0"`
+ - optional `na-value`: `"NA"` or `"NaN"` (default `"NA"`)
+
+
+#### Missing value semantics
+
+For elements supporting a `na-value` attribute:
+
+- `"NA"`: comparisons propagate missingness (e.g. `"x" == NA` → `NA`)
+- `"NaN"`: comparisons yield boolean results (e.g. `"x" == NaN` → `false`)
+
+Readers should preserve semantics when the runtime model supports it.
+
+## Awkward arrays (experimental)
+
+### `encoding-type: awkward-array`, `encoding-version: 0.1.0`
+
+Ragged arrays are stored by decomposing an Awkward Array into constituent buffers (via `ak.to_buffers`), then storing those buffers as Zarr arrays within a group.
+
+Group attributes:
+
+- `encoding-type: "awkward-array"`
+- `encoding-version: "0.1.0"`
+- `form`: string — serialized Awkward “form”
+- `length`: integer — logical length
+
+Group members: arrays for the buffers (often named like `nodeX-*`).
+
+> **Experimental**
+>
+> This encoding is considered experimental in the anndata 0.9.x series and later.
+
+## Sources
+
+- AnnData “on-disk format” prose docs (modern, ≥0.8): https://anndata.readthedocs.io/en/stable/fileformat-prose.html
+- AnnData 0.7.8 “on-disk format” prose docs (legacy): https://dokk.org/documentation/anndata/0.7.8/fileformat-prose/
diff --git a/docs/GET_STARTED.md b/docs/GET_STARTED.md
new file mode 100644
index 0000000..2ca023c
--- /dev/null
+++ b/docs/GET_STARTED.md
@@ -0,0 +1,189 @@
+# Get Started
+
+This short walkthrough shows the basic workflow: inspect a store, export metadata, and write a subset.
+
+## 1 Install
+
+Using uv (recommended):
+```bash
+git clone https://github.com/cellgeni/h5ad-cli.git
+cd h5ad-cli
+uv sync
+```
+
+With pip:
+```bash
+git clone https://github.com/cellgeni/h5ad-cli.git
+cd h5ad-cli
+pip install .
+```
+
+Additionally, it might be useful to install `csvkit` for inspecting exported CSV files:
+```bash
+# with uv
+uv pip install csvkit
+
+# with pip
+pip install csvkit
+```
+
+## 2 Inspect a files with `info` command
+
+Let's load an example `.h5ad` file:
+```bash
+wget -O visium.h5ad https://exampledata.scverse.org/squidpy/figshare/visium_hne_adata.h5ad
+```
+
+Now run `info` to see the file structure:
+```bash
+uv run h5ad info visium.h5ad
+```
+```
+An object with n_obs × n_var: 2688 × 18078
+ obs: array_col, array_row, cluster, in_tissue, leiden, log1p_n_genes_by_counts, log1p_total_counts, log1p_total_counts_mt, n_counts, n_genes_by_counts, pct_counts_in_top_100_genes, pct_counts_in_top_200_genes, pct_counts_in_top_500_genes,
+pct_counts_in_top_50_genes, pct_counts_mt, total_counts, total_counts_mt
+ var: feature_types, gene_ids, genome, highly_variable, highly_variable_rank, log1p_mean_counts, log1p_total_counts, mean_counts, means, mt, n_cells, n_cells_by_counts, pct_dropout_by_counts, total_counts, variances, variances_norm
+ obsm: X_pca, X_umap, spatial
+ varm: PCs
+ obsp: connectivities, distances
+ uns: cluster_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, spatial, umap
+ raw: X, var
+```
+
+To inspect a specific entry:
+```bash
+uv run h5ad info visium.h5ad obsm/X_pca
+```
+```
+Path: obsm/X_pca
+Type: dense-matrix
+Shape: (2688, 50)
+Dtype: float32
+Details: Dense matrix 2688×50 (float32)
+```
+
+## 3 Export entries
+View the first few lines of the `obs` dataframe:
+
+```bash
+uv run h5ad export dataframe visium.h5ad obs --head 10
+```
+```csv
+_index,array_col,array_row,cluster,in_tissue,leiden,log1p_n_genes_by_counts,log1p_total_counts,log1p_total_counts_mt,n_counts,n_genes_by_counts,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes,pct_counts_in_top_50_genes,pct_counts_mt,total_counts,total_counts_mt
+AAACAAGTATCTCCCA-1,102,50,Cortex_2,1,Cortex_3,8.502891406705377,9.869983,8.257904,19340.0,4928,43.13340227507756,49.21406411582213,60.449844881075485,38.42812823164426,19.943123,19340.0,3857.0
+AAACAATCTACTAGCA-1,43,3,Cortex_5,1,Pyramidal_layer_dentate_gyrus,8.145839612936841,9.528867,8.091933,13750.0,3448,55.14181818181818,60.95272727272727,70.57454545454546,50.516363636363636,23.76,13750.0,3267.0
+AAACACCAATAACTGC-1,19,59,Thalamus_2,1,Hypothalamus_1,8.70334075304372,10.395467,8.499233,32710.0,6022,47.071232039131765,54.56435340874351,65.0871293182513,40.48303271170896,15.010699,32710.0,4910.0
+AAACAGAGCGACTCCT-1,94,14,Cortex_5,1,Pyramidal_layer_dentate_gyrus,8.369157112588834,9.674704,8.092851,15909.0,4311,45.81054748884279,52.07744044251681,62.97693129675027,40.95794833113332,20.554403,15909.0,3270.0
+AAACCGGGTAGGTACC-1,28,42,Thalamus_2,1,Hypothalamus_1,8.663542087751374,10.369013,8.808967,31856.0,5787,45.887744851833254,52.98216976393771,64.24849321948768,40.287543947764945,21.01017,31856.0,6693.0
+AAACCGTTCGTCCAGG-1,42,52,Hypothalamus_2,1,Pyramidal_layer,8.682538124003075,10.337314,8.559678,30862.0,5898,43.79171797031949,51.18592443781998,62.65634113148856,37.80053139783553,16.901043,30862.0,5216.0
+AAACCTCATGAAGTTG-1,19,37,Thalamus_2,1,Hypothalamus_1,9.027858802380862,11.007419,8.849371,60319.0,8331,34.28770370861586,42.45594257199224,55.48997828213332,27.803842901904872,11.553574,60319.0,6969.0
+AAACGAAGAACATACC-1,64,6,Cortex_4,1,Hypothalamus_2,8.84246002419529,10.578089,8.855521,39264.0,6921,37.99663814180929,44.75346373268134,56.6320293398533,32.95639771801141,17.858597,39264.0,7012.0
+AAACGAGACGGTTGAT-1,79,35,Fiber_tract,1,Cortex_5,8.80941494391005,10.458923,8.351847,34853.0,6696,39.947780678851174,47.52818982583996,58.838550483459095,33.7245000430379,12.156773,34853.0,4237.0
+AAACGGTTGCGAACTG-1,59,67,Lateral_ventricle,1,Striatum,8.718663567048953,10.254004,8.416489,28395.0,6115,41.67635147032928,49.20232435287903,60.556435992252155,35.562599049128366,15.918295,28395.0,4520.0
+```
+
+Export cell metadata to a CSV file:
+```bash
+uv run h5ad export dataframe visium.h5ad obs --output cells.csv
+wc -l cells.csv # 2689 cells.csv
+```
+
+## 4 Subset by names
+
+Let's get all cluster names from `cells.csv`:
+```bash
+awk -F ',' 'NR>1{print $4}' cells.csv | sort | uniq -c
+```
+```
+284 Cortex_1
+257 Cortex_2
+244 Cortex_3
+164 Cortex_4
+129 Cortex_5
+226 Fiber_tract
+222 Hippocampus
+208 Hypothalamus_1
+133 Hypothalamus_2
+105 Lateral_ventricle
+42 Pyramidal_layer
+68 Pyramidal_layer_dentate_gyrus
+153 Striatum
+261 Thalamus_1
+192 Thalamus_2
+```
+
+To get all obs names in "Cortex_2", you can use `csvsql` from `csvkit`:
+```bash
+csvsql -d ',' -I --query "SELECT _index FROM cells WHERE cluster='Cortex_2'" cells.csv > barcodes.txt
+sed -i '1d' barcodes.txt # remove header
+wc -l barcodes.txt # 257 barcodes.txt
+```
+
+Now you can use this list to create a subset `.h5ad` file:
+```bash
+uv run h5ad subset visium.h5ad --output cortex2.h5ad --obs barcodes.txt
+```
+
+Check the result:
+```bash
+uv run h5ad info cortex2.h5ad
+```
+```
+An object with n_obs × n_var: 257 × 18078
+ obs: array_col, array_row, cluster, in_tissue, leiden, log1p_n_genes_by_counts, log1p_total_counts, log1p_total_counts_mt, n_counts, n_genes_by_counts,
+pct_counts_in_top_100_genes, pct_counts_in_top_200_genes, pct_counts_in_top_500_genes, pct_counts_in_top_50_genes, pct_counts_mt, total_counts, total_counts_mt
+ var: feature_types, gene_ids, genome, highly_variable, highly_variable_rank, log1p_mean_counts, log1p_total_counts, mean_counts, means, mt, n_cells, n_cells_by_counts,
+pct_dropout_by_counts, total_counts, variances, variances_norm
+ obsm: X_pca, X_umap, spatial
+ varm: PCs
+ obsp: connectivities, distances
+ uns: cluster_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, spatial, umap
+```
+
+## Import or replace data
+You can also import new data into an existing store. For example, let's replace the `obs` dataframe with a modified version. First, leave only first 5 columns in `cells.csv`:
+```bash
+cut -d ',' -f 1-5 cells.csv > cells1to5.csv
+```
+
+Now import it back into `cortex2.h5ad` with the `_index` column as index:
+```bash
+uv run h5ad import dataframe visium.h5ad obs cells1to5.csv --index-column _index --output visium_obs1to5.h5ad
+```
+
+Check the updated `obs` structure:
+```bash
+uv run h5ad info visium_obs1to5.h5ad
+```
+```
+An object with n_obs × n_var: 2688 × 18078
+ obs: array_col, array_row, cluster, in_tissue
+ var: feature_types, gene_ids, genome, highly_variable, highly_variable_rank, log1p_mean_counts, log1p_total_counts, mean_counts, means, mt, n_cells, n_cells_by_counts,
+pct_dropout_by_counts, total_counts, variances, variances_norm
+ obsm: X_pca, X_umap, spatial
+ varm: PCs
+ obsp: connectivities, distances
+ uns: cluster_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, spatial, umap
+ raw: X, var
+```
+
+You can also import the data into existing file:
+```bash
+uv run h5ad import dataframe visium.h5ad obs cells1to5.csv --index-column _index --inplace
+```
+
+Check the updated `obs` structure:
+```bash
+uv run h5ad info visium.h5ad
+```
+```
+An object with n_obs × n_var: 2688 × 18078
+ obs: array_col, array_row, cluster, in_tissue
+ var: feature_types, gene_ids, genome, highly_variable, highly_variable_rank, log1p_mean_counts, log1p_total_counts, mean_counts, means, mt, n_cells, n_cells_by_counts,
+pct_dropout_by_counts, total_counts, variances, variances_norm
+ obsm: X_pca, X_umap, spatial
+ varm: PCs
+ obsp: connectivities, distances
+ uns: cluster_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, spatial, umap
+ raw: X, var
+```
\ No newline at end of file
diff --git a/docs/TUTORIAL.md b/docs/TUTORIAL.md
new file mode 100644
index 0000000..a19a416
--- /dev/null
+++ b/docs/TUTORIAL.md
@@ -0,0 +1,382 @@
+# Tutorial: Using h5ad CLI with csvkit
+
+This tutorial demonstrates how to combine `h5ad` CLI with `csvkit` to explore, analyze, and subset large `.h5ad` files efficiently without loading them into memory.
+
+## Introduction
+
+### h5ad CLI
+A command-line tool for working with AnnData (`.h5ad`) files. It streams data directly from disk, making it perfect for exploring huge single-cell datasets without memory constraints.
+
+**Key features:**
+- `info` - Inspect file structure and dimensions
+- `table` - Export metadata to CSV
+- `subset` - Filter files by cell/gene names
+
+### csvkit
+A suite of command-line tools for working with CSV files. Think of it as `awk`, `sed`, and `grep` but specifically designed for CSV data.
+
+**Key tools we'll use:**
+- `csvcut` - Select specific columns
+- `csvsql` - Execute SQL queries on CSV files
+- `csvgrep` - Filter rows by pattern
+- `csvlook` - Pretty-print CSV in terminal
+
+**Installation:**
+```bash
+pip install csvkit
+```
+
+## 1. Inspect File Structure with `info`
+
+First, let's see what's in our `.h5ad` file:
+
+```bash
+h5ad info dataset.h5ad
+```
+
+**Example output:**
+```
+File: dataset.h5ad
+Dimensions: 50000 obs × 20000 var
+
+Top-level groups:
+ obs/
+ - cell_type
+ - sample_id
+ - donor_id
+ - tissue
+ - n_genes
+ var/
+ - gene_name
+ - highly_variable
+ X (sparse matrix)
+ layers/
+ obsm/
+ uns/
+```
+
+This shows us that we have 50,000 cells with metadata including cell types, samples, and donor information.
+
+## 2. Export Metadata with `table`
+
+### 2.1 Basic Metadata Export
+
+Export all cell metadata (observations) to CSV:
+
+```bash
+h5ad table dataset.h5ad --axis obs --output cell_metadata.csv
+```
+
+Export just specific columns:
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns cell_type,sample_id,donor_id --output cells.csv
+```
+
+Preview the first few rows in a nice table format:
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns cell_type,sample_id,donor_id --head 10 | csvlook
+```
+
+**Example output:**
+```
+| obs_names | cell_type | sample_id | donor_id |
+| ------------------- | ------------ | --------- | -------- |
+| AAACCTGAGAAACCAT-1 | T cell | sample_1 | donor_A |
+| AAACCTGAGACAGACC-1 | B cell | sample_1 | donor_A |
+| AAACCTGAGGCATGGT-1 | NK cell | sample_2 | donor_B |
+| AAACCTGCAAGCCGCT-1 | T cell | sample_2 | donor_B |
+| AAACCTGCACATTAGC-1 | Monocyte | sample_1 | donor_A |
+```
+
+### 2.2 Calculate Statistics with `csvsql`
+
+Now let's analyze the metadata using SQL queries!
+
+**Count cells per cell type:**
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns cell_type | \
+ csvsql --query "SELECT cell_type, COUNT(*) as n_cells FROM stdin GROUP BY cell_type ORDER BY n_cells DESC" | \
+ csvlook
+```
+
+**Example output:**
+```
+| cell_type | n_cells |
+| ------------ | ------- |
+| T cell | 15234 |
+| Monocyte | 12456 |
+| B cell | 8932 |
+| NK cell | 5621 |
+| DC | 3456 |
+| Macrophage | 2301 |
+```
+
+**Count cells per cell type and sample:**
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns cell_type,sample_id | \
+ csvsql --query "SELECT cell_type, sample_id, COUNT(*) as n_cells
+ FROM stdin
+ GROUP BY cell_type, sample_id
+ ORDER BY cell_type, sample_id" | \
+ csvlook
+```
+
+**Example output:**
+```
+| cell_type | sample_id | n_cells |
+| ------------ | --------- | ------- |
+| B cell | sample_1 | 4521 |
+| B cell | sample_2 | 4411 |
+| Monocyte | sample_1 | 6234 |
+| Monocyte | sample_2 | 6222 |
+| T cell | sample_1 | 7645 |
+| T cell | sample_2 | 7589 |
+```
+
+**Calculate average gene count per cell type:**
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns cell_type,n_genes | \
+ csvsql --query "SELECT cell_type,
+ AVG(n_genes) as avg_genes,
+ MIN(n_genes) as min_genes,
+ MAX(n_genes) as max_genes
+ FROM stdin
+ GROUP BY cell_type
+ ORDER BY avg_genes DESC" | \
+ csvlook
+```
+
+**Find samples with low cell counts:**
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns sample_id | \
+ csvsql --query "SELECT sample_id, COUNT(*) as n_cells
+ FROM stdin
+ GROUP BY sample_id
+ HAVING COUNT(*) < 1000
+ ORDER BY n_cells" | \
+ csvlook
+```
+
+## 3. Filter and Subset Data
+
+### 3.1 Extract Cell Names for a Specific Cell Type
+
+Let's say we want to create a subset containing only T cells.
+
+**Step 1: Export metadata and filter for T cells**
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns cell_type --output cell_metadata.csv
+```
+
+**Step 2: Use csvgrep to find T cells and extract their names**
+
+```bash
+csvgrep -c cell_type -m "T cell" cell_metadata.csv | \
+ csvcut -c obs_names | \
+ tail -n +2 > tcell_names.txt
+```
+
+This creates a file `tcell_names.txt` with one cell barcode per line.
+
+**Alternative: Use csvsql for more complex filters**
+
+Get T cells from a specific donor:
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns cell_type,donor_id --output cell_metadata.csv
+
+csvsql --query "SELECT obs_names
+ FROM cell_metadata
+ WHERE cell_type = 'T cell'
+ AND donor_id = 'donor_A'" \
+ cell_metadata.csv | \
+ tail -n +2 > tcell_donor_A.txt
+```
+
+Get cells with high gene counts (>2000 genes):
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns n_genes --output cell_metadata.csv
+
+csvsql --query "SELECT obs_names
+ FROM cell_metadata
+ WHERE n_genes > 2000" \
+ cell_metadata.csv | \
+ tail -n +2 > high_quality_cells.txt
+```
+
+### 3.2 Create the Subset
+
+Now use the filtered cell list to create a new `.h5ad` file:
+
+```bash
+h5ad subset dataset.h5ad tcells_only.h5ad --obs tcell_names.txt
+```
+
+**Verify the subset:**
+
+```bash
+h5ad info tcells_only.h5ad
+```
+
+**Check the cell type distribution:**
+
+```bash
+h5ad table tcells_only.h5ad --axis obs --columns cell_type | \
+ csvsql --query "SELECT cell_type, COUNT(*) as n_cells FROM stdin GROUP BY cell_type" | \
+ csvlook
+```
+
+### 3.3 Advanced: Subset by Both Cells and Genes
+
+Let's create a subset with specific cell types and a curated gene list.
+
+**Step 1: Filter cells (multiple cell types)**
+
+```bash
+h5ad table dataset.h5ad --axis obs --columns cell_type --output cell_metadata.csv
+
+csvsql --query "SELECT obs_names
+ FROM cell_metadata
+ WHERE cell_type IN ('T cell', 'NK cell', 'B cell')" \
+ cell_metadata.csv | \
+ tail -n +2 > lymphocytes.txt
+```
+
+**Step 2: Create a gene list**
+
+You might have a predefined list or extract genes from the file:
+
+```bash
+# Export all genes
+h5ad table dataset.h5ad --axis var --columns gene_name --output genes.csv
+
+# Filter for specific genes (e.g., markers)
+echo "CD3D
+CD3E
+CD4
+CD8A
+CD8B
+CD19
+CD20
+NCAM1" > marker_genes.txt
+```
+
+**Step 3: Create the subset**
+
+```bash
+h5ad subset dataset.h5ad lymphocytes_markers.h5ad \
+ --obs lymphocytes.txt \
+ --var marker_genes.txt
+```
+
+**Verify:**
+
+```bash
+h5ad info lymphocytes_markers.h5ad
+```
+
+## 4. Complete Example Workflow
+
+Here's a complete workflow combining everything:
+
+```bash
+# 1. Inspect the file
+h5ad info large_dataset.h5ad
+
+# 2. Export and analyze metadata
+h5ad table large_dataset.h5ad --axis obs \
+ --columns cell_type,sample_id,donor_id,n_genes \
+ --output all_metadata.csv
+
+# 3. Generate statistics
+echo "Cell type distribution:"
+csvsql --query "SELECT cell_type, COUNT(*) as n_cells
+ FROM all_metadata
+ GROUP BY cell_type
+ ORDER BY n_cells DESC" \
+ all_metadata.csv | csvlook
+
+echo "Sample distribution:"
+csvsql --query "SELECT sample_id, donor_id, COUNT(*) as n_cells
+ FROM all_metadata
+ GROUP BY sample_id, donor_id" \
+ all_metadata.csv | csvlook
+
+# 4. Filter for high-quality T cells from a specific donor
+csvsql --query "SELECT obs_names
+ FROM all_metadata
+ WHERE cell_type = 'T cell'
+ AND donor_id = 'donor_A'
+ AND n_genes > 1500" \
+ all_metadata.csv | \
+ tail -n +2 > selected_cells.txt
+
+echo "Selected $(wc -l < selected_cells.txt) cells"
+
+# 5. Create subset
+h5ad subset large_dataset.h5ad tcells_subset.h5ad --obs selected_cells.txt
+
+# 6. Verify result
+h5ad info tcells_subset.h5ad
+h5ad table tcells_subset.h5ad --axis obs --columns cell_type,donor_id | \
+ csvsql --query "SELECT cell_type, donor_id, COUNT(*) as n_cells FROM stdin GROUP BY cell_type, donor_id" | \
+ csvlook
+```
+
+## Tips and Best Practices
+
+1. **Use `--head` for quick previews** before exporting large files:
+ ```bash
+ h5ad table data.h5ad --axis obs --head 100 | csvlook
+ ```
+
+2. **Pipe directly to csvkit** to avoid creating intermediate files:
+ ```bash
+ h5ad table data.h5ad --axis obs --columns cell_type | csvsql --query "..."
+ ```
+
+3. **Check cell counts** before subsetting:
+ ```bash
+ wc -l selected_cells.txt # Should be > 0!
+ ```
+
+4. **Use csvstat** for quick summary statistics:
+ ```bash
+ h5ad table data.h5ad --axis obs --columns n_genes,n_counts | csvstat
+ ```
+
+5. **Combine with standard Unix tools**:
+ ```bash
+ # Get unique cell types
+ h5ad table data.h5ad --axis obs --columns cell_type | tail -n +2 | sort -u
+
+ # Count samples
+ h5ad table data.h5ad --axis obs --columns sample_id | tail -n +2 | sort | uniq -c
+ ```
+
+## Conclusion
+
+By combining `h5ad` CLI with `csvkit`, you can:
+- ✅ Explore huge datasets without loading them into memory
+- ✅ Perform complex queries and aggregations on metadata
+- ✅ Create filtered subsets based on sophisticated criteria
+- ✅ Work entirely on the command line without Python/R
+
+This workflow is especially powerful for:
+- Initial data exploration
+- Quality control analysis
+- Creating test datasets
+- Preparing data for downstream analysis
+- Batch processing multiple files
+
+For more information:
+- h5ad CLI: [README.md](../README.md)
+- csvkit documentation: https://csvkit.readthedocs.io/
diff --git a/pyproject.toml b/pyproject.toml
index c18faa4..281812b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,8 +7,10 @@ requires-python = ">=3.12"
dependencies = [
"h5py>=3.15.1",
"numpy>=2.3.5",
+ "pillow>=12.1.0",
"rich>=14.2.0",
"typer>=0.20.0",
+ "zarr>=3.1.5",
]
[project.optional-dependencies]
diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py
index bb4749d..66bbd22 100644
--- a/src/h5ad/cli.py
+++ b/src/h5ad/cli.py
@@ -1,104 +1,110 @@
-import sys
-import csv
+"""CLI for h5ad files with export and import subcommands."""
+
from pathlib import Path
-from typing import Optional, Sequence, Tuple, Dict, List
+from typing import Optional, Sequence, List
-import rich
from rich.console import Console
import typer
-import h5py
-import numpy as np
-from h5ad.commands import show_info, export_table, subset_h5ad
+from h5ad.commands import (
+ show_info,
+ subset_h5ad,
+ export_mtx,
+ export_npy,
+ export_json,
+ export_table,
+)
+
+from h5ad.commands import export_image as export_image_cmd
app = typer.Typer(
- help="Streaming CLI for huge .h5ad files (info, table, subset)."
+ help="Streaming CLI for huge .h5ad and .zarr files (info, subset, export, import)."
)
-console = Console(stderr=True)
+# Use stderr for status/progress to keep stdout clean for data output
+# force_terminal=True ensures Rich output is visible even in non-TTY environments
+console = Console(stderr=True, force_terminal=True)
+# Create sub-apps for export and import
+export_app = typer.Typer(help="Export objects from h5ad files.")
+import_app = typer.Typer(help="Import objects into h5ad files.")
+app.add_typer(export_app, name="export")
+app.add_typer(import_app, name="import")
+
+# ============================================================================
+# INFO command
+# ============================================================================
@app.command()
def info(
file: Path = typer.Argument(
...,
- help="Path to the .h5ad file",
+ help="Path to the .h5ad/.zarr store",
exists=True,
readable=True,
- )
+ dir_okay=True,
+ file_okay=True,
+ ),
+ entry: Optional[str] = typer.Argument(
+ None,
+ help="Entry path to inspect (e.g., 'obsm/X_pca', 'X', 'uns')",
+ ),
+ tree: bool = typer.Option(
+ False,
+ "--tree",
+ "-t",
+ help="Show a tree of all entries",
+ ),
+ depth: int = typer.Option(
+ None,
+ "--depth",
+ "-d",
+ help="Maximum recursion depth for tree display (only with --tree)",
+ ),
) -> None:
"""
Show high-level information about the .h5ad file.
- Args:
- file (Path): Path to the .h5ad file
+
+ Use --tree to see a tree of all entries.
+ Use --entry to inspect a specific entry in detail.
+
+ Examples:
+ h5ad info data.h5ad
+ h5ad info --tree data.h5ad
+ h5ad info obsm/X_pca data.h5ad
"""
- show_info(file, console)
+ try:
+ show_info(file, console, show_types=tree, depth=depth, entry_path=entry)
+ except Exception as e:
+ console.print(f"[bold red]Error:[/] {e}")
+ raise typer.Exit(code=1)
+# ============================================================================
+# SUBSET command
+# ============================================================================
@app.command()
-def table(
+def subset(
file: Path = typer.Argument(
...,
- help="Path to the .h5ad file",
+ help="Input .h5ad/.zarr",
exists=True,
readable=True,
+ dir_okay=True,
+ file_okay=True,
),
- axis: str = typer.Option("obs", help="Axis to read from ('obs' or 'var')"),
- columns: Optional[str] = typer.Option(
- None,
- "--columns",
- "-c",
- help="Comma separated column names to include in the output table",
- ),
- out: Optional[Path] = typer.Option(
+ output: Optional[Path] = typer.Option(
None,
"--output",
"-o",
- help="Output file path (defaults to stdout)",
- writable=True,
+ help="Output .h5ad/.zarr path. Required unless --inplace.",
+ dir_okay=True,
+ file_okay=True,
),
- chunk_rows: int = typer.Option(
- 10000, "--chunk-rows", "-r", help="Number of rows to read per chunk"
+ inplace: bool = typer.Option(
+ False,
+ "--inplace",
+ help="Modify source file directly.",
),
- head: Optional[int] = typer.Option(
- None, "--head", "-n", help="Output only the first n rows"
- ),
-) -> None:
- """
- Export a table of the specified axis ('obs' or 'var') to CSV format.
- Args:
- file (Path): Path to the .h5ad file
- axis (str): Axis to read from ('obs' or 'var')
- columns (Optional[str]): Comma separated column names to include in the output table
- out (Optional[Path]): Output file path (defaults to stdout)
- chunk_rows (int): Number of rows to read per chunk
- head (Optional[int]): Output only the first n rows
- """
- # Validate axis parameter
- if axis not in ("obs", "var"):
- console.print(
- f"[bold red]Error:[/] Invalid axis '{axis}'. Must be either 'obs' or 'var'.",
- )
- raise typer.Exit(code=1)
-
- col_list: Optional[List[str]] = None
- if columns:
- col_list = [col.strip() for col in columns.split(",") if col.strip()]
-
- export_table(
- file=file,
- axis=axis,
- columns=col_list,
- out=out,
- chunk_rows=chunk_rows,
- head=head,
- console=console,
- )
-
-
-@app.command()
-def subset(
- file: Path = typer.Argument(..., help="Input .h5ad", exists=True, readable=True),
- output: Path = typer.Argument(..., help="Output .h5ad", writable=True),
obs: Optional[Path] = typer.Option(
None,
"--obs",
@@ -114,7 +120,12 @@ def subset(
readable=True,
),
chunk_rows: int = typer.Option(
- 1024, "--chunk-rows", "-r", help="Row chunk size for dense matrices"
+ 1024,
+ "--chunk",
+ "-C",
+ "--chunk-rows",
+ "-r",
+ help="Row chunk size for dense matrices",
),
) -> None:
"""Subset an h5ad by obs and/or var names."""
@@ -124,6 +135,13 @@ def subset(
)
raise typer.Exit(code=1)
+ if not inplace and output is None:
+ console.print(
+ "[bold red]Error:[/] Output file is required. "
+ "Use --output/-o or --inplace.",
+ )
+ raise typer.Exit(code=1)
+
try:
subset_h5ad(
file=file,
@@ -132,11 +150,508 @@ def subset(
var_file=var,
chunk_rows=chunk_rows,
console=console,
+ inplace=inplace,
)
except Exception as e:
console.print(f"[bold red]Error:[/] {e}")
raise typer.Exit(code=1)
+# ============================================================================
+# EXPORT subcommands
+# ============================================================================
+@export_app.command("dataframe")
+def export_dataframe(
+ file: Path = typer.Argument(
+ ...,
+ help="Path to the .h5ad/.zarr store",
+ exists=True,
+ readable=True,
+ dir_okay=True,
+ file_okay=True,
+ ),
+ entry: str = typer.Argument(..., help="Entry path to export ('obs' or 'var')"),
+ output: Path = typer.Option(
+ None, "--output", "-o", writable=True, help="Output CSV file path"
+ ),
+ columns: Optional[str] = typer.Option(
+ None,
+ "--columns",
+ "-c",
+ help="Comma separated column names to include",
+ ),
+ chunk_rows: int = typer.Option(
+ 10_000,
+ "--chunk",
+ "-C",
+ "--chunk-rows",
+ "-r",
+ help="Number of rows to read per chunk",
+ ),
+ head: Optional[int] = typer.Option(
+ None, "--head", "-n", help="Output only the first n entries"
+ ),
+) -> None:
+ """
+ Export a dataframe (obs or var) to CSV.
+
+ Examples:
+ h5ad export dataframe data.h5ad obs --output obs.csv
+ h5ad export dataframe data.h5ad var --output var.csv --columns gene_id,mean
+ h5ad export dataframe data.h5ad obs --head 100
+ """
+
+ if entry not in ("obs", "var"):
+ console.print(
+ f"[bold red]Error:[/] Dataframe export is only supported for 'obs' or 'var' at this point, not '{entry}'.",
+ )
+ raise typer.Exit(code=1)
+
+ col_list: Optional[List[str]] = None
+ if columns:
+ col_list = [col.strip() for col in columns.split(",") if col.strip()]
+
+ try:
+ export_table(
+ file=file,
+ axis=entry,
+ columns=col_list,
+ out=output,
+ chunk_rows=chunk_rows,
+ head=head,
+ console=console,
+ )
+ except Exception as e:
+ console.print(f"[bold red]Error:[/] {e}")
+ raise typer.Exit(code=1)
+
+
+@export_app.command("array")
+def export_array(
+ file: Path = typer.Argument(
+ ...,
+ help="Path to the .h5ad/.zarr store",
+ exists=True,
+ readable=True,
+ dir_okay=True,
+ file_okay=True,
+ ),
+ entry: str = typer.Argument(
+ ..., help="Entry path to export (e.g., 'obsm/X_pca', 'varm/PCs', 'X')"
+ ),
+ output: Path = typer.Option(
+ ..., "--output", "-o", help="Output .npy file path", writable=True
+ ),
+ chunk_elements: int = typer.Option(
+ 100_000,
+ "--chunk",
+ "-C",
+ help="Number of elements to read per chunk",
+ ),
+) -> None:
+ """
+ Export a dense array or matrix to NumPy .npy format.
+
+ Examples:
+ h5ad export array data.h5ad obsm/X_pca pca.npy
+ h5ad export array data.h5ad X matrix.npy
+ h5ad export array data.h5ad varm/PCs loadings.npy
+ """
+
+ try:
+ export_npy(
+ file=file,
+ obj=entry,
+ out=output,
+ chunk_elements=chunk_elements,
+ console=console,
+ )
+ except Exception as e:
+ console.print(f"[bold red]Error:[/] {e}")
+ raise typer.Exit(code=1)
+
+
+@export_app.command("sparse")
+def export_sparse(
+ file: Path = typer.Argument(
+ ...,
+ help="Path to the .h5ad/.zarr store",
+ exists=True,
+ readable=True,
+ dir_okay=True,
+ file_okay=True,
+ ),
+ entry: str = typer.Argument(
+ ..., help="Entry path to export (e.g., 'X', 'layers/counts')"
+ ),
+ output: Optional[Path] = typer.Option(
+ None,
+ "--output",
+ "-o",
+ writable=True,
+ help="Output .mtx file path (defaults to stdout)",
+ ),
+ head: Optional[int] = typer.Option(
+ None, "--head", "-n", help="Output only the first n entries of mtx file"
+ ),
+ chunk_elements: int = typer.Option(
+ 1_000,
+ "--chunk",
+ "-C",
+ help="Number of rows/columns (depends on compression format) to process per chunk",
+ ),
+ in_memory: bool = typer.Option(
+ False,
+ "--in-memory",
+ "-m",
+ help="Load the entire sparse matrix into memory before exporting (may be faster for small matrices)",
+ ),
+) -> None:
+ """
+ Export a sparse matrix (CSR/CSC) to Matrix Market (.mtx) format.
+
+ Examples:
+ h5ad export sparse data.h5ad X matrix.mtx
+ h5ad export sparse data.h5ad layers/counts counts.mtx
+ h5ad export sparse data.h5ad X --head 100
+ """
+
+ try:
+ export_mtx(
+ file=file,
+ obj=entry,
+ out=output,
+ head=head,
+ chunk_elements=chunk_elements,
+ in_memory=in_memory,
+ console=console,
+ )
+ except Exception as e:
+ console.print(f"[bold red]Error:[/] {e}")
+ raise typer.Exit(code=1)
+
+
+@export_app.command("dict")
+def export_dict(
+ file: Path = typer.Argument(
+ ...,
+ help="Path to the .h5ad/.zarr store",
+ exists=True,
+ readable=True,
+ dir_okay=True,
+ file_okay=True,
+ ),
+ entry: str = typer.Argument(
+ ..., help="Entry path to export (e.g., 'uns', 'uns/colors')"
+ ),
+ output_arg: Optional[Path] = typer.Argument(None, help="Output .json file path"),
+ output: Optional[Path] = typer.Option(
+ None, "--output", "-o", help="Output .json file path"
+ ),
+ max_elements: int = typer.Option(
+ 100_000,
+ "--max-elements",
+ help="Maximum array elements for JSON export",
+ ),
+ include_attrs: bool = typer.Option(
+ False, "--include-attrs", help="Include HDF5 attributes in JSON export"
+ ),
+) -> None:
+ """
+ Export a dict/group or scalar to JSON format.
+
+ Examples:
+ h5ad export dict data.h5ad uns metadata.json
+ h5ad export dict data.h5ad uns/colors colors.json
+ """
+
+ try:
+ out_path = output if output is not None else output_arg
+ export_json(
+ file=file,
+ obj=entry,
+ out=out_path,
+ max_elements=max_elements,
+ include_attrs=include_attrs,
+ console=console,
+ )
+ except Exception as e:
+ console.print(f"[bold red]Error:[/] {e}")
+ raise typer.Exit(code=1)
+
+
+@export_app.command("image")
+def export_image(
+ file: Path = typer.Argument(
+ ...,
+ help="Path to the .h5ad/.zarr store",
+ exists=True,
+ readable=True,
+ dir_okay=True,
+ file_okay=True,
+ ),
+ entry: str = typer.Argument(..., help="Entry path to export (2D or 3D array)"),
+ output: Optional[Path] = typer.Option(
+ None, "--output", "-o", help="Output image file (.png, .jpg, .tiff)"
+ ),
+) -> None:
+ """
+ Export an image-like array to PNG/JPG/TIFF format.
+
+ The array should be 2D (H,W) or 3D (H,W,C) with C in {1,3,4}.
+
+ Examples:
+ h5ad export image data.h5ad uns/spatial/image tissue.png
+ """
+
+ try:
+ export_image_cmd(file=file, obj=entry, out=output, console=console)
+ except Exception as e:
+ console.print(f"[bold red]Error:[/] {e}")
+ raise typer.Exit(code=1)
+
+
+# ============================================================================
+# IMPORT subcommands
+# ============================================================================
+def _get_target_file(file: Path, output: Optional[Path], inplace: bool) -> Path:
+ """Determine target path and copy/convert if needed."""
+ from h5ad.commands.import_data import _prepare_target_path
+
+ return _prepare_target_path(file, output, inplace, console)
+
+
+@import_app.command("dataframe")
+def import_dataframe(
+ file: Path = typer.Argument(
+ ...,
+ help="Path to the source .h5ad/.zarr store",
+ exists=True,
+ readable=True,
+ dir_okay=True,
+ file_okay=True,
+ ),
+ entry: str = typer.Argument(
+ ..., help="Entry path to create/replace ('obs' or 'var')"
+ ),
+ input_file: Path = typer.Argument(
+ ..., help="Input CSV file", exists=True, readable=True
+ ),
+ output: Optional[Path] = typer.Option(
+ None,
+ "--output",
+ "-o",
+ help="Output .h5ad/.zarr path. Required unless --inplace.",
+ dir_okay=True,
+ file_okay=True,
+ ),
+ inplace: bool = typer.Option(
+ False,
+ "--inplace",
+ help="Modify source file directly.",
+ ),
+ index_column: Optional[str] = typer.Option(
+ None,
+ "--index-column",
+ "-i",
+ help="Column to use as index. Defaults to first column.",
+ ),
+) -> None:
+ """
+ Import a CSV file into obs or var.
+
+ Examples:
+ h5ad import dataframe data.h5ad obs cells.csv -o output.h5ad -i cell_id
+ h5ad import dataframe data.h5ad var genes.csv --inplace -i gene_id
+ """
+ from h5ad.commands.import_data import _import_csv
+
+ if entry not in ("obs", "var"):
+ console.print(
+ f"[bold red]Error:[/] Entry must be 'obs' or 'var', not '{entry}'.",
+ )
+ raise typer.Exit(code=1)
+
+ if not inplace and output is None:
+ console.print(
+ "[bold red]Error:[/] Output file is required. "
+ "Use --output/-o or --inplace.",
+ )
+ raise typer.Exit(code=1)
+
+ try:
+ target = _get_target_file(file, output, inplace)
+ _import_csv(target, entry, input_file, index_column, console)
+ except Exception as e:
+ console.print(f"[bold red]Error:[/] {e}")
+ raise typer.Exit(code=1)
+
+
+@import_app.command("array")
+def import_array(
+ file: Path = typer.Argument(
+ ...,
+ help="Path to the source .h5ad/.zarr store",
+ exists=True,
+ readable=True,
+ dir_okay=True,
+ file_okay=True,
+ ),
+ entry: str = typer.Argument(
+ ..., help="Entry path to create/replace (e.g., 'X', 'obsm/X_pca')"
+ ),
+ input_file: Path = typer.Argument(
+ ..., help="Input .npy file", exists=True, readable=True
+ ),
+ output: Optional[Path] = typer.Option(
+ None,
+ "--output",
+ "-o",
+ help="Output .h5ad/.zarr path. Required unless --inplace.",
+ dir_okay=True,
+ file_okay=True,
+ ),
+ inplace: bool = typer.Option(
+ False,
+ "--inplace",
+ help="Modify source file directly.",
+ ),
+) -> None:
+ """
+ Import a NumPy .npy file as a dense array.
+
+ Dimensions are validated against existing obs/var.
+
+ Examples:
+ h5ad import array data.h5ad obsm/X_pca pca.npy -o output.h5ad
+ h5ad import array data.h5ad X matrix.npy --inplace
+ """
+ from h5ad.commands.import_data import _import_npy
+
+ if not inplace and output is None:
+ console.print(
+ "[bold red]Error:[/] Output file is required. "
+ "Use --output/-o or --inplace.",
+ )
+ raise typer.Exit(code=1)
+
+ try:
+ target = _get_target_file(file, output, inplace)
+ _import_npy(target, entry, input_file, console)
+ except Exception as e:
+ console.print(f"[bold red]Error:[/] {e}")
+ raise typer.Exit(code=1)
+
+
+@import_app.command("sparse")
+def import_sparse(
+ file: Path = typer.Argument(
+ ...,
+ help="Path to the source .h5ad/.zarr store",
+ exists=True,
+ readable=True,
+ dir_okay=True,
+ file_okay=True,
+ ),
+ obj: str = typer.Argument(
+ ..., help="Object path to create/replace (e.g., 'X', 'layers/counts')"
+ ),
+ input_file: Path = typer.Argument(
+ ..., help="Input .mtx file", exists=True, readable=True
+ ),
+ output: Optional[Path] = typer.Option(
+ None,
+ "--output",
+ "-o",
+ help="Output .h5ad/.zarr path. Required unless --inplace.",
+ dir_okay=True,
+ file_okay=True,
+ ),
+ inplace: bool = typer.Option(
+ False,
+ "--inplace",
+ help="Modify source file directly.",
+ ),
+) -> None:
+ """
+ Import a Matrix Market (.mtx) file as a CSR sparse matrix.
+
+ Dimensions are validated against existing obs/var.
+
+ Examples:
+ h5ad import sparse data.h5ad X matrix.mtx -o output.h5ad
+ h5ad import sparse data.h5ad layers/counts counts.mtx --inplace
+ """
+ from h5ad.commands.import_data import _import_mtx
+
+ if not inplace and output is None:
+ console.print(
+ "[bold red]Error:[/] Output file is required. "
+ "Use --output/-o or --inplace.",
+ )
+ raise typer.Exit(code=1)
+
+ try:
+ target = _get_target_file(file, output, inplace)
+ _import_mtx(target, obj, input_file, console)
+ except Exception as e:
+ console.print(f"[bold red]Error:[/] {e}")
+ raise typer.Exit(code=1)
+
+
+@import_app.command("dict")
+def import_dict(
+ file: Path = typer.Argument(
+ ...,
+ help="Path to the source .h5ad/.zarr store",
+ exists=True,
+ readable=True,
+ dir_okay=True,
+ file_okay=True,
+ ),
+ obj: str = typer.Argument(
+ ..., help="Object path to create/replace (e.g., 'uns', 'uns/metadata')"
+ ),
+ input_file: Path = typer.Argument(
+ ..., help="Input .json file", exists=True, readable=True
+ ),
+ output: Optional[Path] = typer.Option(
+ None,
+ "--output",
+ "-o",
+ help="Output .h5ad/.zarr path. Required unless --inplace.",
+ dir_okay=True,
+ file_okay=True,
+ ),
+ inplace: bool = typer.Option(
+ False,
+ "--inplace",
+ help="Modify source file directly.",
+ ),
+) -> None:
+ """
+ Import a JSON file into uns or other dict-like groups.
+
+ Examples:
+ h5ad import dict data.h5ad uns/metadata config.json -o output.h5ad
+ h5ad import dict data.h5ad uns settings.json --inplace
+ """
+ from h5ad.commands.import_data import _import_json
+
+ if not inplace and output is None:
+ console.print(
+ "[bold red]Error:[/] Output file is required. "
+ "Use --output/-o or --inplace.",
+ )
+ raise typer.Exit(code=1)
+
+ try:
+ target = _get_target_file(file, output, inplace)
+ _import_json(target, obj, input_file, console)
+ except Exception as e:
+ console.print(f"[bold red]Error:[/] {e}")
+ raise typer.Exit(code=1)
+
+
def main(argv: Optional[Sequence[str]] = None) -> None:
app(standalone_mode=True)
diff --git a/src/h5ad/commands/__init__.py b/src/h5ad/commands/__init__.py
index e681fea..70d960f 100644
--- a/src/h5ad/commands/__init__.py
+++ b/src/h5ad/commands/__init__.py
@@ -1,3 +1,4 @@
from h5ad.commands.info import show_info
-from h5ad.commands.table import export_table
from h5ad.commands.subset import subset_h5ad
+from h5ad.commands.export import export_table, export_image, export_json, export_mtx, export_npy
+from h5ad.commands.import_data import import_object
diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py
new file mode 100644
index 0000000..22221a7
--- /dev/null
+++ b/src/h5ad/commands/export.py
@@ -0,0 +1,109 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import List, Optional
+
+from rich.console import Console
+
+from h5ad.formats.array import export_npy as export_npy_format
+from h5ad.formats.common import EXPORTABLE_TYPES, IMAGE_EXTENSIONS, TYPE_EXTENSIONS
+from h5ad.formats.dataframe import export_dataframe
+from h5ad.formats.image import export_image as export_image_format
+from h5ad.formats.json_data import export_json as export_json_format
+from h5ad.formats.sparse import export_mtx as export_mtx_format
+from h5ad.storage import open_store
+
+
+def export_table(
+ file: Path,
+ axis: str,
+ columns: Optional[List[str]],
+ out: Optional[Path],
+ chunk_rows: int,
+ head: Optional[int],
+ console: Console,
+) -> None:
+ with open_store(file, "r") as store:
+ export_dataframe(
+ store.root,
+ axis=axis,
+ columns=columns,
+ out=out,
+ chunk_rows=chunk_rows,
+ head=head,
+ console=console,
+ )
+
+
+def export_npy(
+ file: Path,
+ obj: str,
+ out: Path,
+ chunk_elements: int,
+ console: Console,
+) -> None:
+ with open_store(file, "r") as store:
+ export_npy_format(
+ store.root,
+ obj=obj,
+ out=out,
+ chunk_elements=chunk_elements,
+ console=console,
+ )
+
+
+def export_mtx(
+ file: Path,
+ obj: str,
+ out: Optional[Path],
+ head: Optional[int],
+ chunk_elements: int,
+ in_memory: bool,
+ console: Console,
+) -> None:
+ with open_store(file, "r") as store:
+ export_mtx_format(
+ store.root,
+ obj=obj,
+ out=out,
+ head=head,
+ chunk_elements=chunk_elements,
+ in_memory=in_memory,
+ console=console,
+ )
+
+
+def export_json(
+ file: Path,
+ obj: str,
+ out: Optional[Path],
+ max_elements: int,
+ include_attrs: bool,
+ console: Console,
+) -> None:
+ with open_store(file, "r") as store:
+ export_json_format(
+ store.root,
+ obj=obj,
+ out=out,
+ max_elements=max_elements,
+ include_attrs=include_attrs,
+ console=console,
+ )
+
+
+def export_image(file: Path, obj: str, out: Path, console: Console) -> None:
+ with open_store(file, "r") as store:
+ export_image_format(store.root, obj=obj, out=out, console=console)
+
+
+__all__ = [
+ "EXPORTABLE_TYPES",
+ "IMAGE_EXTENSIONS",
+ "TYPE_EXTENSIONS",
+ "export_image",
+ "export_json",
+ "export_mtx",
+ "export_npy",
+ "export_table",
+]
diff --git a/src/h5ad/commands/import_data.py b/src/h5ad/commands/import_data.py
new file mode 100644
index 0000000..dad838a
--- /dev/null
+++ b/src/h5ad/commands/import_data.py
@@ -0,0 +1,129 @@
+"""Import command helpers for creating/replacing objects in h5ad/zarr stores."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Optional
+
+from rich.console import Console
+
+from h5ad.formats.array import import_npy
+from h5ad.formats.dataframe import import_dataframe
+from h5ad.formats.json_data import import_json
+from h5ad.formats.sparse import import_mtx
+from h5ad.storage import copy_path, copy_store_contents, detect_backend, open_store
+
+
+EXTENSION_FORMAT = {
+ ".csv": "csv",
+ ".npy": "npy",
+ ".mtx": "mtx",
+ ".json": "json",
+}
+
+
+def _prepare_target_path(
+ file: Path,
+ output_file: Optional[Path],
+ inplace: bool,
+ console: Console,
+) -> Path:
+ if inplace:
+ return file
+ if output_file is None:
+ raise ValueError("Output file is required unless --inplace is specified.")
+
+ src_backend = detect_backend(file)
+ dst_backend = detect_backend(output_file)
+
+ if src_backend == dst_backend:
+ copy_path(file, output_file)
+ console.print(f"[dim]Copied {file} → {output_file}[/]")
+ return output_file
+
+ with open_store(file, "r") as src_store, open_store(output_file, "w") as dst_store:
+ copy_store_contents(src_store.root, dst_store.root)
+ console.print(
+ f"[dim]Converted {file} ({src_backend}) → {output_file} ({dst_backend})[/]"
+ )
+ return output_file
+
+
+def import_object(
+ file: Path,
+ obj: str,
+ input_file: Path,
+ output_file: Optional[Path],
+ inplace: bool,
+ index_column: Optional[str],
+ console: Console,
+) -> None:
+ target_file = _prepare_target_path(file, output_file, inplace, console)
+ ext = input_file.suffix.lower()
+
+ if ext not in EXTENSION_FORMAT:
+ raise ValueError(
+ f"Unsupported input file extension '{ext}'. "
+ f"Supported: {', '.join(sorted(EXTENSION_FORMAT.keys()))}"
+ )
+
+ fmt = EXTENSION_FORMAT[ext]
+
+ if index_column and (fmt != "csv" or obj not in ("obs", "var")):
+ raise ValueError("--index-column is only valid for CSV import into 'obs' or 'var'.")
+
+ if fmt == "csv":
+ _import_csv(target_file, obj, input_file, index_column, console)
+ elif fmt == "npy":
+ _import_npy(target_file, obj, input_file, console)
+ elif fmt == "mtx":
+ _import_mtx(target_file, obj, input_file, console)
+ elif fmt == "json":
+ _import_json(target_file, obj, input_file, console)
+
+
+def _import_csv(
+ file: Path,
+ obj: str,
+ input_file: Path,
+ index_column: Optional[str],
+ console: Console,
+) -> None:
+ with open_store(file, "a") as store:
+ import_dataframe(
+ store.root,
+ obj=obj,
+ input_file=input_file,
+ index_column=index_column,
+ console=console,
+ )
+
+
+def _import_npy(
+ file: Path,
+ obj: str,
+ input_file: Path,
+ console: Console,
+) -> None:
+ with open_store(file, "a") as store:
+ import_npy(store.root, obj=obj, input_file=input_file, console=console)
+
+
+def _import_mtx(
+ file: Path,
+ obj: str,
+ input_file: Path,
+ console: Console,
+) -> None:
+ with open_store(file, "a") as store:
+ import_mtx(store.root, obj=obj, input_file=input_file, console=console)
+
+
+def _import_json(
+ file: Path,
+ obj: str,
+ input_file: Path,
+ console: Console,
+) -> None:
+ with open_store(file, "a") as store:
+ import_json(store.root, obj=obj, input_file=input_file, console=console)
diff --git a/src/h5ad/commands/info.py b/src/h5ad/commands/info.py
index 95c3c72..76b56da 100644
--- a/src/h5ad/commands/info.py
+++ b/src/h5ad/commands/info.py
@@ -1,32 +1,194 @@
from pathlib import Path
+from typing import Any, Optional
-import h5py
import rich
from rich.console import Console
-from h5ad.info import axis_len
+from rich.tree import Tree
+from h5ad.core.info import axis_len, format_type_info, get_entry_type
+from h5ad.storage import is_dataset, is_group, open_store
-def show_info(file: Path, console: Console) -> None:
+# Preferred display order for top-level keys
+KEY_ORDER = ["X", "obs", "var", "obsm", "varm", "layers", "obsp", "varp", "uns"]
+
+
+def _sort_keys(keys: list) -> list:
+ """Sort keys according to KEY_ORDER, with unknown keys at the end."""
+ order_map = {k: i for i, k in enumerate(KEY_ORDER)}
+ return sorted(keys, key=lambda k: (order_map.get(k, len(KEY_ORDER)), k))
+
+
+def show_info(
+ file: Path,
+ console: Console,
+ show_types: bool = False,
+ depth: Optional[int] = None,
+ entry_path: Optional[str] = None,
+) -> None:
"""
Show high-level information about the .h5ad file.
Args:
file (Path): Path to the .h5ad file
console (Console): Rich console for output
+ show_types (bool): Show detailed type information for each entry
+ depth (Optional[int]): Maximum recursion depth for type display (only with show_types=True)
+ entry_path (Optional[str]): Specific entry path to inspect (e.g., 'obsm/X_pca')
"""
- with h5py.File(file, "r") as f:
+ with open_store(file, "r") as store:
+ f = store.root
+ # If a specific path is requested, show detailed info for that object
+ if entry_path:
+ _show_object_info(f, entry_path, console)
+ return
+
# Get n_obs and n_var
n_obs = axis_len(f, "obs")
n_var = axis_len(f, "var")
rich.print(
f"[bold cyan]An object with n_obs × n_var: {n_obs if n_obs is not None else '?'} × {n_var if n_var is not None else '?'}[/]"
)
- # List top-level keys and their sub-keys
- for key, obj in sorted(f.items(), key=lambda x: len(x[0])):
- # Only process Groups, skip Datasets like X
- if isinstance(obj, h5py.Group):
- sub_keys = [k for k in obj.keys() if k != "_index"]
- if sub_keys and key != "X":
- rich.print(
- f"\t[bold yellow]{key}:[/]\t"
- + ", ".join(f"[bright_white]{sub}[/]" for sub in sub_keys)
+
+ if show_types:
+ _show_types_tree(f, console, root_label=str(file), depth=depth)
+ else:
+ # List top-level keys and their sub-keys (original behavior)
+ for key in _sort_keys(list(f.keys())):
+ obj = f[key]
+ # Only process Groups, skip Datasets like X
+ if is_group(obj):
+ sub_keys = [
+ k
+ for k in obj.keys()
+ if k not in ("_index", "__categories", "obs_names", "var_names")
+ ]
+ if sub_keys and key != "X":
+ rich.print(
+ f"\t[bold yellow]{key}:[/]\t"
+ + ", ".join(f"[bright_white]{sub}[/]" for sub in sub_keys)
+ )
+
+
+def _show_types_tree(
+ f: Any, console: Console, root_label: str, depth: Optional[int] = None
+) -> None:
+ """Show a tree view with type information for all entries.
+
+ Recursion depth by group:
+ - obs/var: top level only (no children)
+ - X: top level only
+ - obsm/obsp/varm/varp/layers: 1 level (show matrices)
+ - uns: 2 levels deep
+ """
+ tree = Tree(f"[bold]{root_label}[/]")
+
+ # Define max depth for each top-level group
+ max_depth_map = {
+ "obs": 0,
+ "var": 0,
+ "X": 0,
+ "obsm": 1,
+ "obsp": 1,
+ "varm": 1,
+ "varp": 1,
+ "layers": 1,
+ "uns": 2,
+ }
+
+ def add_node(
+ parent_tree: Tree,
+ name: str,
+ obj: Any,
+ current_depth: int,
+ max_depth: int,
+ ) -> None:
+ info = get_entry_type(obj)
+ type_str = format_type_info(info)
+
+ if is_dataset(obj):
+ shape_str = f"[dim]{obj.shape}[/]" if obj.shape else ""
+ node_text = f"[bright_white]{name}[/] {shape_str} {type_str}"
+ parent_tree.add(node_text)
+ else:
+ # Group
+ node_text = f"[bold yellow]{name}/[/] {type_str}"
+ subtree = parent_tree.add(node_text)
+
+ # Recurse only if within allowed depth
+ if current_depth < max_depth:
+ for child_name in sorted(obj.keys()):
+ if child_name in ("_index", "__categories"):
+ continue
+ child_obj = obj[child_name]
+ add_node(
+ subtree, child_name, child_obj, current_depth + 1, max_depth
)
+
+ # Add top-level items in preferred order
+ for key in _sort_keys(list(f.keys())):
+ obj = f[key]
+ # Skip empty groups
+ if is_group(obj):
+ children = [
+ k
+ for k in obj.keys()
+ if k not in ("_index", "__categories", "obs_names", "var_names")
+ ]
+ if not children:
+ continue
+ max_depth = (
+ depth if depth is not None else max_depth_map.get(key, 1)
+ ) # default to 1 level for unknown groups
+ add_node(tree, key, obj, current_depth=0, max_depth=max_depth)
+
+ console.print(tree)
+
+
+def _show_object_info(f: Any, entry_path: str, console: Console) -> None:
+ """Show detailed info for a specific object path."""
+ # Normalize path
+ entry_path = entry_path.strip().lstrip("/")
+
+ if entry_path not in f:
+ console.print(f"[bold red]Error:[/] '{entry_path}' not found in the file.")
+ return
+
+ entry = f[entry_path]
+ info = get_entry_type(entry)
+
+ console.print(f"\n[bold cyan]Path:[/] {entry_path}")
+ console.print(f"[bold cyan]Type:[/] {info['type']}")
+
+ if info["encoding"]:
+ console.print(f"[bold cyan]Encoding:[/] {info['encoding']}")
+
+ if info["shape"]:
+ console.print(f"[bold cyan]Shape:[/] {info['shape']}")
+
+ if info["dtype"]:
+ console.print(f"[bold cyan]Dtype:[/] {info['dtype']}")
+
+ console.print(f"[bold cyan]Details:[/] {info['details']}")
+
+ # Show attributes if any
+ if entry.attrs:
+ console.print(f"\n[bold cyan]Attributes:[/]")
+ for k, v in entry.attrs.items():
+ v_str = v.decode("utf-8") if isinstance(v, bytes) else str(v)
+ if len(v_str) > 80:
+ v_str = v_str[:77] + "..."
+ console.print(f" [dim]{k}:[/] {v_str}")
+
+ # If it's a group, show children
+ if is_group(entry):
+ children = [
+ k
+ for k in entry.keys()
+ if k not in ("_index", "__categories", "obs_names", "var_names")
+ ]
+ if children:
+ console.print(f"\n[bold cyan]Children:[/]")
+ for child_name in sorted(children):
+ child_entry = entry[child_name]
+ child_info = get_entry_type(child_entry)
+ type_str = format_type_info(child_info)
+ console.print(f" [bright_white]{child_name}[/] {type_str}")
diff --git a/src/h5ad/commands/subset.py b/src/h5ad/commands/subset.py
index 2e01d9d..940ef07 100644
--- a/src/h5ad/commands/subset.py
+++ b/src/h5ad/commands/subset.py
@@ -1,686 +1,17 @@
-"""Subset operations for .h5ad files."""
-
-from __future__ import annotations
-
-from pathlib import Path
-from typing import Optional, Set, Tuple, List, Dict, Any
-
-import h5py
-import numpy as np
-import typer
-from rich.console import Console
-from rich.progress import (
- Progress,
- SpinnerColumn,
- TextColumn,
- BarColumn,
- TaskProgressColumn,
- TimeElapsedColumn,
+from h5ad.core.subset import (
+ _read_name_file,
+ indices_from_name_set,
+ subset_axis_group,
+ subset_dense_matrix,
+ subset_h5ad,
+ subset_sparse_matrix_group,
)
-from h5ad.read import decode_str_array
-
-
-def _copy_attrs(src: h5py.AttributeManager, dst: h5py.AttributeManager) -> None:
- """
- Copy HDF5 attributes from source to destination.
- Args:
- src (h5py.AttributeManager): Source attributes
- dst (h5py.AttributeManager): Destination attributes
- """
- for k, v in src.items():
- dst[k] = v
-
-
-def _ds_create_kwargs(src: h5py.Dataset) -> Dict[str, Any]:
- """
- Best-effort carryover of dataset creation properties.
- (h5py doesn't expose everything perfectly; this covers the big ones.)
-
- Args:
- src (h5py.Dataset): Source dataset
- Returns:
- Dict[str, Any]: Dataset creation keyword arguments
- """
- kw: Dict[str, Any] = {}
- if src.chunks is not None:
- kw["chunks"] = src.chunks
- if src.compression is not None:
- kw["compression"] = src.compression
- kw["compression_opts"] = src.compression_opts
- kw["shuffle"] = bool(src.shuffle)
- kw["fletcher32"] = bool(src.fletcher32)
- if src.scaleoffset is not None:
- kw["scaleoffset"] = src.scaleoffset
- if src.fillvalue is not None:
- kw["fillvalue"] = src.fillvalue
- return kw
-
-
-def _read_name_file(path: Path) -> Set[str]:
- """
- Read one name per line from a file. Blank lines ignored.
- """
- names: Set[str] = set()
- with open(path, "r", encoding="utf-8") as f:
- for line in f:
- line = line.strip()
- if line:
- names.add(line)
- return names
-
-
-def indices_from_name_set(
- names_ds: h5py.Dataset,
- keep: Set[str],
- *,
- chunk_size: int = 200_000,
-) -> Tuple[np.ndarray, Set[str]]:
- """
- Returns (indices_sorted, missing_names).
- Chunked scan so we don't do names_ds[...] for huge datasets.
-
- Args:
- names_ds (h5py.Dataset): Dataset containing names
- keep (Set[str]): Set of names to find
- chunk_size (int): Number of names to read per chunk
-
- Returns:
- Tuple[np.ndarray, Set[str]]: (Array of found indices, set of missing names)
- """
- if names_ds.ndim != 1:
- # common h5ad uses 1D obs_names/var_names
- flat_len = int(np.prod(names_ds.shape))
- else:
- flat_len = names_ds.shape[0]
-
- remaining = set(keep) # we'll delete as we find
- found_indices: List[int] = []
-
- for start in range(0, flat_len, chunk_size):
- end = min(start + chunk_size, flat_len)
- chunk = names_ds[start:end]
- chunk = decode_str_array(np.asarray(chunk)).astype(str)
-
- for i, name in enumerate(chunk):
- if name in remaining:
- found_indices.append(start + i)
- remaining.remove(name)
-
- if not remaining:
- break
-
- return np.asarray(found_indices, dtype=np.int64), remaining
-
-
-def subset_axis_group(
- src: h5py.Group,
- dst: h5py.Group,
- indices: Optional[np.ndarray],
-) -> None:
- """
- Subset obs/var group:
- - datasets: subset along first axis (obj[indices, ...])
- - categorical groups: copy categories, subset codes
- - unknown groups: copy as-is if indices is None; otherwise copy conservatively
-
- Args:
- src (h5py.Group): Source axis group
- dst (h5py.Group): Destination axis group
- indices (Optional[np.ndarray]): Indices to keep; if None, copy as-is
- """
- _copy_attrs(src.attrs, dst.attrs)
-
- for key in src.keys():
- obj = src[key]
-
- if isinstance(obj, h5py.Dataset):
- if indices is None:
- src.copy(key, dst, name=key)
- else:
- data = obj[indices, ...]
- ds = dst.create_dataset(key, data=data)
- _copy_attrs(obj.attrs, ds.attrs)
-
- elif isinstance(obj, h5py.Group):
- enc = obj.attrs.get("encoding-type", b"")
- if isinstance(enc, bytes):
- enc = enc.decode("utf-8")
-
- if enc == "categorical":
- gdst = dst.create_group(key)
- _copy_attrs(obj.attrs, gdst.attrs)
- obj.copy("categories", gdst, name="categories")
-
- codes = obj["codes"]
- if indices is None:
- obj.copy("codes", gdst, name="codes")
- else:
- codes_sub = codes[indices, ...]
- ds = gdst.create_dataset("codes", data=codes_sub)
- _copy_attrs(codes.attrs, ds.attrs)
- else:
- # Unknown group type - copy as-is
- src.copy(key, dst, name=key)
-
-
-def subset_dense_matrix(
- src: h5py.Dataset,
- dst_parent: h5py.Group,
- name: str,
- obs_idx: Optional[np.ndarray],
- var_idx: Optional[np.ndarray],
- *,
- chunk_rows: int = 1024,
-) -> None:
- """
- Chunked write for dense 2D datasets.
- Args:
- src (h5py.Dataset): Source dense matrix dataset
- dst_parent (h5py.Group): Destination parent group
- name (str): Name for the destination dataset
- obs_idx (Optional[np.ndarray]): Indices of observations to keep
- var_idx (Optional[np.ndarray]): Indices of variables to keep
- chunk_rows (int): Number of rows to read per chunk
- """
- if src.ndim != 2:
- # fallback: copy whole dataset
- src.parent.copy(src.name.split("/")[-1], dst_parent, name=name)
- return
-
- n_obs, n_var = src.shape
- out_obs = len(obs_idx) if obs_idx is not None else n_obs
- out_var = len(var_idx) if var_idx is not None else n_var
-
- kw = _ds_create_kwargs(src)
- # adjust chunks to output shape if possible
- if "chunks" in kw and kw["chunks"] is not None:
- c0, c1 = kw["chunks"]
- kw["chunks"] = (min(c0, out_obs), min(c1, out_var))
-
- dst = dst_parent.create_dataset(
- name, shape=(out_obs, out_var), dtype=src.dtype, **kw
- )
- _copy_attrs(src.attrs, dst.attrs)
-
- # Write in blocks of output rows
- for out_start in range(0, out_obs, chunk_rows):
- out_end = min(out_start + chunk_rows, out_obs)
-
- if obs_idx is None:
- block = src[out_start:out_end, :]
- else:
- rows = obs_idx[out_start:out_end]
- block = src[rows, :]
-
- if var_idx is not None:
- block = block[:, var_idx]
-
- dst[out_start:out_end, :] = block
-
-
-def subset_sparse_matrix_group(
- src: h5py.Group,
- dst_parent: h5py.Group,
- name: str,
- obs_idx: Optional[np.ndarray],
- var_idx: Optional[np.ndarray],
-) -> None:
- """
- Subset a sparse matrix stored as an h5ad group with datasets:
- - data, indices, indptr
- Supports both CSR (Compressed Sparse Row) and CSC (Compressed Sparse Column) formats.
-
- CSR: rows are compressed, efficient for row-wise operations
- CSC: columns are compressed, efficient for column-wise operations
-
- Args:
- src (h5py.Group): Source sparse matrix group
- dst_parent (h5py.Group): Destination parent group
- name (str): Name for the destination group
- obs_idx (Optional[np.ndarray]): Indices of observations to keep
- var_idx (Optional[np.ndarray]): Indices of variables to keep
- """
- data = src["data"]
- indices = src["indices"]
- indptr = src["indptr"]
-
- # Determine format
- encoding = src.attrs.get("encoding-type", b"")
- if isinstance(encoding, bytes):
- encoding = encoding.decode("utf-8")
-
- is_csr = encoding == "csr_matrix"
- is_csc = encoding == "csc_matrix"
-
- if not is_csr and not is_csc:
- raise ValueError(f"Unsupported sparse format: {encoding}")
-
- # Determine shape
- shape = src.attrs.get("shape", None)
- if shape is None:
- # fallback: infer from indptr len and max index
- major_dim = indptr.shape[0] - 1
- minor_dim = int(indices[...].max()) + 1 if indices.size else 0
- if is_csr:
- n_obs, n_var = major_dim, minor_dim
- else: # CSC
- n_obs, n_var = minor_dim, major_dim
- else:
- n_obs, n_var = shape
-
- # For CSR: major axis = obs (rows), minor axis = var (cols)
- # For CSC: major axis = var (cols), minor axis = obs (rows)
- if is_csr:
- major_idx = obs_idx if obs_idx is not None else np.arange(n_obs, dtype=np.int64)
- minor_idx = var_idx
- out_obs = major_idx.shape[0]
- out_var = minor_idx.shape[0] if minor_idx is not None else n_var
- else: # CSC
- major_idx = var_idx if var_idx is not None else np.arange(n_var, dtype=np.int64)
- minor_idx = obs_idx
- out_obs = minor_idx.shape[0] if minor_idx is not None else n_obs
- out_var = major_idx.shape[0]
-
- # Build minor axis remap if needed
- minor_map = None
- out_minor_dim = out_var if is_csr else out_obs
- total_minor_dim = n_var if is_csr else n_obs
-
- if minor_idx is not None:
- # array remap is fastest; if dimension is huge and memory matters, use dict instead
- minor_map = np.full(total_minor_dim, -1, dtype=np.int64)
- minor_map[minor_idx] = np.arange(minor_idx.shape[0], dtype=np.int64)
-
- # Pass 1: count nnz in output to preallocate
- out_counts = np.zeros(len(major_idx), dtype=np.int64)
- for i, major_pos in enumerate(major_idx):
- s = int(indptr[major_pos])
- e = int(indptr[major_pos + 1])
- if s == e:
- continue
- minor_indices = indices[s:e]
- if minor_map is None:
- out_counts[i] = e - s
- else:
- mask = minor_map[minor_indices] >= 0
- out_counts[i] = mask.sum()
-
- out_indptr = np.zeros(len(major_idx) + 1, dtype=indptr.dtype)
- np.cumsum(out_counts, out=out_indptr[1:])
- out_nnz = int(out_indptr[-1])
-
- # Preallocate output arrays
- out_data = np.empty(out_nnz, dtype=data.dtype)
- out_indices = np.empty(out_nnz, dtype=indices.dtype)
-
- # Pass 2: fill
- cursor = 0
- for i, major_pos in enumerate(major_idx):
- s = int(indptr[major_pos])
- e = int(indptr[major_pos + 1])
- if s == e:
- continue
-
- minor_indices = indices[s:e]
- vals = data[s:e]
-
- if minor_map is None:
- length = e - s
- out_indices[cursor : cursor + length] = minor_indices
- out_data[cursor : cursor + length] = vals
- cursor += length
- else:
- mask = minor_map[minor_indices] >= 0
- new_minor = minor_map[minor_indices[mask]]
- new_vals = vals[mask]
- length = len(new_minor)
- out_indices[cursor : cursor + length] = new_minor
- out_data[cursor : cursor + length] = new_vals
- cursor += length
-
- # Create dst group
- gdst = dst_parent.create_group(name)
- _copy_attrs(src.attrs, gdst.attrs)
- gdst.attrs["shape"] = (out_obs, out_var)
- # Write encoding-type as bytes to match h5ad standard
- gdst.attrs["encoding-type"] = (
- encoding.encode("utf-8") if isinstance(encoding, str) else encoding
- )
-
- # Write datasets (best-effort preserve compression/etc.)
- # Adjust chunks to not exceed output size
- data_kw = _ds_create_kwargs(data)
- if "chunks" in data_kw and data_kw["chunks"] is not None:
- data_kw["chunks"] = (min(data_kw["chunks"][0], out_nnz),)
- d_data = gdst.create_dataset("data", data=out_data, **data_kw)
- _copy_attrs(data.attrs, d_data.attrs)
-
- indices_kw = _ds_create_kwargs(indices)
- if "chunks" in indices_kw and indices_kw["chunks"] is not None:
- indices_kw["chunks"] = (min(indices_kw["chunks"][0], out_nnz),)
- d_indices = gdst.create_dataset("indices", data=out_indices, **indices_kw)
- _copy_attrs(indices.attrs, d_indices.attrs)
-
- indptr_kw = _ds_create_kwargs(indptr)
- if "chunks" in indptr_kw and indptr_kw["chunks"] is not None:
- indptr_kw["chunks"] = (min(indptr_kw["chunks"][0], len(out_indptr)),)
- d_indptr = gdst.create_dataset("indptr", data=out_indptr, **indptr_kw)
- _copy_attrs(indptr.attrs, d_indptr.attrs)
-
-
-def subset_matrix_like(
- src_obj: h5py.Dataset | h5py.Group,
- dst_parent: h5py.Group,
- name: str,
- obs_idx: Optional[np.ndarray],
- var_idx: Optional[np.ndarray],
- *,
- chunk_rows: int = 1024,
-) -> None:
- """
- Dispatch for dense dataset vs sparse (csr/csc) group.
- Args:
- src_obj (h5py.Dataset | h5py.Group): Source dataset or group
- dst_parent (h5py.Group): Destination parent group
- name (str): Name for the destination dataset/group
- obs_idx (Optional[np.ndarray]): Indices of observations to keep
- var_idx (Optional[np.ndarray]): Indices of variables to keep
- chunk_rows (int): Number of rows to read per chunk when subsetting dense matrices
- """
- if isinstance(src_obj, h5py.Dataset):
- subset_dense_matrix(
- src_obj, dst_parent, name, obs_idx, var_idx, chunk_rows=chunk_rows
- )
- return
-
- # group
- enc = src_obj.attrs.get("encoding-type", b"")
- if isinstance(enc, bytes):
- enc = enc.decode("utf-8")
-
- if enc in ("csr_matrix", "csc_matrix"):
- subset_sparse_matrix_group(src_obj, dst_parent, name, obs_idx, var_idx)
- else:
- # unknown sparse type -> copy as-is (or raise)
- src_obj.file.copy(src_obj, dst_parent, name)
-
-
-def subset_h5ad(
- file: Path,
- output: Path,
- obs_file: Optional[Path],
- var_file: Optional[Path],
- *,
- chunk_rows: int = 1024,
- console: Console,
-) -> None:
- """
- Subset an h5ad file by obs and/or var names.
- Args:
- file (Path): Input h5ad file path
- output (Path): Output h5ad file path
- obs_file (Optional[Path]): File with obs names to keep (one per line)
- var_file (Optional[Path]): File with var names to keep (one per line)
- chunk_rows (int): Number of rows to read per chunk when subsetting dense matrices
- console (Console): Rich console for output
- """
- # ---- Read keep-lists
- obs_keep: Optional[Set[str]] = None
- if obs_file is not None:
- obs_keep = _read_name_file(obs_file)
- console.print(f"[cyan]Found {len(obs_keep)} obs names to keep[/]")
-
- var_keep: Optional[Set[str]] = None
- if var_file is not None:
- var_keep = _read_name_file(var_file)
- console.print(f"[cyan]Found {len(var_keep)} var names to keep[/]")
-
- if obs_keep is None and var_keep is None:
- console.print(
- "[bold red]Error:[/] At least one of [cyan]--obs[/] or [cyan]--var[/] must be provided.",
- )
- raise typer.Exit(code=1)
-
- # ---- Open files
- with console.status("[magenta]Opening files...[/]"):
- src = h5py.File(file, "r")
- dst = h5py.File(output, "w")
-
- try:
- # ---- Compute indices
- obs_idx = None
- if obs_keep is not None:
- console.print("[cyan]Matching obs names...[/]")
- obs_names_ds = src["obs"].get("obs_names") or src["obs"].get(
- src["obs"].attrs.get("_index", "obs_names")
- )
- if obs_names_ds is None:
- console.print("[bold red]Error:[/] Could not find obs names")
- raise RuntimeError("Could not find obs names")
-
- obs_idx, missing_obs = indices_from_name_set(obs_names_ds, obs_keep)
- if missing_obs:
- console.print(
- f"[yellow]Warning: {len(missing_obs)} obs names not found in file[/]"
- )
- console.print(
- f"[green]Selected {len(obs_idx)} obs (of {obs_names_ds.shape[0]})[/]"
- )
-
- var_idx = None
- if var_keep is not None:
- console.print("[cyan]Matching var names...[/]")
- var_names_ds = src["var"].get("var_names") or src["var"].get(
- src["var"].attrs.get("_index", "var_names")
- )
- if var_names_ds is None:
- console.print("[bold red]Error:[/] Could not find var names")
- raise RuntimeError("Could not find var names")
-
- var_idx, missing_var = indices_from_name_set(var_names_ds, var_keep)
- if missing_var:
- console.print(
- f"[yellow]Warning: {len(missing_var)} var names not found in file[/]"
- )
- console.print(
- f"[green]Selected {len(var_idx)} var (of {var_names_ds.shape[0]})[/]"
- )
-
- # ---- Build task list
- tasks: List[str] = []
- if "obs" in src:
- tasks.append("obs")
- if "var" in src:
- tasks.append("var")
- if "X" in src:
- tasks.append("X")
- if "layers" in src:
- tasks.extend([f"layer:{k}" for k in src["layers"].keys()])
- if "obsm" in src:
- tasks.extend([f"obsm:{k}" for k in src["obsm"].keys()])
- if "varm" in src:
- tasks.extend([f"varm:{k}" for k in src["varm"].keys()])
- if "obsp" in src:
- tasks.extend([f"obsp:{k}" for k in src["obsp"].keys()])
- if "varp" in src:
- tasks.extend([f"varp:{k}" for k in src["varp"].keys()])
- if "uns" in src:
- tasks.append("uns")
-
- # ---- Progress bar for all operations
- with Progress(
- SpinnerColumn(),
- TextColumn("[progress.description]{task.description}"),
- BarColumn(),
- TaskProgressColumn(),
- TimeElapsedColumn(),
- console=console,
- ) as progress:
- task_id = progress.add_task("[cyan]Subsetting...", total=len(tasks))
- processed_top: Set[str] = set()
-
- # obs
- if "obs" in src:
- progress.update(task_id, description="[cyan]Subsetting obs...[/]")
- obs_dst = dst.create_group("obs")
- subset_axis_group(src["obs"], obs_dst, obs_idx)
- processed_top.add("obs")
- progress.advance(task_id)
-
- # var
- if "var" in src:
- progress.update(task_id, description="[cyan]Subsetting var...[/]")
- var_dst = dst.create_group("var")
- subset_axis_group(src["var"], var_dst, var_idx)
- processed_top.add("var")
- progress.advance(task_id)
-
- # X
- if "X" in src:
- progress.update(task_id, description="[cyan]Subsetting X...[/]")
- subset_matrix_like(
- src["X"], dst, "X", obs_idx, var_idx, chunk_rows=chunk_rows
- )
- processed_top.add("X")
- progress.advance(task_id)
-
- # layers
- if "layers" in src:
- layers_dst = dst.create_group("layers")
- processed_top.add("layers")
- for lname in src["layers"].keys():
- progress.update(
- task_id, description=f"[cyan]Subsetting layer: {lname}...[/]"
- )
- subset_matrix_like(
- src["layers"][lname],
- layers_dst,
- lname,
- obs_idx,
- var_idx,
- chunk_rows=chunk_rows,
- )
- progress.advance(task_id)
-
- # obsm
- if "obsm" in src:
- obsm_dst = dst.create_group("obsm")
- processed_top.add("obsm")
- for k in src["obsm"].keys():
- if obs_idx is None:
- progress.update(
- task_id, description=f"[cyan]Copying obsm: {k}...[/]"
- )
- src["obsm"].copy(k, obsm_dst, name=k)
- else:
- progress.update(
- task_id, description=f"[cyan]Subsetting obsm: {k}...[/]"
- )
- obj = src["obsm"][k]
- if isinstance(obj, h5py.Dataset):
- data = obj[obs_idx, ...]
- obsm_dst.create_dataset(k, data=data)
- for ak, av in obj.attrs.items():
- obsm_dst[k].attrs[ak] = av
- else:
- subset_matrix_like(
- obj, obsm_dst, k, obs_idx, None, chunk_rows=chunk_rows
- )
- progress.advance(task_id)
-
- # varm
- if "varm" in src:
- varm_dst = dst.create_group("varm")
- processed_top.add("varm")
- for k in src["varm"].keys():
- if var_idx is None:
- progress.update(
- task_id, description=f"[cyan]Copying varm: {k}...[/]"
- )
- src["varm"].copy(k, varm_dst, name=k)
- else:
- progress.update(
- task_id, description=f"[cyan]Subsetting varm: {k}...[/]"
- )
- obj = src["varm"][k]
- if isinstance(obj, h5py.Dataset):
- data = obj[var_idx, ...]
- varm_dst.create_dataset(k, data=data)
- for ak, av in obj.attrs.items():
- varm_dst[k].attrs[ak] = av
- else:
- subset_matrix_like(
- obj, varm_dst, k, var_idx, None, chunk_rows=chunk_rows
- )
- progress.advance(task_id)
-
- # obsp
- if "obsp" in src:
- obsp_dst = dst.create_group("obsp")
- processed_top.add("obsp")
- for k in src["obsp"].keys():
- if obs_idx is None:
- progress.update(
- task_id, description=f"[cyan]Copying obsp: {k}...[/]"
- )
- src["obsp"].copy(k, obsp_dst, name=k)
- else:
- progress.update(
- task_id, description=f"[cyan]Subsetting obsp: {k}...[/]"
- )
- subset_matrix_like(
- src["obsp"][k],
- obsp_dst,
- k,
- obs_idx,
- obs_idx,
- chunk_rows=chunk_rows,
- )
- progress.advance(task_id)
-
- # varp
- if "varp" in src:
- varp_dst = dst.create_group("varp")
- processed_top.add("varp")
- for k in src["varp"].keys():
- if var_idx is None:
- progress.update(
- task_id, description=f"[cyan]Copying varp: {k}...[/]"
- )
- src["varp"].copy(k, varp_dst, name=k)
- else:
- progress.update(
- task_id, description=f"[cyan]Subsetting varp: {k}...[/]"
- )
- subset_matrix_like(
- src["varp"][k],
- varp_dst,
- k,
- var_idx,
- var_idx,
- chunk_rows=chunk_rows,
- )
- progress.advance(task_id)
-
- # uns
- if "uns" in src:
- progress.update(task_id, description="[cyan]Copying uns...[/]")
- src.copy("uns", dst)
- processed_top.add("uns")
- progress.advance(task_id)
-
- # copy any remaining top-level keys
- for key in src.keys():
- if key not in processed_top:
- src.copy(key, dst)
-
- # top-level attrs
- for ak, av in src.attrs.items():
- dst.attrs[ak] = av
-
- console.print(f"[bold green]✓ Successfully created {output}[/]")
-
- finally:
- dst.close()
- src.close()
+__all__ = [
+ "_read_name_file",
+ "indices_from_name_set",
+ "subset_axis_group",
+ "subset_dense_matrix",
+ "subset_h5ad",
+ "subset_sparse_matrix_group",
+]
diff --git a/src/h5ad/commands/table.py b/src/h5ad/commands/table.py
deleted file mode 100644
index 16b7686..0000000
--- a/src/h5ad/commands/table.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import sys
-import csv
-from pathlib import Path
-from typing import List, Optional, Dict
-
-import h5py
-import numpy as np
-from rich.console import Console
-from h5ad.info import get_axis_group
-from h5ad.read import col_chunk_as_strings
-
-
-def export_table(
- file: Path,
- axis: str,
- columns: Optional[List[str]],
- out: Optional[Path],
- chunk_rows: int,
- head: Optional[int],
- console: Console,
-) -> None:
- """
- Export a table of the specified axis to CSV format.
- Args:
- file (Path): Path to the .h5ad file
- axis (str): Axis to read from ('obs' or 'var')
- columns (Optional[List[str]]): List of column names to include in the output table
- out (Optional[Path]): Output file path (defaults to stdout)
- chunk_rows (int): Number of rows to read per chunk
- head (Optional[int]): Output only the first n rows
- """
- with h5py.File(file, "r") as f:
- group, n_rows, index_name = get_axis_group(f, axis)
-
- # Determine columns to read
- if columns:
- col_names = list(columns)
- else:
- col_names = [k for k in group.keys() if k != "_index" and k != index_name]
- # Add index name if not already present
- if index_name and index_name not in col_names:
- col_names.insert(0, index_name)
-
- if isinstance(index_name, bytes):
- index_name = index_name.decode("utf-8")
-
- if index_name not in col_names:
- col_names.insert(0, index_name)
- else:
- col_names = [index_name] + [c for c in col_names if c != index_name]
-
- # Limit rows if head option is specified
- if head is not None and head > 0:
- n_rows = min(n_rows, head)
-
- # Open writer
- if out is None or str(out) == "-":
- out_fh = sys.stdout
- else:
- out_fh = open(out, "w", newline="", encoding="utf-8")
- writer = csv.writer(out_fh)
-
- # Write data in chunks
- try:
- writer.writerow(col_names)
- cat_cache: Dict[int, np.ndarray] = {}
- with console.status(
- f"[magenta]Exporting {axis} table...[/] to {'stdout' if out_fh is sys.stdout else out}"
- ) as status:
- for start in range(0, n_rows, chunk_rows):
- end = min(start + chunk_rows, n_rows)
- status.update(
- f"[magenta]Exporting rows {start}-{end} of {n_rows}...[/]"
- )
- cols_data: List[List[str]] = []
- # Read each column for the current chunk
- for col in col_names:
- cols_data.append(
- col_chunk_as_strings(group, col, start, end, cat_cache)
- )
- # Write rows
- for row_idx in range(end - start):
- row = [
- cols_data[col_idx][row_idx]
- for col_idx in range(len(col_names))
- ]
- writer.writerow(row)
- finally:
- if out_fh is not sys.stdout:
- out_fh.close()
diff --git a/src/h5ad/core/__init__.py b/src/h5ad/core/__init__.py
new file mode 100644
index 0000000..9224273
--- /dev/null
+++ b/src/h5ad/core/__init__.py
@@ -0,0 +1 @@
+"""Core logic shared by CLI commands and format handlers."""
diff --git a/src/h5ad/core/info.py b/src/h5ad/core/info.py
new file mode 100644
index 0000000..8db8a14
--- /dev/null
+++ b/src/h5ad/core/info.py
@@ -0,0 +1,221 @@
+from __future__ import annotations
+
+from typing import Optional, Tuple, Dict, Any, Union
+
+import numpy as np
+
+from h5ad.storage import is_dataset, is_group, is_hdf5_dataset
+
+
+def _decode_attr(value: Any) -> Any:
+ if isinstance(value, bytes):
+ return value.decode("utf-8")
+ return value
+
+
+def get_entry_type(entry: Any) -> Dict[str, Any]:
+ """
+ Determine the type/format of an object for export guidance.
+
+ Supports both:
+ - v0.2.0 (modern): Objects with encoding-type/encoding-version attributes
+ - v0.1.0 (legacy): Objects without encoding attributes, inferred from structure
+ """
+ result: Dict[str, Any] = {
+ "type": "unknown",
+ "export_as": None,
+ "encoding": None,
+ "shape": None,
+ "dtype": None,
+ "details": "",
+ "version": None,
+ }
+
+ enc = _decode_attr(entry.attrs.get("encoding-type", b""))
+ result["encoding"] = enc if enc else None
+
+ enc_ver = _decode_attr(entry.attrs.get("encoding-version", b""))
+ result["version"] = enc_ver if enc_ver else None
+
+ if is_dataset(entry):
+ result["shape"] = entry.shape
+ result["dtype"] = str(entry.dtype)
+
+ if "categories" in entry.attrs:
+ result["type"] = "categorical"
+ result["export_as"] = "csv"
+ result["version"] = result["version"] or "0.1.0"
+ n_cats = "?"
+ if is_hdf5_dataset(entry):
+ try:
+ cats_ref = entry.attrs["categories"]
+ cats_ds = entry.file[cats_ref]
+ n_cats = cats_ds.shape[0]
+ except Exception:
+ n_cats = "?"
+ result["details"] = (
+ f"Legacy categorical [{entry.shape[0]} values, {n_cats} categories]"
+ )
+ return result
+
+ if entry.shape == ():
+ result["type"] = "scalar"
+ result["export_as"] = "json"
+ result["details"] = f"Scalar value ({entry.dtype})"
+ return result
+
+ if entry.ndim == 1:
+ result["type"] = "array"
+ result["export_as"] = "npy"
+ result["details"] = f"1D array [{entry.shape[0]}] ({entry.dtype})"
+ elif entry.ndim == 2:
+ result["type"] = "dense-matrix"
+ result["export_as"] = "npy"
+ result["details"] = (
+ f"Dense matrix {entry.shape[0]}×{entry.shape[1]} ({entry.dtype})"
+ )
+ elif entry.ndim == 3:
+ result["type"] = "array"
+ result["export_as"] = "npy"
+ result["details"] = f"3D array {entry.shape} ({entry.dtype})"
+ else:
+ result["type"] = "array"
+ result["export_as"] = "npy"
+ result["details"] = f"ND array {entry.shape} ({entry.dtype})"
+ return result
+
+ if is_group(entry):
+ if enc in ("csr_matrix", "csc_matrix"):
+ shape = entry.attrs.get("shape", None)
+ shape_str = f"{shape[0]}×{shape[1]}" if shape is not None else "?"
+ result["type"] = "sparse-matrix"
+ result["export_as"] = "mtx"
+ result["details"] = (
+ f"Sparse {enc.replace('_matrix', '').upper()} matrix {shape_str}"
+ )
+ return result
+
+ if enc == "categorical":
+ codes = entry.get("codes")
+ cats = entry.get("categories")
+ n_codes = codes.shape[0] if codes is not None else "?"
+ n_cats = cats.shape[0] if cats is not None else "?"
+ result["type"] = "categorical"
+ result["export_as"] = "csv"
+ result["details"] = f"Categorical [{n_codes} values, {n_cats} categories]"
+ return result
+
+ if (
+ enc == "dataframe"
+ or "_index" in entry.attrs
+ or "obs_names" in entry
+ or "var_names" in entry
+ ):
+ if enc == "dataframe":
+ df_version = result["version"] or "0.2.0"
+ else:
+ df_version = "0.1.0"
+ result["version"] = df_version
+
+ has_legacy_cats = "__categories" in entry
+ n_cols = len(
+ [k for k in entry.keys() if k not in ("_index", "__categories")]
+ )
+
+ result["type"] = "dataframe"
+ result["export_as"] = "csv"
+ if has_legacy_cats:
+ result["details"] = f"DataFrame with {n_cols} columns (legacy v0.1.0)"
+ else:
+ result["details"] = f"DataFrame with {n_cols} columns"
+ return result
+
+ if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"):
+ result["type"] = "array"
+ result["export_as"] = "npy"
+ result["details"] = f"Encoded array ({enc})"
+ return result
+
+ if enc == "string-array":
+ result["type"] = "array"
+ result["export_as"] = "npy"
+ result["details"] = "Encoded string array"
+ return result
+
+ if enc == "awkward-array":
+ length = entry.attrs.get("length", "?")
+ result["type"] = "awkward-array"
+ result["export_as"] = "json"
+ result["details"] = f"Awkward array (length={length})"
+ return result
+
+ n_keys = len(list(entry.keys()))
+ result["type"] = "dict"
+ result["export_as"] = "json"
+ result["details"] = f"Group with {n_keys} keys"
+ return result
+
+ return result
+
+
+def format_type_info(info: Dict[str, Any]) -> str:
+ type_colors = {
+ "dataframe": "green",
+ "sparse-matrix": "magenta",
+ "dense-matrix": "blue",
+ "array": "blue",
+ "dict": "yellow",
+ "categorical": "green",
+ "scalar": "white",
+ "unknown": "red",
+ }
+
+ color = type_colors.get(info["type"], "white")
+ return f"[{color}]<{info['type']}>[/]"
+
+
+def axis_len(file: Any, axis: str) -> int:
+ if axis not in file:
+ raise KeyError(f"'{axis}' not found in the file.")
+
+ group = file[axis]
+ if not is_group(group):
+ raise TypeError(f"'{axis}' is not a group.")
+
+ index_name = group.attrs.get("_index", None)
+ if index_name is None:
+ if axis == "obs":
+ index_name = "obs_names"
+ elif axis == "var":
+ index_name = "var_names"
+ else:
+ raise ValueError(f"Invalid axis '{axis}'. Must be 'obs' or 'var'.")
+
+ index_name = _decode_attr(index_name)
+
+ if index_name not in group:
+ raise KeyError(f"Index dataset '{index_name}' not found in '{axis}' group.")
+
+ dataset = group[index_name]
+ if not is_dataset(dataset):
+ raise TypeError(f"Index '{index_name}' in '{axis}' is not a dataset.")
+ if dataset.shape:
+ return int(dataset.shape[0])
+ raise ValueError(
+ f"Cannot determine length of '{axis}': index dataset has no shape."
+ )
+
+
+def get_axis_group(file: Any, axis: str) -> Tuple[Any, int, str]:
+ if axis not in ("obs", "var"):
+ raise ValueError("axis must be 'obs' or 'var'.")
+
+ n = axis_len(file, axis)
+ group = file[axis]
+
+ index_name = group.attrs.get("_index", None)
+ if index_name is None:
+ index_name = "obs_names" if axis == "obs" else "var_names"
+ index_name = _decode_attr(index_name)
+
+ return group, n, index_name
diff --git a/src/h5ad/core/read.py b/src/h5ad/core/read.py
new file mode 100644
index 0000000..b81ee1f
--- /dev/null
+++ b/src/h5ad/core/read.py
@@ -0,0 +1,112 @@
+from __future__ import annotations
+
+from typing import List, Dict, Any
+
+import h5py
+import numpy as np
+
+from h5ad.storage import is_group, is_dataset, is_hdf5_dataset
+
+
+def decode_str_array(array: np.ndarray) -> np.ndarray:
+ if np.issubdtype(array.dtype, np.bytes_):
+ return array.astype("U")
+ if array.dtype.kind == "O":
+ return array.astype(str)
+ return array.astype(str)
+
+
+def read_categorical_column(
+ col: Any,
+ start: int,
+ end: int,
+ cache: Dict[int, np.ndarray],
+ parent_group: Any | None = None,
+) -> List[str]:
+ key = id(col)
+
+ if is_group(col):
+ if key not in cache:
+ cats = col["categories"][...]
+ cats = decode_str_array(cats)
+ cache[key] = np.asarray(cats, dtype=str)
+ cats = cache[key]
+
+ codes_ds = col["codes"]
+ codes = codes_ds[start:end]
+ codes = np.asarray(codes, dtype=np.int64)
+ return [cats[c] if 0 <= c < len(cats) else "" for c in codes]
+
+ if is_dataset(col):
+ if key not in cache:
+ cats_ref = col.attrs.get("categories", None)
+ if cats_ref is not None and is_hdf5_dataset(col):
+ cats_ds = col.file[cats_ref]
+ cats = cats_ds[...]
+ elif parent_group is not None and "__categories" in parent_group:
+ col_name = col.name.split("/")[-1]
+ cats_grp = parent_group["__categories"]
+ if col_name in cats_grp:
+ cats = cats_grp[col_name][...]
+ else:
+ raise KeyError(
+ f"Cannot find categories for legacy column {col.name}"
+ )
+ else:
+ raise KeyError(
+ f"Cannot find categories for legacy column {col.name}"
+ )
+ cats = decode_str_array(cats)
+ cache[key] = np.asarray(cats, dtype=str)
+ cats = cache[key]
+
+ codes = col[start:end]
+ codes = np.asarray(codes, dtype=np.int64)
+ return [cats[c] if 0 <= c < len(cats) else "" for c in codes]
+
+ raise TypeError(f"Unsupported categorical column type: {type(col)}")
+
+
+def col_chunk_as_strings(
+ group: Any,
+ col_name: str,
+ start: int,
+ end: int,
+ cat_cache: Dict[int, np.ndarray],
+) -> List[str]:
+ if col_name not in group:
+ raise RuntimeError(f"Column {col_name!r} not found in group {group.name}")
+
+ col = group[col_name]
+
+ if is_dataset(col):
+ if "categories" in col.attrs:
+ return read_categorical_column(col, start, end, cat_cache, group)
+
+ chunk = col[start:end]
+ if chunk.ndim != 1:
+ chunk = chunk.reshape(-1)
+ chunk = decode_str_array(np.asarray(chunk))
+ return chunk.tolist()
+
+ if is_group(col):
+ enc = col.attrs.get("encoding-type", b"")
+ if isinstance(enc, bytes):
+ enc = enc.decode("utf-8")
+
+ if enc == "categorical":
+ return read_categorical_column(col, start, end, cat_cache)
+
+ if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"):
+ values = col["values"][start:end]
+ mask = col["mask"][start:end]
+ values = decode_str_array(np.asarray(values))
+ return ["" if m else str(v) for v, m in zip(values, mask)]
+
+ raise ValueError(
+ f"Unsupported group encoding {enc!r} for column {col_name!r}"
+ )
+
+ raise TypeError(
+ f"Unsupported column type for {col_name!r} in group {group.name}"
+ )
diff --git a/src/h5ad/core/subset.py b/src/h5ad/core/subset.py
new file mode 100644
index 0000000..d9e7829
--- /dev/null
+++ b/src/h5ad/core/subset.py
@@ -0,0 +1,529 @@
+"""Subset operations for .h5ad and .zarr stores."""
+
+from __future__ import annotations
+
+from pathlib import Path
+import shutil
+from typing import Optional, Set, Tuple, List, Dict, Any
+
+import numpy as np
+from rich.console import Console
+from rich.progress import (
+ Progress,
+ SpinnerColumn,
+ TextColumn,
+ BarColumn,
+ TaskProgressColumn,
+ TimeElapsedColumn,
+)
+
+from h5ad.core.read import decode_str_array
+from h5ad.storage import (
+ create_dataset,
+ copy_attrs,
+ copy_tree,
+ dataset_create_kwargs,
+ detect_backend,
+ is_dataset,
+ is_group,
+ is_zarr_group,
+ is_zarr_array,
+ open_store,
+)
+
+
+def _target_backend(dst_group: Any) -> str:
+ return "zarr" if is_zarr_group(dst_group) else "hdf5"
+
+
+def _ensure_group(parent: Any, name: str) -> Any:
+ return parent[name] if name in parent else parent.create_group(name)
+
+
+def _group_get(parent: Any, key: str) -> Any | None:
+ return parent[key] if key in parent else None
+
+
+def _decode_attr(value: Any) -> Any:
+ if isinstance(value, bytes):
+ return value.decode("utf-8")
+ return value
+
+
+def _read_name_file(path: Path) -> Set[str]:
+ names: Set[str] = set()
+ with open(path, "r", encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if line:
+ names.add(line)
+ return names
+
+
+def indices_from_name_set(
+ names_ds: Any,
+ keep: Set[str],
+ *,
+ chunk_size: int = 200_000,
+) -> Tuple[np.ndarray, Set[str]]:
+ if names_ds.ndim != 1:
+ flat_len = int(np.prod(names_ds.shape))
+ else:
+ flat_len = names_ds.shape[0]
+
+ remaining = set(keep)
+ found_indices: List[int] = []
+
+ for start in range(0, flat_len, chunk_size):
+ end = min(start + chunk_size, flat_len)
+ chunk = names_ds[start:end]
+ chunk = decode_str_array(np.asarray(chunk)).astype(str)
+
+ for i, name in enumerate(chunk):
+ if name in remaining:
+ found_indices.append(start + i)
+ remaining.remove(name)
+
+ if not remaining:
+ break
+
+ return np.asarray(found_indices, dtype=np.int64), remaining
+
+
+def subset_axis_group(
+ src: Any,
+ dst: Any,
+ indices: Optional[np.ndarray],
+) -> None:
+ copy_attrs(src.attrs, dst.attrs, target_backend=_target_backend(dst))
+ target_backend = _target_backend(dst)
+
+ for key in src.keys():
+ obj = src[key]
+
+ if is_dataset(obj):
+ if indices is None:
+ copy_tree(obj, dst, key)
+ else:
+ if is_zarr_array(obj):
+ if obj.ndim == 1:
+ data = obj.oindex[indices]
+ else:
+ selection = (indices,) + (slice(None),) * (obj.ndim - 1)
+ data = obj.oindex[selection]
+ else:
+ data = obj[indices, ...]
+ ds = create_dataset(
+ dst,
+ key,
+ data=data,
+ **dataset_create_kwargs(obj, target_backend=target_backend),
+ )
+ copy_attrs(obj.attrs, ds.attrs, target_backend=target_backend)
+ elif is_group(obj):
+ enc = obj.attrs.get("encoding-type", b"")
+ if isinstance(enc, bytes):
+ enc = enc.decode("utf-8")
+
+ if enc == "categorical":
+ gdst = dst.create_group(key)
+ copy_attrs(obj.attrs, gdst.attrs, target_backend=target_backend)
+ copy_tree(obj["categories"], gdst, "categories")
+
+ codes = obj["codes"]
+ if indices is None:
+ copy_tree(codes, gdst, "codes")
+ else:
+ codes_sub = codes[indices, ...]
+ ds = create_dataset(
+ gdst,
+ "codes",
+ data=codes_sub,
+ **dataset_create_kwargs(codes, target_backend=target_backend),
+ )
+ copy_attrs(codes.attrs, ds.attrs, target_backend=target_backend)
+ else:
+ copy_tree(obj, dst, key)
+
+
+def subset_dense_matrix(
+ src: Any,
+ dst_parent: Any,
+ name: str,
+ obs_idx: Optional[np.ndarray],
+ var_idx: Optional[np.ndarray],
+ *,
+ chunk_rows: int = 1024,
+) -> None:
+ if src.ndim != 2:
+ copy_tree(src, dst_parent, name)
+ return
+
+ n_obs, n_var = src.shape
+ out_obs = len(obs_idx) if obs_idx is not None else n_obs
+ out_var = len(var_idx) if var_idx is not None else n_var
+
+ target_backend = _target_backend(dst_parent)
+ kw = dataset_create_kwargs(src, target_backend=target_backend)
+ chunks = kw.get("chunks")
+ if isinstance(chunks, (tuple, list)) and len(chunks) >= 2:
+ kw["chunks"] = (min(int(chunks[0]), out_obs), min(int(chunks[1]), out_var))
+
+ dst = create_dataset(
+ dst_parent,
+ name,
+ shape=(out_obs, out_var),
+ dtype=src.dtype,
+ **kw,
+ )
+ copy_attrs(src.attrs, dst.attrs, target_backend=_target_backend(dst_parent))
+
+ for out_start in range(0, out_obs, chunk_rows):
+ out_end = min(out_start + chunk_rows, out_obs)
+
+ if obs_idx is None:
+ block = src[out_start:out_end, :]
+ else:
+ rows = obs_idx[out_start:out_end]
+ block = src[rows, :]
+
+ if var_idx is not None:
+ block = block[:, var_idx]
+
+ dst[out_start:out_end, :] = block
+
+
+def subset_sparse_matrix_group(
+ src: Any,
+ dst_parent: Any,
+ name: str,
+ obs_idx: Optional[np.ndarray],
+ var_idx: Optional[np.ndarray],
+) -> None:
+ enc = src.attrs.get("encoding-type", b"")
+ if isinstance(enc, bytes):
+ enc = enc.decode("utf-8")
+
+ if enc not in ("csr_matrix", "csc_matrix"):
+ raise ValueError(f"Unsupported sparse encoding type: {enc}")
+
+ data = np.asarray(src["data"][...])
+ indices = np.asarray(src["indices"][...], dtype=np.int64)
+ indptr = np.asarray(src["indptr"][...], dtype=np.int64)
+ shape = src.attrs.get("shape", None)
+ if shape is None:
+ raise ValueError("Sparse matrix group missing 'shape' attribute.")
+ n_rows, n_cols = int(shape[0]), int(shape[1])
+
+ if enc == "csr_matrix":
+ row_idx = obs_idx if obs_idx is not None else np.arange(n_rows, dtype=np.int64)
+ col_idx = var_idx if var_idx is not None else np.arange(n_cols, dtype=np.int64)
+
+ new_data = []
+ new_indices = []
+ new_indptr = [0]
+
+ for r in row_idx:
+ start = indptr[r]
+ end = indptr[r + 1]
+ row_cols = indices[start:end]
+ row_data = data[start:end]
+
+ if var_idx is not None:
+ col_mask = np.isin(row_cols, col_idx)
+ row_cols = row_cols[col_mask]
+ row_data = row_data[col_mask]
+
+ if var_idx is not None:
+ col_map = {c: i for i, c in enumerate(col_idx)}
+ row_cols = np.array([col_map[c] for c in row_cols], dtype=np.int64)
+
+ new_indices.extend(row_cols.tolist())
+ new_data.extend(row_data.tolist())
+ new_indptr.append(len(new_indices))
+
+ new_shape = (len(row_idx), len(col_idx))
+ else:
+ row_idx = obs_idx if obs_idx is not None else np.arange(n_rows, dtype=np.int64)
+ col_idx = var_idx if var_idx is not None else np.arange(n_cols, dtype=np.int64)
+
+ new_data = []
+ new_indices = []
+ new_indptr = [0]
+
+ for c in col_idx:
+ start = indptr[c]
+ end = indptr[c + 1]
+ col_rows = indices[start:end]
+ col_data = data[start:end]
+
+ if obs_idx is not None:
+ row_mask = np.isin(col_rows, row_idx)
+ col_rows = col_rows[row_mask]
+ col_data = col_data[row_mask]
+
+ if obs_idx is not None:
+ row_map = {r: i for i, r in enumerate(row_idx)}
+ col_rows = np.array([row_map[r] for r in col_rows], dtype=np.int64)
+
+ new_indices.extend(col_rows.tolist())
+ new_data.extend(col_data.tolist())
+ new_indptr.append(len(new_indices))
+
+ new_shape = (len(row_idx), len(col_idx))
+
+ group = dst_parent.create_group(name)
+ group.attrs["encoding-type"] = enc
+ group.attrs["encoding-version"] = "0.1.0"
+ if is_zarr_group(group):
+ group.attrs["shape"] = list(new_shape)
+ else:
+ group.attrs["shape"] = np.array(new_shape, dtype=np.int64)
+
+ create_dataset(group, "data", data=np.array(new_data, dtype=data.dtype))
+ create_dataset(group, "indices", data=np.array(new_indices, dtype=indices.dtype))
+ create_dataset(group, "indptr", data=np.array(new_indptr, dtype=indptr.dtype))
+
+
+def subset_matrix_entry(
+ obj: Any,
+ dst_parent: Any,
+ name: str,
+ obs_idx: Optional[np.ndarray],
+ var_idx: Optional[np.ndarray],
+ *,
+ chunk_rows: int,
+ entry_label: str,
+) -> None:
+ if is_dataset(obj):
+ subset_dense_matrix(
+ obj, dst_parent, name, obs_idx, var_idx, chunk_rows=chunk_rows
+ )
+ return
+
+ if is_group(obj):
+ enc = obj.attrs.get("encoding-type", b"")
+ if isinstance(enc, bytes):
+ enc = enc.decode("utf-8")
+ if enc in ("csr_matrix", "csc_matrix"):
+ subset_sparse_matrix_group(obj, dst_parent, name, obs_idx, var_idx)
+ return
+ raise ValueError(f"Unsupported {entry_label} encoding type: {enc}")
+
+ raise ValueError(f"Unsupported {entry_label} object type")
+
+
+def subset_h5ad(
+ file: Path,
+ output: Optional[Path],
+ obs_file: Optional[Path],
+ var_file: Optional[Path],
+ *,
+ chunk_rows: int = 1024,
+ console: Console,
+ inplace: bool = False,
+) -> None:
+ obs_keep: Optional[Set[str]] = None
+ if obs_file is not None:
+ obs_keep = _read_name_file(obs_file)
+ console.print(f"[cyan]Found {len(obs_keep)} obs names to keep[/]")
+
+ var_keep: Optional[Set[str]] = None
+ if var_file is not None:
+ var_keep = _read_name_file(var_file)
+ console.print(f"[cyan]Found {len(var_keep)} var names to keep[/]")
+
+ if obs_keep is None and var_keep is None:
+ raise ValueError("At least one of --obs or --var must be provided.")
+
+ if not inplace and output is None:
+ raise ValueError("Output file is required unless --inplace is specified.")
+
+ if inplace:
+ src_backend = detect_backend(file)
+ if src_backend == "zarr":
+ base_name = file.stem if file.suffix else file.name
+ tmp_path = file.with_name(f"{base_name}.subset-tmp.zarr")
+ else:
+ tmp_path = file.with_name(f"{file.name}.subset-tmp")
+ if tmp_path.exists():
+ raise FileExistsError(f"Temporary path already exists: {tmp_path}")
+ dst_path = tmp_path
+ else:
+ dst_path = output
+
+ with console.status("[magenta]Opening files...[/]"):
+ with open_store(file, "r") as src_store, open_store(dst_path, "w") as dst_store:
+ src = src_store.root
+ dst = dst_store.root
+
+ obs_idx = None
+ if obs_keep is not None:
+ console.print("[cyan]Matching obs names...[/]")
+ obs_group = src["obs"]
+ obs_index = _decode_attr(obs_group.attrs.get("_index", "obs_names"))
+ obs_names_ds = _group_get(obs_group, "obs_names") or _group_get(
+ obs_group, obs_index
+ )
+ if obs_names_ds is None:
+ raise KeyError("Could not find obs names")
+
+ obs_idx, missing_obs = indices_from_name_set(obs_names_ds, obs_keep)
+ if missing_obs:
+ console.print(
+ f"[yellow]Warning: {len(missing_obs)} obs names not found in file[/]"
+ )
+ console.print(
+ f"[green]Selected {len(obs_idx)} obs (of {obs_names_ds.shape[0]})[/]"
+ )
+
+ var_idx = None
+ if var_keep is not None:
+ console.print("[cyan]Matching var names...[/]")
+ var_group = src["var"]
+ var_index = _decode_attr(var_group.attrs.get("_index", "var_names"))
+ var_names_ds = _group_get(var_group, "var_names") or _group_get(
+ var_group, var_index
+ )
+ if var_names_ds is None:
+ raise KeyError("Could not find var names")
+
+ var_idx, missing_var = indices_from_name_set(var_names_ds, var_keep)
+ if missing_var:
+ console.print(
+ f"[yellow]Warning: {len(missing_var)} var names not found in file[/]"
+ )
+ console.print(
+ f"[green]Selected {len(var_idx)} var (of {var_names_ds.shape[0]})[/]"
+ )
+
+ tasks: List[str] = []
+ if "obs" in src:
+ tasks.append("obs")
+ if "var" in src:
+ tasks.append("var")
+ if "X" in src:
+ tasks.append("X")
+ if "layers" in src:
+ tasks.extend([f"layer:{k}" for k in src["layers"].keys()])
+ if "obsm" in src:
+ tasks.extend([f"obsm:{k}" for k in src["obsm"].keys()])
+ if "varm" in src:
+ tasks.extend([f"varm:{k}" for k in src["varm"].keys()])
+ if "obsp" in src:
+ tasks.extend([f"obsp:{k}" for k in src["obsp"].keys()])
+ if "varp" in src:
+ tasks.extend([f"varp:{k}" for k in src["varp"].keys()])
+ if "uns" in src:
+ tasks.append("uns")
+
+ with Progress(
+ SpinnerColumn(finished_text="[green]✓[/]"),
+ TextColumn("[progress.description]{task.description}"),
+ console=console,
+ transient=False,
+ ) as progress:
+ for task in tasks:
+ task_id = progress.add_task(
+ f"[cyan]Subsetting {task}...[/]", total=None
+ )
+ if task == "obs":
+ obs_dst = dst.create_group("obs")
+ subset_axis_group(src["obs"], obs_dst, obs_idx)
+ elif task == "var":
+ var_dst = dst.create_group("var")
+ subset_axis_group(src["var"], var_dst, var_idx)
+ elif task == "X":
+ X = src["X"]
+ if is_dataset(X):
+ subset_dense_matrix(
+ X, dst, "X", obs_idx, var_idx, chunk_rows=chunk_rows
+ )
+ elif is_group(X):
+ subset_sparse_matrix_group(X, dst, "X", obs_idx, var_idx)
+ else:
+ copy_tree(X, dst, "X")
+ elif task.startswith("layer:"):
+ key = task.split(":", 1)[1]
+ layer_src = src["layers"][key]
+ layers_dst = _ensure_group(dst, "layers")
+ subset_matrix_entry(
+ layer_src,
+ layers_dst,
+ key,
+ obs_idx,
+ var_idx,
+ chunk_rows=chunk_rows,
+ entry_label=f"layer:{key}",
+ )
+ elif task.startswith("obsm:"):
+ key = task.split(":", 1)[1]
+ obsm_dst = _ensure_group(dst, "obsm")
+ obsm_obj = src["obsm"][key]
+ subset_matrix_entry(
+ obsm_obj,
+ obsm_dst,
+ key,
+ obs_idx,
+ None,
+ chunk_rows=chunk_rows,
+ entry_label=f"obsm:{key}",
+ )
+ elif task.startswith("varm:"):
+ key = task.split(":", 1)[1]
+ varm_dst = _ensure_group(dst, "varm")
+ varm_obj = src["varm"][key]
+ subset_matrix_entry(
+ varm_obj,
+ varm_dst,
+ key,
+ var_idx,
+ None,
+ chunk_rows=chunk_rows,
+ entry_label=f"varm:{key}",
+ )
+ elif task.startswith("obsp:"):
+ key = task.split(":", 1)[1]
+ obsp_dst = _ensure_group(dst, "obsp")
+ obsp_obj = src["obsp"][key]
+ subset_matrix_entry(
+ obsp_obj,
+ obsp_dst,
+ key,
+ obs_idx,
+ obs_idx,
+ chunk_rows=chunk_rows,
+ entry_label=f"obsp:{key}",
+ )
+ elif task.startswith("varp:"):
+ key = task.split(":", 1)[1]
+ varp_dst = _ensure_group(dst, "varp")
+ varp_obj = src["varp"][key]
+ subset_matrix_entry(
+ varp_obj,
+ varp_dst,
+ key,
+ var_idx,
+ var_idx,
+ chunk_rows=chunk_rows,
+ entry_label=f"varp:{key}",
+ )
+ elif task == "uns":
+ copy_tree(src["uns"], dst, "uns")
+ progress.update(
+ task_id,
+ description=f"[green]Subsetting {task}[/]",
+ completed=1,
+ total=1,
+ )
+
+ if inplace:
+ if file.exists():
+ if file.is_dir():
+ shutil.rmtree(file)
+ else:
+ file.unlink()
+ if dst_path.is_dir():
+ shutil.move(str(dst_path), str(file))
+ else:
+ dst_path.replace(file)
diff --git a/src/h5ad/formats/__init__.py b/src/h5ad/formats/__init__.py
new file mode 100644
index 0000000..18b9721
--- /dev/null
+++ b/src/h5ad/formats/__init__.py
@@ -0,0 +1 @@
+"""Format-specific import/export helpers."""
diff --git a/src/h5ad/formats/array.py b/src/h5ad/formats/array.py
new file mode 100644
index 0000000..1dd21ac
--- /dev/null
+++ b/src/h5ad/formats/array.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from rich.console import Console
+
+from h5ad.formats.common import _get_encoding_type, _resolve
+from h5ad.formats.validate import validate_dimensions
+from h5ad.storage import create_dataset, is_dataset, is_group
+from h5ad.util.path import norm_path
+
+
+def export_npy(
+ root: Any,
+ obj: str,
+ out: Path,
+ chunk_elements: int,
+ console: Console,
+) -> None:
+ h5obj = _resolve(root, obj)
+
+ if is_group(h5obj):
+ enc = _get_encoding_type(h5obj)
+ if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"):
+ if "values" not in h5obj:
+ raise ValueError(f"Encoded group '{obj}' is missing 'values' dataset.")
+ ds = h5obj["values"]
+ console.print(f"[dim]Exporting nullable array values from '{obj}'[/]")
+ else:
+ raise ValueError(
+ f"Target '{obj}' is a group with encoding '{enc}'; cannot export as .npy directly."
+ )
+ elif is_dataset(h5obj):
+ ds = h5obj
+ else:
+ raise ValueError("Target is not an array-like object.")
+
+ out.parent.mkdir(parents=True, exist_ok=True)
+ mm = np.lib.format.open_memmap(out, mode="w+", dtype=ds.dtype, shape=ds.shape)
+ try:
+ if ds.shape == ():
+ mm[...] = ds[()]
+ console.print(f"[green]Wrote[/] {out}")
+ return
+
+ if ds.ndim == 1:
+ n = int(ds.shape[0])
+ step = max(1, int(chunk_elements))
+ with console.status(f"[magenta]Exporting {obj} to {out}...[/]") as status:
+ for start in range(0, n, step):
+ end = min(start + step, n)
+ status.update(
+ f"[magenta]Exporting {obj}: {start}-{end} of {n}...[/]"
+ )
+ mm[start:end] = ds[start:end]
+ console.print(f"[green]Wrote[/] {out}")
+ return
+
+ n0 = int(ds.shape[0])
+ row_elems = int(np.prod(ds.shape[1:])) if ds.ndim > 1 else 1
+ step0 = max(1, int(chunk_elements) // max(1, row_elems))
+ with console.status(f"[magenta]Exporting {obj} to {out}...[/]") as status:
+ for start in range(0, n0, step0):
+ end = min(start + step0, n0)
+ status.update(
+ f"[magenta]Exporting {obj}: {start}-{end} of {n0}...[/]"
+ )
+ mm[start:end, ...] = ds[start:end, ...]
+ console.print(f"[green]Wrote[/] {out}")
+ finally:
+ del mm
+
+
+def import_npy(
+ root: Any,
+ obj: str,
+ input_file: Path,
+ console: Console,
+) -> None:
+ obj = norm_path(obj)
+ arr = np.load(input_file)
+
+ validate_dimensions(root, obj, arr.shape, console)
+
+ parts = obj.split("/")
+ parent = root
+ for part in parts[:-1]:
+ parent = parent[part] if part in parent else parent.create_group(part)
+ name = parts[-1]
+
+ if name in parent:
+ del parent[name]
+
+ create_dataset(parent, name, data=arr)
+
+ shape_str = "×".join(str(d) for d in arr.shape)
+ console.print(f"[green]Imported[/] {shape_str} array into '{obj}'")
diff --git a/src/h5ad/formats/common.py b/src/h5ad/formats/common.py
new file mode 100644
index 0000000..6282eb5
--- /dev/null
+++ b/src/h5ad/formats/common.py
@@ -0,0 +1,67 @@
+from __future__ import annotations
+
+from typing import Any, Dict
+
+import numpy as np
+
+from h5ad.storage import is_dataset, is_group
+from h5ad.util.path import norm_path
+
+
+TYPE_EXTENSIONS = {
+ "dataframe": {".csv"},
+ "sparse-matrix": {".mtx"},
+ "dense-matrix": {".npy", ".png", ".jpg", ".jpeg", ".tif", ".tiff"},
+ "array": {".npy", ".png", ".jpg", ".jpeg", ".tif", ".tiff"},
+ "dict": {".json"},
+ "scalar": {".json"},
+ "categorical": {".csv"},
+ "awkward-array": {".json"},
+}
+
+IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".tif", ".tiff"}
+
+EXPORTABLE_TYPES = set(TYPE_EXTENSIONS.keys())
+
+
+def _get_encoding_type(group: Any) -> str:
+ enc = group.attrs.get("encoding-type", "")
+ if isinstance(enc, bytes):
+ enc = enc.decode("utf-8")
+ return str(enc)
+
+
+def _resolve(root: Any, obj: str) -> Any:
+ obj = norm_path(obj)
+ if obj not in root:
+ raise KeyError(f"'{obj}' not found in the file.")
+ return root[obj]
+
+
+def _check_json_exportable(h5obj: Any, max_elements: int, path: str = "") -> None:
+ if is_dataset(h5obj):
+ if h5obj.shape == ():
+ return
+ n = int(np.prod(h5obj.shape)) if h5obj.shape else 0
+ if n > max_elements:
+ obj_name = getattr(h5obj, "name", "