diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7548bf7..dde3803 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -11,7 +11,7 @@ concurrency: cancel-in-progress: true jobs: - test: + tests: runs-on: ubuntu-latest timeout-minutes: 20 @@ -23,6 +23,21 @@ jobs: fail-fast: false matrix: python-version: ["3.12"] # add "3.13" if you want + module: + - name: cli + tests: tests/test_cli.py + - name: export + tests: tests/test_export.py + - name: import + tests: tests/test_import.py + - name: info-read + tests: tests/test_info_read.py + - name: subset + tests: tests/test_subset.py + - name: zarr + tests: tests/test_zarr.py + + name: tests (${{ matrix.module.name }}) steps: - uses: actions/checkout@v4 @@ -35,36 +50,36 @@ jobs: - name: Set up uv uses: astral-sh/setup-uv@v3 with: - enable-cache: true + enable-cache: false - name: Install dependencies (frozen) run: uv sync --extra dev --frozen - name: Run tests with coverage run: | - uv run pytest -v \ + uv run pytest -v -W default ${{ matrix.module.tests }} \ --cov=h5ad \ --cov-report=term-missing \ --cov-report=xml \ --cov-report=html \ - --junitxml=pytest-results.xml + --junitxml=pytest-results-${{ matrix.module.name }}.xml - name: Publish test results summary uses: EnricoMi/publish-unit-test-result-action@v2 if: always() with: - files: pytest-results.xml - check_name: Test Results + files: pytest-results-${{ matrix.module.name }}.xml + check_name: Test Results (${{ matrix.module.name }}) - name: Upload coverage artifacts uses: actions/upload-artifact@v4 if: always() with: - name: coverage + name: coverage-${{ matrix.module.name }} path: | coverage.xml htmlcov/ - pytest-results.xml + pytest-results-${{ matrix.module.name }}.xml retention-days: 30 - name: Upload coverage to Codecov diff --git a/README.md b/README.md index eecee0f..ff7a474 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,20 @@ # h5ad CLI -A command-line tool for exploring huge `.h5ad` (AnnData) files without loading them fully into memory. Streams data directly from disk for efficient inspection of structure, metadata, and matrices. +A command-line tool for exploring huge AnnData stores (`.h5ad` and `.zarr`) without loading them fully into memory. Streams data directly from disk for efficient inspection of structure, metadata, and matrices. ## Features -- **`info`** – Show file structure and dimensions (`n_obs × n_var`) -- **`table`** – Export obs/var metadata to CSV with chunked streaming -- **`subset`** – Filter h5ad files by cell/gene names (supports dense and sparse CSR/CSC matrices) -- Memory-efficient chunked processing for large files -- Rich terminal output with colors and progress bars +- Streaming access to very large `.h5ad` and `.zarr` stores +- Auto-detects `.h5ad` files vs `.zarr` directories +- Chunked processing for dense and sparse matrices (CSR/CSC) +- Rich terminal output with progress indicators ## Installation +Using [uv](https://docs.astral.sh/uv/) (recommended): ```bash +git clone https://github.com/cellgeni/h5ad-cli.git +cd h5ad-cli uv sync ``` @@ -21,45 +23,27 @@ For development and testing: uv sync --extra dev ``` -See [docs/TESTING.md](docs/TESTING.md) for testing documentation. - -## Usage -Invoke any subcommand via `uv run h5ad ...`: - -```bash -uv run h5ad --help -``` - -#### Examples - -**Inspect overall structure and axis sizes:** +Alternative with pip: ```bash -uv run h5ad info data.h5ad +git clone https://github.com/cellgeni/h5ad-cli.git +cd h5ad-cli +pip install . ``` -**Export full obs metadata to CSV:** +For development and testing with pip: ```bash -uv run h5ad table data.h5ad --axis obs --out obs_metadata.csv +pip install -e ".[dev]" ``` -**Export selected obs columns to stdout:** -```bash -uv run h5ad table data.h5ad --axis obs --cols cell_type,donor -``` +See [docs/TESTING.md](docs/TESTING.md) for testing documentation. -**Export var metadata with custom chunk size:** -```bash -uv run h5ad table data.h5ad --axis var --chunk-rows 5000 --out var_metadata.csv -``` +## Commands (Overview) -**Subset by cell names:** -```bash -uv run h5ad subset input.h5ad output.h5ad --obs cells.txt -``` +Run help at any level (e.g. `uv run h5ad --help`, `uv run h5ad export --help`). -**Subset by both cells and genes:** -```bash -uv run h5ad subset input.h5ad output.h5ad --obs cells.txt --var genes.txt -``` +- `info` – read-only inspection of store layout, shapes, and type hints; supports drilling into paths like `obsm/X_pca` or `uns`. +- `subset` – stream and write a filtered copy based on obs/var name lists, preserving dense and sparse matrix encodings. +- `export` – extract data from a store; subcommands: `dataframe` (obs/var to CSV), `array` (dense to `.npy`), `sparse` (CSR/CSC to `.mtx`), `dict` (JSON), `image` (PNG). +- `import` – write new data into a store; subcommands: `dataframe` (CSV → obs/var), `array` (`.npy`), `sparse` (`.mtx`), `dict` (JSON). -All commands stream from disk, so even multi-GB `.h5ad` files remain responsive. +See [docs/GET_STARTED.md](docs/GET_STARTED.md) for a short tutorial. \ No newline at end of file diff --git a/docs/ELEMENTS_h5ad.md b/docs/ELEMENTS_h5ad.md new file mode 100644 index 0000000..acb491d --- /dev/null +++ b/docs/ELEMENTS_h5ad.md @@ -0,0 +1,274 @@ +# AnnData on-disk element specifications — HDF5 (`.h5ad`) + +This document describes how *elements* are encoded inside an AnnData **HDF5** container (`.h5ad`). +It is intended to be GitHub-renderable Markdown (no Sphinx/MyST directives). + +> **Scope** +> +> - “Modern” encoding metadata (`encoding-type`, `encoding-version`) is the convention used by **anndata ≥ 0.8**. +> - “Legacy” conventions (notably DataFrame categorical handling) are described for **anndata 0.7.x** files, which are still commonly encountered. + +## Table of contents + +- [Encoding metadata](#encoding-metadata) +- [AnnData group](#anndata-group) +- [Dense arrays](#dense-arrays) +- [Sparse arrays (CSR/CSC)](#sparse-arrays-csrcsc) +- [DataFrames](#dataframes) + - [DataFrame v0.2.0](#dataframe-v020) + - [DataFrame v0.1.0 (legacy: anndata 0.7.x)](#dataframe-v010-legacy-anndata-07x) + - [Legacy categorical columns (Series-level)](#legacy-categorical-columns-series-level) +- [Mappings / dict](#mappings--dict) +- [Scalars](#scalars) +- [Categorical arrays](#categorical-arrays) +- [String arrays](#string-arrays) +- [Nullable arrays](#nullable-arrays) + - [Missing value semantics](#missing-value-semantics) +- [Awkward arrays (experimental)](#awkward-arrays-experimental) +- [Sources](#sources) + +## Encoding metadata + +**Modern convention (anndata ≥ 0.8):** + +- Any element (HDF5 *group* or *dataset*) that participates in the element-dispatch system: + - **MUST** have attribute `encoding-type` (string) + - **MUST** have attribute `encoding-version` (string, parseable as a version) + +Readers should dispatch first on `encoding-type`, then on `encoding-version`. + +**Legacy convention (anndata ≤ 0.7.x):** + +- Many objects do *not* have `encoding-type`/`encoding-version`. +- Some elements (e.g. CSR/CSC sparse matrices, legacy DataFrames) *do* use `encoding-type`/`encoding-version`. +- Readers typically infer element kinds from: + - known AnnData keys (`X`, `obs`, `var`, …), + - group structure, and/or + - legacy attributes (e.g. the `categories` attribute on categorical columns). + +## AnnData group + +### `encoding-type: anndata`, `encoding-version: 0.1.0` + +An `AnnData` object **MUST** be stored as an HDF5 **group** with attributes: + +- `encoding-type: "anndata"` +- `encoding-version: "0.1.0"` + +Required members: + +- `obs` — a [DataFrame](#dataframes) +- `var` — a [DataFrame](#dataframes) + +Optional members (if present, they must satisfy these constraints): + +- `X` — dense array or sparse array; shape `(n_obs, n_var)` +- `layers` — mapping; values dense or sparse arrays; each shape `(n_obs, n_var)` +- `obsm` — mapping; values dense arrays, sparse arrays, or dataframes; first dim `n_obs` +- `varm` — mapping; values dense arrays, sparse arrays, or dataframes; first dim `n_var` +- `obsp` — mapping; values dense or sparse arrays; first two dims `n_obs` +- `varp` — mapping; values dense or sparse arrays; first two dims `n_var` +- `uns` — mapping/dict-like container (recursive) + +## Dense arrays + +### `encoding-type: array`, `encoding-version: 0.2.0` + +- A dense array **MUST** be an HDF5 **dataset**. +- The dataset **MUST** have attributes: + - `encoding-type: "array"` + - `encoding-version: "0.2.0"` + +> **Legacy note** +> +> In anndata 0.7.x, dense arrays were typically stored as plain datasets *without* `encoding-type`/`encoding-version`. + +## Sparse arrays (CSR/CSC) + +### `encoding-type: csr_matrix|csc_matrix`, `encoding-version: 0.1.0` + +A sparse matrix **MUST** be stored as an HDF5 **group**. + +- Group attributes: + - `encoding-type: "csr_matrix"` **or** `"csc_matrix"` + - `encoding-version: "0.1.0"` + - `shape`: integer array of length 2 (matrix shape) +- Group members (datasets): + - `data` + - `indices` + - `indptr` + +The exact CSR/CSC semantics follow SciPy’s conventions. + +## DataFrames + +DataFrames are stored column-wise: each column is stored as a dataset (or group, if the column itself is an encoded element). + + +### DataFrame v0.2.0 + +#### `encoding-type: dataframe`, `encoding-version: 0.2.0` + +A dataframe **MUST** be stored as an HDF5 **group**. + +- Group attributes: + - `_index`: string — the key of the dataset to be used as the row index + - `column-order`: array of strings — original column order + - `encoding-type: "dataframe"` + - `encoding-version: "0.2.0"` +- Group members: + - the index dataset (named by `_index`) + - one member per column +- All column entries **MUST** have the same length in their first dimension. +- Columns **SHOULD** share chunking along the first dimension. + +Columns are independently encoded: +- simple numeric/bool columns are commonly `encoding-type: array` +- categorical columns are commonly `encoding-type: categorical` + + +### DataFrame v0.1.0 (legacy: anndata 0.7.x) + +#### `encoding-type: dataframe`, `encoding-version: 0.1.0` + +A legacy dataframe is stored as an HDF5 **group** where: + +- Group attributes include: + - `_index` + - `column-order` + - `encoding-type: "dataframe"` + - `encoding-version: "0.1.0"` +- Each column is a dataset. +- Categorical columns are stored as **integer code datasets**, and their category labels are stored in a reserved subgroup named `__categories`. + +**Reserved subgroup:** + +- `__categories/` stores the array of category labels for column ``. + + +### Legacy categorical columns (Series-level) + +In v0.1.0 DataFrames, a categorical column dataset (e.g. `obs/cell_type`) can be identified by the presence of an attribute: + +- `categories`: an **HDF5 object reference** pointing to the corresponding `__categories/` dataset. + +## Mappings / dict + +### `encoding-type: dict`, `encoding-version: 0.1.0` + +- A mapping **MUST** be stored as an HDF5 **group**. +- Group attributes: + - `encoding-type: "dict"` + - `encoding-version: "0.1.0"` +- Each entry in the group is another element (recursively). + +> **Legacy note** +> +> In anndata 0.7.x, groups used as mappings often had **no special attributes**. + +## Scalars + +### `encoding-version: 0.2.0` + +Scalars are stored as **0-dimensional datasets**. + +- Numeric scalars: + - `encoding-type: "numeric-scalar"` + - `encoding-version: "0.2.0"` + - value is numeric (including boolean, ints, floats, complex) +- String scalars: + - `encoding-type: "string"` + - `encoding-version: "0.2.0"` + - **HDF5 requirement:** variable-length UTF-8 string dtype + +> **Legacy note** +> +> In anndata 0.7.x, scalar strings were commonly stored as `|O` datasets without `encoding-type`/`encoding-version`. + +## Categorical arrays + +### `encoding-type: categorical`, `encoding-version: 0.2.0` + +Categorical arrays are stored as an HDF5 **group** with members: + +- `codes`: integer dataset + - values are zero-based indices into `categories` + - signed integer arrays **MAY** use `-1` to denote missing values +- `categories`: array of labels + +Group attributes: + +- `encoding-type: "categorical"` +- `encoding-version: "0.2.0"` +- `ordered`: boolean (whether the categories are ordered) + +## String arrays + +### `encoding-type: string-array`, `encoding-version: 0.2.0` + +- String arrays **MUST** be stored as HDF5 datasets. +- Dataset attributes: + - `encoding-type: "string-array"` + - `encoding-version: "0.2.0"` +- **HDF5 requirement:** variable-length UTF-8 string dtype + +## Nullable arrays + +These encodings support Pandas nullable integer/boolean/string arrays by storing a `values` array plus a boolean `mask` array. + +### `encoding-type: nullable-integer`, `encoding-version: 0.1.0` + +- Stored as an HDF5 group with datasets: + - `values` (integer) + - `mask` (boolean) + +### `encoding-type: nullable-boolean`, `encoding-version: 0.1.0` + +- Stored as an HDF5 group with datasets: + - `values` (boolean) + - `mask` (boolean) +- `values` and `mask` **MUST** have the same shape. + +### `encoding-type: nullable-string-array`, `encoding-version: 0.1.0` + +- Stored as an HDF5 group with datasets: + - `values` (string array) + - `mask` (boolean) +- Group attributes: + - `encoding-type: "nullable-string-array"` + - `encoding-version: "0.1.0"` + - optional `na-value`: `"NA"` or `"NaN"` (default `"NA"`) + + +#### Missing value semantics + +For elements supporting a `na-value` attribute: + +- `"NA"`: comparisons propagate missingness (e.g. `"x" == NA` → `NA`) +- `"NaN"`: comparisons yield boolean results (e.g. `"x" == NaN` → `false`) + +Readers should preserve semantics when the runtime model supports it. + +## Awkward arrays (experimental) + +### `encoding-type: awkward-array`, `encoding-version: 0.1.0` + +Ragged arrays are stored by decomposing an Awkward Array into constituent buffers (via `ak.to_buffers`), then storing those buffers as datasets within a group. + +Group attributes: + +- `encoding-type: "awkward-array"` +- `encoding-version: "0.1.0"` +- `form`: string — serialized Awkward “form” +- `length`: integer — logical length + +Group members: datasets for the buffers (often named like `nodeX-*`). + +> **Experimental** +> +> This encoding is considered experimental in the anndata 0.9.x series and later. + +## Sources + +- AnnData “on-disk format” prose docs (modern, ≥0.8): https://anndata.readthedocs.io/en/stable/fileformat-prose.html +- AnnData 0.7.8 “on-disk format” prose docs (legacy): https://dokk.org/documentation/anndata/0.7.8/fileformat-prose/ diff --git a/docs/ELEMENTS_zarr.md b/docs/ELEMENTS_zarr.md new file mode 100644 index 0000000..ce309e6 --- /dev/null +++ b/docs/ELEMENTS_zarr.md @@ -0,0 +1,276 @@ +# AnnData on-disk element specifications — Zarr (`.zarr`) + +This document describes how *elements* are encoded inside an AnnData **Zarr** container (`.zarr`). +It is intended to be GitHub-renderable Markdown (no Sphinx/MyST directives). + +> **Scope** +> +> - “Modern” encoding metadata (`encoding-type`, `encoding-version`) is the convention used by **anndata ≥ 0.8**. +> - “Legacy” conventions (notably DataFrame categorical handling) are described for **anndata 0.7.x** files, which are still commonly encountered. + +## Table of contents + +- [Encoding metadata](#encoding-metadata) +- [AnnData group](#anndata-group) +- [Dense arrays](#dense-arrays) +- [Sparse arrays (CSR/CSC)](#sparse-arrays-csrcsc) +- [DataFrames](#dataframes) + - [DataFrame v0.2.0](#dataframe-v020) + - [DataFrame v0.1.0 (legacy: anndata 0.7.x)](#dataframe-v010-legacy-anndata-07x) + - [Legacy categorical columns (Series-level)](#legacy-categorical-columns-series-level) +- [Mappings / dict](#mappings--dict) +- [Scalars](#scalars) +- [Categorical arrays](#categorical-arrays) +- [String arrays](#string-arrays) +- [Nullable arrays](#nullable-arrays) + - [Missing value semantics](#missing-value-semantics) +- [Awkward arrays (experimental)](#awkward-arrays-experimental) +- [Sources](#sources) + +## Encoding metadata + +**Modern convention (anndata ≥ 0.8):** + +- Any element (Zarr *group* or *array*) that participates in the element-dispatch system: + - **MUST** have attribute `encoding-type` (string) + - **MUST** have attribute `encoding-version` (string, parseable as a version) + +Readers should dispatch first on `encoding-type`, then on `encoding-version`. + +**Legacy convention (anndata ≤ 0.7.x):** + +- Many objects do *not* have `encoding-type`/`encoding-version`. +- Some elements (e.g. CSR/CSC sparse matrices, legacy DataFrames) *do* use `encoding-type`/`encoding-version`. +- Readers typically infer element kinds from: + - known AnnData keys (`X`, `obs`, `var`, …), + - group structure, and/or + - legacy attributes (e.g. the `categories` attribute on categorical columns). + +## AnnData group + +### `encoding-type: anndata`, `encoding-version: 0.1.0` + +An `AnnData` object **MUST** be stored as a Zarr **group** with attributes: + +- `encoding-type: "anndata"` +- `encoding-version: "0.1.0"` + +Required members: + +- `obs` — a [DataFrame](#dataframes) +- `var` — a [DataFrame](#dataframes) + +Optional members (if present, they must satisfy these constraints): + +- `X` — dense array or sparse array; shape `(n_obs, n_var)` +- `layers` — mapping; values dense or sparse arrays; each shape `(n_obs, n_var)` +- `obsm` — mapping; values dense arrays, sparse arrays, or dataframes; first dim `n_obs` +- `varm` — mapping; values dense arrays, sparse arrays, or dataframes; first dim `n_var` +- `obsp` — mapping; values dense or sparse arrays; first two dims `n_obs` +- `varp` — mapping; values dense or sparse arrays; first two dims `n_var` +- `uns` — mapping/dict-like container (recursive) + +## Dense arrays + +### `encoding-type: array`, `encoding-version: 0.2.0` + +- A dense array **MUST** be stored as a Zarr **array**. +- The array **MUST** have attributes: + - `encoding-type: "array"` + - `encoding-version: "0.2.0"` + +> **Legacy note** +> +> In anndata 0.7.x, dense arrays were typically stored as plain Zarr arrays *without* `encoding-type`/`encoding-version`. + +## Sparse arrays (CSR/CSC) + +### `encoding-type: csr_matrix|csc_matrix`, `encoding-version: 0.1.0` + +A sparse matrix **MUST** be stored as a Zarr **group**. + +- Group attributes: + - `encoding-type: "csr_matrix"` **or** `"csc_matrix"` + - `encoding-version: "0.1.0"` + - `shape`: integer array of length 2 (matrix shape) +- Group members (arrays): + - `data` + - `indices` + - `indptr` + +The exact CSR/CSC semantics follow SciPy’s conventions. + +## DataFrames + +DataFrames are stored column-wise: each column is stored as a Zarr array (or group, if the column itself is an encoded element). + + +### DataFrame v0.2.0 + +#### `encoding-type: dataframe`, `encoding-version: 0.2.0` + +A dataframe **MUST** be stored as a Zarr **group**. + +- Group attributes: + - `_index`: string — the key of the array to be used as the row index + - `column-order`: array of strings — original column order + - `encoding-type: "dataframe"` + - `encoding-version: "0.2.0"` +- Group members: + - the index array (named by `_index`) + - one member per column +- All column entries **MUST** have the same length in their first dimension. +- Columns **SHOULD** share chunking along the first dimension. + +Columns are independently encoded: +- simple numeric/bool columns are commonly `encoding-type: array` +- categorical columns are commonly `encoding-type: categorical` + + +### DataFrame v0.1.0 (legacy: anndata 0.7.x) + +#### `encoding-type: dataframe`, `encoding-version: 0.1.0` + +A legacy dataframe is stored as a Zarr **group** where: + +- Group attributes include: + - `_index` + - `column-order` + - `encoding-type: "dataframe"` + - `encoding-version: "0.1.0"` +- Each column is an array. +- Categorical columns are stored as **integer code arrays**, and their category labels are stored in a reserved subgroup named `__categories`. + +**Reserved subgroup:** + +- `__categories/` stores the array of category labels for column ``. + + +### Legacy categorical columns (Series-level) + +In v0.1.0 DataFrames, a categorical column array (e.g. `obs/cell_type`) can be identified by the presence of an attribute: + +- `categories`: an **absolute path string** to the corresponding `__categories/` array. + +(This differs from HDF5, which can store an object reference.) + +## Mappings / dict + +### `encoding-type: dict`, `encoding-version: 0.1.0` + +- A mapping **MUST** be stored as a Zarr **group**. +- Group attributes: + - `encoding-type: "dict"` + - `encoding-version: "0.1.0"` +- Each entry in the group is another element (recursively). + +> **Legacy note** +> +> In anndata 0.7.x, groups used as mappings often had **no special attributes**. + +## Scalars + +### `encoding-version: 0.2.0` + +Scalars are stored as **0-dimensional Zarr arrays**. + +- Numeric scalars: + - `encoding-type: "numeric-scalar"` + - `encoding-version: "0.2.0"` + - value is numeric (including boolean, ints, floats, complex) +- String scalars: + - `encoding-type: "string"` + - `encoding-version: "0.2.0"` + - **Zarr requirement:** fixed-length unicode dtype (e.g. ` **Legacy note** +> +> In anndata 0.7.x, scalar strings were commonly stored without `encoding-type`/`encoding-version`. + +## Categorical arrays + +### `encoding-type: categorical`, `encoding-version: 0.2.0` + +Categorical arrays are stored as a Zarr **group** with members: + +- `codes`: integer array + - values are zero-based indices into `categories` + - signed integer arrays **MAY** use `-1` to denote missing values +- `categories`: array of labels + +Group attributes: + +- `encoding-type: "categorical"` +- `encoding-version: "0.2.0"` +- `ordered`: boolean (whether the categories are ordered) + +## String arrays + +### `encoding-type: string-array`, `encoding-version: 0.2.0` + +- String arrays **MUST** be stored as Zarr arrays. +- Array attributes: + - `encoding-type: "string-array"` + - `encoding-version: "0.2.0"` +- **Zarr requirement:** the array **MUST** be stored using `numcodecs.VLenUTF8` for variable-length UTF-8 strings. + +## Nullable arrays + +These encodings support Pandas nullable integer/boolean/string arrays by storing a `values` array plus a boolean `mask` array. + +### `encoding-type: nullable-integer`, `encoding-version: 0.1.0` + +- Stored as a Zarr group with arrays: + - `values` (integer) + - `mask` (boolean) + +### `encoding-type: nullable-boolean`, `encoding-version: 0.1.0` + +- Stored as a Zarr group with arrays: + - `values` (boolean) + - `mask` (boolean) +- `values` and `mask` **MUST** have the same shape. + +### `encoding-type: nullable-string-array`, `encoding-version: 0.1.0` + +- Stored as a Zarr group with arrays: + - `values` (string array) + - `mask` (boolean) +- Group attributes: + - `encoding-type: "nullable-string-array"` + - `encoding-version: "0.1.0"` + - optional `na-value`: `"NA"` or `"NaN"` (default `"NA"`) + + +#### Missing value semantics + +For elements supporting a `na-value` attribute: + +- `"NA"`: comparisons propagate missingness (e.g. `"x" == NA` → `NA`) +- `"NaN"`: comparisons yield boolean results (e.g. `"x" == NaN` → `false`) + +Readers should preserve semantics when the runtime model supports it. + +## Awkward arrays (experimental) + +### `encoding-type: awkward-array`, `encoding-version: 0.1.0` + +Ragged arrays are stored by decomposing an Awkward Array into constituent buffers (via `ak.to_buffers`), then storing those buffers as Zarr arrays within a group. + +Group attributes: + +- `encoding-type: "awkward-array"` +- `encoding-version: "0.1.0"` +- `form`: string — serialized Awkward “form” +- `length`: integer — logical length + +Group members: arrays for the buffers (often named like `nodeX-*`). + +> **Experimental** +> +> This encoding is considered experimental in the anndata 0.9.x series and later. + +## Sources + +- AnnData “on-disk format” prose docs (modern, ≥0.8): https://anndata.readthedocs.io/en/stable/fileformat-prose.html +- AnnData 0.7.8 “on-disk format” prose docs (legacy): https://dokk.org/documentation/anndata/0.7.8/fileformat-prose/ diff --git a/docs/GET_STARTED.md b/docs/GET_STARTED.md new file mode 100644 index 0000000..2ca023c --- /dev/null +++ b/docs/GET_STARTED.md @@ -0,0 +1,189 @@ +# Get Started + +This short walkthrough shows the basic workflow: inspect a store, export metadata, and write a subset. + +## 1 Install + +Using uv (recommended): +```bash +git clone https://github.com/cellgeni/h5ad-cli.git +cd h5ad-cli +uv sync +``` + +With pip: +```bash +git clone https://github.com/cellgeni/h5ad-cli.git +cd h5ad-cli +pip install . +``` + +Additionally, it might be useful to install `csvkit` for inspecting exported CSV files: +```bash +# with uv +uv pip install csvkit + +# with pip +pip install csvkit +``` + +## 2 Inspect a files with `info` command + +Let's load an example `.h5ad` file: +```bash +wget -O visium.h5ad https://exampledata.scverse.org/squidpy/figshare/visium_hne_adata.h5ad +``` + +Now run `info` to see the file structure: +```bash +uv run h5ad info visium.h5ad +``` +``` +An object with n_obs × n_var: 2688 × 18078 + obs: array_col, array_row, cluster, in_tissue, leiden, log1p_n_genes_by_counts, log1p_total_counts, log1p_total_counts_mt, n_counts, n_genes_by_counts, pct_counts_in_top_100_genes, pct_counts_in_top_200_genes, pct_counts_in_top_500_genes, +pct_counts_in_top_50_genes, pct_counts_mt, total_counts, total_counts_mt + var: feature_types, gene_ids, genome, highly_variable, highly_variable_rank, log1p_mean_counts, log1p_total_counts, mean_counts, means, mt, n_cells, n_cells_by_counts, pct_dropout_by_counts, total_counts, variances, variances_norm + obsm: X_pca, X_umap, spatial + varm: PCs + obsp: connectivities, distances + uns: cluster_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, spatial, umap + raw: X, var +``` + +To inspect a specific entry: +```bash +uv run h5ad info visium.h5ad obsm/X_pca +``` +``` +Path: obsm/X_pca +Type: dense-matrix +Shape: (2688, 50) +Dtype: float32 +Details: Dense matrix 2688×50 (float32) +``` + +## 3 Export entries +View the first few lines of the `obs` dataframe: + +```bash +uv run h5ad export dataframe visium.h5ad obs --head 10 +``` +```csv +_index,array_col,array_row,cluster,in_tissue,leiden,log1p_n_genes_by_counts,log1p_total_counts,log1p_total_counts_mt,n_counts,n_genes_by_counts,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes,pct_counts_in_top_50_genes,pct_counts_mt,total_counts,total_counts_mt +AAACAAGTATCTCCCA-1,102,50,Cortex_2,1,Cortex_3,8.502891406705377,9.869983,8.257904,19340.0,4928,43.13340227507756,49.21406411582213,60.449844881075485,38.42812823164426,19.943123,19340.0,3857.0 +AAACAATCTACTAGCA-1,43,3,Cortex_5,1,Pyramidal_layer_dentate_gyrus,8.145839612936841,9.528867,8.091933,13750.0,3448,55.14181818181818,60.95272727272727,70.57454545454546,50.516363636363636,23.76,13750.0,3267.0 +AAACACCAATAACTGC-1,19,59,Thalamus_2,1,Hypothalamus_1,8.70334075304372,10.395467,8.499233,32710.0,6022,47.071232039131765,54.56435340874351,65.0871293182513,40.48303271170896,15.010699,32710.0,4910.0 +AAACAGAGCGACTCCT-1,94,14,Cortex_5,1,Pyramidal_layer_dentate_gyrus,8.369157112588834,9.674704,8.092851,15909.0,4311,45.81054748884279,52.07744044251681,62.97693129675027,40.95794833113332,20.554403,15909.0,3270.0 +AAACCGGGTAGGTACC-1,28,42,Thalamus_2,1,Hypothalamus_1,8.663542087751374,10.369013,8.808967,31856.0,5787,45.887744851833254,52.98216976393771,64.24849321948768,40.287543947764945,21.01017,31856.0,6693.0 +AAACCGTTCGTCCAGG-1,42,52,Hypothalamus_2,1,Pyramidal_layer,8.682538124003075,10.337314,8.559678,30862.0,5898,43.79171797031949,51.18592443781998,62.65634113148856,37.80053139783553,16.901043,30862.0,5216.0 +AAACCTCATGAAGTTG-1,19,37,Thalamus_2,1,Hypothalamus_1,9.027858802380862,11.007419,8.849371,60319.0,8331,34.28770370861586,42.45594257199224,55.48997828213332,27.803842901904872,11.553574,60319.0,6969.0 +AAACGAAGAACATACC-1,64,6,Cortex_4,1,Hypothalamus_2,8.84246002419529,10.578089,8.855521,39264.0,6921,37.99663814180929,44.75346373268134,56.6320293398533,32.95639771801141,17.858597,39264.0,7012.0 +AAACGAGACGGTTGAT-1,79,35,Fiber_tract,1,Cortex_5,8.80941494391005,10.458923,8.351847,34853.0,6696,39.947780678851174,47.52818982583996,58.838550483459095,33.7245000430379,12.156773,34853.0,4237.0 +AAACGGTTGCGAACTG-1,59,67,Lateral_ventricle,1,Striatum,8.718663567048953,10.254004,8.416489,28395.0,6115,41.67635147032928,49.20232435287903,60.556435992252155,35.562599049128366,15.918295,28395.0,4520.0 +``` + +Export cell metadata to a CSV file: +```bash +uv run h5ad export dataframe visium.h5ad obs --output cells.csv +wc -l cells.csv # 2689 cells.csv +``` + +## 4 Subset by names + +Let's get all cluster names from `cells.csv`: +```bash +awk -F ',' 'NR>1{print $4}' cells.csv | sort | uniq -c +``` +``` +284 Cortex_1 +257 Cortex_2 +244 Cortex_3 +164 Cortex_4 +129 Cortex_5 +226 Fiber_tract +222 Hippocampus +208 Hypothalamus_1 +133 Hypothalamus_2 +105 Lateral_ventricle +42 Pyramidal_layer +68 Pyramidal_layer_dentate_gyrus +153 Striatum +261 Thalamus_1 +192 Thalamus_2 +``` + +To get all obs names in "Cortex_2", you can use `csvsql` from `csvkit`: +```bash +csvsql -d ',' -I --query "SELECT _index FROM cells WHERE cluster='Cortex_2'" cells.csv > barcodes.txt +sed -i '1d' barcodes.txt # remove header +wc -l barcodes.txt # 257 barcodes.txt +``` + +Now you can use this list to create a subset `.h5ad` file: +```bash +uv run h5ad subset visium.h5ad --output cortex2.h5ad --obs barcodes.txt +``` + +Check the result: +```bash +uv run h5ad info cortex2.h5ad +``` +``` +An object with n_obs × n_var: 257 × 18078 + obs: array_col, array_row, cluster, in_tissue, leiden, log1p_n_genes_by_counts, log1p_total_counts, log1p_total_counts_mt, n_counts, n_genes_by_counts, +pct_counts_in_top_100_genes, pct_counts_in_top_200_genes, pct_counts_in_top_500_genes, pct_counts_in_top_50_genes, pct_counts_mt, total_counts, total_counts_mt + var: feature_types, gene_ids, genome, highly_variable, highly_variable_rank, log1p_mean_counts, log1p_total_counts, mean_counts, means, mt, n_cells, n_cells_by_counts, +pct_dropout_by_counts, total_counts, variances, variances_norm + obsm: X_pca, X_umap, spatial + varm: PCs + obsp: connectivities, distances + uns: cluster_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, spatial, umap +``` + +## Import or replace data +You can also import new data into an existing store. For example, let's replace the `obs` dataframe with a modified version. First, leave only first 5 columns in `cells.csv`: +```bash +cut -d ',' -f 1-5 cells.csv > cells1to5.csv +``` + +Now import it back into `cortex2.h5ad` with the `_index` column as index: +```bash +uv run h5ad import dataframe visium.h5ad obs cells1to5.csv --index-column _index --output visium_obs1to5.h5ad +``` + +Check the updated `obs` structure: +```bash +uv run h5ad info visium_obs1to5.h5ad +``` +``` +An object with n_obs × n_var: 2688 × 18078 + obs: array_col, array_row, cluster, in_tissue + var: feature_types, gene_ids, genome, highly_variable, highly_variable_rank, log1p_mean_counts, log1p_total_counts, mean_counts, means, mt, n_cells, n_cells_by_counts, +pct_dropout_by_counts, total_counts, variances, variances_norm + obsm: X_pca, X_umap, spatial + varm: PCs + obsp: connectivities, distances + uns: cluster_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, spatial, umap + raw: X, var +``` + +You can also import the data into existing file: +```bash +uv run h5ad import dataframe visium.h5ad obs cells1to5.csv --index-column _index --inplace +``` + +Check the updated `obs` structure: +```bash +uv run h5ad info visium.h5ad +``` +``` +An object with n_obs × n_var: 2688 × 18078 + obs: array_col, array_row, cluster, in_tissue + var: feature_types, gene_ids, genome, highly_variable, highly_variable_rank, log1p_mean_counts, log1p_total_counts, mean_counts, means, mt, n_cells, n_cells_by_counts, +pct_dropout_by_counts, total_counts, variances, variances_norm + obsm: X_pca, X_umap, spatial + varm: PCs + obsp: connectivities, distances + uns: cluster_colors, hvg, leiden, leiden_colors, neighbors, pca, rank_genes_groups, spatial, umap + raw: X, var +``` \ No newline at end of file diff --git a/docs/TUTORIAL.md b/docs/TUTORIAL.md new file mode 100644 index 0000000..a19a416 --- /dev/null +++ b/docs/TUTORIAL.md @@ -0,0 +1,382 @@ +# Tutorial: Using h5ad CLI with csvkit + +This tutorial demonstrates how to combine `h5ad` CLI with `csvkit` to explore, analyze, and subset large `.h5ad` files efficiently without loading them into memory. + +## Introduction + +### h5ad CLI +A command-line tool for working with AnnData (`.h5ad`) files. It streams data directly from disk, making it perfect for exploring huge single-cell datasets without memory constraints. + +**Key features:** +- `info` - Inspect file structure and dimensions +- `table` - Export metadata to CSV +- `subset` - Filter files by cell/gene names + +### csvkit +A suite of command-line tools for working with CSV files. Think of it as `awk`, `sed`, and `grep` but specifically designed for CSV data. + +**Key tools we'll use:** +- `csvcut` - Select specific columns +- `csvsql` - Execute SQL queries on CSV files +- `csvgrep` - Filter rows by pattern +- `csvlook` - Pretty-print CSV in terminal + +**Installation:** +```bash +pip install csvkit +``` + +## 1. Inspect File Structure with `info` + +First, let's see what's in our `.h5ad` file: + +```bash +h5ad info dataset.h5ad +``` + +**Example output:** +``` +File: dataset.h5ad +Dimensions: 50000 obs × 20000 var + +Top-level groups: + obs/ + - cell_type + - sample_id + - donor_id + - tissue + - n_genes + var/ + - gene_name + - highly_variable + X (sparse matrix) + layers/ + obsm/ + uns/ +``` + +This shows us that we have 50,000 cells with metadata including cell types, samples, and donor information. + +## 2. Export Metadata with `table` + +### 2.1 Basic Metadata Export + +Export all cell metadata (observations) to CSV: + +```bash +h5ad table dataset.h5ad --axis obs --output cell_metadata.csv +``` + +Export just specific columns: + +```bash +h5ad table dataset.h5ad --axis obs --columns cell_type,sample_id,donor_id --output cells.csv +``` + +Preview the first few rows in a nice table format: + +```bash +h5ad table dataset.h5ad --axis obs --columns cell_type,sample_id,donor_id --head 10 | csvlook +``` + +**Example output:** +``` +| obs_names | cell_type | sample_id | donor_id | +| ------------------- | ------------ | --------- | -------- | +| AAACCTGAGAAACCAT-1 | T cell | sample_1 | donor_A | +| AAACCTGAGACAGACC-1 | B cell | sample_1 | donor_A | +| AAACCTGAGGCATGGT-1 | NK cell | sample_2 | donor_B | +| AAACCTGCAAGCCGCT-1 | T cell | sample_2 | donor_B | +| AAACCTGCACATTAGC-1 | Monocyte | sample_1 | donor_A | +``` + +### 2.2 Calculate Statistics with `csvsql` + +Now let's analyze the metadata using SQL queries! + +**Count cells per cell type:** + +```bash +h5ad table dataset.h5ad --axis obs --columns cell_type | \ + csvsql --query "SELECT cell_type, COUNT(*) as n_cells FROM stdin GROUP BY cell_type ORDER BY n_cells DESC" | \ + csvlook +``` + +**Example output:** +``` +| cell_type | n_cells | +| ------------ | ------- | +| T cell | 15234 | +| Monocyte | 12456 | +| B cell | 8932 | +| NK cell | 5621 | +| DC | 3456 | +| Macrophage | 2301 | +``` + +**Count cells per cell type and sample:** + +```bash +h5ad table dataset.h5ad --axis obs --columns cell_type,sample_id | \ + csvsql --query "SELECT cell_type, sample_id, COUNT(*) as n_cells + FROM stdin + GROUP BY cell_type, sample_id + ORDER BY cell_type, sample_id" | \ + csvlook +``` + +**Example output:** +``` +| cell_type | sample_id | n_cells | +| ------------ | --------- | ------- | +| B cell | sample_1 | 4521 | +| B cell | sample_2 | 4411 | +| Monocyte | sample_1 | 6234 | +| Monocyte | sample_2 | 6222 | +| T cell | sample_1 | 7645 | +| T cell | sample_2 | 7589 | +``` + +**Calculate average gene count per cell type:** + +```bash +h5ad table dataset.h5ad --axis obs --columns cell_type,n_genes | \ + csvsql --query "SELECT cell_type, + AVG(n_genes) as avg_genes, + MIN(n_genes) as min_genes, + MAX(n_genes) as max_genes + FROM stdin + GROUP BY cell_type + ORDER BY avg_genes DESC" | \ + csvlook +``` + +**Find samples with low cell counts:** + +```bash +h5ad table dataset.h5ad --axis obs --columns sample_id | \ + csvsql --query "SELECT sample_id, COUNT(*) as n_cells + FROM stdin + GROUP BY sample_id + HAVING COUNT(*) < 1000 + ORDER BY n_cells" | \ + csvlook +``` + +## 3. Filter and Subset Data + +### 3.1 Extract Cell Names for a Specific Cell Type + +Let's say we want to create a subset containing only T cells. + +**Step 1: Export metadata and filter for T cells** + +```bash +h5ad table dataset.h5ad --axis obs --columns cell_type --output cell_metadata.csv +``` + +**Step 2: Use csvgrep to find T cells and extract their names** + +```bash +csvgrep -c cell_type -m "T cell" cell_metadata.csv | \ + csvcut -c obs_names | \ + tail -n +2 > tcell_names.txt +``` + +This creates a file `tcell_names.txt` with one cell barcode per line. + +**Alternative: Use csvsql for more complex filters** + +Get T cells from a specific donor: + +```bash +h5ad table dataset.h5ad --axis obs --columns cell_type,donor_id --output cell_metadata.csv + +csvsql --query "SELECT obs_names + FROM cell_metadata + WHERE cell_type = 'T cell' + AND donor_id = 'donor_A'" \ + cell_metadata.csv | \ + tail -n +2 > tcell_donor_A.txt +``` + +Get cells with high gene counts (>2000 genes): + +```bash +h5ad table dataset.h5ad --axis obs --columns n_genes --output cell_metadata.csv + +csvsql --query "SELECT obs_names + FROM cell_metadata + WHERE n_genes > 2000" \ + cell_metadata.csv | \ + tail -n +2 > high_quality_cells.txt +``` + +### 3.2 Create the Subset + +Now use the filtered cell list to create a new `.h5ad` file: + +```bash +h5ad subset dataset.h5ad tcells_only.h5ad --obs tcell_names.txt +``` + +**Verify the subset:** + +```bash +h5ad info tcells_only.h5ad +``` + +**Check the cell type distribution:** + +```bash +h5ad table tcells_only.h5ad --axis obs --columns cell_type | \ + csvsql --query "SELECT cell_type, COUNT(*) as n_cells FROM stdin GROUP BY cell_type" | \ + csvlook +``` + +### 3.3 Advanced: Subset by Both Cells and Genes + +Let's create a subset with specific cell types and a curated gene list. + +**Step 1: Filter cells (multiple cell types)** + +```bash +h5ad table dataset.h5ad --axis obs --columns cell_type --output cell_metadata.csv + +csvsql --query "SELECT obs_names + FROM cell_metadata + WHERE cell_type IN ('T cell', 'NK cell', 'B cell')" \ + cell_metadata.csv | \ + tail -n +2 > lymphocytes.txt +``` + +**Step 2: Create a gene list** + +You might have a predefined list or extract genes from the file: + +```bash +# Export all genes +h5ad table dataset.h5ad --axis var --columns gene_name --output genes.csv + +# Filter for specific genes (e.g., markers) +echo "CD3D +CD3E +CD4 +CD8A +CD8B +CD19 +CD20 +NCAM1" > marker_genes.txt +``` + +**Step 3: Create the subset** + +```bash +h5ad subset dataset.h5ad lymphocytes_markers.h5ad \ + --obs lymphocytes.txt \ + --var marker_genes.txt +``` + +**Verify:** + +```bash +h5ad info lymphocytes_markers.h5ad +``` + +## 4. Complete Example Workflow + +Here's a complete workflow combining everything: + +```bash +# 1. Inspect the file +h5ad info large_dataset.h5ad + +# 2. Export and analyze metadata +h5ad table large_dataset.h5ad --axis obs \ + --columns cell_type,sample_id,donor_id,n_genes \ + --output all_metadata.csv + +# 3. Generate statistics +echo "Cell type distribution:" +csvsql --query "SELECT cell_type, COUNT(*) as n_cells + FROM all_metadata + GROUP BY cell_type + ORDER BY n_cells DESC" \ + all_metadata.csv | csvlook + +echo "Sample distribution:" +csvsql --query "SELECT sample_id, donor_id, COUNT(*) as n_cells + FROM all_metadata + GROUP BY sample_id, donor_id" \ + all_metadata.csv | csvlook + +# 4. Filter for high-quality T cells from a specific donor +csvsql --query "SELECT obs_names + FROM all_metadata + WHERE cell_type = 'T cell' + AND donor_id = 'donor_A' + AND n_genes > 1500" \ + all_metadata.csv | \ + tail -n +2 > selected_cells.txt + +echo "Selected $(wc -l < selected_cells.txt) cells" + +# 5. Create subset +h5ad subset large_dataset.h5ad tcells_subset.h5ad --obs selected_cells.txt + +# 6. Verify result +h5ad info tcells_subset.h5ad +h5ad table tcells_subset.h5ad --axis obs --columns cell_type,donor_id | \ + csvsql --query "SELECT cell_type, donor_id, COUNT(*) as n_cells FROM stdin GROUP BY cell_type, donor_id" | \ + csvlook +``` + +## Tips and Best Practices + +1. **Use `--head` for quick previews** before exporting large files: + ```bash + h5ad table data.h5ad --axis obs --head 100 | csvlook + ``` + +2. **Pipe directly to csvkit** to avoid creating intermediate files: + ```bash + h5ad table data.h5ad --axis obs --columns cell_type | csvsql --query "..." + ``` + +3. **Check cell counts** before subsetting: + ```bash + wc -l selected_cells.txt # Should be > 0! + ``` + +4. **Use csvstat** for quick summary statistics: + ```bash + h5ad table data.h5ad --axis obs --columns n_genes,n_counts | csvstat + ``` + +5. **Combine with standard Unix tools**: + ```bash + # Get unique cell types + h5ad table data.h5ad --axis obs --columns cell_type | tail -n +2 | sort -u + + # Count samples + h5ad table data.h5ad --axis obs --columns sample_id | tail -n +2 | sort | uniq -c + ``` + +## Conclusion + +By combining `h5ad` CLI with `csvkit`, you can: +- ✅ Explore huge datasets without loading them into memory +- ✅ Perform complex queries and aggregations on metadata +- ✅ Create filtered subsets based on sophisticated criteria +- ✅ Work entirely on the command line without Python/R + +This workflow is especially powerful for: +- Initial data exploration +- Quality control analysis +- Creating test datasets +- Preparing data for downstream analysis +- Batch processing multiple files + +For more information: +- h5ad CLI: [README.md](../README.md) +- csvkit documentation: https://csvkit.readthedocs.io/ diff --git a/pyproject.toml b/pyproject.toml index c18faa4..281812b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,8 +7,10 @@ requires-python = ">=3.12" dependencies = [ "h5py>=3.15.1", "numpy>=2.3.5", + "pillow>=12.1.0", "rich>=14.2.0", "typer>=0.20.0", + "zarr>=3.1.5", ] [project.optional-dependencies] diff --git a/src/h5ad/cli.py b/src/h5ad/cli.py index bb4749d..66bbd22 100644 --- a/src/h5ad/cli.py +++ b/src/h5ad/cli.py @@ -1,104 +1,110 @@ -import sys -import csv +"""CLI for h5ad files with export and import subcommands.""" + from pathlib import Path -from typing import Optional, Sequence, Tuple, Dict, List +from typing import Optional, Sequence, List -import rich from rich.console import Console import typer -import h5py -import numpy as np -from h5ad.commands import show_info, export_table, subset_h5ad +from h5ad.commands import ( + show_info, + subset_h5ad, + export_mtx, + export_npy, + export_json, + export_table, +) + +from h5ad.commands import export_image as export_image_cmd app = typer.Typer( - help="Streaming CLI for huge .h5ad files (info, table, subset)." + help="Streaming CLI for huge .h5ad and .zarr files (info, subset, export, import)." ) -console = Console(stderr=True) +# Use stderr for status/progress to keep stdout clean for data output +# force_terminal=True ensures Rich output is visible even in non-TTY environments +console = Console(stderr=True, force_terminal=True) +# Create sub-apps for export and import +export_app = typer.Typer(help="Export objects from h5ad files.") +import_app = typer.Typer(help="Import objects into h5ad files.") +app.add_typer(export_app, name="export") +app.add_typer(import_app, name="import") + +# ============================================================================ +# INFO command +# ============================================================================ @app.command() def info( file: Path = typer.Argument( ..., - help="Path to the .h5ad file", + help="Path to the .h5ad/.zarr store", exists=True, readable=True, - ) + dir_okay=True, + file_okay=True, + ), + entry: Optional[str] = typer.Argument( + None, + help="Entry path to inspect (e.g., 'obsm/X_pca', 'X', 'uns')", + ), + tree: bool = typer.Option( + False, + "--tree", + "-t", + help="Show a tree of all entries", + ), + depth: int = typer.Option( + None, + "--depth", + "-d", + help="Maximum recursion depth for tree display (only with --tree)", + ), ) -> None: """ Show high-level information about the .h5ad file. - Args: - file (Path): Path to the .h5ad file + + Use --tree to see a tree of all entries. + Use --entry to inspect a specific entry in detail. + + Examples: + h5ad info data.h5ad + h5ad info --tree data.h5ad + h5ad info obsm/X_pca data.h5ad """ - show_info(file, console) + try: + show_info(file, console, show_types=tree, depth=depth, entry_path=entry) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) +# ============================================================================ +# SUBSET command +# ============================================================================ @app.command() -def table( +def subset( file: Path = typer.Argument( ..., - help="Path to the .h5ad file", + help="Input .h5ad/.zarr", exists=True, readable=True, + dir_okay=True, + file_okay=True, ), - axis: str = typer.Option("obs", help="Axis to read from ('obs' or 'var')"), - columns: Optional[str] = typer.Option( - None, - "--columns", - "-c", - help="Comma separated column names to include in the output table", - ), - out: Optional[Path] = typer.Option( + output: Optional[Path] = typer.Option( None, "--output", "-o", - help="Output file path (defaults to stdout)", - writable=True, + help="Output .h5ad/.zarr path. Required unless --inplace.", + dir_okay=True, + file_okay=True, ), - chunk_rows: int = typer.Option( - 10000, "--chunk-rows", "-r", help="Number of rows to read per chunk" + inplace: bool = typer.Option( + False, + "--inplace", + help="Modify source file directly.", ), - head: Optional[int] = typer.Option( - None, "--head", "-n", help="Output only the first n rows" - ), -) -> None: - """ - Export a table of the specified axis ('obs' or 'var') to CSV format. - Args: - file (Path): Path to the .h5ad file - axis (str): Axis to read from ('obs' or 'var') - columns (Optional[str]): Comma separated column names to include in the output table - out (Optional[Path]): Output file path (defaults to stdout) - chunk_rows (int): Number of rows to read per chunk - head (Optional[int]): Output only the first n rows - """ - # Validate axis parameter - if axis not in ("obs", "var"): - console.print( - f"[bold red]Error:[/] Invalid axis '{axis}'. Must be either 'obs' or 'var'.", - ) - raise typer.Exit(code=1) - - col_list: Optional[List[str]] = None - if columns: - col_list = [col.strip() for col in columns.split(",") if col.strip()] - - export_table( - file=file, - axis=axis, - columns=col_list, - out=out, - chunk_rows=chunk_rows, - head=head, - console=console, - ) - - -@app.command() -def subset( - file: Path = typer.Argument(..., help="Input .h5ad", exists=True, readable=True), - output: Path = typer.Argument(..., help="Output .h5ad", writable=True), obs: Optional[Path] = typer.Option( None, "--obs", @@ -114,7 +120,12 @@ def subset( readable=True, ), chunk_rows: int = typer.Option( - 1024, "--chunk-rows", "-r", help="Row chunk size for dense matrices" + 1024, + "--chunk", + "-C", + "--chunk-rows", + "-r", + help="Row chunk size for dense matrices", ), ) -> None: """Subset an h5ad by obs and/or var names.""" @@ -124,6 +135,13 @@ def subset( ) raise typer.Exit(code=1) + if not inplace and output is None: + console.print( + "[bold red]Error:[/] Output file is required. " + "Use --output/-o or --inplace.", + ) + raise typer.Exit(code=1) + try: subset_h5ad( file=file, @@ -132,11 +150,508 @@ def subset( var_file=var, chunk_rows=chunk_rows, console=console, + inplace=inplace, ) except Exception as e: console.print(f"[bold red]Error:[/] {e}") raise typer.Exit(code=1) +# ============================================================================ +# EXPORT subcommands +# ============================================================================ +@export_app.command("dataframe") +def export_dataframe( + file: Path = typer.Argument( + ..., + help="Path to the .h5ad/.zarr store", + exists=True, + readable=True, + dir_okay=True, + file_okay=True, + ), + entry: str = typer.Argument(..., help="Entry path to export ('obs' or 'var')"), + output: Path = typer.Option( + None, "--output", "-o", writable=True, help="Output CSV file path" + ), + columns: Optional[str] = typer.Option( + None, + "--columns", + "-c", + help="Comma separated column names to include", + ), + chunk_rows: int = typer.Option( + 10_000, + "--chunk", + "-C", + "--chunk-rows", + "-r", + help="Number of rows to read per chunk", + ), + head: Optional[int] = typer.Option( + None, "--head", "-n", help="Output only the first n entries" + ), +) -> None: + """ + Export a dataframe (obs or var) to CSV. + + Examples: + h5ad export dataframe data.h5ad obs --output obs.csv + h5ad export dataframe data.h5ad var --output var.csv --columns gene_id,mean + h5ad export dataframe data.h5ad obs --head 100 + """ + + if entry not in ("obs", "var"): + console.print( + f"[bold red]Error:[/] Dataframe export is only supported for 'obs' or 'var' at this point, not '{entry}'.", + ) + raise typer.Exit(code=1) + + col_list: Optional[List[str]] = None + if columns: + col_list = [col.strip() for col in columns.split(",") if col.strip()] + + try: + export_table( + file=file, + axis=entry, + columns=col_list, + out=output, + chunk_rows=chunk_rows, + head=head, + console=console, + ) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) + + +@export_app.command("array") +def export_array( + file: Path = typer.Argument( + ..., + help="Path to the .h5ad/.zarr store", + exists=True, + readable=True, + dir_okay=True, + file_okay=True, + ), + entry: str = typer.Argument( + ..., help="Entry path to export (e.g., 'obsm/X_pca', 'varm/PCs', 'X')" + ), + output: Path = typer.Option( + ..., "--output", "-o", help="Output .npy file path", writable=True + ), + chunk_elements: int = typer.Option( + 100_000, + "--chunk", + "-C", + help="Number of elements to read per chunk", + ), +) -> None: + """ + Export a dense array or matrix to NumPy .npy format. + + Examples: + h5ad export array data.h5ad obsm/X_pca pca.npy + h5ad export array data.h5ad X matrix.npy + h5ad export array data.h5ad varm/PCs loadings.npy + """ + + try: + export_npy( + file=file, + obj=entry, + out=output, + chunk_elements=chunk_elements, + console=console, + ) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) + + +@export_app.command("sparse") +def export_sparse( + file: Path = typer.Argument( + ..., + help="Path to the .h5ad/.zarr store", + exists=True, + readable=True, + dir_okay=True, + file_okay=True, + ), + entry: str = typer.Argument( + ..., help="Entry path to export (e.g., 'X', 'layers/counts')" + ), + output: Optional[Path] = typer.Option( + None, + "--output", + "-o", + writable=True, + help="Output .mtx file path (defaults to stdout)", + ), + head: Optional[int] = typer.Option( + None, "--head", "-n", help="Output only the first n entries of mtx file" + ), + chunk_elements: int = typer.Option( + 1_000, + "--chunk", + "-C", + help="Number of rows/columns (depends on compression format) to process per chunk", + ), + in_memory: bool = typer.Option( + False, + "--in-memory", + "-m", + help="Load the entire sparse matrix into memory before exporting (may be faster for small matrices)", + ), +) -> None: + """ + Export a sparse matrix (CSR/CSC) to Matrix Market (.mtx) format. + + Examples: + h5ad export sparse data.h5ad X matrix.mtx + h5ad export sparse data.h5ad layers/counts counts.mtx + h5ad export sparse data.h5ad X --head 100 + """ + + try: + export_mtx( + file=file, + obj=entry, + out=output, + head=head, + chunk_elements=chunk_elements, + in_memory=in_memory, + console=console, + ) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) + + +@export_app.command("dict") +def export_dict( + file: Path = typer.Argument( + ..., + help="Path to the .h5ad/.zarr store", + exists=True, + readable=True, + dir_okay=True, + file_okay=True, + ), + entry: str = typer.Argument( + ..., help="Entry path to export (e.g., 'uns', 'uns/colors')" + ), + output_arg: Optional[Path] = typer.Argument(None, help="Output .json file path"), + output: Optional[Path] = typer.Option( + None, "--output", "-o", help="Output .json file path" + ), + max_elements: int = typer.Option( + 100_000, + "--max-elements", + help="Maximum array elements for JSON export", + ), + include_attrs: bool = typer.Option( + False, "--include-attrs", help="Include HDF5 attributes in JSON export" + ), +) -> None: + """ + Export a dict/group or scalar to JSON format. + + Examples: + h5ad export dict data.h5ad uns metadata.json + h5ad export dict data.h5ad uns/colors colors.json + """ + + try: + out_path = output if output is not None else output_arg + export_json( + file=file, + obj=entry, + out=out_path, + max_elements=max_elements, + include_attrs=include_attrs, + console=console, + ) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) + + +@export_app.command("image") +def export_image( + file: Path = typer.Argument( + ..., + help="Path to the .h5ad/.zarr store", + exists=True, + readable=True, + dir_okay=True, + file_okay=True, + ), + entry: str = typer.Argument(..., help="Entry path to export (2D or 3D array)"), + output: Optional[Path] = typer.Option( + None, "--output", "-o", help="Output image file (.png, .jpg, .tiff)" + ), +) -> None: + """ + Export an image-like array to PNG/JPG/TIFF format. + + The array should be 2D (H,W) or 3D (H,W,C) with C in {1,3,4}. + + Examples: + h5ad export image data.h5ad uns/spatial/image tissue.png + """ + + try: + export_image_cmd(file=file, obj=entry, out=output, console=console) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) + + +# ============================================================================ +# IMPORT subcommands +# ============================================================================ +def _get_target_file(file: Path, output: Optional[Path], inplace: bool) -> Path: + """Determine target path and copy/convert if needed.""" + from h5ad.commands.import_data import _prepare_target_path + + return _prepare_target_path(file, output, inplace, console) + + +@import_app.command("dataframe") +def import_dataframe( + file: Path = typer.Argument( + ..., + help="Path to the source .h5ad/.zarr store", + exists=True, + readable=True, + dir_okay=True, + file_okay=True, + ), + entry: str = typer.Argument( + ..., help="Entry path to create/replace ('obs' or 'var')" + ), + input_file: Path = typer.Argument( + ..., help="Input CSV file", exists=True, readable=True + ), + output: Optional[Path] = typer.Option( + None, + "--output", + "-o", + help="Output .h5ad/.zarr path. Required unless --inplace.", + dir_okay=True, + file_okay=True, + ), + inplace: bool = typer.Option( + False, + "--inplace", + help="Modify source file directly.", + ), + index_column: Optional[str] = typer.Option( + None, + "--index-column", + "-i", + help="Column to use as index. Defaults to first column.", + ), +) -> None: + """ + Import a CSV file into obs or var. + + Examples: + h5ad import dataframe data.h5ad obs cells.csv -o output.h5ad -i cell_id + h5ad import dataframe data.h5ad var genes.csv --inplace -i gene_id + """ + from h5ad.commands.import_data import _import_csv + + if entry not in ("obs", "var"): + console.print( + f"[bold red]Error:[/] Entry must be 'obs' or 'var', not '{entry}'.", + ) + raise typer.Exit(code=1) + + if not inplace and output is None: + console.print( + "[bold red]Error:[/] Output file is required. " + "Use --output/-o or --inplace.", + ) + raise typer.Exit(code=1) + + try: + target = _get_target_file(file, output, inplace) + _import_csv(target, entry, input_file, index_column, console) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) + + +@import_app.command("array") +def import_array( + file: Path = typer.Argument( + ..., + help="Path to the source .h5ad/.zarr store", + exists=True, + readable=True, + dir_okay=True, + file_okay=True, + ), + entry: str = typer.Argument( + ..., help="Entry path to create/replace (e.g., 'X', 'obsm/X_pca')" + ), + input_file: Path = typer.Argument( + ..., help="Input .npy file", exists=True, readable=True + ), + output: Optional[Path] = typer.Option( + None, + "--output", + "-o", + help="Output .h5ad/.zarr path. Required unless --inplace.", + dir_okay=True, + file_okay=True, + ), + inplace: bool = typer.Option( + False, + "--inplace", + help="Modify source file directly.", + ), +) -> None: + """ + Import a NumPy .npy file as a dense array. + + Dimensions are validated against existing obs/var. + + Examples: + h5ad import array data.h5ad obsm/X_pca pca.npy -o output.h5ad + h5ad import array data.h5ad X matrix.npy --inplace + """ + from h5ad.commands.import_data import _import_npy + + if not inplace and output is None: + console.print( + "[bold red]Error:[/] Output file is required. " + "Use --output/-o or --inplace.", + ) + raise typer.Exit(code=1) + + try: + target = _get_target_file(file, output, inplace) + _import_npy(target, entry, input_file, console) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) + + +@import_app.command("sparse") +def import_sparse( + file: Path = typer.Argument( + ..., + help="Path to the source .h5ad/.zarr store", + exists=True, + readable=True, + dir_okay=True, + file_okay=True, + ), + obj: str = typer.Argument( + ..., help="Object path to create/replace (e.g., 'X', 'layers/counts')" + ), + input_file: Path = typer.Argument( + ..., help="Input .mtx file", exists=True, readable=True + ), + output: Optional[Path] = typer.Option( + None, + "--output", + "-o", + help="Output .h5ad/.zarr path. Required unless --inplace.", + dir_okay=True, + file_okay=True, + ), + inplace: bool = typer.Option( + False, + "--inplace", + help="Modify source file directly.", + ), +) -> None: + """ + Import a Matrix Market (.mtx) file as a CSR sparse matrix. + + Dimensions are validated against existing obs/var. + + Examples: + h5ad import sparse data.h5ad X matrix.mtx -o output.h5ad + h5ad import sparse data.h5ad layers/counts counts.mtx --inplace + """ + from h5ad.commands.import_data import _import_mtx + + if not inplace and output is None: + console.print( + "[bold red]Error:[/] Output file is required. " + "Use --output/-o or --inplace.", + ) + raise typer.Exit(code=1) + + try: + target = _get_target_file(file, output, inplace) + _import_mtx(target, obj, input_file, console) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) + + +@import_app.command("dict") +def import_dict( + file: Path = typer.Argument( + ..., + help="Path to the source .h5ad/.zarr store", + exists=True, + readable=True, + dir_okay=True, + file_okay=True, + ), + obj: str = typer.Argument( + ..., help="Object path to create/replace (e.g., 'uns', 'uns/metadata')" + ), + input_file: Path = typer.Argument( + ..., help="Input .json file", exists=True, readable=True + ), + output: Optional[Path] = typer.Option( + None, + "--output", + "-o", + help="Output .h5ad/.zarr path. Required unless --inplace.", + dir_okay=True, + file_okay=True, + ), + inplace: bool = typer.Option( + False, + "--inplace", + help="Modify source file directly.", + ), +) -> None: + """ + Import a JSON file into uns or other dict-like groups. + + Examples: + h5ad import dict data.h5ad uns/metadata config.json -o output.h5ad + h5ad import dict data.h5ad uns settings.json --inplace + """ + from h5ad.commands.import_data import _import_json + + if not inplace and output is None: + console.print( + "[bold red]Error:[/] Output file is required. " + "Use --output/-o or --inplace.", + ) + raise typer.Exit(code=1) + + try: + target = _get_target_file(file, output, inplace) + _import_json(target, obj, input_file, console) + except Exception as e: + console.print(f"[bold red]Error:[/] {e}") + raise typer.Exit(code=1) + + def main(argv: Optional[Sequence[str]] = None) -> None: app(standalone_mode=True) diff --git a/src/h5ad/commands/__init__.py b/src/h5ad/commands/__init__.py index e681fea..70d960f 100644 --- a/src/h5ad/commands/__init__.py +++ b/src/h5ad/commands/__init__.py @@ -1,3 +1,4 @@ from h5ad.commands.info import show_info -from h5ad.commands.table import export_table from h5ad.commands.subset import subset_h5ad +from h5ad.commands.export import export_table, export_image, export_json, export_mtx, export_npy +from h5ad.commands.import_data import import_object diff --git a/src/h5ad/commands/export.py b/src/h5ad/commands/export.py new file mode 100644 index 0000000..22221a7 --- /dev/null +++ b/src/h5ad/commands/export.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +from pathlib import Path +from typing import List, Optional + +from rich.console import Console + +from h5ad.formats.array import export_npy as export_npy_format +from h5ad.formats.common import EXPORTABLE_TYPES, IMAGE_EXTENSIONS, TYPE_EXTENSIONS +from h5ad.formats.dataframe import export_dataframe +from h5ad.formats.image import export_image as export_image_format +from h5ad.formats.json_data import export_json as export_json_format +from h5ad.formats.sparse import export_mtx as export_mtx_format +from h5ad.storage import open_store + + +def export_table( + file: Path, + axis: str, + columns: Optional[List[str]], + out: Optional[Path], + chunk_rows: int, + head: Optional[int], + console: Console, +) -> None: + with open_store(file, "r") as store: + export_dataframe( + store.root, + axis=axis, + columns=columns, + out=out, + chunk_rows=chunk_rows, + head=head, + console=console, + ) + + +def export_npy( + file: Path, + obj: str, + out: Path, + chunk_elements: int, + console: Console, +) -> None: + with open_store(file, "r") as store: + export_npy_format( + store.root, + obj=obj, + out=out, + chunk_elements=chunk_elements, + console=console, + ) + + +def export_mtx( + file: Path, + obj: str, + out: Optional[Path], + head: Optional[int], + chunk_elements: int, + in_memory: bool, + console: Console, +) -> None: + with open_store(file, "r") as store: + export_mtx_format( + store.root, + obj=obj, + out=out, + head=head, + chunk_elements=chunk_elements, + in_memory=in_memory, + console=console, + ) + + +def export_json( + file: Path, + obj: str, + out: Optional[Path], + max_elements: int, + include_attrs: bool, + console: Console, +) -> None: + with open_store(file, "r") as store: + export_json_format( + store.root, + obj=obj, + out=out, + max_elements=max_elements, + include_attrs=include_attrs, + console=console, + ) + + +def export_image(file: Path, obj: str, out: Path, console: Console) -> None: + with open_store(file, "r") as store: + export_image_format(store.root, obj=obj, out=out, console=console) + + +__all__ = [ + "EXPORTABLE_TYPES", + "IMAGE_EXTENSIONS", + "TYPE_EXTENSIONS", + "export_image", + "export_json", + "export_mtx", + "export_npy", + "export_table", +] diff --git a/src/h5ad/commands/import_data.py b/src/h5ad/commands/import_data.py new file mode 100644 index 0000000..dad838a --- /dev/null +++ b/src/h5ad/commands/import_data.py @@ -0,0 +1,129 @@ +"""Import command helpers for creating/replacing objects in h5ad/zarr stores.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Optional + +from rich.console import Console + +from h5ad.formats.array import import_npy +from h5ad.formats.dataframe import import_dataframe +from h5ad.formats.json_data import import_json +from h5ad.formats.sparse import import_mtx +from h5ad.storage import copy_path, copy_store_contents, detect_backend, open_store + + +EXTENSION_FORMAT = { + ".csv": "csv", + ".npy": "npy", + ".mtx": "mtx", + ".json": "json", +} + + +def _prepare_target_path( + file: Path, + output_file: Optional[Path], + inplace: bool, + console: Console, +) -> Path: + if inplace: + return file + if output_file is None: + raise ValueError("Output file is required unless --inplace is specified.") + + src_backend = detect_backend(file) + dst_backend = detect_backend(output_file) + + if src_backend == dst_backend: + copy_path(file, output_file) + console.print(f"[dim]Copied {file} → {output_file}[/]") + return output_file + + with open_store(file, "r") as src_store, open_store(output_file, "w") as dst_store: + copy_store_contents(src_store.root, dst_store.root) + console.print( + f"[dim]Converted {file} ({src_backend}) → {output_file} ({dst_backend})[/]" + ) + return output_file + + +def import_object( + file: Path, + obj: str, + input_file: Path, + output_file: Optional[Path], + inplace: bool, + index_column: Optional[str], + console: Console, +) -> None: + target_file = _prepare_target_path(file, output_file, inplace, console) + ext = input_file.suffix.lower() + + if ext not in EXTENSION_FORMAT: + raise ValueError( + f"Unsupported input file extension '{ext}'. " + f"Supported: {', '.join(sorted(EXTENSION_FORMAT.keys()))}" + ) + + fmt = EXTENSION_FORMAT[ext] + + if index_column and (fmt != "csv" or obj not in ("obs", "var")): + raise ValueError("--index-column is only valid for CSV import into 'obs' or 'var'.") + + if fmt == "csv": + _import_csv(target_file, obj, input_file, index_column, console) + elif fmt == "npy": + _import_npy(target_file, obj, input_file, console) + elif fmt == "mtx": + _import_mtx(target_file, obj, input_file, console) + elif fmt == "json": + _import_json(target_file, obj, input_file, console) + + +def _import_csv( + file: Path, + obj: str, + input_file: Path, + index_column: Optional[str], + console: Console, +) -> None: + with open_store(file, "a") as store: + import_dataframe( + store.root, + obj=obj, + input_file=input_file, + index_column=index_column, + console=console, + ) + + +def _import_npy( + file: Path, + obj: str, + input_file: Path, + console: Console, +) -> None: + with open_store(file, "a") as store: + import_npy(store.root, obj=obj, input_file=input_file, console=console) + + +def _import_mtx( + file: Path, + obj: str, + input_file: Path, + console: Console, +) -> None: + with open_store(file, "a") as store: + import_mtx(store.root, obj=obj, input_file=input_file, console=console) + + +def _import_json( + file: Path, + obj: str, + input_file: Path, + console: Console, +) -> None: + with open_store(file, "a") as store: + import_json(store.root, obj=obj, input_file=input_file, console=console) diff --git a/src/h5ad/commands/info.py b/src/h5ad/commands/info.py index 95c3c72..76b56da 100644 --- a/src/h5ad/commands/info.py +++ b/src/h5ad/commands/info.py @@ -1,32 +1,194 @@ from pathlib import Path +from typing import Any, Optional -import h5py import rich from rich.console import Console -from h5ad.info import axis_len +from rich.tree import Tree +from h5ad.core.info import axis_len, format_type_info, get_entry_type +from h5ad.storage import is_dataset, is_group, open_store -def show_info(file: Path, console: Console) -> None: +# Preferred display order for top-level keys +KEY_ORDER = ["X", "obs", "var", "obsm", "varm", "layers", "obsp", "varp", "uns"] + + +def _sort_keys(keys: list) -> list: + """Sort keys according to KEY_ORDER, with unknown keys at the end.""" + order_map = {k: i for i, k in enumerate(KEY_ORDER)} + return sorted(keys, key=lambda k: (order_map.get(k, len(KEY_ORDER)), k)) + + +def show_info( + file: Path, + console: Console, + show_types: bool = False, + depth: Optional[int] = None, + entry_path: Optional[str] = None, +) -> None: """ Show high-level information about the .h5ad file. Args: file (Path): Path to the .h5ad file console (Console): Rich console for output + show_types (bool): Show detailed type information for each entry + depth (Optional[int]): Maximum recursion depth for type display (only with show_types=True) + entry_path (Optional[str]): Specific entry path to inspect (e.g., 'obsm/X_pca') """ - with h5py.File(file, "r") as f: + with open_store(file, "r") as store: + f = store.root + # If a specific path is requested, show detailed info for that object + if entry_path: + _show_object_info(f, entry_path, console) + return + # Get n_obs and n_var n_obs = axis_len(f, "obs") n_var = axis_len(f, "var") rich.print( f"[bold cyan]An object with n_obs × n_var: {n_obs if n_obs is not None else '?'} × {n_var if n_var is not None else '?'}[/]" ) - # List top-level keys and their sub-keys - for key, obj in sorted(f.items(), key=lambda x: len(x[0])): - # Only process Groups, skip Datasets like X - if isinstance(obj, h5py.Group): - sub_keys = [k for k in obj.keys() if k != "_index"] - if sub_keys and key != "X": - rich.print( - f"\t[bold yellow]{key}:[/]\t" - + ", ".join(f"[bright_white]{sub}[/]" for sub in sub_keys) + + if show_types: + _show_types_tree(f, console, root_label=str(file), depth=depth) + else: + # List top-level keys and their sub-keys (original behavior) + for key in _sort_keys(list(f.keys())): + obj = f[key] + # Only process Groups, skip Datasets like X + if is_group(obj): + sub_keys = [ + k + for k in obj.keys() + if k not in ("_index", "__categories", "obs_names", "var_names") + ] + if sub_keys and key != "X": + rich.print( + f"\t[bold yellow]{key}:[/]\t" + + ", ".join(f"[bright_white]{sub}[/]" for sub in sub_keys) + ) + + +def _show_types_tree( + f: Any, console: Console, root_label: str, depth: Optional[int] = None +) -> None: + """Show a tree view with type information for all entries. + + Recursion depth by group: + - obs/var: top level only (no children) + - X: top level only + - obsm/obsp/varm/varp/layers: 1 level (show matrices) + - uns: 2 levels deep + """ + tree = Tree(f"[bold]{root_label}[/]") + + # Define max depth for each top-level group + max_depth_map = { + "obs": 0, + "var": 0, + "X": 0, + "obsm": 1, + "obsp": 1, + "varm": 1, + "varp": 1, + "layers": 1, + "uns": 2, + } + + def add_node( + parent_tree: Tree, + name: str, + obj: Any, + current_depth: int, + max_depth: int, + ) -> None: + info = get_entry_type(obj) + type_str = format_type_info(info) + + if is_dataset(obj): + shape_str = f"[dim]{obj.shape}[/]" if obj.shape else "" + node_text = f"[bright_white]{name}[/] {shape_str} {type_str}" + parent_tree.add(node_text) + else: + # Group + node_text = f"[bold yellow]{name}/[/] {type_str}" + subtree = parent_tree.add(node_text) + + # Recurse only if within allowed depth + if current_depth < max_depth: + for child_name in sorted(obj.keys()): + if child_name in ("_index", "__categories"): + continue + child_obj = obj[child_name] + add_node( + subtree, child_name, child_obj, current_depth + 1, max_depth ) + + # Add top-level items in preferred order + for key in _sort_keys(list(f.keys())): + obj = f[key] + # Skip empty groups + if is_group(obj): + children = [ + k + for k in obj.keys() + if k not in ("_index", "__categories", "obs_names", "var_names") + ] + if not children: + continue + max_depth = ( + depth if depth is not None else max_depth_map.get(key, 1) + ) # default to 1 level for unknown groups + add_node(tree, key, obj, current_depth=0, max_depth=max_depth) + + console.print(tree) + + +def _show_object_info(f: Any, entry_path: str, console: Console) -> None: + """Show detailed info for a specific object path.""" + # Normalize path + entry_path = entry_path.strip().lstrip("/") + + if entry_path not in f: + console.print(f"[bold red]Error:[/] '{entry_path}' not found in the file.") + return + + entry = f[entry_path] + info = get_entry_type(entry) + + console.print(f"\n[bold cyan]Path:[/] {entry_path}") + console.print(f"[bold cyan]Type:[/] {info['type']}") + + if info["encoding"]: + console.print(f"[bold cyan]Encoding:[/] {info['encoding']}") + + if info["shape"]: + console.print(f"[bold cyan]Shape:[/] {info['shape']}") + + if info["dtype"]: + console.print(f"[bold cyan]Dtype:[/] {info['dtype']}") + + console.print(f"[bold cyan]Details:[/] {info['details']}") + + # Show attributes if any + if entry.attrs: + console.print(f"\n[bold cyan]Attributes:[/]") + for k, v in entry.attrs.items(): + v_str = v.decode("utf-8") if isinstance(v, bytes) else str(v) + if len(v_str) > 80: + v_str = v_str[:77] + "..." + console.print(f" [dim]{k}:[/] {v_str}") + + # If it's a group, show children + if is_group(entry): + children = [ + k + for k in entry.keys() + if k not in ("_index", "__categories", "obs_names", "var_names") + ] + if children: + console.print(f"\n[bold cyan]Children:[/]") + for child_name in sorted(children): + child_entry = entry[child_name] + child_info = get_entry_type(child_entry) + type_str = format_type_info(child_info) + console.print(f" [bright_white]{child_name}[/] {type_str}") diff --git a/src/h5ad/commands/subset.py b/src/h5ad/commands/subset.py index 2e01d9d..940ef07 100644 --- a/src/h5ad/commands/subset.py +++ b/src/h5ad/commands/subset.py @@ -1,686 +1,17 @@ -"""Subset operations for .h5ad files.""" - -from __future__ import annotations - -from pathlib import Path -from typing import Optional, Set, Tuple, List, Dict, Any - -import h5py -import numpy as np -import typer -from rich.console import Console -from rich.progress import ( - Progress, - SpinnerColumn, - TextColumn, - BarColumn, - TaskProgressColumn, - TimeElapsedColumn, +from h5ad.core.subset import ( + _read_name_file, + indices_from_name_set, + subset_axis_group, + subset_dense_matrix, + subset_h5ad, + subset_sparse_matrix_group, ) -from h5ad.read import decode_str_array - - -def _copy_attrs(src: h5py.AttributeManager, dst: h5py.AttributeManager) -> None: - """ - Copy HDF5 attributes from source to destination. - Args: - src (h5py.AttributeManager): Source attributes - dst (h5py.AttributeManager): Destination attributes - """ - for k, v in src.items(): - dst[k] = v - - -def _ds_create_kwargs(src: h5py.Dataset) -> Dict[str, Any]: - """ - Best-effort carryover of dataset creation properties. - (h5py doesn't expose everything perfectly; this covers the big ones.) - - Args: - src (h5py.Dataset): Source dataset - Returns: - Dict[str, Any]: Dataset creation keyword arguments - """ - kw: Dict[str, Any] = {} - if src.chunks is not None: - kw["chunks"] = src.chunks - if src.compression is not None: - kw["compression"] = src.compression - kw["compression_opts"] = src.compression_opts - kw["shuffle"] = bool(src.shuffle) - kw["fletcher32"] = bool(src.fletcher32) - if src.scaleoffset is not None: - kw["scaleoffset"] = src.scaleoffset - if src.fillvalue is not None: - kw["fillvalue"] = src.fillvalue - return kw - - -def _read_name_file(path: Path) -> Set[str]: - """ - Read one name per line from a file. Blank lines ignored. - """ - names: Set[str] = set() - with open(path, "r", encoding="utf-8") as f: - for line in f: - line = line.strip() - if line: - names.add(line) - return names - - -def indices_from_name_set( - names_ds: h5py.Dataset, - keep: Set[str], - *, - chunk_size: int = 200_000, -) -> Tuple[np.ndarray, Set[str]]: - """ - Returns (indices_sorted, missing_names). - Chunked scan so we don't do names_ds[...] for huge datasets. - - Args: - names_ds (h5py.Dataset): Dataset containing names - keep (Set[str]): Set of names to find - chunk_size (int): Number of names to read per chunk - - Returns: - Tuple[np.ndarray, Set[str]]: (Array of found indices, set of missing names) - """ - if names_ds.ndim != 1: - # common h5ad uses 1D obs_names/var_names - flat_len = int(np.prod(names_ds.shape)) - else: - flat_len = names_ds.shape[0] - - remaining = set(keep) # we'll delete as we find - found_indices: List[int] = [] - - for start in range(0, flat_len, chunk_size): - end = min(start + chunk_size, flat_len) - chunk = names_ds[start:end] - chunk = decode_str_array(np.asarray(chunk)).astype(str) - - for i, name in enumerate(chunk): - if name in remaining: - found_indices.append(start + i) - remaining.remove(name) - - if not remaining: - break - - return np.asarray(found_indices, dtype=np.int64), remaining - - -def subset_axis_group( - src: h5py.Group, - dst: h5py.Group, - indices: Optional[np.ndarray], -) -> None: - """ - Subset obs/var group: - - datasets: subset along first axis (obj[indices, ...]) - - categorical groups: copy categories, subset codes - - unknown groups: copy as-is if indices is None; otherwise copy conservatively - - Args: - src (h5py.Group): Source axis group - dst (h5py.Group): Destination axis group - indices (Optional[np.ndarray]): Indices to keep; if None, copy as-is - """ - _copy_attrs(src.attrs, dst.attrs) - - for key in src.keys(): - obj = src[key] - - if isinstance(obj, h5py.Dataset): - if indices is None: - src.copy(key, dst, name=key) - else: - data = obj[indices, ...] - ds = dst.create_dataset(key, data=data) - _copy_attrs(obj.attrs, ds.attrs) - - elif isinstance(obj, h5py.Group): - enc = obj.attrs.get("encoding-type", b"") - if isinstance(enc, bytes): - enc = enc.decode("utf-8") - - if enc == "categorical": - gdst = dst.create_group(key) - _copy_attrs(obj.attrs, gdst.attrs) - obj.copy("categories", gdst, name="categories") - - codes = obj["codes"] - if indices is None: - obj.copy("codes", gdst, name="codes") - else: - codes_sub = codes[indices, ...] - ds = gdst.create_dataset("codes", data=codes_sub) - _copy_attrs(codes.attrs, ds.attrs) - else: - # Unknown group type - copy as-is - src.copy(key, dst, name=key) - - -def subset_dense_matrix( - src: h5py.Dataset, - dst_parent: h5py.Group, - name: str, - obs_idx: Optional[np.ndarray], - var_idx: Optional[np.ndarray], - *, - chunk_rows: int = 1024, -) -> None: - """ - Chunked write for dense 2D datasets. - Args: - src (h5py.Dataset): Source dense matrix dataset - dst_parent (h5py.Group): Destination parent group - name (str): Name for the destination dataset - obs_idx (Optional[np.ndarray]): Indices of observations to keep - var_idx (Optional[np.ndarray]): Indices of variables to keep - chunk_rows (int): Number of rows to read per chunk - """ - if src.ndim != 2: - # fallback: copy whole dataset - src.parent.copy(src.name.split("/")[-1], dst_parent, name=name) - return - - n_obs, n_var = src.shape - out_obs = len(obs_idx) if obs_idx is not None else n_obs - out_var = len(var_idx) if var_idx is not None else n_var - - kw = _ds_create_kwargs(src) - # adjust chunks to output shape if possible - if "chunks" in kw and kw["chunks"] is not None: - c0, c1 = kw["chunks"] - kw["chunks"] = (min(c0, out_obs), min(c1, out_var)) - - dst = dst_parent.create_dataset( - name, shape=(out_obs, out_var), dtype=src.dtype, **kw - ) - _copy_attrs(src.attrs, dst.attrs) - - # Write in blocks of output rows - for out_start in range(0, out_obs, chunk_rows): - out_end = min(out_start + chunk_rows, out_obs) - - if obs_idx is None: - block = src[out_start:out_end, :] - else: - rows = obs_idx[out_start:out_end] - block = src[rows, :] - - if var_idx is not None: - block = block[:, var_idx] - - dst[out_start:out_end, :] = block - - -def subset_sparse_matrix_group( - src: h5py.Group, - dst_parent: h5py.Group, - name: str, - obs_idx: Optional[np.ndarray], - var_idx: Optional[np.ndarray], -) -> None: - """ - Subset a sparse matrix stored as an h5ad group with datasets: - - data, indices, indptr - Supports both CSR (Compressed Sparse Row) and CSC (Compressed Sparse Column) formats. - - CSR: rows are compressed, efficient for row-wise operations - CSC: columns are compressed, efficient for column-wise operations - - Args: - src (h5py.Group): Source sparse matrix group - dst_parent (h5py.Group): Destination parent group - name (str): Name for the destination group - obs_idx (Optional[np.ndarray]): Indices of observations to keep - var_idx (Optional[np.ndarray]): Indices of variables to keep - """ - data = src["data"] - indices = src["indices"] - indptr = src["indptr"] - - # Determine format - encoding = src.attrs.get("encoding-type", b"") - if isinstance(encoding, bytes): - encoding = encoding.decode("utf-8") - - is_csr = encoding == "csr_matrix" - is_csc = encoding == "csc_matrix" - - if not is_csr and not is_csc: - raise ValueError(f"Unsupported sparse format: {encoding}") - - # Determine shape - shape = src.attrs.get("shape", None) - if shape is None: - # fallback: infer from indptr len and max index - major_dim = indptr.shape[0] - 1 - minor_dim = int(indices[...].max()) + 1 if indices.size else 0 - if is_csr: - n_obs, n_var = major_dim, minor_dim - else: # CSC - n_obs, n_var = minor_dim, major_dim - else: - n_obs, n_var = shape - - # For CSR: major axis = obs (rows), minor axis = var (cols) - # For CSC: major axis = var (cols), minor axis = obs (rows) - if is_csr: - major_idx = obs_idx if obs_idx is not None else np.arange(n_obs, dtype=np.int64) - minor_idx = var_idx - out_obs = major_idx.shape[0] - out_var = minor_idx.shape[0] if minor_idx is not None else n_var - else: # CSC - major_idx = var_idx if var_idx is not None else np.arange(n_var, dtype=np.int64) - minor_idx = obs_idx - out_obs = minor_idx.shape[0] if minor_idx is not None else n_obs - out_var = major_idx.shape[0] - - # Build minor axis remap if needed - minor_map = None - out_minor_dim = out_var if is_csr else out_obs - total_minor_dim = n_var if is_csr else n_obs - - if minor_idx is not None: - # array remap is fastest; if dimension is huge and memory matters, use dict instead - minor_map = np.full(total_minor_dim, -1, dtype=np.int64) - minor_map[minor_idx] = np.arange(minor_idx.shape[0], dtype=np.int64) - - # Pass 1: count nnz in output to preallocate - out_counts = np.zeros(len(major_idx), dtype=np.int64) - for i, major_pos in enumerate(major_idx): - s = int(indptr[major_pos]) - e = int(indptr[major_pos + 1]) - if s == e: - continue - minor_indices = indices[s:e] - if minor_map is None: - out_counts[i] = e - s - else: - mask = minor_map[minor_indices] >= 0 - out_counts[i] = mask.sum() - - out_indptr = np.zeros(len(major_idx) + 1, dtype=indptr.dtype) - np.cumsum(out_counts, out=out_indptr[1:]) - out_nnz = int(out_indptr[-1]) - - # Preallocate output arrays - out_data = np.empty(out_nnz, dtype=data.dtype) - out_indices = np.empty(out_nnz, dtype=indices.dtype) - - # Pass 2: fill - cursor = 0 - for i, major_pos in enumerate(major_idx): - s = int(indptr[major_pos]) - e = int(indptr[major_pos + 1]) - if s == e: - continue - - minor_indices = indices[s:e] - vals = data[s:e] - - if minor_map is None: - length = e - s - out_indices[cursor : cursor + length] = minor_indices - out_data[cursor : cursor + length] = vals - cursor += length - else: - mask = minor_map[minor_indices] >= 0 - new_minor = minor_map[minor_indices[mask]] - new_vals = vals[mask] - length = len(new_minor) - out_indices[cursor : cursor + length] = new_minor - out_data[cursor : cursor + length] = new_vals - cursor += length - - # Create dst group - gdst = dst_parent.create_group(name) - _copy_attrs(src.attrs, gdst.attrs) - gdst.attrs["shape"] = (out_obs, out_var) - # Write encoding-type as bytes to match h5ad standard - gdst.attrs["encoding-type"] = ( - encoding.encode("utf-8") if isinstance(encoding, str) else encoding - ) - - # Write datasets (best-effort preserve compression/etc.) - # Adjust chunks to not exceed output size - data_kw = _ds_create_kwargs(data) - if "chunks" in data_kw and data_kw["chunks"] is not None: - data_kw["chunks"] = (min(data_kw["chunks"][0], out_nnz),) - d_data = gdst.create_dataset("data", data=out_data, **data_kw) - _copy_attrs(data.attrs, d_data.attrs) - - indices_kw = _ds_create_kwargs(indices) - if "chunks" in indices_kw and indices_kw["chunks"] is not None: - indices_kw["chunks"] = (min(indices_kw["chunks"][0], out_nnz),) - d_indices = gdst.create_dataset("indices", data=out_indices, **indices_kw) - _copy_attrs(indices.attrs, d_indices.attrs) - - indptr_kw = _ds_create_kwargs(indptr) - if "chunks" in indptr_kw and indptr_kw["chunks"] is not None: - indptr_kw["chunks"] = (min(indptr_kw["chunks"][0], len(out_indptr)),) - d_indptr = gdst.create_dataset("indptr", data=out_indptr, **indptr_kw) - _copy_attrs(indptr.attrs, d_indptr.attrs) - - -def subset_matrix_like( - src_obj: h5py.Dataset | h5py.Group, - dst_parent: h5py.Group, - name: str, - obs_idx: Optional[np.ndarray], - var_idx: Optional[np.ndarray], - *, - chunk_rows: int = 1024, -) -> None: - """ - Dispatch for dense dataset vs sparse (csr/csc) group. - Args: - src_obj (h5py.Dataset | h5py.Group): Source dataset or group - dst_parent (h5py.Group): Destination parent group - name (str): Name for the destination dataset/group - obs_idx (Optional[np.ndarray]): Indices of observations to keep - var_idx (Optional[np.ndarray]): Indices of variables to keep - chunk_rows (int): Number of rows to read per chunk when subsetting dense matrices - """ - if isinstance(src_obj, h5py.Dataset): - subset_dense_matrix( - src_obj, dst_parent, name, obs_idx, var_idx, chunk_rows=chunk_rows - ) - return - - # group - enc = src_obj.attrs.get("encoding-type", b"") - if isinstance(enc, bytes): - enc = enc.decode("utf-8") - - if enc in ("csr_matrix", "csc_matrix"): - subset_sparse_matrix_group(src_obj, dst_parent, name, obs_idx, var_idx) - else: - # unknown sparse type -> copy as-is (or raise) - src_obj.file.copy(src_obj, dst_parent, name) - - -def subset_h5ad( - file: Path, - output: Path, - obs_file: Optional[Path], - var_file: Optional[Path], - *, - chunk_rows: int = 1024, - console: Console, -) -> None: - """ - Subset an h5ad file by obs and/or var names. - Args: - file (Path): Input h5ad file path - output (Path): Output h5ad file path - obs_file (Optional[Path]): File with obs names to keep (one per line) - var_file (Optional[Path]): File with var names to keep (one per line) - chunk_rows (int): Number of rows to read per chunk when subsetting dense matrices - console (Console): Rich console for output - """ - # ---- Read keep-lists - obs_keep: Optional[Set[str]] = None - if obs_file is not None: - obs_keep = _read_name_file(obs_file) - console.print(f"[cyan]Found {len(obs_keep)} obs names to keep[/]") - - var_keep: Optional[Set[str]] = None - if var_file is not None: - var_keep = _read_name_file(var_file) - console.print(f"[cyan]Found {len(var_keep)} var names to keep[/]") - - if obs_keep is None and var_keep is None: - console.print( - "[bold red]Error:[/] At least one of [cyan]--obs[/] or [cyan]--var[/] must be provided.", - ) - raise typer.Exit(code=1) - - # ---- Open files - with console.status("[magenta]Opening files...[/]"): - src = h5py.File(file, "r") - dst = h5py.File(output, "w") - - try: - # ---- Compute indices - obs_idx = None - if obs_keep is not None: - console.print("[cyan]Matching obs names...[/]") - obs_names_ds = src["obs"].get("obs_names") or src["obs"].get( - src["obs"].attrs.get("_index", "obs_names") - ) - if obs_names_ds is None: - console.print("[bold red]Error:[/] Could not find obs names") - raise RuntimeError("Could not find obs names") - - obs_idx, missing_obs = indices_from_name_set(obs_names_ds, obs_keep) - if missing_obs: - console.print( - f"[yellow]Warning: {len(missing_obs)} obs names not found in file[/]" - ) - console.print( - f"[green]Selected {len(obs_idx)} obs (of {obs_names_ds.shape[0]})[/]" - ) - - var_idx = None - if var_keep is not None: - console.print("[cyan]Matching var names...[/]") - var_names_ds = src["var"].get("var_names") or src["var"].get( - src["var"].attrs.get("_index", "var_names") - ) - if var_names_ds is None: - console.print("[bold red]Error:[/] Could not find var names") - raise RuntimeError("Could not find var names") - - var_idx, missing_var = indices_from_name_set(var_names_ds, var_keep) - if missing_var: - console.print( - f"[yellow]Warning: {len(missing_var)} var names not found in file[/]" - ) - console.print( - f"[green]Selected {len(var_idx)} var (of {var_names_ds.shape[0]})[/]" - ) - - # ---- Build task list - tasks: List[str] = [] - if "obs" in src: - tasks.append("obs") - if "var" in src: - tasks.append("var") - if "X" in src: - tasks.append("X") - if "layers" in src: - tasks.extend([f"layer:{k}" for k in src["layers"].keys()]) - if "obsm" in src: - tasks.extend([f"obsm:{k}" for k in src["obsm"].keys()]) - if "varm" in src: - tasks.extend([f"varm:{k}" for k in src["varm"].keys()]) - if "obsp" in src: - tasks.extend([f"obsp:{k}" for k in src["obsp"].keys()]) - if "varp" in src: - tasks.extend([f"varp:{k}" for k in src["varp"].keys()]) - if "uns" in src: - tasks.append("uns") - - # ---- Progress bar for all operations - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TaskProgressColumn(), - TimeElapsedColumn(), - console=console, - ) as progress: - task_id = progress.add_task("[cyan]Subsetting...", total=len(tasks)) - processed_top: Set[str] = set() - - # obs - if "obs" in src: - progress.update(task_id, description="[cyan]Subsetting obs...[/]") - obs_dst = dst.create_group("obs") - subset_axis_group(src["obs"], obs_dst, obs_idx) - processed_top.add("obs") - progress.advance(task_id) - - # var - if "var" in src: - progress.update(task_id, description="[cyan]Subsetting var...[/]") - var_dst = dst.create_group("var") - subset_axis_group(src["var"], var_dst, var_idx) - processed_top.add("var") - progress.advance(task_id) - - # X - if "X" in src: - progress.update(task_id, description="[cyan]Subsetting X...[/]") - subset_matrix_like( - src["X"], dst, "X", obs_idx, var_idx, chunk_rows=chunk_rows - ) - processed_top.add("X") - progress.advance(task_id) - - # layers - if "layers" in src: - layers_dst = dst.create_group("layers") - processed_top.add("layers") - for lname in src["layers"].keys(): - progress.update( - task_id, description=f"[cyan]Subsetting layer: {lname}...[/]" - ) - subset_matrix_like( - src["layers"][lname], - layers_dst, - lname, - obs_idx, - var_idx, - chunk_rows=chunk_rows, - ) - progress.advance(task_id) - - # obsm - if "obsm" in src: - obsm_dst = dst.create_group("obsm") - processed_top.add("obsm") - for k in src["obsm"].keys(): - if obs_idx is None: - progress.update( - task_id, description=f"[cyan]Copying obsm: {k}...[/]" - ) - src["obsm"].copy(k, obsm_dst, name=k) - else: - progress.update( - task_id, description=f"[cyan]Subsetting obsm: {k}...[/]" - ) - obj = src["obsm"][k] - if isinstance(obj, h5py.Dataset): - data = obj[obs_idx, ...] - obsm_dst.create_dataset(k, data=data) - for ak, av in obj.attrs.items(): - obsm_dst[k].attrs[ak] = av - else: - subset_matrix_like( - obj, obsm_dst, k, obs_idx, None, chunk_rows=chunk_rows - ) - progress.advance(task_id) - - # varm - if "varm" in src: - varm_dst = dst.create_group("varm") - processed_top.add("varm") - for k in src["varm"].keys(): - if var_idx is None: - progress.update( - task_id, description=f"[cyan]Copying varm: {k}...[/]" - ) - src["varm"].copy(k, varm_dst, name=k) - else: - progress.update( - task_id, description=f"[cyan]Subsetting varm: {k}...[/]" - ) - obj = src["varm"][k] - if isinstance(obj, h5py.Dataset): - data = obj[var_idx, ...] - varm_dst.create_dataset(k, data=data) - for ak, av in obj.attrs.items(): - varm_dst[k].attrs[ak] = av - else: - subset_matrix_like( - obj, varm_dst, k, var_idx, None, chunk_rows=chunk_rows - ) - progress.advance(task_id) - - # obsp - if "obsp" in src: - obsp_dst = dst.create_group("obsp") - processed_top.add("obsp") - for k in src["obsp"].keys(): - if obs_idx is None: - progress.update( - task_id, description=f"[cyan]Copying obsp: {k}...[/]" - ) - src["obsp"].copy(k, obsp_dst, name=k) - else: - progress.update( - task_id, description=f"[cyan]Subsetting obsp: {k}...[/]" - ) - subset_matrix_like( - src["obsp"][k], - obsp_dst, - k, - obs_idx, - obs_idx, - chunk_rows=chunk_rows, - ) - progress.advance(task_id) - - # varp - if "varp" in src: - varp_dst = dst.create_group("varp") - processed_top.add("varp") - for k in src["varp"].keys(): - if var_idx is None: - progress.update( - task_id, description=f"[cyan]Copying varp: {k}...[/]" - ) - src["varp"].copy(k, varp_dst, name=k) - else: - progress.update( - task_id, description=f"[cyan]Subsetting varp: {k}...[/]" - ) - subset_matrix_like( - src["varp"][k], - varp_dst, - k, - var_idx, - var_idx, - chunk_rows=chunk_rows, - ) - progress.advance(task_id) - - # uns - if "uns" in src: - progress.update(task_id, description="[cyan]Copying uns...[/]") - src.copy("uns", dst) - processed_top.add("uns") - progress.advance(task_id) - - # copy any remaining top-level keys - for key in src.keys(): - if key not in processed_top: - src.copy(key, dst) - - # top-level attrs - for ak, av in src.attrs.items(): - dst.attrs[ak] = av - - console.print(f"[bold green]✓ Successfully created {output}[/]") - - finally: - dst.close() - src.close() +__all__ = [ + "_read_name_file", + "indices_from_name_set", + "subset_axis_group", + "subset_dense_matrix", + "subset_h5ad", + "subset_sparse_matrix_group", +] diff --git a/src/h5ad/commands/table.py b/src/h5ad/commands/table.py deleted file mode 100644 index 16b7686..0000000 --- a/src/h5ad/commands/table.py +++ /dev/null @@ -1,90 +0,0 @@ -import sys -import csv -from pathlib import Path -from typing import List, Optional, Dict - -import h5py -import numpy as np -from rich.console import Console -from h5ad.info import get_axis_group -from h5ad.read import col_chunk_as_strings - - -def export_table( - file: Path, - axis: str, - columns: Optional[List[str]], - out: Optional[Path], - chunk_rows: int, - head: Optional[int], - console: Console, -) -> None: - """ - Export a table of the specified axis to CSV format. - Args: - file (Path): Path to the .h5ad file - axis (str): Axis to read from ('obs' or 'var') - columns (Optional[List[str]]): List of column names to include in the output table - out (Optional[Path]): Output file path (defaults to stdout) - chunk_rows (int): Number of rows to read per chunk - head (Optional[int]): Output only the first n rows - """ - with h5py.File(file, "r") as f: - group, n_rows, index_name = get_axis_group(f, axis) - - # Determine columns to read - if columns: - col_names = list(columns) - else: - col_names = [k for k in group.keys() if k != "_index" and k != index_name] - # Add index name if not already present - if index_name and index_name not in col_names: - col_names.insert(0, index_name) - - if isinstance(index_name, bytes): - index_name = index_name.decode("utf-8") - - if index_name not in col_names: - col_names.insert(0, index_name) - else: - col_names = [index_name] + [c for c in col_names if c != index_name] - - # Limit rows if head option is specified - if head is not None and head > 0: - n_rows = min(n_rows, head) - - # Open writer - if out is None or str(out) == "-": - out_fh = sys.stdout - else: - out_fh = open(out, "w", newline="", encoding="utf-8") - writer = csv.writer(out_fh) - - # Write data in chunks - try: - writer.writerow(col_names) - cat_cache: Dict[int, np.ndarray] = {} - with console.status( - f"[magenta]Exporting {axis} table...[/] to {'stdout' if out_fh is sys.stdout else out}" - ) as status: - for start in range(0, n_rows, chunk_rows): - end = min(start + chunk_rows, n_rows) - status.update( - f"[magenta]Exporting rows {start}-{end} of {n_rows}...[/]" - ) - cols_data: List[List[str]] = [] - # Read each column for the current chunk - for col in col_names: - cols_data.append( - col_chunk_as_strings(group, col, start, end, cat_cache) - ) - # Write rows - for row_idx in range(end - start): - row = [ - cols_data[col_idx][row_idx] - for col_idx in range(len(col_names)) - ] - writer.writerow(row) - finally: - if out_fh is not sys.stdout: - out_fh.close() diff --git a/src/h5ad/core/__init__.py b/src/h5ad/core/__init__.py new file mode 100644 index 0000000..9224273 --- /dev/null +++ b/src/h5ad/core/__init__.py @@ -0,0 +1 @@ +"""Core logic shared by CLI commands and format handlers.""" diff --git a/src/h5ad/core/info.py b/src/h5ad/core/info.py new file mode 100644 index 0000000..8db8a14 --- /dev/null +++ b/src/h5ad/core/info.py @@ -0,0 +1,221 @@ +from __future__ import annotations + +from typing import Optional, Tuple, Dict, Any, Union + +import numpy as np + +from h5ad.storage import is_dataset, is_group, is_hdf5_dataset + + +def _decode_attr(value: Any) -> Any: + if isinstance(value, bytes): + return value.decode("utf-8") + return value + + +def get_entry_type(entry: Any) -> Dict[str, Any]: + """ + Determine the type/format of an object for export guidance. + + Supports both: + - v0.2.0 (modern): Objects with encoding-type/encoding-version attributes + - v0.1.0 (legacy): Objects without encoding attributes, inferred from structure + """ + result: Dict[str, Any] = { + "type": "unknown", + "export_as": None, + "encoding": None, + "shape": None, + "dtype": None, + "details": "", + "version": None, + } + + enc = _decode_attr(entry.attrs.get("encoding-type", b"")) + result["encoding"] = enc if enc else None + + enc_ver = _decode_attr(entry.attrs.get("encoding-version", b"")) + result["version"] = enc_ver if enc_ver else None + + if is_dataset(entry): + result["shape"] = entry.shape + result["dtype"] = str(entry.dtype) + + if "categories" in entry.attrs: + result["type"] = "categorical" + result["export_as"] = "csv" + result["version"] = result["version"] or "0.1.0" + n_cats = "?" + if is_hdf5_dataset(entry): + try: + cats_ref = entry.attrs["categories"] + cats_ds = entry.file[cats_ref] + n_cats = cats_ds.shape[0] + except Exception: + n_cats = "?" + result["details"] = ( + f"Legacy categorical [{entry.shape[0]} values, {n_cats} categories]" + ) + return result + + if entry.shape == (): + result["type"] = "scalar" + result["export_as"] = "json" + result["details"] = f"Scalar value ({entry.dtype})" + return result + + if entry.ndim == 1: + result["type"] = "array" + result["export_as"] = "npy" + result["details"] = f"1D array [{entry.shape[0]}] ({entry.dtype})" + elif entry.ndim == 2: + result["type"] = "dense-matrix" + result["export_as"] = "npy" + result["details"] = ( + f"Dense matrix {entry.shape[0]}×{entry.shape[1]} ({entry.dtype})" + ) + elif entry.ndim == 3: + result["type"] = "array" + result["export_as"] = "npy" + result["details"] = f"3D array {entry.shape} ({entry.dtype})" + else: + result["type"] = "array" + result["export_as"] = "npy" + result["details"] = f"ND array {entry.shape} ({entry.dtype})" + return result + + if is_group(entry): + if enc in ("csr_matrix", "csc_matrix"): + shape = entry.attrs.get("shape", None) + shape_str = f"{shape[0]}×{shape[1]}" if shape is not None else "?" + result["type"] = "sparse-matrix" + result["export_as"] = "mtx" + result["details"] = ( + f"Sparse {enc.replace('_matrix', '').upper()} matrix {shape_str}" + ) + return result + + if enc == "categorical": + codes = entry.get("codes") + cats = entry.get("categories") + n_codes = codes.shape[0] if codes is not None else "?" + n_cats = cats.shape[0] if cats is not None else "?" + result["type"] = "categorical" + result["export_as"] = "csv" + result["details"] = f"Categorical [{n_codes} values, {n_cats} categories]" + return result + + if ( + enc == "dataframe" + or "_index" in entry.attrs + or "obs_names" in entry + or "var_names" in entry + ): + if enc == "dataframe": + df_version = result["version"] or "0.2.0" + else: + df_version = "0.1.0" + result["version"] = df_version + + has_legacy_cats = "__categories" in entry + n_cols = len( + [k for k in entry.keys() if k not in ("_index", "__categories")] + ) + + result["type"] = "dataframe" + result["export_as"] = "csv" + if has_legacy_cats: + result["details"] = f"DataFrame with {n_cols} columns (legacy v0.1.0)" + else: + result["details"] = f"DataFrame with {n_cols} columns" + return result + + if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"): + result["type"] = "array" + result["export_as"] = "npy" + result["details"] = f"Encoded array ({enc})" + return result + + if enc == "string-array": + result["type"] = "array" + result["export_as"] = "npy" + result["details"] = "Encoded string array" + return result + + if enc == "awkward-array": + length = entry.attrs.get("length", "?") + result["type"] = "awkward-array" + result["export_as"] = "json" + result["details"] = f"Awkward array (length={length})" + return result + + n_keys = len(list(entry.keys())) + result["type"] = "dict" + result["export_as"] = "json" + result["details"] = f"Group with {n_keys} keys" + return result + + return result + + +def format_type_info(info: Dict[str, Any]) -> str: + type_colors = { + "dataframe": "green", + "sparse-matrix": "magenta", + "dense-matrix": "blue", + "array": "blue", + "dict": "yellow", + "categorical": "green", + "scalar": "white", + "unknown": "red", + } + + color = type_colors.get(info["type"], "white") + return f"[{color}]<{info['type']}>[/]" + + +def axis_len(file: Any, axis: str) -> int: + if axis not in file: + raise KeyError(f"'{axis}' not found in the file.") + + group = file[axis] + if not is_group(group): + raise TypeError(f"'{axis}' is not a group.") + + index_name = group.attrs.get("_index", None) + if index_name is None: + if axis == "obs": + index_name = "obs_names" + elif axis == "var": + index_name = "var_names" + else: + raise ValueError(f"Invalid axis '{axis}'. Must be 'obs' or 'var'.") + + index_name = _decode_attr(index_name) + + if index_name not in group: + raise KeyError(f"Index dataset '{index_name}' not found in '{axis}' group.") + + dataset = group[index_name] + if not is_dataset(dataset): + raise TypeError(f"Index '{index_name}' in '{axis}' is not a dataset.") + if dataset.shape: + return int(dataset.shape[0]) + raise ValueError( + f"Cannot determine length of '{axis}': index dataset has no shape." + ) + + +def get_axis_group(file: Any, axis: str) -> Tuple[Any, int, str]: + if axis not in ("obs", "var"): + raise ValueError("axis must be 'obs' or 'var'.") + + n = axis_len(file, axis) + group = file[axis] + + index_name = group.attrs.get("_index", None) + if index_name is None: + index_name = "obs_names" if axis == "obs" else "var_names" + index_name = _decode_attr(index_name) + + return group, n, index_name diff --git a/src/h5ad/core/read.py b/src/h5ad/core/read.py new file mode 100644 index 0000000..b81ee1f --- /dev/null +++ b/src/h5ad/core/read.py @@ -0,0 +1,112 @@ +from __future__ import annotations + +from typing import List, Dict, Any + +import h5py +import numpy as np + +from h5ad.storage import is_group, is_dataset, is_hdf5_dataset + + +def decode_str_array(array: np.ndarray) -> np.ndarray: + if np.issubdtype(array.dtype, np.bytes_): + return array.astype("U") + if array.dtype.kind == "O": + return array.astype(str) + return array.astype(str) + + +def read_categorical_column( + col: Any, + start: int, + end: int, + cache: Dict[int, np.ndarray], + parent_group: Any | None = None, +) -> List[str]: + key = id(col) + + if is_group(col): + if key not in cache: + cats = col["categories"][...] + cats = decode_str_array(cats) + cache[key] = np.asarray(cats, dtype=str) + cats = cache[key] + + codes_ds = col["codes"] + codes = codes_ds[start:end] + codes = np.asarray(codes, dtype=np.int64) + return [cats[c] if 0 <= c < len(cats) else "" for c in codes] + + if is_dataset(col): + if key not in cache: + cats_ref = col.attrs.get("categories", None) + if cats_ref is not None and is_hdf5_dataset(col): + cats_ds = col.file[cats_ref] + cats = cats_ds[...] + elif parent_group is not None and "__categories" in parent_group: + col_name = col.name.split("/")[-1] + cats_grp = parent_group["__categories"] + if col_name in cats_grp: + cats = cats_grp[col_name][...] + else: + raise KeyError( + f"Cannot find categories for legacy column {col.name}" + ) + else: + raise KeyError( + f"Cannot find categories for legacy column {col.name}" + ) + cats = decode_str_array(cats) + cache[key] = np.asarray(cats, dtype=str) + cats = cache[key] + + codes = col[start:end] + codes = np.asarray(codes, dtype=np.int64) + return [cats[c] if 0 <= c < len(cats) else "" for c in codes] + + raise TypeError(f"Unsupported categorical column type: {type(col)}") + + +def col_chunk_as_strings( + group: Any, + col_name: str, + start: int, + end: int, + cat_cache: Dict[int, np.ndarray], +) -> List[str]: + if col_name not in group: + raise RuntimeError(f"Column {col_name!r} not found in group {group.name}") + + col = group[col_name] + + if is_dataset(col): + if "categories" in col.attrs: + return read_categorical_column(col, start, end, cat_cache, group) + + chunk = col[start:end] + if chunk.ndim != 1: + chunk = chunk.reshape(-1) + chunk = decode_str_array(np.asarray(chunk)) + return chunk.tolist() + + if is_group(col): + enc = col.attrs.get("encoding-type", b"") + if isinstance(enc, bytes): + enc = enc.decode("utf-8") + + if enc == "categorical": + return read_categorical_column(col, start, end, cat_cache) + + if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"): + values = col["values"][start:end] + mask = col["mask"][start:end] + values = decode_str_array(np.asarray(values)) + return ["" if m else str(v) for v, m in zip(values, mask)] + + raise ValueError( + f"Unsupported group encoding {enc!r} for column {col_name!r}" + ) + + raise TypeError( + f"Unsupported column type for {col_name!r} in group {group.name}" + ) diff --git a/src/h5ad/core/subset.py b/src/h5ad/core/subset.py new file mode 100644 index 0000000..d9e7829 --- /dev/null +++ b/src/h5ad/core/subset.py @@ -0,0 +1,529 @@ +"""Subset operations for .h5ad and .zarr stores.""" + +from __future__ import annotations + +from pathlib import Path +import shutil +from typing import Optional, Set, Tuple, List, Dict, Any + +import numpy as np +from rich.console import Console +from rich.progress import ( + Progress, + SpinnerColumn, + TextColumn, + BarColumn, + TaskProgressColumn, + TimeElapsedColumn, +) + +from h5ad.core.read import decode_str_array +from h5ad.storage import ( + create_dataset, + copy_attrs, + copy_tree, + dataset_create_kwargs, + detect_backend, + is_dataset, + is_group, + is_zarr_group, + is_zarr_array, + open_store, +) + + +def _target_backend(dst_group: Any) -> str: + return "zarr" if is_zarr_group(dst_group) else "hdf5" + + +def _ensure_group(parent: Any, name: str) -> Any: + return parent[name] if name in parent else parent.create_group(name) + + +def _group_get(parent: Any, key: str) -> Any | None: + return parent[key] if key in parent else None + + +def _decode_attr(value: Any) -> Any: + if isinstance(value, bytes): + return value.decode("utf-8") + return value + + +def _read_name_file(path: Path) -> Set[str]: + names: Set[str] = set() + with open(path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + names.add(line) + return names + + +def indices_from_name_set( + names_ds: Any, + keep: Set[str], + *, + chunk_size: int = 200_000, +) -> Tuple[np.ndarray, Set[str]]: + if names_ds.ndim != 1: + flat_len = int(np.prod(names_ds.shape)) + else: + flat_len = names_ds.shape[0] + + remaining = set(keep) + found_indices: List[int] = [] + + for start in range(0, flat_len, chunk_size): + end = min(start + chunk_size, flat_len) + chunk = names_ds[start:end] + chunk = decode_str_array(np.asarray(chunk)).astype(str) + + for i, name in enumerate(chunk): + if name in remaining: + found_indices.append(start + i) + remaining.remove(name) + + if not remaining: + break + + return np.asarray(found_indices, dtype=np.int64), remaining + + +def subset_axis_group( + src: Any, + dst: Any, + indices: Optional[np.ndarray], +) -> None: + copy_attrs(src.attrs, dst.attrs, target_backend=_target_backend(dst)) + target_backend = _target_backend(dst) + + for key in src.keys(): + obj = src[key] + + if is_dataset(obj): + if indices is None: + copy_tree(obj, dst, key) + else: + if is_zarr_array(obj): + if obj.ndim == 1: + data = obj.oindex[indices] + else: + selection = (indices,) + (slice(None),) * (obj.ndim - 1) + data = obj.oindex[selection] + else: + data = obj[indices, ...] + ds = create_dataset( + dst, + key, + data=data, + **dataset_create_kwargs(obj, target_backend=target_backend), + ) + copy_attrs(obj.attrs, ds.attrs, target_backend=target_backend) + elif is_group(obj): + enc = obj.attrs.get("encoding-type", b"") + if isinstance(enc, bytes): + enc = enc.decode("utf-8") + + if enc == "categorical": + gdst = dst.create_group(key) + copy_attrs(obj.attrs, gdst.attrs, target_backend=target_backend) + copy_tree(obj["categories"], gdst, "categories") + + codes = obj["codes"] + if indices is None: + copy_tree(codes, gdst, "codes") + else: + codes_sub = codes[indices, ...] + ds = create_dataset( + gdst, + "codes", + data=codes_sub, + **dataset_create_kwargs(codes, target_backend=target_backend), + ) + copy_attrs(codes.attrs, ds.attrs, target_backend=target_backend) + else: + copy_tree(obj, dst, key) + + +def subset_dense_matrix( + src: Any, + dst_parent: Any, + name: str, + obs_idx: Optional[np.ndarray], + var_idx: Optional[np.ndarray], + *, + chunk_rows: int = 1024, +) -> None: + if src.ndim != 2: + copy_tree(src, dst_parent, name) + return + + n_obs, n_var = src.shape + out_obs = len(obs_idx) if obs_idx is not None else n_obs + out_var = len(var_idx) if var_idx is not None else n_var + + target_backend = _target_backend(dst_parent) + kw = dataset_create_kwargs(src, target_backend=target_backend) + chunks = kw.get("chunks") + if isinstance(chunks, (tuple, list)) and len(chunks) >= 2: + kw["chunks"] = (min(int(chunks[0]), out_obs), min(int(chunks[1]), out_var)) + + dst = create_dataset( + dst_parent, + name, + shape=(out_obs, out_var), + dtype=src.dtype, + **kw, + ) + copy_attrs(src.attrs, dst.attrs, target_backend=_target_backend(dst_parent)) + + for out_start in range(0, out_obs, chunk_rows): + out_end = min(out_start + chunk_rows, out_obs) + + if obs_idx is None: + block = src[out_start:out_end, :] + else: + rows = obs_idx[out_start:out_end] + block = src[rows, :] + + if var_idx is not None: + block = block[:, var_idx] + + dst[out_start:out_end, :] = block + + +def subset_sparse_matrix_group( + src: Any, + dst_parent: Any, + name: str, + obs_idx: Optional[np.ndarray], + var_idx: Optional[np.ndarray], +) -> None: + enc = src.attrs.get("encoding-type", b"") + if isinstance(enc, bytes): + enc = enc.decode("utf-8") + + if enc not in ("csr_matrix", "csc_matrix"): + raise ValueError(f"Unsupported sparse encoding type: {enc}") + + data = np.asarray(src["data"][...]) + indices = np.asarray(src["indices"][...], dtype=np.int64) + indptr = np.asarray(src["indptr"][...], dtype=np.int64) + shape = src.attrs.get("shape", None) + if shape is None: + raise ValueError("Sparse matrix group missing 'shape' attribute.") + n_rows, n_cols = int(shape[0]), int(shape[1]) + + if enc == "csr_matrix": + row_idx = obs_idx if obs_idx is not None else np.arange(n_rows, dtype=np.int64) + col_idx = var_idx if var_idx is not None else np.arange(n_cols, dtype=np.int64) + + new_data = [] + new_indices = [] + new_indptr = [0] + + for r in row_idx: + start = indptr[r] + end = indptr[r + 1] + row_cols = indices[start:end] + row_data = data[start:end] + + if var_idx is not None: + col_mask = np.isin(row_cols, col_idx) + row_cols = row_cols[col_mask] + row_data = row_data[col_mask] + + if var_idx is not None: + col_map = {c: i for i, c in enumerate(col_idx)} + row_cols = np.array([col_map[c] for c in row_cols], dtype=np.int64) + + new_indices.extend(row_cols.tolist()) + new_data.extend(row_data.tolist()) + new_indptr.append(len(new_indices)) + + new_shape = (len(row_idx), len(col_idx)) + else: + row_idx = obs_idx if obs_idx is not None else np.arange(n_rows, dtype=np.int64) + col_idx = var_idx if var_idx is not None else np.arange(n_cols, dtype=np.int64) + + new_data = [] + new_indices = [] + new_indptr = [0] + + for c in col_idx: + start = indptr[c] + end = indptr[c + 1] + col_rows = indices[start:end] + col_data = data[start:end] + + if obs_idx is not None: + row_mask = np.isin(col_rows, row_idx) + col_rows = col_rows[row_mask] + col_data = col_data[row_mask] + + if obs_idx is not None: + row_map = {r: i for i, r in enumerate(row_idx)} + col_rows = np.array([row_map[r] for r in col_rows], dtype=np.int64) + + new_indices.extend(col_rows.tolist()) + new_data.extend(col_data.tolist()) + new_indptr.append(len(new_indices)) + + new_shape = (len(row_idx), len(col_idx)) + + group = dst_parent.create_group(name) + group.attrs["encoding-type"] = enc + group.attrs["encoding-version"] = "0.1.0" + if is_zarr_group(group): + group.attrs["shape"] = list(new_shape) + else: + group.attrs["shape"] = np.array(new_shape, dtype=np.int64) + + create_dataset(group, "data", data=np.array(new_data, dtype=data.dtype)) + create_dataset(group, "indices", data=np.array(new_indices, dtype=indices.dtype)) + create_dataset(group, "indptr", data=np.array(new_indptr, dtype=indptr.dtype)) + + +def subset_matrix_entry( + obj: Any, + dst_parent: Any, + name: str, + obs_idx: Optional[np.ndarray], + var_idx: Optional[np.ndarray], + *, + chunk_rows: int, + entry_label: str, +) -> None: + if is_dataset(obj): + subset_dense_matrix( + obj, dst_parent, name, obs_idx, var_idx, chunk_rows=chunk_rows + ) + return + + if is_group(obj): + enc = obj.attrs.get("encoding-type", b"") + if isinstance(enc, bytes): + enc = enc.decode("utf-8") + if enc in ("csr_matrix", "csc_matrix"): + subset_sparse_matrix_group(obj, dst_parent, name, obs_idx, var_idx) + return + raise ValueError(f"Unsupported {entry_label} encoding type: {enc}") + + raise ValueError(f"Unsupported {entry_label} object type") + + +def subset_h5ad( + file: Path, + output: Optional[Path], + obs_file: Optional[Path], + var_file: Optional[Path], + *, + chunk_rows: int = 1024, + console: Console, + inplace: bool = False, +) -> None: + obs_keep: Optional[Set[str]] = None + if obs_file is not None: + obs_keep = _read_name_file(obs_file) + console.print(f"[cyan]Found {len(obs_keep)} obs names to keep[/]") + + var_keep: Optional[Set[str]] = None + if var_file is not None: + var_keep = _read_name_file(var_file) + console.print(f"[cyan]Found {len(var_keep)} var names to keep[/]") + + if obs_keep is None and var_keep is None: + raise ValueError("At least one of --obs or --var must be provided.") + + if not inplace and output is None: + raise ValueError("Output file is required unless --inplace is specified.") + + if inplace: + src_backend = detect_backend(file) + if src_backend == "zarr": + base_name = file.stem if file.suffix else file.name + tmp_path = file.with_name(f"{base_name}.subset-tmp.zarr") + else: + tmp_path = file.with_name(f"{file.name}.subset-tmp") + if tmp_path.exists(): + raise FileExistsError(f"Temporary path already exists: {tmp_path}") + dst_path = tmp_path + else: + dst_path = output + + with console.status("[magenta]Opening files...[/]"): + with open_store(file, "r") as src_store, open_store(dst_path, "w") as dst_store: + src = src_store.root + dst = dst_store.root + + obs_idx = None + if obs_keep is not None: + console.print("[cyan]Matching obs names...[/]") + obs_group = src["obs"] + obs_index = _decode_attr(obs_group.attrs.get("_index", "obs_names")) + obs_names_ds = _group_get(obs_group, "obs_names") or _group_get( + obs_group, obs_index + ) + if obs_names_ds is None: + raise KeyError("Could not find obs names") + + obs_idx, missing_obs = indices_from_name_set(obs_names_ds, obs_keep) + if missing_obs: + console.print( + f"[yellow]Warning: {len(missing_obs)} obs names not found in file[/]" + ) + console.print( + f"[green]Selected {len(obs_idx)} obs (of {obs_names_ds.shape[0]})[/]" + ) + + var_idx = None + if var_keep is not None: + console.print("[cyan]Matching var names...[/]") + var_group = src["var"] + var_index = _decode_attr(var_group.attrs.get("_index", "var_names")) + var_names_ds = _group_get(var_group, "var_names") or _group_get( + var_group, var_index + ) + if var_names_ds is None: + raise KeyError("Could not find var names") + + var_idx, missing_var = indices_from_name_set(var_names_ds, var_keep) + if missing_var: + console.print( + f"[yellow]Warning: {len(missing_var)} var names not found in file[/]" + ) + console.print( + f"[green]Selected {len(var_idx)} var (of {var_names_ds.shape[0]})[/]" + ) + + tasks: List[str] = [] + if "obs" in src: + tasks.append("obs") + if "var" in src: + tasks.append("var") + if "X" in src: + tasks.append("X") + if "layers" in src: + tasks.extend([f"layer:{k}" for k in src["layers"].keys()]) + if "obsm" in src: + tasks.extend([f"obsm:{k}" for k in src["obsm"].keys()]) + if "varm" in src: + tasks.extend([f"varm:{k}" for k in src["varm"].keys()]) + if "obsp" in src: + tasks.extend([f"obsp:{k}" for k in src["obsp"].keys()]) + if "varp" in src: + tasks.extend([f"varp:{k}" for k in src["varp"].keys()]) + if "uns" in src: + tasks.append("uns") + + with Progress( + SpinnerColumn(finished_text="[green]✓[/]"), + TextColumn("[progress.description]{task.description}"), + console=console, + transient=False, + ) as progress: + for task in tasks: + task_id = progress.add_task( + f"[cyan]Subsetting {task}...[/]", total=None + ) + if task == "obs": + obs_dst = dst.create_group("obs") + subset_axis_group(src["obs"], obs_dst, obs_idx) + elif task == "var": + var_dst = dst.create_group("var") + subset_axis_group(src["var"], var_dst, var_idx) + elif task == "X": + X = src["X"] + if is_dataset(X): + subset_dense_matrix( + X, dst, "X", obs_idx, var_idx, chunk_rows=chunk_rows + ) + elif is_group(X): + subset_sparse_matrix_group(X, dst, "X", obs_idx, var_idx) + else: + copy_tree(X, dst, "X") + elif task.startswith("layer:"): + key = task.split(":", 1)[1] + layer_src = src["layers"][key] + layers_dst = _ensure_group(dst, "layers") + subset_matrix_entry( + layer_src, + layers_dst, + key, + obs_idx, + var_idx, + chunk_rows=chunk_rows, + entry_label=f"layer:{key}", + ) + elif task.startswith("obsm:"): + key = task.split(":", 1)[1] + obsm_dst = _ensure_group(dst, "obsm") + obsm_obj = src["obsm"][key] + subset_matrix_entry( + obsm_obj, + obsm_dst, + key, + obs_idx, + None, + chunk_rows=chunk_rows, + entry_label=f"obsm:{key}", + ) + elif task.startswith("varm:"): + key = task.split(":", 1)[1] + varm_dst = _ensure_group(dst, "varm") + varm_obj = src["varm"][key] + subset_matrix_entry( + varm_obj, + varm_dst, + key, + var_idx, + None, + chunk_rows=chunk_rows, + entry_label=f"varm:{key}", + ) + elif task.startswith("obsp:"): + key = task.split(":", 1)[1] + obsp_dst = _ensure_group(dst, "obsp") + obsp_obj = src["obsp"][key] + subset_matrix_entry( + obsp_obj, + obsp_dst, + key, + obs_idx, + obs_idx, + chunk_rows=chunk_rows, + entry_label=f"obsp:{key}", + ) + elif task.startswith("varp:"): + key = task.split(":", 1)[1] + varp_dst = _ensure_group(dst, "varp") + varp_obj = src["varp"][key] + subset_matrix_entry( + varp_obj, + varp_dst, + key, + var_idx, + var_idx, + chunk_rows=chunk_rows, + entry_label=f"varp:{key}", + ) + elif task == "uns": + copy_tree(src["uns"], dst, "uns") + progress.update( + task_id, + description=f"[green]Subsetting {task}[/]", + completed=1, + total=1, + ) + + if inplace: + if file.exists(): + if file.is_dir(): + shutil.rmtree(file) + else: + file.unlink() + if dst_path.is_dir(): + shutil.move(str(dst_path), str(file)) + else: + dst_path.replace(file) diff --git a/src/h5ad/formats/__init__.py b/src/h5ad/formats/__init__.py new file mode 100644 index 0000000..18b9721 --- /dev/null +++ b/src/h5ad/formats/__init__.py @@ -0,0 +1 @@ +"""Format-specific import/export helpers.""" diff --git a/src/h5ad/formats/array.py b/src/h5ad/formats/array.py new file mode 100644 index 0000000..1dd21ac --- /dev/null +++ b/src/h5ad/formats/array.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import numpy as np +from rich.console import Console + +from h5ad.formats.common import _get_encoding_type, _resolve +from h5ad.formats.validate import validate_dimensions +from h5ad.storage import create_dataset, is_dataset, is_group +from h5ad.util.path import norm_path + + +def export_npy( + root: Any, + obj: str, + out: Path, + chunk_elements: int, + console: Console, +) -> None: + h5obj = _resolve(root, obj) + + if is_group(h5obj): + enc = _get_encoding_type(h5obj) + if enc in ("nullable-integer", "nullable-boolean", "nullable-string-array"): + if "values" not in h5obj: + raise ValueError(f"Encoded group '{obj}' is missing 'values' dataset.") + ds = h5obj["values"] + console.print(f"[dim]Exporting nullable array values from '{obj}'[/]") + else: + raise ValueError( + f"Target '{obj}' is a group with encoding '{enc}'; cannot export as .npy directly." + ) + elif is_dataset(h5obj): + ds = h5obj + else: + raise ValueError("Target is not an array-like object.") + + out.parent.mkdir(parents=True, exist_ok=True) + mm = np.lib.format.open_memmap(out, mode="w+", dtype=ds.dtype, shape=ds.shape) + try: + if ds.shape == (): + mm[...] = ds[()] + console.print(f"[green]Wrote[/] {out}") + return + + if ds.ndim == 1: + n = int(ds.shape[0]) + step = max(1, int(chunk_elements)) + with console.status(f"[magenta]Exporting {obj} to {out}...[/]") as status: + for start in range(0, n, step): + end = min(start + step, n) + status.update( + f"[magenta]Exporting {obj}: {start}-{end} of {n}...[/]" + ) + mm[start:end] = ds[start:end] + console.print(f"[green]Wrote[/] {out}") + return + + n0 = int(ds.shape[0]) + row_elems = int(np.prod(ds.shape[1:])) if ds.ndim > 1 else 1 + step0 = max(1, int(chunk_elements) // max(1, row_elems)) + with console.status(f"[magenta]Exporting {obj} to {out}...[/]") as status: + for start in range(0, n0, step0): + end = min(start + step0, n0) + status.update( + f"[magenta]Exporting {obj}: {start}-{end} of {n0}...[/]" + ) + mm[start:end, ...] = ds[start:end, ...] + console.print(f"[green]Wrote[/] {out}") + finally: + del mm + + +def import_npy( + root: Any, + obj: str, + input_file: Path, + console: Console, +) -> None: + obj = norm_path(obj) + arr = np.load(input_file) + + validate_dimensions(root, obj, arr.shape, console) + + parts = obj.split("/") + parent = root + for part in parts[:-1]: + parent = parent[part] if part in parent else parent.create_group(part) + name = parts[-1] + + if name in parent: + del parent[name] + + create_dataset(parent, name, data=arr) + + shape_str = "×".join(str(d) for d in arr.shape) + console.print(f"[green]Imported[/] {shape_str} array into '{obj}'") diff --git a/src/h5ad/formats/common.py b/src/h5ad/formats/common.py new file mode 100644 index 0000000..6282eb5 --- /dev/null +++ b/src/h5ad/formats/common.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from typing import Any, Dict + +import numpy as np + +from h5ad.storage import is_dataset, is_group +from h5ad.util.path import norm_path + + +TYPE_EXTENSIONS = { + "dataframe": {".csv"}, + "sparse-matrix": {".mtx"}, + "dense-matrix": {".npy", ".png", ".jpg", ".jpeg", ".tif", ".tiff"}, + "array": {".npy", ".png", ".jpg", ".jpeg", ".tif", ".tiff"}, + "dict": {".json"}, + "scalar": {".json"}, + "categorical": {".csv"}, + "awkward-array": {".json"}, +} + +IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".tif", ".tiff"} + +EXPORTABLE_TYPES = set(TYPE_EXTENSIONS.keys()) + + +def _get_encoding_type(group: Any) -> str: + enc = group.attrs.get("encoding-type", "") + if isinstance(enc, bytes): + enc = enc.decode("utf-8") + return str(enc) + + +def _resolve(root: Any, obj: str) -> Any: + obj = norm_path(obj) + if obj not in root: + raise KeyError(f"'{obj}' not found in the file.") + return root[obj] + + +def _check_json_exportable(h5obj: Any, max_elements: int, path: str = "") -> None: + if is_dataset(h5obj): + if h5obj.shape == (): + return + n = int(np.prod(h5obj.shape)) if h5obj.shape else 0 + if n > max_elements: + obj_name = getattr(h5obj, "name", "") + raise ValueError( + f"Cannot export to JSON: '{path or obj_name}' has {n} elements " + f"(max {max_elements}). Use --max-elements to increase limit." + ) + return + + if is_group(h5obj): + enc = _get_encoding_type(h5obj) + if enc in ("csr_matrix", "csc_matrix"): + obj_name = getattr(h5obj, "name", "") + raise ValueError( + f"Cannot export to JSON: '{path or obj_name}' is a sparse matrix. " + "Export it as .mtx instead." + ) + + for key in h5obj.keys(): + child = h5obj[key] + child_path = f"{path}/{key}" if path else key + if is_group(child) or is_dataset(child): + _check_json_exportable(child, max_elements=max_elements, path=child_path) diff --git a/src/h5ad/formats/dataframe.py b/src/h5ad/formats/dataframe.py new file mode 100644 index 0000000..f767c4c --- /dev/null +++ b/src/h5ad/formats/dataframe.py @@ -0,0 +1,169 @@ +from __future__ import annotations + +import csv +import sys +from contextlib import nullcontext +from pathlib import Path +from typing import Any, List, Optional, Tuple + +import numpy as np +from rich.console import Console + +from h5ad.core.info import get_axis_group +from h5ad.core.read import col_chunk_as_strings +from h5ad.formats.validate import validate_dimensions +from h5ad.storage import create_dataset, is_zarr_group + + +def export_dataframe( + root: Any, + axis: str, + columns: Optional[List[str]], + out: Optional[Path], + chunk_rows: int, + head: Optional[int], + console: Console, +) -> None: + group, n_rows, index_name = get_axis_group(root, axis) + + reserved_keys = {"_index", "__categories"} + + if columns: + col_names = list(columns) + else: + col_names = [ + k for k in group.keys() if k not in reserved_keys and k != index_name + ] + if index_name and index_name not in col_names: + col_names.insert(0, index_name) + + if isinstance(index_name, bytes): + index_name = index_name.decode("utf-8") + + if index_name not in col_names: + col_names.insert(0, index_name) + else: + col_names = [index_name] + [c for c in col_names if c != index_name] + + if head is not None and head > 0: + n_rows = min(n_rows, head) + + if out is None or str(out) == "-": + out_fh = sys.stdout + else: + out_fh = open(out, "w", newline="", encoding="utf-8") + writer = csv.writer(out_fh) + + try: + writer.writerow(col_names) + cat_cache = {} + + use_status = out_fh is not sys.stdout + status_ctx = ( + console.status(f"[magenta]Exporting {axis} table to {out}...[/]") + if use_status + else nullcontext() + ) + + with status_ctx as status: + for start in range(0, n_rows, chunk_rows): + end = min(start + chunk_rows, n_rows) + if use_status and status: + status.update( + f"[magenta]Exporting rows {start}-{end} of {n_rows}...[/]" + ) + cols_data: List[List[str]] = [] + for col in col_names: + cols_data.append( + col_chunk_as_strings(group, col, start, end, cat_cache) + ) + for row_idx in range(end - start): + row = [ + cols_data[col_idx][row_idx] + for col_idx in range(len(col_names)) + ] + writer.writerow(row) + finally: + if out_fh is not sys.stdout: + out_fh.close() + + +def _read_csv( + input_file: Path, + index_column: Optional[str], +) -> Tuple[List[dict], List[str], List[str], str]: + with open(input_file, "r", encoding="utf-8", newline="") as f: + reader = csv.DictReader(f) + if reader.fieldnames is None: + raise ValueError("CSV file has no header.") + fieldnames = list(reader.fieldnames) + + if index_column: + if index_column not in fieldnames: + raise ValueError( + f"Index column '{index_column}' not found in CSV. " + f"Available columns: {', '.join(fieldnames)}" + ) + idx_col = index_column + else: + idx_col = fieldnames[0] + + rows = list(reader) + + index_values = [row[idx_col] for row in rows] + data_columns = [c for c in fieldnames if c != idx_col] + + return rows, data_columns, index_values, idx_col + + +def import_dataframe( + root: Any, + obj: str, + input_file: Path, + index_column: Optional[str], + console: Console, +) -> None: + if obj not in ("obs", "var"): + raise ValueError( + f"CSV import is only supported for 'obs' or 'var', not '{obj}'." + ) + + rows, data_columns, index_values, _ = _read_csv(input_file, index_column) + n_rows = len(rows) + + validate_dimensions(root, obj, (n_rows,), console) + + if obj in root: + del root[obj] + + group = root.create_group(obj) + index_name = "obs_names" if obj == "obs" else "var_names" + group.attrs["_index"] = index_name + group.attrs["encoding-type"] = "dataframe" + group.attrs["encoding-version"] = "0.2.0" + + if is_zarr_group(group): + group.attrs["column-order"] = list(data_columns) + else: + group.attrs["column-order"] = np.array(data_columns, dtype="S") + + create_dataset(group, index_name, data=np.array(index_values, dtype="S")) + + for col in data_columns: + values = [row[col] for row in rows] + try: + arr = np.array(values, dtype=np.float64) + create_dataset(group, col, data=arr) + except (ValueError, TypeError): + try: + arr = np.array(values, dtype=np.int64) + create_dataset(group, col, data=arr) + except (ValueError, TypeError): + arr = np.array(values, dtype="S") + ds = create_dataset(group, col, data=arr) + ds.attrs["encoding-type"] = "string-array" + ds.attrs["encoding-version"] = "0.2.0" + + console.print( + f"[green]Imported[/] {n_rows} rows × {len(data_columns)} columns into '{obj}'" + ) diff --git a/src/h5ad/formats/image.py b/src/h5ad/formats/image.py new file mode 100644 index 0000000..fe5d2ce --- /dev/null +++ b/src/h5ad/formats/image.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import numpy as np +from PIL import Image +from rich.console import Console + +from h5ad.formats.common import _resolve +from h5ad.storage import is_dataset + + +def export_image(root: Any, obj: str, out: Path, console: Console) -> None: + h5obj = _resolve(root, obj) + if not is_dataset(h5obj): + raise ValueError("Image export requires a dataset.") + arr = np.asarray(h5obj[...]) + + if arr.ndim not in (2, 3): + raise ValueError(f"Expected 2D or 3D image array; got shape {arr.shape}.") + if arr.ndim == 3 and arr.shape[2] not in (1, 3, 4): + raise ValueError( + f"Expected last dimension (channels) to be 1, 3, or 4; got {arr.shape}." + ) + + if np.issubdtype(arr.dtype, np.floating): + amax = float(np.nanmax(arr)) if arr.size else 0.0 + if amax <= 1.0: + arr = np.clip(arr, 0.0, 1.0) * 255.0 + else: + arr = np.clip(arr, 0.0, 255.0) + arr = arr.astype(np.uint8) + elif np.issubdtype(arr.dtype, np.integer): + arr = np.clip(arr, 0, 255).astype(np.uint8) + elif arr.dtype == np.bool_: + arr = arr.astype(np.uint8) * 255 + else: + raise ValueError(f"Unsupported image dtype: {arr.dtype}") + + if arr.ndim == 3 and arr.shape[2] == 1: + arr = arr[:, :, 0] + + img = Image.fromarray(arr) + out.parent.mkdir(parents=True, exist_ok=True) + img.save(out) + console.print(f"[green]Wrote[/] {out}") diff --git a/src/h5ad/formats/json_data.py b/src/h5ad/formats/json_data.py new file mode 100644 index 0000000..c983677 --- /dev/null +++ b/src/h5ad/formats/json_data.py @@ -0,0 +1,155 @@ +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any, Dict + +import numpy as np +from rich.console import Console + +from h5ad.core.read import decode_str_array +from h5ad.formats.common import _check_json_exportable, _resolve +from h5ad.storage import create_dataset, is_dataset, is_group +from h5ad.util.path import norm_path + + +def export_json( + root: Any, + obj: str, + out: Path | None, + max_elements: int, + include_attrs: bool, + console: Console, +) -> None: + h5obj = _resolve(root, obj) + _check_json_exportable(h5obj, max_elements=max_elements) + + payload = _to_jsonable( + h5obj, max_elements=max_elements, include_attrs=include_attrs + ) + if out is None or str(out) == "-": + out_fh = sys.stdout + else: + out.parent.mkdir(parents=True, exist_ok=True) + out_fh = open(out, "w", encoding="utf-8") + try: + json.dump(payload, out_fh, indent=2, ensure_ascii=False, sort_keys=True) + out_fh.write("\n") + finally: + if out_fh is not sys.stdout: + out_fh.close() + if out_fh is not sys.stdout: + console.print(f"[green]Wrote[/] {out}") + + +def _attrs_to_jsonable(attrs: Any, max_elements: int) -> Dict[str, Any]: + out: Dict[str, Any] = {} + for k in attrs.keys(): + v = attrs.get(k) + out[str(k)] = _pyify(v, max_elements=max_elements) + return out + + +def _pyify(value: Any, max_elements: int) -> Any: + if isinstance(value, bytes): + try: + return value.decode("utf-8") + except Exception: + return value.decode("utf-8", errors="replace") + if isinstance(value, np.generic): + return value.item() + if isinstance(value, np.ndarray): + if value.size > max_elements: + raise ValueError( + f"Refusing to convert array of size {value.size} (> {max_elements}) to JSON." + ) + if np.issubdtype(value.dtype, np.bytes_) or value.dtype.kind == "O": + value = decode_str_array(value) + return value.tolist() + return value + + +def _dataset_to_jsonable(ds: Any, max_elements: int) -> Any: + if ds.shape == (): + v = ds[()] + return _pyify(v, max_elements=max_elements) + n = int(np.prod(ds.shape)) if ds.shape else 0 + if n > max_elements: + ds_name = getattr(ds, "name", "") + raise ValueError( + f"Refusing to convert dataset {ds_name!r} with {n} elements (> {max_elements}) to JSON." + ) + arr = np.asarray(ds[...]) + return _pyify(arr, max_elements=max_elements) + + +def _to_jsonable(h5obj: Any, max_elements: int, include_attrs: bool) -> Any: + if is_dataset(h5obj): + return _dataset_to_jsonable(h5obj, max_elements=max_elements) + + d: Dict[str, Any] = {} + if include_attrs and len(h5obj.attrs): + d["__attrs__"] = _attrs_to_jsonable(h5obj.attrs, max_elements=max_elements) + + for key in h5obj.keys(): + child = h5obj[key] + if is_group(child) or is_dataset(child): + d[str(key)] = _to_jsonable( + child, + max_elements=max_elements, + include_attrs=include_attrs, + ) + return d + + +def import_json( + root: Any, + obj: str, + input_file: Path, + console: Console, +) -> None: + obj = norm_path(obj) + with open(input_file, "r", encoding="utf-8") as fh: + payload = json.load(fh) + + parts = obj.split("/") + parent = root + for part in parts[:-1]: + parent = parent[part] if part in parent else parent.create_group(part) + name = parts[-1] + + if name in parent: + del parent[name] + + _write_json_to_group(parent, name, payload) + + console.print(f"[green]Imported[/] JSON data into '{obj}'") + + +def _write_json_to_group(parent: Any, name: str, value: Any) -> None: + if isinstance(value, dict): + group = parent.create_group(name) + for k, v in value.items(): + _write_json_to_group(group, k, v) + elif isinstance(value, list): + try: + arr = np.array(value) + if arr.dtype.kind in ("U", "O"): + arr = np.array(value, dtype="S") + create_dataset(parent, name, data=arr) + except (ValueError, TypeError): + create_dataset(parent, name, data=json.dumps(value).encode("utf-8")) + elif isinstance(value, str): + create_dataset(parent, name, data=np.array([value], dtype="S")) + elif isinstance(value, bool): + create_dataset(parent, name, data=np.array(value, dtype=bool)) + elif isinstance(value, int): + create_dataset(parent, name, data=np.array(value, dtype=np.int64)) + elif isinstance(value, float): + create_dataset(parent, name, data=np.array(value, dtype=np.float64)) + elif value is None: + ds = create_dataset(parent, name, data=np.array([], dtype="S")) + ds.attrs["_is_none"] = True + else: + raise ValueError(f"Cannot convert JSON value of type {type(value).__name__}") diff --git a/src/h5ad/formats/sparse.py b/src/h5ad/formats/sparse.py new file mode 100644 index 0000000..4045ce5 --- /dev/null +++ b/src/h5ad/formats/sparse.py @@ -0,0 +1,262 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any, List, Tuple +import sys +from contextlib import nullcontext + +import numpy as np +from rich.console import Console + +from h5ad.formats.common import _get_encoding_type, _resolve +from h5ad.formats.validate import validate_dimensions +from h5ad.storage import create_dataset, is_dataset, is_group, is_zarr_group +from h5ad.util.path import norm_path + + +def _read_mtx( + input_file: Path, +) -> Tuple[List[Tuple[int, int, float]], Tuple[int, int], int]: + with open(input_file, "r", encoding="utf-8") as fh: + header = fh.readline() + if not header.startswith("%%MatrixMarket"): + raise ValueError("Invalid MTX file: missing MatrixMarket header.") + + parts = header.lower().split() + field = "real" + for p in parts: + if p in ("real", "integer", "complex", "pattern"): + field = p + break + + line = fh.readline() + while line.startswith("%"): + line = fh.readline() + + dims = line.split() + n_rows, n_cols, nnz = int(dims[0]), int(dims[1]), int(dims[2]) + + entries = [] + for _ in range(nnz): + parts = fh.readline().split() + r, c = int(parts[0]) - 1, int(parts[1]) - 1 + if field == "pattern": + v = 1.0 + else: + v = float(parts[2]) + entries.append((r, c, v)) + + return entries, (n_rows, n_cols), nnz + + +def _create_csr_from_entries( + entries: List[Tuple[int, int, float]], shape: Tuple[int, int] +) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + n_rows, _ = shape + entries.sort(key=lambda x: (x[0], x[1])) + + data = np.array([e[2] for e in entries], dtype=np.float32) + indices = np.array([e[1] for e in entries], dtype=np.int32) + + indptr = np.zeros(n_rows + 1, dtype=np.int32) + for r, _, _ in entries: + indptr[r + 1] += 1 + indptr = np.cumsum(indptr) + + return data, indices, indptr + + +def export_mtx( + root: Any, + obj: str, + out: Path | None, + head: int | None, + chunk_elements: int, + in_memory: bool, + console: Console, +) -> None: + h5obj = _resolve(root, obj) + if not is_group(h5obj): + raise ValueError("MTX export requires a CSR/CSC matrix group (not a dataset).") + + enc = _get_encoding_type(h5obj) + if enc not in ("csr_matrix", "csc_matrix"): + raise ValueError( + f"Target group encoding-type is {enc!r}; expected 'csr_matrix' or 'csc_matrix'." + ) + + data = h5obj.get("data") + indices = h5obj.get("indices") + indptr = h5obj.get("indptr") + if not (is_dataset(data) and is_dataset(indices) and is_dataset(indptr)): + raise ValueError( + "Sparse matrix group must contain datasets: data, indices, indptr" + ) + + shape = h5obj.attrs.get("shape", None) + if shape is None: + raise ValueError("Sparse matrix group is missing required 'shape' attribute.") + n_rows, n_cols = (int(shape[0]), int(shape[1])) + + field = "real" if np.issubdtype(data.dtype, np.floating) else "integer" + + indptr_arr = np.asarray(indptr[...], dtype=np.int64) + nnz_ptr = int(indptr_arr[-1]) if indptr_arr.size else 0 + nnz_data = int(data.shape[0]) + nnz_idx = int(indices.shape[0]) + + if not (nnz_ptr == nnz_data == nnz_idx): + raise ValueError( + f"Sparse matrix data inconsistency: indptr implies {nnz_ptr} nonzeros, " + f"but data has {nnz_data} and indices has {nnz_idx}." + ) + + nnz = nnz_data + major_step = max(1, int(chunk_elements)) + if head is not None and head > 0: + nnz = min(nnz_data, head) + + if out is None or str(out) == "-": + out_fh = sys.stdout + else: + out.parent.mkdir(parents=True, exist_ok=True) + out_fh = open(out, "w", encoding="utf-8", newline="\n") + + use_status = out_fh is not sys.stdout + status_ctx = ( + console.status(f"[magenta]Exporting {obj} to {out}...[/]") + if use_status + else nullcontext() + ) + try: + out_fh.write(f"%%MatrixMarket matrix coordinate {field} general\n") + out_fh.write("% generated by h5ad-cli\n") + if head is not None and head > 0: + out_fh.write(f"% output limited to first {nnz}/{nnz_data} nonzero entries\n") + out_fh.write(f"{n_rows} {n_cols} {nnz}\n") + + if in_memory: + with status_ctx as status: + if use_status and status: + status.update( + f"[magenta]Loading entire matrix {obj} into memory...[/]" + ) + data_arr = np.asarray(data[...]) + indices_arr = np.asarray(indices[...], dtype=np.int64) + counts = np.diff(indptr_arr) + if int(counts.sum()) != nnz_data: + raise ValueError( + "Sparse matrix indptr does not match data/indices length." + ) + + if enc == "csr_matrix": + major_idx = np.repeat(np.arange(n_rows, dtype=np.int64), counts) + row_idx = major_idx + col_idx = indices_arr + else: + major_idx = np.repeat(np.arange(n_cols, dtype=np.int64), counts) + row_idx = indices_arr + col_idx = major_idx + + if head is not None and head > 0: + row_idx = row_idx[:nnz] + col_idx = col_idx[:nnz] + data_arr = data_arr[:nnz] + + data_fmt = "%.18g" if field == "real" else "%d" + coords = np.column_stack((row_idx + 1, col_idx + 1, data_arr)) + if use_status and status: + status.update(f"[magenta]Saving {nnz} entries to {out}...[/]") + np.savetxt(out_fh, coords, fmt=["%d", "%d", data_fmt], newline="\n") + else: + major = n_rows if enc == "csr_matrix" else n_cols + max_lines = head if head is not None and head > 0 else None + written = 0 + with status_ctx as status: + for major_start in range(0, major, major_step): + major_end = min(major_start + major_step, major) + if use_status and status: + status.update( + f"[magenta]Exporting {obj}: {major_start+1}-{major_end} of {major}...[/]" + ) + for major_i in range(major_start, major_end): + start = min(int(indptr_arr[major_i]), nnz_data) + end = min(int(indptr_arr[major_i + 1]), nnz_data) + if end <= start: + continue + idx = np.asarray(indices[start:end], dtype=np.int64) + vals = np.asarray(data[start:end]) + m = min(len(idx), len(vals)) + if m == 0: + raise ValueError("Sparse matrix chunk has zero length.") + if max_lines is not None: + remaining = max_lines - written + if remaining <= 0: + break + if m > remaining: + m = remaining + idx = idx[:m] + vals = vals[:m] + idx_list = idx.tolist() + vals_list = vals.tolist() + if enc == "csr_matrix": + r = major_i + 1 + lines = [ + f"{r} {c + 1} {v}\n" + for c, v in zip(idx_list, vals_list) + ] + else: + c = major_i + 1 + lines = [ + f"{r + 1} {c} {v}\n" + for r, v in zip(idx_list, vals_list) + ] + out_fh.write("".join(lines)) + written += m + if max_lines is not None and written >= max_lines: + break + if max_lines is not None and written >= max_lines: + break + finally: + if out_fh is not sys.stdout: + out_fh.close() + if out_fh is not sys.stdout: + console.print(f"[green]Wrote[/] {out}") + + +def import_mtx( + root: Any, + obj: str, + input_file: Path, + console: Console, +) -> None: + obj = norm_path(obj) + entries, shape, nnz = _read_mtx(input_file) + data, indices, indptr = _create_csr_from_entries(entries, shape) + + validate_dimensions(root, obj, shape, console) + + parts = obj.split("/") + parent = root + for part in parts[:-1]: + parent = parent[part] if part in parent else parent.create_group(part) + name = parts[-1] + + if name in parent: + del parent[name] + + group = parent.create_group(name) + group.attrs["encoding-type"] = "csr_matrix" + group.attrs["encoding-version"] = "0.1.0" + if is_zarr_group(group): + group.attrs["shape"] = list(shape) + else: + group.attrs["shape"] = np.array(shape, dtype=np.int64) + + create_dataset(group, "data", data=data) + create_dataset(group, "indices", data=indices) + create_dataset(group, "indptr", data=indptr) + + console.print( + f"[green]Imported[/] {shape[0]}×{shape[1]} sparse matrix ({nnz} non-zero) into '{obj}'" + ) diff --git a/src/h5ad/formats/validate.py b/src/h5ad/formats/validate.py new file mode 100644 index 0000000..194192b --- /dev/null +++ b/src/h5ad/formats/validate.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +from typing import Optional, Tuple, Any + +from rich.console import Console + +from h5ad.core.info import axis_len +from h5ad.util.path import norm_path + + +OBS_AXIS_PREFIXES = ("obs", "obsm/", "obsp/") +VAR_AXIS_PREFIXES = ("var", "varm/", "varp/") +MATRIX_PREFIXES = ("X", "layers/") + + +def _get_axis_length(root: Any, axis: str) -> Optional[int]: + try: + return axis_len(root, axis) + except Exception: + return None + + +def validate_dimensions( + root: Any, + obj_path: str, + data_shape: Tuple[int, ...], + console: Console, +) -> None: + obj_path = norm_path(obj_path) + n_obs = _get_axis_length(root, "obs") + n_var = _get_axis_length(root, "var") + + if obj_path == "obs": + if n_obs is not None and data_shape[0] != n_obs: + raise ValueError( + f"Row count mismatch: input has {data_shape[0]} rows, " + f"but obs has {n_obs} cells." + ) + return + if obj_path == "var": + if n_var is not None and data_shape[0] != n_var: + raise ValueError( + f"Row count mismatch: input has {data_shape[0]} rows, " + f"but var has {n_var} features." + ) + return + + for prefix in MATRIX_PREFIXES: + if obj_path == prefix or obj_path.startswith(prefix + "/") or obj_path.startswith(prefix): + if obj_path == "X" or obj_path.startswith("layers/"): + if len(data_shape) < 2: + raise ValueError( + f"Matrix data requires 2D shape, got {len(data_shape)}D." + ) + if n_obs is not None and data_shape[0] != n_obs: + raise ValueError( + f"First dimension mismatch: input has {data_shape[0]} rows, " + f"but obs has {n_obs} cells." + ) + if n_var is not None and data_shape[1] != n_var: + raise ValueError( + f"Second dimension mismatch: input has {data_shape[1]} columns, " + f"but var has {n_var} features." + ) + return + + for prefix in OBS_AXIS_PREFIXES: + if obj_path.startswith(prefix) and obj_path != "obs": + if n_obs is not None and data_shape[0] != n_obs: + raise ValueError( + f"First dimension mismatch: input has {data_shape[0]} rows, " + f"but obs has {n_obs} cells." + ) + if obj_path.startswith("obsp/") and len(data_shape) >= 2: + if data_shape[1] != n_obs: + raise ValueError( + "obsp matrix must be square (n_obs × n_obs): " + f"got {data_shape[0]}×{data_shape[1]}, expected {n_obs}×{n_obs}." + ) + return + + for prefix in VAR_AXIS_PREFIXES: + if obj_path.startswith(prefix) and obj_path != "var": + if n_var is not None and data_shape[0] != n_var: + raise ValueError( + f"First dimension mismatch: input has {data_shape[0]} rows, " + f"but var has {n_var} features." + ) + if obj_path.startswith("varp/") and len(data_shape) >= 2: + if data_shape[1] != n_var: + raise ValueError( + "varp matrix must be square (n_var × n_var): " + f"got {data_shape[0]}×{data_shape[1]}, expected {n_var}×{n_var}." + ) + return + + console.print(f"[dim]Note: No dimension validation for path '{obj_path}'[/]") diff --git a/src/h5ad/info.py b/src/h5ad/info.py index 3535303..635b03a 100644 --- a/src/h5ad/info.py +++ b/src/h5ad/info.py @@ -1,77 +1,3 @@ -from typing import Optional, Tuple -import h5py +from h5ad.core.info import axis_len, format_type_info, get_axis_group, get_entry_type - -def axis_len(file: h5py.File, axis: str) -> Optional[int]: - """ - Get the length of the specified axis ('obs' or 'var') in the h5ad file. - Args: - file (h5py.File): Opened h5ad file object - axis (str): Axis name ('obs' or 'var') - - Returns: - Optional[int]: Length of the axis, or None if not found - """ - # Check if the specified axis exists in the file - if axis not in file: - return None - - # Get the group corresponding to the axis - group = file[axis] - if not isinstance(group, h5py.Group): - return None - - # Determine the index name for the axis - index_name = group.attrs.get("_index", None) - if index_name is None: - if axis == "obs": - index_name = "obs_names" - elif axis == "var": - index_name = "var_names" - else: - return None - - if isinstance(index_name, bytes): - index_name = index_name.decode("utf-8") - - if index_name not in group: - return None - - # Return the length of the index dataset - dataset = group[index_name] - if not isinstance(dataset, h5py.Dataset): - return None - if dataset.shape: - return int(dataset.shape[0]) - return None - - -def get_axis_group(file: h5py.File, axis: str) -> Tuple[h5py.Group, int, str]: - """ - Get the axis group, its length, and index name. - Args: - file (h5py.File): Opened h5ad file object - axis (str): Axis name ('obs' or 'var') - - Returns: - Tuple[h5py.Group, int, str]: Axis group, its length, and index - """ - if axis not in ("obs", "var"): - raise ValueError("axis must be 'obs' or 'var'.") - if axis not in file: - raise KeyError(f"'{axis}' not found in the file.") - - group = file[axis] - if not isinstance(group, h5py.Group): - raise TypeError(f"'{axis}' is not a group.") - - n = axis_len(file, axis) - if n is None: - raise RuntimeError(f"Could not determine length of axis '{axis}'.") - - index_name = group.attrs.get("_index", None) - if index_name is None: - index_name = "obs_names" if axis == "obs" else "var_names" - if isinstance(index_name, bytes): - index_name = index_name.decode("utf-8") - return group, n, index_name +__all__ = ["axis_len", "format_type_info", "get_axis_group", "get_entry_type"] diff --git a/src/h5ad/read.py b/src/h5ad/read.py index 5abec06..63f8c4d 100644 --- a/src/h5ad/read.py +++ b/src/h5ad/read.py @@ -1,82 +1,3 @@ -import numpy as np -import h5py -from typing import List, Dict +from h5ad.core.read import col_chunk_as_strings, decode_str_array, read_categorical_column - -def decode_str_array(array: np.ndarray) -> np.ndarray: - """ - Decode a numpy array of bytes or objects to strings. - Args: - array (np.ndarray): Input numpy array of bytes or objects - - Returns: - np.ndarray: Decoded numpy array of strings - """ - if np.issubdtype(array.dtype, np.bytes_): - return array.astype("U") - if array.dtype.kind == "O": - return array.astype(str) - return array.astype(str) - - -def read_categorical_column( - col_group: h5py.Group, start: int, end: int, cache: Dict[int, np.ndarray] -) -> List[str]: - """ - Decode an AnnData 'categorical' column for a slice [start:end]. - Args: - col_group (h5py.Group): Column group containing 'categories' and 'codes' - start (int): Start index of the slice - end (int): End index of the slice - cache (Dict[int, np.ndarray]): Cache for decoded categories - Returns: - List[str]: Decoded categorical values for the specified slice - """ - key = id(col_group) - if key not in cache: - cats = col_group["categories"][...] - cats = decode_str_array(cats) - cache[key] = np.asarray(cats, dtype=str) - cats = cache[key] - - codes_ds = col_group["codes"] - codes = codes_ds[start:end] - codes = np.asarray(codes, dtype=np.int64) - return [cats[c] if 0 <= c < len(cats) else "" for c in codes] - - -def col_chunk_as_strings( - group: h5py.Group, - col_name: str, - start: int, - end: int, - cat_cache: Dict[int, np.ndarray], -) -> List[str]: - """ - Read a column from an obs/var group as strings. - Args: - group (h5py.Group): The obs/var group - col_name (str): Name of the column to read - start (int): Start index of the slice - end (int): End index of the slice - cat_cache (Dict[int, np.ndarray]): Cache for decoded categorical columns - Returns: - List[str]: Column values as strings for the specified slice - """ - if col_name in group and isinstance(group[col_name], h5py.Dataset): - dataset = group[col_name] - chunk = dataset[start:end] - if chunk.ndim != 1: - chunk = chunk.reshape(-1) - chunk = decode_str_array(np.asarray(chunk)) - return chunk.tolist() - - if col_name in group and isinstance(group[col_name], h5py.Group): - col_group = group[col_name] - enc = col_group.attrs.get("encoding-type", b"") - if isinstance(enc, bytes): - enc = enc.decode("utf-8") - if enc == "categorical": - return read_categorical_column(col_group, start, end, cat_cache) - - raise RuntimeError(f"Unsupported column {col_name!r} in group {group.name}") +__all__ = ["col_chunk_as_strings", "decode_str_array", "read_categorical_column"] diff --git a/src/h5ad/storage/__init__.py b/src/h5ad/storage/__init__.py new file mode 100644 index 0000000..43d876d --- /dev/null +++ b/src/h5ad/storage/__init__.py @@ -0,0 +1,271 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Iterable, Optional, Sequence +import shutil + +import h5py + +try: + import zarr +except Exception: # pragma: no cover - optional dependency + zarr = None + +import numpy as np + + +@dataclass +class Store: + backend: str + root: Any + path: Path + + def close(self) -> None: + if self.backend == "hdf5": + try: + self.root.close() + except Exception: + return + + def __enter__(self) -> "Store": + return self + + def __exit__(self, exc_type, exc, tb) -> None: + self.close() + + +def _require_zarr() -> None: + if zarr is None: # pragma: no cover - optional dependency + raise ImportError( + "zarr is required for .zarr support. Install with: uv sync --extra zarr" + ) + + +def is_hdf5_group(obj: Any) -> bool: + return isinstance(obj, (h5py.File, h5py.Group)) + + +def is_hdf5_dataset(obj: Any) -> bool: + return isinstance(obj, h5py.Dataset) + + +def is_zarr_group(obj: Any) -> bool: + return zarr is not None and isinstance(obj, zarr.Group) + + +def is_zarr_array(obj: Any) -> bool: + return zarr is not None and isinstance(obj, zarr.Array) + + +def is_group(obj: Any) -> bool: + return is_hdf5_group(obj) or is_zarr_group(obj) + + +def is_dataset(obj: Any) -> bool: + return is_hdf5_dataset(obj) or is_zarr_array(obj) + + +def is_zarr_path(path: Path) -> bool: + if not path.exists() or not path.is_dir(): + return False + if (path / "zarr.json").exists(): + return True + if (path / ".zgroup").exists() or (path / ".zattrs").exists(): + return True + return False + + +def detect_backend(path: Path) -> str: + if path.exists(): + if path.is_dir(): + if is_zarr_path(path): + return "zarr" + raise ValueError( + f"Path '{path}' is a directory but does not look like a Zarr store." + ) + return "hdf5" + if path.suffix == ".zarr": + return "zarr" + return "hdf5" + + +def open_store(path: Path, mode: str) -> Store: + path = Path(path) + backend = detect_backend(path) + if backend == "zarr": + _require_zarr() + root = zarr.open_group(str(path), mode=mode) + return Store(backend="zarr", root=root, path=path) + root = h5py.File(path, mode) + return Store(backend="hdf5", root=root, path=path) + + +def _normalize_attr_value(value: Any, target_backend: str) -> Any: + if target_backend == "zarr": + if isinstance(value, bytes): + return value.decode("utf-8") + if isinstance(value, (list, tuple)): + return [ + v.decode("utf-8") if isinstance(v, bytes) else v for v in value + ] + if isinstance(value, np.ndarray): + if value.dtype.kind in ("S", "O"): + return [ + v.decode("utf-8") if isinstance(v, bytes) else v + for v in value.tolist() + ] + return value.tolist() + if isinstance(value, np.generic): + return value.item() + return value + + +def copy_attrs(src_attrs: Any, dst_attrs: Any, *, target_backend: str) -> None: + for k, v in src_attrs.items(): + dst_attrs[k] = _normalize_attr_value(v, target_backend) + + +def dataset_create_kwargs(src: Any, *, target_backend: str) -> dict: + kw: dict = {} + chunks = getattr(src, "chunks", None) + if chunks is not None: + kw["chunks"] = chunks + if target_backend == "hdf5" and is_hdf5_dataset(src): + if src.compression is not None: + kw["compression"] = src.compression + kw["compression_opts"] = src.compression_opts + kw["shuffle"] = bool(src.shuffle) + kw["fletcher32"] = bool(src.fletcher32) + if src.scaleoffset is not None: + kw["scaleoffset"] = src.scaleoffset + if src.fillvalue is not None: + kw["fillvalue"] = src.fillvalue + if target_backend == "zarr" and is_zarr_array(src): + src_zarr_format = getattr(getattr(src, "metadata", None), "zarr_format", None) + if src_zarr_format == 3: + compressors = None + try: + compressors = getattr(src, "compressors", None) + except Exception: + compressors = None + if compressors is not None: + kw["compressors"] = compressors + else: + try: + compressor = getattr(src, "compressor", None) + except Exception: + compressor = None + if compressor is not None: + kw["compressor"] = compressor + try: + filters = getattr(src, "filters", None) + except Exception: + filters = None + if filters is not None: + kw["filters"] = filters + try: + fill_value = getattr(src, "fill_value", None) + except Exception: + fill_value = None + if fill_value is not None: + kw["fill_value"] = fill_value + return kw + + +def create_dataset( + parent: Any, + name: str, + *, + data: Any = None, + shape: Optional[Sequence[int]] = None, + dtype: Any = None, + **kwargs: Any, +) -> Any: + if is_zarr_group(parent): + zarr_format = getattr(getattr(parent, "metadata", None), "zarr_format", None) + if zarr_format == 3: + kwargs = dict(kwargs) + kwargs.pop("compressor", None) + elif zarr_format == 2 and "compressors" in kwargs and "compressor" not in kwargs: + kwargs = dict(kwargs) + compressors = kwargs.pop("compressors") + if isinstance(compressors, (list, tuple)) and len(compressors) == 1: + kwargs["compressor"] = compressors[0] + if data is not None: + return parent.create_array(name, data=data, **kwargs) + return parent.create_array(name, shape=shape, dtype=dtype, **kwargs) + if data is not None: + return parent.create_dataset(name, data=data, **kwargs) + return parent.create_dataset(name, shape=shape, dtype=dtype, **kwargs) + + +def _chunk_step(shape: Sequence[int], chunks: Optional[Sequence[int]]) -> int: + if chunks is not None and len(chunks) > 0 and chunks[0]: + return int(chunks[0]) + if not shape: + return 1 + return max(1, min(1024, int(shape[0]))) + + +def copy_dataset(src: Any, dst_group: Any, name: str) -> Any: + shape = tuple(src.shape) if getattr(src, "shape", None) is not None else () + target_backend = "zarr" if is_zarr_group(dst_group) else "hdf5" + ds = create_dataset( + dst_group, + name, + shape=shape, + dtype=src.dtype, + **dataset_create_kwargs(src, target_backend=target_backend), + ) + copy_attrs(src.attrs, ds.attrs, target_backend=target_backend) + + if shape == (): + ds[()] = src[()] + return ds + + step = _chunk_step(shape, getattr(src, "chunks", None)) + for start in range(0, shape[0], step): + end = min(start + step, shape[0]) + if len(shape) == 1: + ds[start:end] = src[start:end] + else: + ds[start:end, ...] = src[start:end, ...] + return ds + + +def copy_tree(src_obj: Any, dst_group: Any, name: str, *, exclude: Iterable[str] = ()) -> Any: + if is_hdf5_group(dst_group) and (is_hdf5_group(src_obj) or is_hdf5_dataset(src_obj)): + if not exclude: + dst_group.copy(src_obj, dst_group, name) + return dst_group[name] + if is_dataset(src_obj): + return copy_dataset(src_obj, dst_group, name) + if not is_group(src_obj): + raise TypeError(f"Unsupported object type for copy: {type(src_obj)}") + + target_backend = "zarr" if is_zarr_group(dst_group) else "hdf5" + grp = dst_group.create_group(name) + copy_attrs(src_obj.attrs, grp.attrs, target_backend=target_backend) + for key in src_obj.keys(): + if key in exclude: + continue + child = src_obj[key] + copy_tree(child, grp, key, exclude=exclude) + return grp + + +def copy_store_contents(src_root: Any, dst_root: Any) -> None: + for key in src_root.keys(): + copy_tree(src_root[key], dst_root, key) + + +def copy_path(src: Path, dst: Path) -> None: + src = Path(src) + dst = Path(dst) + if is_zarr_path(src): + if dst.exists(): + raise FileExistsError(f"Destination '{dst}' already exists.") + shutil.copytree(src, dst) + return + shutil.copy2(src, dst) diff --git a/src/h5ad/util/__init__.py b/src/h5ad/util/__init__.py new file mode 100644 index 0000000..364e184 --- /dev/null +++ b/src/h5ad/util/__init__.py @@ -0,0 +1 @@ +"""Utility helpers used across h5ad modules.""" diff --git a/src/h5ad/util/path.py b/src/h5ad/util/path.py new file mode 100644 index 0000000..c5c7102 --- /dev/null +++ b/src/h5ad/util/path.py @@ -0,0 +1,9 @@ +from __future__ import annotations + + +def norm_path(path: str) -> str: + """Normalize object paths used inside h5ad/zarr stores.""" + value = path.strip() + if not value: + raise ValueError("Object path must be non-empty.") + return value.lstrip("/") diff --git a/tests/conftest.py b/tests/conftest.py index bff9605..e3b710f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -172,3 +172,53 @@ def sample_categorical_h5ad(temp_dir): f.create_dataset("X", data=X) return file_path + + +@pytest.fixture +def sample_legacy_v010_h5ad(temp_dir): + """Create a sample h5ad file with legacy v0.1.0 categorical columns. + + In v0.1.0, categorical columns are stored as: + - Integer code datasets with a 'categories' attribute (HDF5 object reference) + - Categories stored in __categories/ subgroup + """ + file_path = temp_dir / "test_legacy_v010.h5ad" + + with h5py.File(file_path, "w") as f: + # Create obs with legacy categorical column + obs = f.create_group("obs") + obs.attrs["_index"] = "obs_names" + obs.attrs["encoding-type"] = "dataframe" + obs.attrs["encoding-version"] = "0.1.0" + obs_names = ["cell_1", "cell_2", "cell_3", "cell_4"] + obs.create_dataset("obs_names", data=np.array(obs_names, dtype="S")) + + # Create __categories subgroup (v0.1.0 convention) + categories_group = obs.create_group("__categories") + cell_type_cats = np.array(["TypeA", "TypeB", "TypeC"], dtype="S") + cats_ds = categories_group.create_dataset("cell_type", data=cell_type_cats) + + # Create categorical column as integer codes with reference to categories + codes = np.array([0, 1, 0, 2], dtype=np.int8) + cell_type_ds = obs.create_dataset("cell_type", data=codes) + # Store HDF5 object reference to categories + cell_type_ds.attrs["categories"] = cats_ds.ref + + # Add a regular non-categorical column + obs.create_dataset( + "n_counts", data=np.array([100, 200, 150, 300], dtype=np.int32) + ) + + # Create var + var = f.create_group("var") + var.attrs["_index"] = "var_names" + var.attrs["encoding-type"] = "dataframe" + var.attrs["encoding-version"] = "0.1.0" + var_names = ["gene_1", "gene_2"] + var.create_dataset("var_names", data=np.array(var_names, dtype="S")) + + # Create X matrix (no encoding-type for legacy) + X = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]], dtype=np.float32) + f.create_dataset("X", data=X) + + return file_path diff --git a/tests/test_cli.py b/tests/test_cli.py index 1659104..9a27d0c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -6,7 +6,7 @@ from typer.testing import CliRunner from h5ad.cli import app from h5ad.commands.info import show_info -from h5ad.commands.table import export_table +from h5ad.commands.export import export_table from rich.console import Console @@ -33,16 +33,86 @@ def test_info_function_direct(self, sample_h5ad_file): # Should not raise exception show_info(sample_h5ad_file, console) + def test_info_tree_flag(self, sample_h5ad_file): + """Test info command with --tree flag.""" + result = runner.invoke(app, ["info", "--tree", str(sample_h5ad_file)]) + assert result.exit_code == 0 + # Should show type annotations in angle brackets + # Output may go to stdout or stderr depending on console config + output = result.stdout + (result.stderr or "") + assert "<" in output + assert ">" in output + + def test_info_tree_short_flag(self, sample_h5ad_file): + """Test info command with -t short flag.""" + result = runner.invoke(app, ["info", "-t", str(sample_h5ad_file)]) + assert result.exit_code == 0 + output = result.stdout + (result.stderr or "") + assert "<" in output + + def test_info_depth_flag(self, sample_h5ad_file): + """Test info command with --depth flag.""" + result = runner.invoke( + app, ["info", "--tree", "--depth", "1", str(sample_h5ad_file)] + ) + assert result.exit_code == 0 + output = result.stdout + (result.stderr or "") + assert "<" in output -class TestTableCommand: - """Tests for table command.""" + def test_info_depth_short_flag(self, sample_h5ad_file): + """Test info command with -d short flag.""" + result = runner.invoke(app, ["info", "-t", "-d", "2", str(sample_h5ad_file)]) + assert result.exit_code == 0 + output = result.stdout + (result.stderr or "") + assert "<" in output - def test_table_command_obs(self, sample_h5ad_file, temp_dir): - """Test table command for obs axis.""" + def test_info_entry_positional(self, sample_h5ad_file): + """Test info command with entry as positional argument.""" + result = runner.invoke(app, ["info", str(sample_h5ad_file), "X"]) + assert result.exit_code == 0 + output = result.stdout + (result.stderr or "") + assert "Path:" in output + assert "Type:" in output + + def test_info_entry_obs(self, sample_h5ad_file): + """Test info command with obs entry.""" + result = runner.invoke(app, ["info", str(sample_h5ad_file), "obs"]) + assert result.exit_code == 0 + output = result.stdout + (result.stderr or "") + assert "Path:" in output + assert "dataframe" in output + + def test_info_entry_nested_path(self, sample_h5ad_file): + """Test info command with nested object path.""" + result = runner.invoke(app, ["info", str(sample_h5ad_file), "uns/description"]) + assert result.exit_code == 0 + output = result.stdout + (result.stderr or "") + assert "Path:" in output + + def test_info_entry_not_found(self, sample_h5ad_file): + """Test info command with non-existent object path.""" + result = runner.invoke(app, ["info", str(sample_h5ad_file), "nonexistent"]) + assert result.exit_code == 0 # Doesn't exit with error, just shows message + output = result.stdout + (result.stderr or "") + assert "not found" in output + + +class TestExportDataframeCommand: + """Tests for export dataframe command (replaces table command).""" + + def test_export_dataframe_obs(self, sample_h5ad_file, temp_dir): + """Test export dataframe for obs axis.""" output = temp_dir / "obs_table.csv" result = runner.invoke( app, - ["table", str(sample_h5ad_file), "--axis", "obs", "--output", str(output)], + [ + "export", + "dataframe", + str(sample_h5ad_file), + "obs", + "--output", + str(output), + ], ) assert result.exit_code == 0 assert output.exists() @@ -54,12 +124,19 @@ def test_table_command_obs(self, sample_h5ad_file, temp_dir): assert len(rows) == 6 # header + 5 rows assert "obs_names" in rows[0] - def test_table_command_var(self, sample_h5ad_file, temp_dir): - """Test table command for var axis.""" + def test_export_dataframe_var(self, sample_h5ad_file, temp_dir): + """Test export dataframe for var axis.""" output = temp_dir / "var_table.csv" result = runner.invoke( app, - ["table", str(sample_h5ad_file), "--axis", "var", "--output", str(output)], + [ + "export", + "dataframe", + str(sample_h5ad_file), + "var", + "--output", + str(output), + ], ) assert result.exit_code == 0 assert output.exists() @@ -69,20 +146,20 @@ def test_table_command_var(self, sample_h5ad_file, temp_dir): rows = list(reader) assert len(rows) == 5 # header + 4 rows - def test_table_command_columns_filter(self, sample_h5ad_file, temp_dir): - """Test table command with column filter.""" + def test_export_dataframe_columns_filter(self, sample_h5ad_file, temp_dir): + """Test export dataframe with column filter.""" output = temp_dir / "table.csv" result = runner.invoke( app, [ - "table", + "export", + "dataframe", str(sample_h5ad_file), - "--axis", "obs", - "--columns", - "obs_names,cell_type", "--output", str(output), + "--columns", + "obs_names,cell_type", ], ) assert result.exit_code == 0 @@ -95,20 +172,43 @@ def test_table_command_columns_filter(self, sample_h5ad_file, temp_dir): assert "cell_type" in header assert "n_counts" not in header - def test_table_command_head(self, sample_h5ad_file, temp_dir): - """Test table command with head limit.""" + def test_export_dataframe_head(self, sample_h5ad_file, temp_dir): + """Test export dataframe with head limit.""" output = temp_dir / "table.csv" result = runner.invoke( app, [ - "table", + "export", + "dataframe", str(sample_h5ad_file), - "--axis", "obs", + "--output", + str(output), "--head", "2", + ], + ) + assert result.exit_code == 0 + + with open(output, "r") as f: + reader = csv.reader(f) + rows = list(reader) + assert len(rows) == 3 # header + 2 rows + + def test_export_dataframe_head_short_flag(self, sample_h5ad_file, temp_dir): + """Test export dataframe with -n short flag.""" + output = temp_dir / "table.csv" + result = runner.invoke( + app, + [ + "export", + "dataframe", + str(sample_h5ad_file), + "obs", "--output", str(output), + "-n", + "3", ], ) assert result.exit_code == 0 @@ -116,17 +216,124 @@ def test_table_command_head(self, sample_h5ad_file, temp_dir): with open(output, "r") as f: reader = csv.reader(f) rows = list(reader) - assert len(rows) == 3 # header + 2 rows + assert len(rows) == 4 # header + 3 rows + + def test_export_dataframe_stdout(self, sample_h5ad_file): + """Test export dataframe to stdout (no --output).""" + result = runner.invoke( + app, + [ + "export", + "dataframe", + str(sample_h5ad_file), + "obs", + "--head", + "2", + ], + ) + assert result.exit_code == 0 + # Output should go to stdout + assert "obs_names" in result.stdout + assert "cell_" in result.stdout + + def test_export_dataframe_columns_short_flag(self, sample_h5ad_file, temp_dir): + """Test export dataframe with -c short flag for columns.""" + output = temp_dir / "table.csv" + result = runner.invoke( + app, + [ + "export", + "dataframe", + str(sample_h5ad_file), + "obs", + "-o", + str(output), + "-c", + "obs_names", + ], + ) + assert result.exit_code == 0 + + with open(output, "r") as f: + reader = csv.reader(f) + rows = list(reader) + header = rows[0] + assert len(header) == 1 + assert "obs_names" in header + + def test_export_dataframe_chunk_rows(self, sample_h5ad_file, temp_dir): + """Test export dataframe with custom chunk size.""" + output = temp_dir / "table.csv" + result = runner.invoke( + app, + [ + "export", + "dataframe", + str(sample_h5ad_file), + "obs", + "--output", + str(output), + "--chunk-rows", + "2", + ], + ) + assert result.exit_code == 0 + assert output.exists() + + with open(output, "r") as f: + reader = csv.reader(f) + rows = list(reader) + assert len(rows) == 6 # header + 5 rows + + def test_export_dataframe_combined_options(self, sample_h5ad_file, temp_dir): + """Test export dataframe with multiple options combined.""" + output = temp_dir / "table.csv" + result = runner.invoke( + app, + [ + "export", + "dataframe", + str(sample_h5ad_file), + "obs", + "-o", + str(output), + "-c", + "obs_names,cell_type", + "-n", + "3", + "-r", + "1", + ], + ) + assert result.exit_code == 0 + + with open(output, "r") as f: + reader = csv.reader(f) + rows = list(reader) + assert len(rows) == 4 # header + 3 rows + header = rows[0] + assert "obs_names" in header + assert "cell_type" in header + assert "n_counts" not in header - def test_table_command_invalid_axis(self, sample_h5ad_file): - """Test table command with invalid axis.""" + def test_export_dataframe_invalid_axis(self, sample_h5ad_file, temp_dir): + """Test export dataframe with invalid axis.""" + output = temp_dir / "table.csv" result = runner.invoke( - app, ["table", str(sample_h5ad_file), "--axis", "invalid"] + app, + [ + "export", + "dataframe", + str(sample_h5ad_file), + "invalid", + "--output", + str(output), + ], ) assert result.exit_code == 1 # Check both stdout and stderr since Console uses stderr=True - output = result.stdout + result.stderr - assert "Invalid axis" in output + output_text = result.stdout + result.stderr + assert "obs" in output_text or "var" in output_text def test_export_table_function(self, sample_h5ad_file, temp_dir): """Test export_table function directly.""" @@ -160,7 +367,15 @@ def test_subset_command_obs(self, sample_h5ad_file, temp_dir): output = temp_dir / "subset.h5ad" result = runner.invoke( - app, ["subset", str(sample_h5ad_file), str(output), "--obs", str(obs_file)] + app, + [ + "subset", + str(sample_h5ad_file), + "--output", + str(output), + "--obs", + str(obs_file), + ], ) assert result.exit_code == 0 assert output.exists() @@ -172,7 +387,15 @@ def test_subset_command_var(self, sample_h5ad_file, temp_dir): output = temp_dir / "subset.h5ad" result = runner.invoke( - app, ["subset", str(sample_h5ad_file), str(output), "--var", str(var_file)] + app, + [ + "subset", + str(sample_h5ad_file), + "--output", + str(output), + "--var", + str(var_file), + ], ) assert result.exit_code == 0 assert output.exists() @@ -191,6 +414,7 @@ def test_subset_command_both(self, sample_h5ad_file, temp_dir): [ "subset", str(sample_h5ad_file), + "--output", str(output), "--obs", str(obs_file), @@ -204,7 +428,9 @@ def test_subset_command_both(self, sample_h5ad_file, temp_dir): def test_subset_command_no_filters(self, sample_h5ad_file, temp_dir): """Test subset command without any filters (should fail).""" output = temp_dir / "subset.h5ad" - result = runner.invoke(app, ["subset", str(sample_h5ad_file), str(output)]) + result = runner.invoke( + app, ["subset", str(sample_h5ad_file), "--output", str(output)] + ) assert result.exit_code == 1 # Check both stdout and stderr since Console uses stderr=True output_text = result.stdout + result.stderr @@ -221,6 +447,7 @@ def test_subset_command_chunk_rows(self, sample_h5ad_file, temp_dir): [ "subset", str(sample_h5ad_file), + "--output", str(output), "--obs", str(obs_file), @@ -242,6 +469,7 @@ def test_subset_command_sparse(self, sample_sparse_csr_h5ad, temp_dir): [ "subset", str(sample_sparse_csr_h5ad), + "--output", str(output), "--obs", str(obs_file), @@ -266,11 +494,25 @@ def test_info_help(self): assert result.exit_code == 0 assert "Show high-level information" in result.stdout - def test_table_help(self): - """Test table command help.""" - result = runner.invoke(app, ["table", "--help"]) + def test_export_help(self): + """Test export command help.""" + result = runner.invoke(app, ["export", "--help"]) + assert result.exit_code == 0 + assert "dataframe" in result.stdout + assert "array" in result.stdout + + def test_export_dataframe_help(self): + """Test export dataframe command help.""" + result = runner.invoke(app, ["export", "dataframe", "--help"]) + assert result.exit_code == 0 + assert "Export a dataframe" in result.stdout + + def test_import_help(self): + """Test import command help.""" + result = runner.invoke(app, ["import", "--help"]) assert result.exit_code == 0 - assert "Export a table" in result.stdout + assert "dataframe" in result.stdout + assert "array" in result.stdout def test_subset_help(self): """Test subset command help.""" diff --git a/tests/test_export.py b/tests/test_export.py new file mode 100644 index 0000000..6a3fad9 --- /dev/null +++ b/tests/test_export.py @@ -0,0 +1,274 @@ +"""Tests for the export command.""" + +import json +from pathlib import Path + +import h5py +import numpy as np +from typer.testing import CliRunner + +from h5ad.cli import app + + +runner = CliRunner() + + +def _read_mtx(path: Path) -> np.ndarray: + with open(path, "r", encoding="utf-8") as fh: + header = fh.readline() + assert header.startswith("%%MatrixMarket") + line = fh.readline() + while line.startswith("%"): + line = fh.readline() + n_rows, n_cols, nnz = map(int, line.split()) + mat = np.zeros((n_rows, n_cols), dtype=np.float32) + for _ in range(nnz): + r, c, v = fh.readline().split() + mat[int(r) - 1, int(c) - 1] = float(v) + return mat + + +def _read_mtx_header_and_data(path: Path) -> tuple[int, int, int, list[str]]: + with open(path, "r", encoding="utf-8") as fh: + header = fh.readline() + assert header.startswith("%%MatrixMarket") + line = fh.readline() + while line.startswith("%"): + line = fh.readline() + n_rows, n_cols, nnz = map(int, line.split()) + data_lines = [line.strip() for line in fh if line.strip()] + return n_rows, n_cols, nnz, data_lines + + +class TestExportArray: + def test_export_array_dense_X(self, sample_h5ad_file, temp_dir): + out = temp_dir / "X.npy" + result = runner.invoke( + app, ["export", "array", str(sample_h5ad_file), "X", "--output", str(out)] + ) + assert result.exit_code == 0 + assert out.exists() + + got = np.load(out) + with h5py.File(sample_h5ad_file, "r") as f: + expected = np.asarray(f["X"][...]) + np.testing.assert_allclose(got, expected) + + def test_export_array_chunk(self, sample_h5ad_file, temp_dir): + out = temp_dir / "X_chunk.npy" + result = runner.invoke( + app, + [ + "export", + "array", + str(sample_h5ad_file), + "X", + "--output", + str(out), + "--chunk", + "3", + ], + ) + assert result.exit_code == 0 + assert out.exists() + + got = np.load(out) + with h5py.File(sample_h5ad_file, "r") as f: + expected = np.asarray(f["X"][...]) + np.testing.assert_allclose(got, expected) + + +class TestExportSparse: + def test_export_sparse_csr(self, sample_sparse_csr_h5ad, temp_dir): + out = temp_dir / "X_csr.mtx" + result = runner.invoke( + app, + [ + "export", + "sparse", + str(sample_sparse_csr_h5ad), + "X", + "--output", + str(out), + ], + ) + assert result.exit_code == 0 + assert out.exists() + + got = _read_mtx(out) + expected = np.array( + [ + [1.0, 0.0, 2.0], + [0.0, 0.0, 0.0], + [3.0, 4.0, 0.0], + [0.0, 5.0, 6.0], + ], + dtype=np.float32, + ) + np.testing.assert_allclose(got, expected) + + def test_export_sparse_head_limits_entries(self, sample_sparse_csr_h5ad, temp_dir): + out = temp_dir / "X_csr_head.mtx" + result = runner.invoke( + app, + [ + "export", + "sparse", + str(sample_sparse_csr_h5ad), + "X", + "--output", + str(out), + "--head", + "2", + ], + ) + assert result.exit_code == 0 + assert out.exists() + + n_rows, n_cols, nnz, data_lines = _read_mtx_header_and_data(out) + assert (n_rows, n_cols) == (4, 3) + assert nnz == 2 + assert len(data_lines) == 2 + assert data_lines[0].startswith("1 1 ") + assert data_lines[1].startswith("1 3 ") + + def test_export_sparse_csc(self, temp_dir): + # Build a small, consistent CSC matrix group + file_path = temp_dir / "test_csc.h5ad" + with h5py.File(file_path, "w") as f: + X = f.create_group("X") + X.attrs["encoding-type"] = "csc_matrix" + X.attrs["shape"] = (3, 4) + data = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], dtype=np.float32) + indices = np.array([0, 2, 0, 1, 1, 2], dtype=np.int32) + indptr = np.array([0, 2, 2, 4, 6], dtype=np.int32) + X.create_dataset("data", data=data) + X.create_dataset("indices", data=indices) + X.create_dataset("indptr", data=indptr) + + out = temp_dir / "X_csc.mtx" + result = runner.invoke( + app, ["export", "sparse", str(file_path), "X", "--output", str(out)] + ) + assert result.exit_code == 0 + assert out.exists() + + got = _read_mtx(out) + expected = np.array( + [ + [1.0, 0.0, 3.0, 0.0], + [0.0, 0.0, 4.0, 5.0], + [2.0, 0.0, 0.0, 6.0], + ], + dtype=np.float32, + ) + np.testing.assert_allclose(got, expected) + + +class TestExportDict: + def test_export_dict_uns(self, sample_h5ad_file, temp_dir): + out = temp_dir / "uns.json" + result = runner.invoke( + app, ["export", "dict", str(sample_h5ad_file), "uns", str(out)] + ) + assert result.exit_code == 0 + assert out.exists() + payload = json.loads(out.read_text(encoding="utf-8")) + assert "description" in payload + assert payload["description"] == ["Test dataset"] + + +class TestExportDataframe: + def test_export_dataframe_obs(self, sample_h5ad_file, temp_dir): + out = temp_dir / "obs.csv" + result = runner.invoke( + app, + ["export", "dataframe", str(sample_h5ad_file), "obs", "--output", str(out)], + ) + assert result.exit_code == 0 + assert out.exists() + text = out.read_text(encoding="utf-8") + assert "obs_names" in text + + def test_export_legacy_v010_dataframe(self, sample_legacy_v010_h5ad, temp_dir): + """Test exporting a legacy v0.1.0 dataframe with categorical columns.""" + out = temp_dir / "obs_legacy.csv" + result = runner.invoke( + app, + [ + "export", + "dataframe", + str(sample_legacy_v010_h5ad), + "obs", + "--output", + str(out), + ], + ) + assert result.exit_code == 0 + assert out.exists() + text = out.read_text(encoding="utf-8") + # Should contain index and columns + assert "obs_names" in text + assert "cell_type" in text + # Should NOT contain __categories (reserved subgroup) + assert "__categories" not in text + # Should contain decoded categorical values, not codes + assert "TypeA" in text + assert "TypeB" in text + + +class TestExportValidation: + def test_wrong_type_for_dataframe(self, sample_h5ad_file, temp_dir): + """Test that wrong object type is rejected for dataframe export.""" + out = temp_dir / "X.csv" + result = runner.invoke( + app, + ["export", "dataframe", str(sample_h5ad_file), "X", "--output", str(out)], + ) + assert result.exit_code == 1 + assert "obs" in result.output or "var" in result.output + + def test_sparse_matrix_array_export(self, sample_sparse_csr_h5ad, temp_dir): + """Test that sparse matrix requires sparse export.""" + out = temp_dir / "X.npy" + result = runner.invoke( + app, + ["export", "array", str(sample_sparse_csr_h5ad), "X", "--output", str(out)], + ) + # Should fail because X is sparse, not dense + assert result.exit_code == 1 + + def test_nonexistent_object(self, sample_h5ad_file, temp_dir): + """Test that nonexistent object path is rejected.""" + out = temp_dir / "output.npy" + result = runner.invoke( + app, + [ + "export", + "array", + str(sample_h5ad_file), + "nonexistent/path", + "--output", + str(out), + ], + ) + assert result.exit_code == 1 + assert "not found" in result.output.lower() or "error" in result.output.lower() + + def test_export_dict_unknown_type(self, temp_dir): + """Test that unknown/complex types can be exported as dict.""" + file_path = temp_dir / "test_unknown.h5ad" + with h5py.File(file_path, "w") as f: + g = f.create_group("obs") + g.create_dataset("obs_names", data=np.array([b"cell1"])) + g.attrs["_index"] = "obs_names" + # Create a group without known encoding + weird = f.create_group("weird_group") + weird.attrs["encoding-type"] = "some_unknown_encoding" + + out = temp_dir / "weird.json" + result = runner.invoke( + app, ["export", "dict", str(file_path), "weird_group", str(out)] + ) + # Should succeed as it's detected as dict + assert result.exit_code == 0 diff --git a/tests/test_import.py b/tests/test_import.py new file mode 100644 index 0000000..f49af84 --- /dev/null +++ b/tests/test_import.py @@ -0,0 +1,555 @@ +"""Tests for the import command.""" + +import json +import re +from pathlib import Path + +import h5py +import numpy as np +from typer.testing import CliRunner + +from h5ad.cli import app + + +runner = CliRunner() + + +def strip_ansi(text: str) -> str: + """Strip ANSI escape codes from text.""" + return re.sub(r"\x1b\[[0-9;]*m", "", text) + + +class TestImportDataframe: + def test_import_dataframe_obs_inplace(self, sample_h5ad_file, temp_dir): + """Test importing CSV into obs with --inplace.""" + csv_file = temp_dir / "new_obs.csv" + csv_file.write_text( + "cell_id,score,label\n" + "cell_1,1.5,A\n" + "cell_2,2.5,B\n" + "cell_3,3.5,A\n" + "cell_4,4.5,C\n" + "cell_5,5.5,B\n" + ) + + result = runner.invoke( + app, + [ + "import", + "dataframe", + str(sample_h5ad_file), + "obs", + str(csv_file), + "--inplace", + "-i", + "cell_id", + ], + ) + assert result.exit_code == 0 + output = strip_ansi(result.output) + assert "5 rows" in output + assert "2 columns" in output + + with h5py.File(sample_h5ad_file, "r") as f: + assert "obs" in f + obs = f["obs"] + assert "score" in obs + assert "label" in obs + + def test_import_dataframe_obs_output(self, sample_h5ad_file, temp_dir): + """Test importing CSV into obs with output file.""" + csv_file = temp_dir / "new_obs.csv" + csv_file.write_text( + "cell_id,score,label\n" + "cell_1,1.5,A\n" + "cell_2,2.5,B\n" + "cell_3,3.5,A\n" + "cell_4,4.5,C\n" + "cell_5,5.5,B\n" + ) + output_file = temp_dir / "output.h5ad" + + result = runner.invoke( + app, + [ + "import", + "dataframe", + str(sample_h5ad_file), + "obs", + str(csv_file), + "-o", + str(output_file), + "-i", + "cell_id", + ], + ) + assert result.exit_code == 0 + assert output_file.exists() + + # Verify output file has the new data + with h5py.File(output_file, "r") as f: + assert "obs" in f + obs = f["obs"] + assert "score" in obs + + # Verify source file is unchanged + with h5py.File(sample_h5ad_file, "r") as f: + obs = f["obs"] + assert "score" not in obs + + def test_import_dataframe_var(self, sample_h5ad_file, temp_dir): + """Test importing CSV into var.""" + csv_file = temp_dir / "new_var.csv" + csv_file.write_text( + "gene_id,mean,std\n" + "gene_1,0.1,0.01\n" + "gene_2,0.2,0.02\n" + "gene_3,0.3,0.03\n" + "gene_4,0.4,0.04\n" + ) + + result = runner.invoke( + app, + [ + "import", + "dataframe", + str(sample_h5ad_file), + "var", + str(csv_file), + "--inplace", + "-i", + "gene_id", + ], + ) + assert result.exit_code == 0 + assert "4 rows" in strip_ansi(result.output) + + def test_import_dataframe_dimension_mismatch(self, sample_h5ad_file, temp_dir): + """Test that dimension mismatch is rejected.""" + csv_file = temp_dir / "wrong_obs.csv" + csv_file.write_text("cell_id,score\ncell_1,1.0\ncell_2,2.0\n") + + result = runner.invoke( + app, + [ + "import", + "dataframe", + str(sample_h5ad_file), + "obs", + str(csv_file), + "--inplace", + "-i", + "cell_id", + ], + ) + assert result.exit_code == 1 + assert "mismatch" in result.output.lower() + + def test_import_dataframe_invalid_index_column(self, sample_h5ad_file, temp_dir): + """Test that invalid index column is rejected.""" + csv_file = temp_dir / "obs.csv" + csv_file.write_text("a,b,c\n1,2,3\n") + + result = runner.invoke( + app, + [ + "import", + "dataframe", + str(sample_h5ad_file), + "obs", + str(csv_file), + "--inplace", + "-i", + "nonexistent", + ], + ) + assert result.exit_code == 1 + assert "not found" in result.output.lower() + + def test_import_dataframe_not_obs_var(self, sample_h5ad_file, temp_dir): + """Test that dataframe import is only allowed for obs/var.""" + csv_file = temp_dir / "data.csv" + csv_file.write_text("a,b\n1,2\n") + + result = runner.invoke( + app, + [ + "import", + "dataframe", + str(sample_h5ad_file), + "uns/data", + str(csv_file), + "--inplace", + ], + ) + assert result.exit_code == 1 + assert "obs" in result.output or "var" in result.output + + def test_import_dataframe_requires_output_or_inplace( + self, sample_h5ad_file, temp_dir + ): + """Test that either --output or --inplace is required.""" + csv_file = temp_dir / "obs.csv" + csv_file.write_text("a,b\n1,2\n") + + result = runner.invoke( + app, + [ + "import", + "dataframe", + str(sample_h5ad_file), + "obs", + str(csv_file), + ], + ) + assert result.exit_code == 1 + assert "Output file is required" in result.output + + +class TestImportArray: + def test_import_array_obsm(self, sample_h5ad_file, temp_dir): + """Test importing NPY into obsm.""" + npy_file = temp_dir / "pca.npy" + arr = np.random.randn(5, 10).astype(np.float32) + np.save(npy_file, arr) + + result = runner.invoke( + app, + [ + "import", + "array", + str(sample_h5ad_file), + "obsm/X_pca", + str(npy_file), + "--inplace", + ], + ) + assert result.exit_code == 0 + assert "5×10" in strip_ansi(result.output) + + with h5py.File(sample_h5ad_file, "r") as f: + assert "obsm/X_pca" in f + np.testing.assert_allclose(f["obsm/X_pca"][...], arr) + + def test_import_array_varm(self, sample_h5ad_file, temp_dir): + """Test importing NPY into varm.""" + npy_file = temp_dir / "pcs.npy" + arr = np.random.randn(4, 5).astype(np.float32) + np.save(npy_file, arr) + + result = runner.invoke( + app, + [ + "import", + "array", + str(sample_h5ad_file), + "varm/PCs", + str(npy_file), + "--inplace", + ], + ) + assert result.exit_code == 0 + + with h5py.File(sample_h5ad_file, "r") as f: + assert "varm/PCs" in f + + def test_import_array_X(self, sample_h5ad_file, temp_dir): + """Test importing NPY into X.""" + npy_file = temp_dir / "X.npy" + arr = np.random.randn(5, 4).astype(np.float32) + np.save(npy_file, arr) + + result = runner.invoke( + app, + [ + "import", + "array", + str(sample_h5ad_file), + "X", + str(npy_file), + "--inplace", + ], + ) + assert result.exit_code == 0 + + with h5py.File(sample_h5ad_file, "r") as f: + assert "X" in f + np.testing.assert_allclose(f["X"][...], arr) + + def test_import_array_dimension_mismatch_obsm(self, sample_h5ad_file, temp_dir): + """Test that obsm dimension mismatch is rejected.""" + npy_file = temp_dir / "bad_pca.npy" + arr = np.random.randn(10, 5).astype(np.float32) + np.save(npy_file, arr) + + result = runner.invoke( + app, + [ + "import", + "array", + str(sample_h5ad_file), + "obsm/X_pca", + str(npy_file), + "--inplace", + ], + ) + assert result.exit_code == 1 + assert "mismatch" in result.output.lower() + + def test_import_array_dimension_mismatch_X(self, sample_h5ad_file, temp_dir): + """Test that X dimension mismatch is rejected.""" + npy_file = temp_dir / "bad_X.npy" + arr = np.random.randn(5, 10).astype(np.float32) + np.save(npy_file, arr) + + result = runner.invoke( + app, + [ + "import", + "array", + str(sample_h5ad_file), + "X", + str(npy_file), + "--inplace", + ], + ) + assert result.exit_code == 1 + assert "mismatch" in result.output.lower() + + def test_import_array_requires_output_or_inplace(self, sample_h5ad_file, temp_dir): + """Test that either --output or --inplace is required.""" + npy_file = temp_dir / "data.npy" + np.save(npy_file, np.array([1, 2, 3])) + + result = runner.invoke( + app, + [ + "import", + "array", + str(sample_h5ad_file), + "obsm/X_pca", + str(npy_file), + ], + ) + assert result.exit_code == 1 + assert "Output file is required" in result.output + + +class TestImportSparse: + def test_import_sparse_X(self, sample_h5ad_file, temp_dir): + """Test importing MTX into X.""" + mtx_file = temp_dir / "X.mtx" + mtx_file.write_text( + "%%MatrixMarket matrix coordinate real general\n" + "% test matrix\n" + "5 4 5\n" + "1 1 1.0\n" + "2 2 2.0\n" + "3 3 3.0\n" + "4 4 4.0\n" + "5 1 5.0\n" + ) + + result = runner.invoke( + app, + [ + "import", + "sparse", + str(sample_h5ad_file), + "X", + str(mtx_file), + "--inplace", + ], + ) + assert result.exit_code == 0 + output = strip_ansi(result.output) + assert "5×4" in output + assert "5 non-zero" in output + + with h5py.File(sample_h5ad_file, "r") as f: + assert "X" in f + X = f["X"] + enc = X.attrs.get("encoding-type") + if isinstance(enc, bytes): + enc = enc.decode("utf-8") + assert enc == "csr_matrix" + + def test_import_sparse_layer(self, sample_h5ad_file, temp_dir): + """Test importing MTX into layers.""" + mtx_file = temp_dir / "layer.mtx" + mtx_file.write_text( + "%%MatrixMarket matrix coordinate real general\n" + "5 4 3\n" + "1 1 1.0\n" + "3 2 2.0\n" + "5 4 3.0\n" + ) + + result = runner.invoke( + app, + [ + "import", + "sparse", + str(sample_h5ad_file), + "layers/counts", + str(mtx_file), + "--inplace", + ], + ) + assert result.exit_code == 0 + + with h5py.File(sample_h5ad_file, "r") as f: + assert "layers/counts" in f + + def test_import_sparse_dimension_mismatch(self, sample_h5ad_file, temp_dir): + """Test that MTX dimension mismatch is rejected.""" + mtx_file = temp_dir / "bad.mtx" + mtx_file.write_text( + "%%MatrixMarket matrix coordinate real general\n" "10 4 1\n" "1 1 1.0\n" + ) + + result = runner.invoke( + app, + [ + "import", + "sparse", + str(sample_h5ad_file), + "X", + str(mtx_file), + "--inplace", + ], + ) + assert result.exit_code == 1 + assert "mismatch" in result.output.lower() + + def test_import_sparse_requires_output_or_inplace(self, sample_h5ad_file, temp_dir): + """Test that either --output or --inplace is required.""" + mtx_file = temp_dir / "data.mtx" + mtx_file.write_text( + "%%MatrixMarket matrix coordinate real general\n" "5 4 1\n" "1 1 1.0\n" + ) + + result = runner.invoke( + app, + [ + "import", + "sparse", + str(sample_h5ad_file), + "X", + str(mtx_file), + ], + ) + assert result.exit_code == 1 + assert "Output file is required" in result.output + + +class TestImportDict: + def test_import_dict_uns(self, sample_h5ad_file, temp_dir): + """Test importing JSON into uns.""" + json_file = temp_dir / "metadata.json" + json_file.write_text( + json.dumps( + { + "version": "1.0", + "colors": ["red", "green", "blue"], + "n_pcs": 50, + } + ) + ) + + result = runner.invoke( + app, + [ + "import", + "dict", + str(sample_h5ad_file), + "uns/metadata", + str(json_file), + "--inplace", + ], + ) + assert result.exit_code == 0 + assert "JSON data" in result.output + + with h5py.File(sample_h5ad_file, "r") as f: + assert "uns/metadata" in f + assert "colors" in f["uns/metadata"] + assert "n_pcs" in f["uns/metadata"] + + def test_import_dict_nested(self, sample_h5ad_file, temp_dir): + """Test importing nested JSON.""" + json_file = temp_dir / "config.json" + json_file.write_text( + json.dumps( + { + "settings": { + "threshold": 0.5, + "enabled": True, + }, + "labels": ["A", "B", "C"], + } + ) + ) + + result = runner.invoke( + app, + [ + "import", + "dict", + str(sample_h5ad_file), + "uns/config", + str(json_file), + "--inplace", + ], + ) + assert result.exit_code == 0 + + with h5py.File(sample_h5ad_file, "r") as f: + assert "uns/config/settings" in f + assert "uns/config/labels" in f + + def test_import_dict_requires_output_or_inplace(self, sample_h5ad_file, temp_dir): + """Test that either --output or --inplace is required.""" + json_file = temp_dir / "data.json" + json_file.write_text('{"key": "value"}') + + result = runner.invoke( + app, + [ + "import", + "dict", + str(sample_h5ad_file), + "uns/data", + str(json_file), + ], + ) + assert result.exit_code == 1 + assert "Output file is required" in result.output + + +class TestImportValidation: + def test_replace_existing_object(self, sample_h5ad_file, temp_dir): + """Test that existing objects can be replaced.""" + with h5py.File(sample_h5ad_file, "r") as f: + original_X = np.array(f["X"][...]) + + npy_file = temp_dir / "new_X.npy" + new_arr = np.ones((5, 4), dtype=np.float32) * 999 + np.save(npy_file, new_arr) + + result = runner.invoke( + app, + [ + "import", + "array", + str(sample_h5ad_file), + "X", + str(npy_file), + "--inplace", + ], + ) + assert result.exit_code == 0 + + with h5py.File(sample_h5ad_file, "r") as f: + np.testing.assert_allclose(f["X"][...], new_arr) + assert not np.allclose(f["X"][...], original_X) diff --git a/tests/test_info_read.py b/tests/test_info_read.py index 07b9a13..e708fac 100644 --- a/tests/test_info_read.py +++ b/tests/test_info_read.py @@ -3,10 +3,89 @@ import pytest import h5py import numpy as np -from h5ad.info import axis_len, get_axis_group +from h5ad.info import axis_len, get_axis_group, get_entry_type, format_type_info from h5ad.read import decode_str_array, read_categorical_column, col_chunk_as_strings +class TestGetEntryType: + """Tests for get_entry_type function.""" + + def test_get_entry_type_dataframe(self, sample_h5ad_file): + """Test type detection for dataframe (obs/var).""" + with h5py.File(sample_h5ad_file, "r") as f: + info = get_entry_type(f["obs"]) + assert info["type"] == "dataframe" + assert info["export_as"] == "csv" + + def test_get_entry_type_dense_matrix(self, sample_h5ad_file): + """Test type detection for dense matrix.""" + with h5py.File(sample_h5ad_file, "r") as f: + info = get_entry_type(f["X"]) + assert info["type"] == "dense-matrix" + assert info["export_as"] == "npy" + assert info["shape"] == (5, 4) + + def test_get_entry_type_sparse_matrix(self, sample_sparse_csr_h5ad): + """Test type detection for sparse matrix.""" + with h5py.File(sample_sparse_csr_h5ad, "r") as f: + info = get_entry_type(f["X"]) + assert info["type"] == "sparse-matrix" + assert info["export_as"] == "mtx" + assert info["encoding"] == "csr_matrix" + + def test_get_entry_type_dict(self, sample_h5ad_file): + """Test type detection for dict/group.""" + with h5py.File(sample_h5ad_file, "r") as f: + info = get_entry_type(f["uns"]) + assert info["type"] == "dict" + assert info["export_as"] == "json" + + def test_get_entry_type_1d_array(self, temp_dir): + """Test type detection for 1D array.""" + file_path = temp_dir / "test.h5ad" + with h5py.File(file_path, "w") as f: + f.create_dataset("arr", data=np.array([1, 2, 3, 4, 5])) + with h5py.File(file_path, "r") as f: + info = get_entry_type(f["arr"]) + assert info["type"] == "array" + assert info["export_as"] == "npy" + + def test_get_entry_type_scalar(self, temp_dir): + """Test type detection for scalar.""" + file_path = temp_dir / "test.h5ad" + with h5py.File(file_path, "w") as f: + f.create_dataset("scalar", data=42) + with h5py.File(file_path, "r") as f: + info = get_entry_type(f["scalar"]) + assert info["type"] == "scalar" + assert info["export_as"] == "json" + + +class TestFormatTypeInfo: + """Tests for format_type_info function.""" + + def test_format_type_info_dataframe(self): + """Test formatting dataframe type info.""" + info = {"type": "dataframe", "export_as": "csv"} + result = format_type_info(info) + assert "" in result + assert "green" in result + + def test_format_type_info_sparse(self): + """Test formatting sparse matrix type info.""" + info = {"type": "sparse-matrix", "export_as": "mtx"} + result = format_type_info(info) + assert "" in result + assert "magenta" in result + + def test_format_type_info_unknown(self): + """Test formatting unknown type info.""" + info = {"type": "unknown", "export_as": None} + result = format_type_info(info) + assert "" in result + assert "red" in result + + class TestAxisLen: """Tests for axis_len function.""" @@ -23,10 +102,28 @@ def test_axis_len_var(self, sample_h5ad_file): assert length == 4 def test_axis_len_nonexistent(self, sample_h5ad_file): - """Test getting length of non-existent axis.""" + """Test getting length of non-existent axis raises KeyError.""" with h5py.File(sample_h5ad_file, "r") as f: - length = axis_len(f, "nonexistent") - assert length is None + with pytest.raises(KeyError, match="'nonexistent' not found"): + axis_len(f, "nonexistent") + + def test_axis_len_not_a_group(self, temp_dir): + """Test that axis_len raises TypeError when axis is not a group.""" + file_path = temp_dir / "test.h5ad" + with h5py.File(file_path, "w") as f: + f.create_dataset("obs", data=np.array([1, 2, 3])) + with h5py.File(file_path, "r") as f: + with pytest.raises(TypeError, match="'obs' is not a group"): + axis_len(f, "obs") + + def test_axis_len_missing_index(self, temp_dir): + """Test that axis_len raises KeyError when index dataset is missing.""" + file_path = temp_dir / "test.h5ad" + with h5py.File(file_path, "w") as f: + f.create_group("obs") + with h5py.File(file_path, "r") as f: + with pytest.raises(KeyError, match="Index dataset 'obs_names' not found"): + axis_len(f, "obs") class TestGetAxisGroup: @@ -148,9 +245,59 @@ def test_col_chunk_categorical(self, sample_categorical_h5ad): result = col_chunk_as_strings(f["obs"], "cell_type", 0, 4, cache) assert result == ["TypeA", "TypeB", "TypeA", "TypeC"] - def test_col_chunk_unsupported(self, sample_h5ad_file): - """Test reading unsupported column.""" + def test_col_chunk_not_found(self, sample_h5ad_file): + """Test reading non-existent column.""" with h5py.File(sample_h5ad_file, "r") as f: cache = {} - with pytest.raises(RuntimeError, match="Unsupported column"): + with pytest.raises(RuntimeError, match="not found in group"): col_chunk_as_strings(f["obs"], "nonexistent", 0, 5, cache) + + +class TestLegacyV010Support: + """Tests for legacy v0.1.0 format support.""" + + def test_get_entry_type_legacy_categorical(self, sample_legacy_v010_h5ad): + """Test type detection for legacy categorical column (v0.1.0).""" + with h5py.File(sample_legacy_v010_h5ad, "r") as f: + info = get_entry_type(f["obs"]["cell_type"]) + assert info["type"] == "categorical" + assert info["version"] == "0.1.0" + assert "Legacy" in info["details"] + + def test_get_entry_type_legacy_dataframe(self, sample_legacy_v010_h5ad): + """Test type detection for legacy dataframe (v0.1.0).""" + with h5py.File(sample_legacy_v010_h5ad, "r") as f: + info = get_entry_type(f["obs"]) + assert info["type"] == "dataframe" + assert info["version"] == "0.1.0" + assert "legacy" in info["details"].lower() + + def test_read_legacy_categorical_column(self, sample_legacy_v010_h5ad): + """Test reading legacy categorical column.""" + with h5py.File(sample_legacy_v010_h5ad, "r") as f: + cache = {} + result = read_categorical_column( + f["obs"]["cell_type"], 0, 4, cache, f["obs"] + ) + assert result == ["TypeA", "TypeB", "TypeA", "TypeC"] + + def test_col_chunk_legacy_categorical(self, sample_legacy_v010_h5ad): + """Test col_chunk_as_strings with legacy categorical column.""" + with h5py.File(sample_legacy_v010_h5ad, "r") as f: + cache = {} + result = col_chunk_as_strings(f["obs"], "cell_type", 0, 4, cache) + assert result == ["TypeA", "TypeB", "TypeA", "TypeC"] + + def test_col_chunk_legacy_numeric(self, sample_legacy_v010_h5ad): + """Test col_chunk_as_strings with legacy numeric column.""" + with h5py.File(sample_legacy_v010_h5ad, "r") as f: + cache = {} + result = col_chunk_as_strings(f["obs"], "n_counts", 0, 4, cache) + assert result == ["100", "200", "150", "300"] + + def test_legacy_categorical_slice(self, sample_legacy_v010_h5ad): + """Test reading slice of legacy categorical column.""" + with h5py.File(sample_legacy_v010_h5ad, "r") as f: + cache = {} + result = col_chunk_as_strings(f["obs"], "cell_type", 1, 3, cache) + assert result == ["TypeB", "TypeA"] diff --git a/tests/test_subset.py b/tests/test_subset.py index 2aa9264..78c5cf8 100644 --- a/tests/test_subset.py +++ b/tests/test_subset.py @@ -414,3 +414,201 @@ def test_subset_h5ad_sparse_csc(self, sample_sparse_csc_h5ad, temp_dir): if isinstance(encoding, bytes): encoding = encoding.decode("utf-8") assert encoding == "csc_matrix" + + def test_subset_h5ad_obsp_sparse_group(self, temp_dir): + """Test subsetting obsp sparse matrix groups.""" + file_path = temp_dir / "obsp_sparse.h5ad" + with h5py.File(file_path, "w") as f: + obs = f.create_group("obs") + obs.attrs["_index"] = "obs_names" + obs_names = ["cell_1", "cell_2", "cell_3", "cell_4"] + obs.create_dataset("obs_names", data=np.array(obs_names, dtype="S")) + + var = f.create_group("var") + var.attrs["_index"] = "var_names" + var_names = ["gene_1", "gene_2"] + var.create_dataset("var_names", data=np.array(var_names, dtype="S")) + + f.create_dataset("X", data=np.zeros((4, 2), dtype=np.float32)) + + obsp = f.create_group("obsp") + conn = obsp.create_group("connectivities") + conn.attrs["encoding-type"] = "csr_matrix" + conn.attrs["encoding-version"] = "0.1.0" + conn.attrs["shape"] = np.array([4, 4], dtype=np.int64) + conn.create_dataset("data", data=np.array([1.0, 2.0, 3.0, 4.0])) + conn.create_dataset("indices", data=np.array([0, 1, 2, 3], dtype=np.int64)) + conn.create_dataset("indptr", data=np.array([0, 1, 2, 3, 4], dtype=np.int64)) + + obs_file = temp_dir / "obs_names.txt" + obs_file.write_text("cell_1\ncell_3\n") + + output = temp_dir / "subset.h5ad" + console = Console(stderr=True) + + subset_h5ad( + file=file_path, + output=output, + obs_file=obs_file, + var_file=None, + chunk_rows=1024, + console=console, + ) + + with h5py.File(output, "r") as f: + conn = f["obsp"]["connectivities"] + encoding = conn.attrs["encoding-type"] + if isinstance(encoding, bytes): + encoding = encoding.decode("utf-8") + assert encoding == "csr_matrix" + assert tuple(conn.attrs["shape"]) == (2, 2) + assert conn["indptr"].shape[0] == 3 + + def test_subset_h5ad_uns_vlen_strings(self, temp_dir): + """Test copying uns datasets with variable-length strings.""" + file_path = temp_dir / "uns_strings.h5ad" + with h5py.File(file_path, "w") as f: + obs = f.create_group("obs") + obs.attrs["_index"] = "obs_names" + obs.create_dataset( + "obs_names", data=np.array(["cell_1", "cell_2"], dtype="S") + ) + + var = f.create_group("var") + var.attrs["_index"] = "var_names" + var.create_dataset( + "var_names", data=np.array(["gene_1", "gene_2"], dtype="S") + ) + + f.create_dataset("X", data=np.zeros((2, 2), dtype=np.float32)) + + uns = f.create_group("uns") + vlen = h5py.string_dtype(encoding="utf-8") + uns.create_dataset("labels", data=["a", "b", "c"], dtype=vlen) + meta = uns.create_group("meta") + meta.create_dataset("method", data="test", dtype=vlen) + + obs_file = temp_dir / "obs_names.txt" + obs_file.write_text("cell_1\n") + + output = temp_dir / "subset.h5ad" + console = Console(stderr=True) + + subset_h5ad( + file=file_path, + output=output, + obs_file=obs_file, + var_file=None, + chunk_rows=1024, + console=console, + ) + + with h5py.File(output, "r") as f: + labels = [ + v.decode("utf-8") if isinstance(v, bytes) else v + for v in f["uns"]["labels"][...] + ] + assert labels == ["a", "b", "c"] + method = f["uns"]["meta"]["method"][()] + if isinstance(method, bytes): + method = method.decode("utf-8") + assert method == "test" + + def test_subset_h5ad_inplace(self, sample_h5ad_file, temp_dir): + """Test subsetting with --inplace behavior.""" + obs_file = temp_dir / "obs_names.txt" + obs_file.write_text("cell_1\ncell_3\n") + + console = Console(stderr=True) + + subset_h5ad( + file=sample_h5ad_file, + output=None, + obs_file=obs_file, + var_file=None, + chunk_rows=1024, + console=console, + inplace=True, + ) + + with h5py.File(sample_h5ad_file, "r") as f: + assert f["obs"]["obs_names"].shape[0] == 2 + assert f["X"].shape[0] == 2 + + def test_subset_h5ad_sparse_entries(self, temp_dir): + """Test sparse matrices in layers, obsm, varm, obsp, and varp.""" + file_path = temp_dir / "sparse_entries.h5ad" + + def _csr_group(parent, name, shape): + group = parent.create_group(name) + group.attrs["encoding-type"] = "csr_matrix" + group.attrs["encoding-version"] = "0.1.0" + group.attrs["shape"] = np.array(shape, dtype=np.int64) + n_rows, n_cols = shape + data = [] + indices = [] + indptr = [0] + for r in range(n_rows): + c = r % n_cols + data.append(float(r + 1)) + indices.append(c) + indptr.append(len(indices)) + group.create_dataset("data", data=np.array(data, dtype=np.float32)) + group.create_dataset("indices", data=np.array(indices, dtype=np.int64)) + group.create_dataset("indptr", data=np.array(indptr, dtype=np.int64)) + return group + + with h5py.File(file_path, "w") as f: + obs = f.create_group("obs") + obs.attrs["_index"] = "obs_names" + obs.create_dataset( + "obs_names", data=np.array(["cell_1", "cell_2", "cell_3", "cell_4"], dtype="S") + ) + + var = f.create_group("var") + var.attrs["_index"] = "var_names" + var.create_dataset( + "var_names", data=np.array(["gene_1", "gene_2", "gene_3"], dtype="S") + ) + + f.create_dataset("X", data=np.zeros((4, 3), dtype=np.float32)) + + layers = f.create_group("layers") + _csr_group(layers, "counts", (4, 3)) + + obsm = f.create_group("obsm") + _csr_group(obsm, "pca", (4, 2)) + + varm = f.create_group("varm") + _csr_group(varm, "pca", (3, 2)) + + obsp = f.create_group("obsp") + _csr_group(obsp, "connectivities", (4, 4)) + + varp = f.create_group("varp") + _csr_group(varp, "correlations", (3, 3)) + + obs_file = temp_dir / "obs_names.txt" + obs_file.write_text("cell_1\ncell_3\n") + + var_file = temp_dir / "var_names.txt" + var_file.write_text("gene_1\ngene_3\n") + + output = temp_dir / "subset.h5ad" + console = Console(stderr=True) + + subset_h5ad( + file=file_path, + output=output, + obs_file=obs_file, + var_file=var_file, + chunk_rows=1024, + console=console, + ) + + with h5py.File(output, "r") as f: + assert tuple(f["layers"]["counts"].attrs["shape"]) == (2, 2) + assert tuple(f["obsm"]["pca"].attrs["shape"]) == (2, 2) + assert tuple(f["varm"]["pca"].attrs["shape"]) == (2, 2) + assert tuple(f["obsp"]["connectivities"].attrs["shape"]) == (2, 2) + assert tuple(f["varp"]["correlations"].attrs["shape"]) == (2, 2) diff --git a/tests/test_zarr.py b/tests/test_zarr.py new file mode 100644 index 0000000..1008d6a --- /dev/null +++ b/tests/test_zarr.py @@ -0,0 +1,170 @@ +"""Tests for zarr auto-detection support (v2 and v3).""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Optional + +import numpy as np +import pytest +from typer.testing import CliRunner +from rich.console import Console + +from h5ad.cli import app +from h5ad.core.subset import subset_h5ad + + +zarr = pytest.importorskip("zarr") + +runner = CliRunner() + + +class UnsupportedZarrFormat(Exception): + pass + + +def _open_zarr_group(path: Path, zarr_format: Optional[int]) -> Any: + if zarr_format is None: + return zarr.open_group(path, mode="w") + + last_exc: Exception | None = None + for kw in ("zarr_format", "zarr_version"): + try: + return zarr.open_group(path, mode="w", **{kw: zarr_format}) + except (TypeError, ValueError) as exc: + last_exc = exc + continue + + raise UnsupportedZarrFormat(str(last_exc)) from last_exc + + +def _create_array(group: Any, name: str, data: np.ndarray) -> Any: + data = np.asarray(data) + if hasattr(group, "create_array"): + try: + return group.create_array(name, data=data) + except TypeError: + return group.create_array( + name, data=data, shape=data.shape, dtype=data.dtype + ) + try: + return group.create_dataset(name, data=data, shape=data.shape) + except TypeError: + return group.create_dataset(name, data=data) + + +def _create_zarr_store(path: Path, *, zarr_format: Optional[int]) -> None: + root = _open_zarr_group(path, zarr_format) + + obs = root.create_group("obs") + obs.attrs["_index"] = "obs_names" + obs_names = ["cell_1", "cell_2", "cell_3", "cell_4", "cell_5"] + _create_array(obs, "obs_names", np.array(obs_names, dtype="S")) + _create_array( + obs, + "cell_type", + np.array(["TypeA", "TypeB", "TypeA", "TypeC", "TypeB"], dtype="S"), + ) + + var = root.create_group("var") + var.attrs["_index"] = "var_names" + var_names = ["gene_1", "gene_2", "gene_3", "gene_4"] + _create_array(var, "var_names", np.array(var_names, dtype="S")) + + X = np.array( + [ + [1.0, 0.0, 2.5, 0.0], + [0.0, 3.2, 0.0, 1.1], + [2.1, 0.0, 1.8, 0.0], + [0.0, 4.5, 0.0, 2.3], + [1.5, 0.0, 3.0, 0.0], + ], + dtype=np.float32, + ) + _create_array(root, "X", X) + + uns = root.create_group("uns") + _create_array(uns, "description", np.array(["Test dataset"], dtype="S")) + + +@pytest.fixture(params=[None, 2], ids=["default", "v2"]) +def zarr_format(request) -> Optional[int]: + return request.param + + +def _skip_if_unsupported(exc: Exception, zarr_format: Optional[int]) -> None: + if zarr_format == 2: + pytest.skip("zarr v2 not supported by installed zarr") + raise exc + + +def test_info_zarr_auto_detect(temp_dir, zarr_format): + store_path = temp_dir / f"test_{zarr_format or 'default'}.zarr" + try: + _create_zarr_store(store_path, zarr_format=zarr_format) + except UnsupportedZarrFormat as exc: + _skip_if_unsupported(exc, zarr_format) + + result = runner.invoke(app, ["info", str(store_path)]) + output = result.stdout + (result.stderr or "") + assert result.exit_code == 0, output + assert "5 × 4" in output + + +def test_export_dataframe_zarr(temp_dir, zarr_format): + store_path = temp_dir / f"test_{zarr_format or 'default'}.zarr" + try: + _create_zarr_store(store_path, zarr_format=zarr_format) + except UnsupportedZarrFormat as exc: + _skip_if_unsupported(exc, zarr_format) + output = temp_dir / "obs.csv" + + result = runner.invoke( + app, + ["export", "dataframe", str(store_path), "obs", "--output", str(output)], + ) + if result.exit_code != 0: + raise AssertionError( + f"exit_code={result.exit_code} exception={result.exception!r} output={result.output}" + ) + assert output.exists() + assert "obs_names" in output.read_text(encoding="utf-8") + + +def test_export_dict_zarr(temp_dir, zarr_format): + store_path = temp_dir / f"test_{zarr_format or 'default'}.zarr" + try: + _create_zarr_store(store_path, zarr_format=zarr_format) + except UnsupportedZarrFormat as exc: + _skip_if_unsupported(exc, zarr_format) + output = temp_dir / "uns.json" + + result = runner.invoke( + app, ["export", "dict", str(store_path), "uns", str(output)] + ) + assert result.exit_code == 0 + assert output.exists() + + +def test_subset_zarr_output(temp_dir, zarr_format): + store_path = temp_dir / f"test_{zarr_format or 'default'}.zarr" + try: + _create_zarr_store(store_path, zarr_format=zarr_format) + except UnsupportedZarrFormat as exc: + _skip_if_unsupported(exc, zarr_format) + obs_file = temp_dir / "obs.txt" + obs_file.write_text("cell_1\ncell_3\n") + output = temp_dir / "subset.zarr" + + console = Console() + subset_h5ad( + file=store_path, + output=output, + obs_file=obs_file, + var_file=None, + chunk_rows=1024, + console=console, + ) + root = zarr.open_group(output, mode="r") + assert root["obs"]["obs_names"].shape[0] == 2 + assert root["X"].shape == (2, 4) diff --git a/uv.lock b/uv.lock index 71c45c8..266fa80 100644 --- a/uv.lock +++ b/uv.lock @@ -97,15 +97,52 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/48/d9f421cb8da5afaa1a64570d9989e00fb7955e6acddc5a12979f7666ef60/coverage-7.13.1-py3-none-any.whl", hash = "sha256:2016745cb3ba554469d02819d78958b571792bb68e31302610e898f80dd3a573", size = 210722, upload-time = "2025-12-28T15:42:54.901Z" }, ] +[[package]] +name = "donfig" +version = "0.8.1.post1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/25/71/80cc718ff6d7abfbabacb1f57aaa42e9c1552bfdd01e64ddd704e4a03638/donfig-0.8.1.post1.tar.gz", hash = "sha256:3bef3413a4c1c601b585e8d297256d0c1470ea012afa6e8461dc28bfb7c23f52", size = 19506, upload-time = "2024-05-23T14:14:31.513Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/d5/c5db1ea3394c6e1732fb3286b3bd878b59507a8f77d32a2cebda7d7b7cd4/donfig-0.8.1.post1-py3-none-any.whl", hash = "sha256:2a3175ce74a06109ff9307d90a230f81215cbac9a751f4d1c6194644b8204f9d", size = 21592, upload-time = "2024-05-23T14:13:55.283Z" }, +] + +[[package]] +name = "google-crc32c" +version = "1.8.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/03/41/4b9c02f99e4c5fb477122cd5437403b552873f014616ac1d19ac8221a58d/google_crc32c-1.8.0.tar.gz", hash = "sha256:a428e25fb7691024de47fecfbff7ff957214da51eddded0da0ae0e0f03a2cf79", size = 14192, upload-time = "2025-12-16T00:35:25.142Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/5f/7307325b1198b59324c0fa9807cafb551afb65e831699f2ce211ad5c8240/google_crc32c-1.8.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:4b8286b659c1335172e39563ab0a768b8015e88e08329fa5321f774275fc3113", size = 31300, upload-time = "2025-12-16T00:21:56.723Z" }, + { url = "https://files.pythonhosted.org/packages/21/8e/58c0d5d86e2220e6a37befe7e6a94dd2f6006044b1a33edf1ff6d9f7e319/google_crc32c-1.8.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:2a3dc3318507de089c5384cc74d54318401410f82aa65b2d9cdde9d297aca7cb", size = 30867, upload-time = "2025-12-16T00:38:31.302Z" }, + { url = "https://files.pythonhosted.org/packages/ce/a9/a780cc66f86335a6019f557a8aaca8fbb970728f0efd2430d15ff1beae0e/google_crc32c-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:14f87e04d613dfa218d6135e81b78272c3b904e2a7053b841481b38a7d901411", size = 33364, upload-time = "2025-12-16T00:40:22.96Z" }, + { url = "https://files.pythonhosted.org/packages/21/3f/3457ea803db0198c9aaca2dd373750972ce28a26f00544b6b85088811939/google_crc32c-1.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cb5c869c2923d56cb0c8e6bcdd73c009c36ae39b652dbe46a05eb4ef0ad01454", size = 33740, upload-time = "2025-12-16T00:40:23.96Z" }, + { url = "https://files.pythonhosted.org/packages/df/c0/87c2073e0c72515bb8733d4eef7b21548e8d189f094b5dad20b0ecaf64f6/google_crc32c-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:3cc0c8912038065eafa603b238abf252e204accab2a704c63b9e14837a854962", size = 34437, upload-time = "2025-12-16T00:35:21.395Z" }, + { url = "https://files.pythonhosted.org/packages/d1/db/000f15b41724589b0e7bc24bc7a8967898d8d3bc8caf64c513d91ef1f6c0/google_crc32c-1.8.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:3ebb04528e83b2634857f43f9bb8ef5b2bbe7f10f140daeb01b58f972d04736b", size = 31297, upload-time = "2025-12-16T00:23:20.709Z" }, + { url = "https://files.pythonhosted.org/packages/d7/0d/8ebed0c39c53a7e838e2a486da8abb0e52de135f1b376ae2f0b160eb4c1a/google_crc32c-1.8.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:450dc98429d3e33ed2926fc99ee81001928d63460f8538f21a5d6060912a8e27", size = 30867, upload-time = "2025-12-16T00:43:14.628Z" }, + { url = "https://files.pythonhosted.org/packages/ce/42/b468aec74a0354b34c8cbf748db20d6e350a68a2b0912e128cabee49806c/google_crc32c-1.8.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:3b9776774b24ba76831609ffbabce8cdf6fa2bd5e9df37b594221c7e333a81fa", size = 33344, upload-time = "2025-12-16T00:40:24.742Z" }, + { url = "https://files.pythonhosted.org/packages/1c/e8/b33784d6fc77fb5062a8a7854e43e1e618b87d5ddf610a88025e4de6226e/google_crc32c-1.8.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:89c17d53d75562edfff86679244830599ee0a48efc216200691de8b02ab6b2b8", size = 33694, upload-time = "2025-12-16T00:40:25.505Z" }, + { url = "https://files.pythonhosted.org/packages/92/b1/d3cbd4d988afb3d8e4db94ca953df429ed6db7282ed0e700d25e6c7bfc8d/google_crc32c-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:57a50a9035b75643996fbf224d6661e386c7162d1dfdab9bc4ca790947d1007f", size = 34435, upload-time = "2025-12-16T00:35:22.107Z" }, + { url = "https://files.pythonhosted.org/packages/21/88/8ecf3c2b864a490b9e7010c84fd203ec8cf3b280651106a3a74dd1b0ca72/google_crc32c-1.8.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:e6584b12cb06796d285d09e33f63309a09368b9d806a551d8036a4207ea43697", size = 31301, upload-time = "2025-12-16T00:24:48.527Z" }, + { url = "https://files.pythonhosted.org/packages/36/c6/f7ff6c11f5ca215d9f43d3629163727a272eabc356e5c9b2853df2bfe965/google_crc32c-1.8.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:f4b51844ef67d6cf2e9425983274da75f18b1597bb2c998e1c0a0e8d46f8f651", size = 30868, upload-time = "2025-12-16T00:48:12.163Z" }, + { url = "https://files.pythonhosted.org/packages/56/15/c25671c7aad70f8179d858c55a6ae8404902abe0cdcf32a29d581792b491/google_crc32c-1.8.0-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b0d1a7afc6e8e4635564ba8aa5c0548e3173e41b6384d7711a9123165f582de2", size = 33381, upload-time = "2025-12-16T00:40:26.268Z" }, + { url = "https://files.pythonhosted.org/packages/42/fa/f50f51260d7b0ef5d4898af122d8a7ec5a84e2984f676f746445f783705f/google_crc32c-1.8.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8b3f68782f3cbd1bce027e48768293072813469af6a61a86f6bb4977a4380f21", size = 33734, upload-time = "2025-12-16T00:40:27.028Z" }, + { url = "https://files.pythonhosted.org/packages/08/a5/7b059810934a09fb3ccb657e0843813c1fee1183d3bc2c8041800374aa2c/google_crc32c-1.8.0-cp314-cp314-win_amd64.whl", hash = "sha256:d511b3153e7011a27ab6ee6bb3a5404a55b994dc1a7322c0b87b29606d9790e2", size = 34878, upload-time = "2025-12-16T00:35:23.142Z" }, +] + [[package]] name = "h5ad" -version = "0.1.0" +version = "0.2.0" source = { editable = "." } dependencies = [ { name = "h5py" }, { name = "numpy" }, + { name = "pillow" }, { name = "rich" }, { name = "typer" }, + { name = "zarr" }, ] [package.optional-dependencies] @@ -118,10 +155,12 @@ dev = [ requires-dist = [ { name = "h5py", specifier = ">=3.15.1" }, { name = "numpy", specifier = ">=2.3.5" }, + { name = "pillow", specifier = ">=12.1.0" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.3.4" }, { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=6.0.0" }, { name = "rich", specifier = ">=14.2.0" }, { name = "typer", specifier = ">=0.20.0" }, + { name = "zarr", specifier = ">=3.1.5" }, ] provides-extras = ["dev"] @@ -190,6 +229,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, ] +[[package]] +name = "numcodecs" +version = "0.16.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/44/bd/8a391e7c356366224734efd24da929cc4796fff468bfb179fe1af6548535/numcodecs-0.16.5.tar.gz", hash = "sha256:0d0fb60852f84c0bd9543cc4d2ab9eefd37fc8efcc410acd4777e62a1d300318", size = 6276387, upload-time = "2025-11-21T02:49:48.986Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/75/cc/55420f3641a67f78392dc0bc5d02cb9eb0a9dcebf2848d1ac77253ca61fa/numcodecs-0.16.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:24e675dc8d1550cd976a99479b87d872cb142632c75cc402fea04c08c4898523", size = 1656287, upload-time = "2025-11-21T02:49:25.755Z" }, + { url = "https://files.pythonhosted.org/packages/f5/6c/86644987505dcb90ba6d627d6989c27bafb0699f9fd00187e06d05ea8594/numcodecs-0.16.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:94ddfa4341d1a3ab99989d13b01b5134abb687d3dab2ead54b450aefe4ad5bd6", size = 1148899, upload-time = "2025-11-21T02:49:26.87Z" }, + { url = "https://files.pythonhosted.org/packages/97/1e/98aaddf272552d9fef1f0296a9939d1487914a239e98678f6b20f8b0a5c8/numcodecs-0.16.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b554ab9ecf69de7ca2b6b5e8bc696bd9747559cb4dd5127bd08d7a28bec59c3a", size = 8534814, upload-time = "2025-11-21T02:49:28.547Z" }, + { url = "https://files.pythonhosted.org/packages/fb/53/78c98ef5c8b2b784453487f3e4d6c017b20747c58b470393e230c78d18e8/numcodecs-0.16.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ad1a379a45bd3491deab8ae6548313946744f868c21d5340116977ea3be5b1d6", size = 9173471, upload-time = "2025-11-21T02:49:30.444Z" }, + { url = "https://files.pythonhosted.org/packages/1c/20/2fdec87fc7f8cec950d2b0bea603c12dc9f05b4966dc5924ba5a36a61bf6/numcodecs-0.16.5-cp312-cp312-win_amd64.whl", hash = "sha256:845a9857886ffe4a3172ba1c537ae5bcc01e65068c31cf1fce1a844bd1da050f", size = 801412, upload-time = "2025-11-21T02:49:32.123Z" }, + { url = "https://files.pythonhosted.org/packages/38/38/071ced5a5fd1c85ba0e14ba721b66b053823e5176298c2f707e50bed11d9/numcodecs-0.16.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:25be3a516ab677dad890760d357cfe081a371d9c0a2e9a204562318ac5969de3", size = 1654359, upload-time = "2025-11-21T02:49:33.673Z" }, + { url = "https://files.pythonhosted.org/packages/d1/c0/5f84ba7525577c1b9909fc2d06ef11314825fc4ad4378f61d0e4c9883b4a/numcodecs-0.16.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0107e839ef75b854e969cb577e140b1aadb9847893937636582d23a2a4c6ce50", size = 1144237, upload-time = "2025-11-21T02:49:35.294Z" }, + { url = "https://files.pythonhosted.org/packages/0b/00/787ea5f237b8ea7bc67140c99155f9c00b5baf11c49afc5f3bfefa298f95/numcodecs-0.16.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:015a7c859ecc2a06e2a548f64008c0ec3aaecabc26456c2c62f4278d8fc20597", size = 8483064, upload-time = "2025-11-21T02:49:36.454Z" }, + { url = "https://files.pythonhosted.org/packages/c4/e6/d359fdd37498e74d26a167f7a51e54542e642ea47181eb4e643a69a066c3/numcodecs-0.16.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:84230b4b9dad2392f2a84242bd6e3e659ac137b5a1ce3571d6965fca673e0903", size = 9126063, upload-time = "2025-11-21T02:49:38.018Z" }, + { url = "https://files.pythonhosted.org/packages/27/72/6663cc0382ddbb866136c255c837bcb96cc7ce5e83562efec55e1b995941/numcodecs-0.16.5-cp313-cp313-win_amd64.whl", hash = "sha256:5088145502ad1ebf677ec47d00eb6f0fd600658217db3e0c070c321c85d6cf3d", size = 799275, upload-time = "2025-11-21T02:49:39.558Z" }, + { url = "https://files.pythonhosted.org/packages/3c/9e/38e7ca8184c958b51f45d56a4aeceb1134ecde2d8bd157efadc98502cc42/numcodecs-0.16.5-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b05647b8b769e6bc8016e9fd4843c823ce5c9f2337c089fb5c9c4da05e5275de", size = 1654721, upload-time = "2025-11-21T02:49:40.602Z" }, + { url = "https://files.pythonhosted.org/packages/a1/37/260fa42e7b2b08e6e00ad632f8dd620961a60a459426c26cea390f8c68d0/numcodecs-0.16.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3832bd1b5af8bb3e413076b7d93318c8e7d7b68935006b9fa36ca057d1725a8f", size = 1146887, upload-time = "2025-11-21T02:49:41.721Z" }, + { url = "https://files.pythonhosted.org/packages/4e/15/e2e1151b5a8b14a15dfd4bb4abccce7fff7580f39bc34092780088835f3a/numcodecs-0.16.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49f7b7d24f103187f53135bed28bb9f0ed6b2e14c604664726487bb6d7c882e1", size = 8476987, upload-time = "2025-11-21T02:49:43.363Z" }, + { url = "https://files.pythonhosted.org/packages/6d/30/16a57fc4d9fb0ba06c600408bd6634f2f1753c54a7a351c99c5e09b51ee2/numcodecs-0.16.5-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:aec9736d81b70f337d89c4070ee3ffeff113f386fd789492fa152d26a15043e4", size = 9102377, upload-time = "2025-11-21T02:49:45.508Z" }, + { url = "https://files.pythonhosted.org/packages/31/a5/a0425af36c20d55a3ea884db4b4efca25a43bea9214ba69ca7932dd997b4/numcodecs-0.16.5-cp314-cp314-win_amd64.whl", hash = "sha256:b16a14303800e9fb88abc39463ab4706c037647ac17e49e297faa5f7d7dbbf1d", size = 819022, upload-time = "2025-11-21T02:49:47.39Z" }, +] + [[package]] name = "numpy" version = "2.3.5" @@ -262,6 +328,75 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" }, ] +[[package]] +name = "pillow" +version = "12.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d0/02/d52c733a2452ef1ffcc123b68e6606d07276b0e358db70eabad7e40042b7/pillow-12.1.0.tar.gz", hash = "sha256:5c5ae0a06e9ea030ab786b0251b32c7e4ce10e58d983c0d5c56029455180b5b9", size = 46977283, upload-time = "2026-01-02T09:13:29.892Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/20/31/dc53fe21a2f2996e1b7d92bf671cdb157079385183ef7c1ae08b485db510/pillow-12.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a332ac4ccb84b6dde65dbace8431f3af08874bf9770719d32a635c4ef411b18b", size = 5262642, upload-time = "2026-01-02T09:11:10.138Z" }, + { url = "https://files.pythonhosted.org/packages/ab/c1/10e45ac9cc79419cedf5121b42dcca5a50ad2b601fa080f58c22fb27626e/pillow-12.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:907bfa8a9cb790748a9aa4513e37c88c59660da3bcfffbd24a7d9e6abf224551", size = 4657464, upload-time = "2026-01-02T09:11:12.319Z" }, + { url = "https://files.pythonhosted.org/packages/ad/26/7b82c0ab7ef40ebede7a97c72d473bda5950f609f8e0c77b04af574a0ddb/pillow-12.1.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:efdc140e7b63b8f739d09a99033aa430accce485ff78e6d311973a67b6bf3208", size = 6234878, upload-time = "2026-01-02T09:11:14.096Z" }, + { url = "https://files.pythonhosted.org/packages/76/25/27abc9792615b5e886ca9411ba6637b675f1b77af3104710ac7353fe5605/pillow-12.1.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bef9768cab184e7ae6e559c032e95ba8d07b3023c289f79a2bd36e8bf85605a5", size = 8044868, upload-time = "2026-01-02T09:11:15.903Z" }, + { url = "https://files.pythonhosted.org/packages/0a/ea/f200a4c36d836100e7bc738fc48cd963d3ba6372ebc8298a889e0cfc3359/pillow-12.1.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:742aea052cf5ab5034a53c3846165bc3ce88d7c38e954120db0ab867ca242661", size = 6349468, upload-time = "2026-01-02T09:11:17.631Z" }, + { url = "https://files.pythonhosted.org/packages/11/8f/48d0b77ab2200374c66d344459b8958c86693be99526450e7aee714e03e4/pillow-12.1.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6dfc2af5b082b635af6e08e0d1f9f1c4e04d17d4e2ca0ef96131e85eda6eb17", size = 7041518, upload-time = "2026-01-02T09:11:19.389Z" }, + { url = "https://files.pythonhosted.org/packages/1d/23/c281182eb986b5d31f0a76d2a2c8cd41722d6fb8ed07521e802f9bba52de/pillow-12.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:609e89d9f90b581c8d16358c9087df76024cf058fa693dd3e1e1620823f39670", size = 6462829, upload-time = "2026-01-02T09:11:21.28Z" }, + { url = "https://files.pythonhosted.org/packages/25/ef/7018273e0faac099d7b00982abdcc39142ae6f3bd9ceb06de09779c4a9d6/pillow-12.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:43b4899cfd091a9693a1278c4982f3e50f7fb7cff5153b05174b4afc9593b616", size = 7166756, upload-time = "2026-01-02T09:11:23.559Z" }, + { url = "https://files.pythonhosted.org/packages/8f/c8/993d4b7ab2e341fe02ceef9576afcf5830cdec640be2ac5bee1820d693d4/pillow-12.1.0-cp312-cp312-win32.whl", hash = "sha256:aa0c9cc0b82b14766a99fbe6084409972266e82f459821cd26997a488a7261a7", size = 6328770, upload-time = "2026-01-02T09:11:25.661Z" }, + { url = "https://files.pythonhosted.org/packages/a7/87/90b358775a3f02765d87655237229ba64a997b87efa8ccaca7dd3e36e7a7/pillow-12.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:d70534cea9e7966169ad29a903b99fc507e932069a881d0965a1a84bb57f6c6d", size = 7033406, upload-time = "2026-01-02T09:11:27.474Z" }, + { url = "https://files.pythonhosted.org/packages/5d/cf/881b457eccacac9e5b2ddd97d5071fb6d668307c57cbf4e3b5278e06e536/pillow-12.1.0-cp312-cp312-win_arm64.whl", hash = "sha256:65b80c1ee7e14a87d6a068dd3b0aea268ffcabfe0498d38661b00c5b4b22e74c", size = 2452612, upload-time = "2026-01-02T09:11:29.309Z" }, + { url = "https://files.pythonhosted.org/packages/dd/c7/2530a4aa28248623e9d7f27316b42e27c32ec410f695929696f2e0e4a778/pillow-12.1.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:7b5dd7cbae20285cdb597b10eb5a2c13aa9de6cde9bb64a3c1317427b1db1ae1", size = 4062543, upload-time = "2026-01-02T09:11:31.566Z" }, + { url = "https://files.pythonhosted.org/packages/8f/1f/40b8eae823dc1519b87d53c30ed9ef085506b05281d313031755c1705f73/pillow-12.1.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:29a4cef9cb672363926f0470afc516dbf7305a14d8c54f7abbb5c199cd8f8179", size = 4138373, upload-time = "2026-01-02T09:11:33.367Z" }, + { url = "https://files.pythonhosted.org/packages/d4/77/6fa60634cf06e52139fd0e89e5bbf055e8166c691c42fb162818b7fda31d/pillow-12.1.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:681088909d7e8fa9e31b9799aaa59ba5234c58e5e4f1951b4c4d1082a2e980e0", size = 3601241, upload-time = "2026-01-02T09:11:35.011Z" }, + { url = "https://files.pythonhosted.org/packages/4f/bf/28ab865de622e14b747f0cd7877510848252d950e43002e224fb1c9ababf/pillow-12.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:983976c2ab753166dc66d36af6e8ec15bb511e4a25856e2227e5f7e00a160587", size = 5262410, upload-time = "2026-01-02T09:11:36.682Z" }, + { url = "https://files.pythonhosted.org/packages/1c/34/583420a1b55e715937a85bd48c5c0991598247a1fd2eb5423188e765ea02/pillow-12.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:db44d5c160a90df2d24a24760bbd37607d53da0b34fb546c4c232af7192298ac", size = 4657312, upload-time = "2026-01-02T09:11:38.535Z" }, + { url = "https://files.pythonhosted.org/packages/1d/fd/f5a0896839762885b3376ff04878f86ab2b097c2f9a9cdccf4eda8ba8dc0/pillow-12.1.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6b7a9d1db5dad90e2991645874f708e87d9a3c370c243c2d7684d28f7e133e6b", size = 6232605, upload-time = "2026-01-02T09:11:40.602Z" }, + { url = "https://files.pythonhosted.org/packages/98/aa/938a09d127ac1e70e6ed467bd03834350b33ef646b31edb7452d5de43792/pillow-12.1.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6258f3260986990ba2fa8a874f8b6e808cf5abb51a94015ca3dc3c68aa4f30ea", size = 8041617, upload-time = "2026-01-02T09:11:42.721Z" }, + { url = "https://files.pythonhosted.org/packages/17/e8/538b24cb426ac0186e03f80f78bc8dc7246c667f58b540bdd57c71c9f79d/pillow-12.1.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e115c15e3bc727b1ca3e641a909f77f8ca72a64fff150f666fcc85e57701c26c", size = 6346509, upload-time = "2026-01-02T09:11:44.955Z" }, + { url = "https://files.pythonhosted.org/packages/01/9a/632e58ec89a32738cabfd9ec418f0e9898a2b4719afc581f07c04a05e3c9/pillow-12.1.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6741e6f3074a35e47c77b23a4e4f2d90db3ed905cb1c5e6e0d49bff2045632bc", size = 7038117, upload-time = "2026-01-02T09:11:46.736Z" }, + { url = "https://files.pythonhosted.org/packages/c7/a2/d40308cf86eada842ca1f3ffa45d0ca0df7e4ab33c83f81e73f5eaed136d/pillow-12.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:935b9d1aed48fcfb3f838caac506f38e29621b44ccc4f8a64d575cb1b2a88644", size = 6460151, upload-time = "2026-01-02T09:11:48.625Z" }, + { url = "https://files.pythonhosted.org/packages/f1/88/f5b058ad6453a085c5266660a1417bdad590199da1b32fb4efcff9d33b05/pillow-12.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5fee4c04aad8932da9f8f710af2c1a15a83582cfb884152a9caa79d4efcdbf9c", size = 7164534, upload-time = "2026-01-02T09:11:50.445Z" }, + { url = "https://files.pythonhosted.org/packages/19/ce/c17334caea1db789163b5d855a5735e47995b0b5dc8745e9a3605d5f24c0/pillow-12.1.0-cp313-cp313-win32.whl", hash = "sha256:a786bf667724d84aa29b5db1c61b7bfdde380202aaca12c3461afd6b71743171", size = 6332551, upload-time = "2026-01-02T09:11:52.234Z" }, + { url = "https://files.pythonhosted.org/packages/e5/07/74a9d941fa45c90a0d9465098fe1ec85de3e2afbdc15cc4766622d516056/pillow-12.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:461f9dfdafa394c59cd6d818bdfdbab4028b83b02caadaff0ffd433faf4c9a7a", size = 7040087, upload-time = "2026-01-02T09:11:54.822Z" }, + { url = "https://files.pythonhosted.org/packages/88/09/c99950c075a0e9053d8e880595926302575bc742b1b47fe1bbcc8d388d50/pillow-12.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:9212d6b86917a2300669511ed094a9406888362e085f2431a7da985a6b124f45", size = 2452470, upload-time = "2026-01-02T09:11:56.522Z" }, + { url = "https://files.pythonhosted.org/packages/b5/ba/970b7d85ba01f348dee4d65412476321d40ee04dcb51cd3735b9dc94eb58/pillow-12.1.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:00162e9ca6d22b7c3ee8e61faa3c3253cd19b6a37f126cad04f2f88b306f557d", size = 5264816, upload-time = "2026-01-02T09:11:58.227Z" }, + { url = "https://files.pythonhosted.org/packages/10/60/650f2fb55fdba7a510d836202aa52f0baac633e50ab1cf18415d332188fb/pillow-12.1.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:7d6daa89a00b58c37cb1747ec9fb7ac3bc5ffd5949f5888657dfddde6d1312e0", size = 4660472, upload-time = "2026-01-02T09:12:00.798Z" }, + { url = "https://files.pythonhosted.org/packages/2b/c0/5273a99478956a099d533c4f46cbaa19fd69d606624f4334b85e50987a08/pillow-12.1.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e2479c7f02f9d505682dc47df8c0ea1fc5e264c4d1629a5d63fe3e2334b89554", size = 6268974, upload-time = "2026-01-02T09:12:02.572Z" }, + { url = "https://files.pythonhosted.org/packages/b4/26/0bf714bc2e73d5267887d47931d53c4ceeceea6978148ed2ab2a4e6463c4/pillow-12.1.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f188d580bd870cda1e15183790d1cc2fa78f666e76077d103edf048eed9c356e", size = 8073070, upload-time = "2026-01-02T09:12:04.75Z" }, + { url = "https://files.pythonhosted.org/packages/43/cf/1ea826200de111a9d65724c54f927f3111dc5ae297f294b370a670c17786/pillow-12.1.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0fde7ec5538ab5095cc02df38ee99b0443ff0e1c847a045554cf5f9af1f4aa82", size = 6380176, upload-time = "2026-01-02T09:12:06.626Z" }, + { url = "https://files.pythonhosted.org/packages/03/e0/7938dd2b2013373fd85d96e0f38d62b7a5a262af21ac274250c7ca7847c9/pillow-12.1.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0ed07dca4a8464bada6139ab38f5382f83e5f111698caf3191cb8dbf27d908b4", size = 7067061, upload-time = "2026-01-02T09:12:08.624Z" }, + { url = "https://files.pythonhosted.org/packages/86/ad/a2aa97d37272a929a98437a8c0ac37b3cf012f4f8721e1bd5154699b2518/pillow-12.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f45bd71d1fa5e5749587613037b172e0b3b23159d1c00ef2fc920da6f470e6f0", size = 6491824, upload-time = "2026-01-02T09:12:10.488Z" }, + { url = "https://files.pythonhosted.org/packages/a4/44/80e46611b288d51b115826f136fb3465653c28f491068a72d3da49b54cd4/pillow-12.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:277518bf4fe74aa91489e1b20577473b19ee70fb97c374aa50830b279f25841b", size = 7190911, upload-time = "2026-01-02T09:12:12.772Z" }, + { url = "https://files.pythonhosted.org/packages/86/77/eacc62356b4cf81abe99ff9dbc7402750044aed02cfd6a503f7c6fc11f3e/pillow-12.1.0-cp313-cp313t-win32.whl", hash = "sha256:7315f9137087c4e0ee73a761b163fc9aa3b19f5f606a7fc08d83fd3e4379af65", size = 6336445, upload-time = "2026-01-02T09:12:14.775Z" }, + { url = "https://files.pythonhosted.org/packages/e7/3c/57d81d0b74d218706dafccb87a87ea44262c43eef98eb3b164fd000e0491/pillow-12.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:0ddedfaa8b5f0b4ffbc2fa87b556dc59f6bb4ecb14a53b33f9189713ae8053c0", size = 7045354, upload-time = "2026-01-02T09:12:16.599Z" }, + { url = "https://files.pythonhosted.org/packages/ac/82/8b9b97bba2e3576a340f93b044a3a3a09841170ab4c1eb0d5c93469fd32f/pillow-12.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:80941e6d573197a0c28f394753de529bb436b1ca990ed6e765cf42426abc39f8", size = 2454547, upload-time = "2026-01-02T09:12:18.704Z" }, + { url = "https://files.pythonhosted.org/packages/8c/87/bdf971d8bbcf80a348cc3bacfcb239f5882100fe80534b0ce67a784181d8/pillow-12.1.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:5cb7bc1966d031aec37ddb9dcf15c2da5b2e9f7cc3ca7c54473a20a927e1eb91", size = 4062533, upload-time = "2026-01-02T09:12:20.791Z" }, + { url = "https://files.pythonhosted.org/packages/ff/4f/5eb37a681c68d605eb7034c004875c81f86ec9ef51f5be4a63eadd58859a/pillow-12.1.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:97e9993d5ed946aba26baf9c1e8cf18adbab584b99f452ee72f7ee8acb882796", size = 4138546, upload-time = "2026-01-02T09:12:23.664Z" }, + { url = "https://files.pythonhosted.org/packages/11/6d/19a95acb2edbace40dcd582d077b991646b7083c41b98da4ed7555b59733/pillow-12.1.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:414b9a78e14ffeb98128863314e62c3f24b8a86081066625700b7985b3f529bd", size = 3601163, upload-time = "2026-01-02T09:12:26.338Z" }, + { url = "https://files.pythonhosted.org/packages/fc/36/2b8138e51cb42e4cc39c3297713455548be855a50558c3ac2beebdc251dd/pillow-12.1.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:e6bdb408f7c9dd2a5ff2b14a3b0bb6d4deb29fb9961e6eb3ae2031ae9a5cec13", size = 5266086, upload-time = "2026-01-02T09:12:28.782Z" }, + { url = "https://files.pythonhosted.org/packages/53/4b/649056e4d22e1caa90816bf99cef0884aed607ed38075bd75f091a607a38/pillow-12.1.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3413c2ae377550f5487991d444428f1a8ae92784aac79caa8b1e3b89b175f77e", size = 4657344, upload-time = "2026-01-02T09:12:31.117Z" }, + { url = "https://files.pythonhosted.org/packages/6c/6b/c5742cea0f1ade0cd61485dc3d81f05261fc2276f537fbdc00802de56779/pillow-12.1.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e5dcbe95016e88437ecf33544ba5db21ef1b8dd6e1b434a2cb2a3d605299e643", size = 6232114, upload-time = "2026-01-02T09:12:32.936Z" }, + { url = "https://files.pythonhosted.org/packages/bf/8f/9f521268ce22d63991601aafd3d48d5ff7280a246a1ef62d626d67b44064/pillow-12.1.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d0a7735df32ccbcc98b98a1ac785cc4b19b580be1bdf0aeb5c03223220ea09d5", size = 8042708, upload-time = "2026-01-02T09:12:34.78Z" }, + { url = "https://files.pythonhosted.org/packages/1a/eb/257f38542893f021502a1bbe0c2e883c90b5cff26cc33b1584a841a06d30/pillow-12.1.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0c27407a2d1b96774cbc4a7594129cc027339fd800cd081e44497722ea1179de", size = 6347762, upload-time = "2026-01-02T09:12:36.748Z" }, + { url = "https://files.pythonhosted.org/packages/c4/5a/8ba375025701c09b309e8d5163c5a4ce0102fa86bbf8800eb0d7ac87bc51/pillow-12.1.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15c794d74303828eaa957ff8070846d0efe8c630901a1c753fdc63850e19ecd9", size = 7039265, upload-time = "2026-01-02T09:12:39.082Z" }, + { url = "https://files.pythonhosted.org/packages/cf/dc/cf5e4cdb3db533f539e88a7bbf9f190c64ab8a08a9bc7a4ccf55067872e4/pillow-12.1.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c990547452ee2800d8506c4150280757f88532f3de2a58e3022e9b179107862a", size = 6462341, upload-time = "2026-01-02T09:12:40.946Z" }, + { url = "https://files.pythonhosted.org/packages/d0/47/0291a25ac9550677e22eda48510cfc4fa4b2ef0396448b7fbdc0a6946309/pillow-12.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b63e13dd27da389ed9475b3d28510f0f954bca0041e8e551b2a4eb1eab56a39a", size = 7165395, upload-time = "2026-01-02T09:12:42.706Z" }, + { url = "https://files.pythonhosted.org/packages/4f/4c/e005a59393ec4d9416be06e6b45820403bb946a778e39ecec62f5b2b991e/pillow-12.1.0-cp314-cp314-win32.whl", hash = "sha256:1a949604f73eb07a8adab38c4fe50791f9919344398bdc8ac6b307f755fc7030", size = 6431413, upload-time = "2026-01-02T09:12:44.944Z" }, + { url = "https://files.pythonhosted.org/packages/1c/af/f23697f587ac5f9095d67e31b81c95c0249cd461a9798a061ed6709b09b5/pillow-12.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:4f9f6a650743f0ddee5593ac9e954ba1bdbc5e150bc066586d4f26127853ab94", size = 7176779, upload-time = "2026-01-02T09:12:46.727Z" }, + { url = "https://files.pythonhosted.org/packages/b3/36/6a51abf8599232f3e9afbd16d52829376a68909fe14efe29084445db4b73/pillow-12.1.0-cp314-cp314-win_arm64.whl", hash = "sha256:808b99604f7873c800c4840f55ff389936ef1948e4e87645eaf3fccbc8477ac4", size = 2543105, upload-time = "2026-01-02T09:12:49.243Z" }, + { url = "https://files.pythonhosted.org/packages/82/54/2e1dd20c8749ff225080d6ba465a0cab4387f5db0d1c5fb1439e2d99923f/pillow-12.1.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:bc11908616c8a283cf7d664f77411a5ed2a02009b0097ff8abbba5e79128ccf2", size = 5268571, upload-time = "2026-01-02T09:12:51.11Z" }, + { url = "https://files.pythonhosted.org/packages/57/61/571163a5ef86ec0cf30d265ac2a70ae6fc9e28413d1dc94fa37fae6bda89/pillow-12.1.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:896866d2d436563fa2a43a9d72f417874f16b5545955c54a64941e87c1376c61", size = 4660426, upload-time = "2026-01-02T09:12:52.865Z" }, + { url = "https://files.pythonhosted.org/packages/5e/e1/53ee5163f794aef1bf84243f755ee6897a92c708505350dd1923f4afec48/pillow-12.1.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8e178e3e99d3c0ea8fc64b88447f7cac8ccf058af422a6cedc690d0eadd98c51", size = 6269908, upload-time = "2026-01-02T09:12:54.884Z" }, + { url = "https://files.pythonhosted.org/packages/bc/0b/b4b4106ff0ee1afa1dc599fde6ab230417f800279745124f6c50bcffed8e/pillow-12.1.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:079af2fb0c599c2ec144ba2c02766d1b55498e373b3ac64687e43849fbbef5bc", size = 8074733, upload-time = "2026-01-02T09:12:56.802Z" }, + { url = "https://files.pythonhosted.org/packages/19/9f/80b411cbac4a732439e629a26ad3ef11907a8c7fc5377b7602f04f6fe4e7/pillow-12.1.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bdec5e43377761c5dbca620efb69a77f6855c5a379e32ac5b158f54c84212b14", size = 6381431, upload-time = "2026-01-02T09:12:58.823Z" }, + { url = "https://files.pythonhosted.org/packages/8f/b7/d65c45db463b66ecb6abc17c6ba6917a911202a07662247e1355ce1789e7/pillow-12.1.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:565c986f4b45c020f5421a4cea13ef294dde9509a8577f29b2fc5edc7587fff8", size = 7068529, upload-time = "2026-01-02T09:13:00.885Z" }, + { url = "https://files.pythonhosted.org/packages/50/96/dfd4cd726b4a45ae6e3c669fc9e49deb2241312605d33aba50499e9d9bd1/pillow-12.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:43aca0a55ce1eefc0aefa6253661cb54571857b1a7b2964bd8a1e3ef4b729924", size = 6492981, upload-time = "2026-01-02T09:13:03.314Z" }, + { url = "https://files.pythonhosted.org/packages/4d/1c/b5dc52cf713ae46033359c5ca920444f18a6359ce1020dd3e9c553ea5bc6/pillow-12.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0deedf2ea233722476b3a81e8cdfbad786f7adbed5d848469fa59fe52396e4ef", size = 7191878, upload-time = "2026-01-02T09:13:05.276Z" }, + { url = "https://files.pythonhosted.org/packages/53/26/c4188248bd5edaf543864fe4834aebe9c9cb4968b6f573ce014cc42d0720/pillow-12.1.0-cp314-cp314t-win32.whl", hash = "sha256:b17fbdbe01c196e7e159aacb889e091f28e61020a8abeac07b68079b6e626988", size = 6438703, upload-time = "2026-01-02T09:13:07.491Z" }, + { url = "https://files.pythonhosted.org/packages/b8/0e/69ed296de8ea05cb03ee139cee600f424ca166e632567b2d66727f08c7ed/pillow-12.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:27b9baecb428899db6c0de572d6d305cfaf38ca1596b5c0542a5182e3e74e8c6", size = 7182927, upload-time = "2026-01-02T09:13:09.841Z" }, + { url = "https://files.pythonhosted.org/packages/fc/f5/68334c015eed9b5cff77814258717dec591ded209ab5b6fb70e2ae873d1d/pillow-12.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:f61333d817698bdcdd0f9d7793e365ac3d2a21c1f1eb02b32ad6aefb8d8ea831", size = 2545104, upload-time = "2026-01-02T09:13:12.068Z" }, +] + [[package]] name = "pluggy" version = "1.6.0" @@ -310,6 +445,52 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/49/1377b49de7d0c1ce41292161ea0f721913fa8722c19fb9c1e3aa0367eecb/pytest_cov-7.0.0-py3-none-any.whl", hash = "sha256:3b8e9558b16cc1479da72058bdecf8073661c7f57f7d3c5f22a1c23507f2d861", size = 22424, upload-time = "2025-09-09T10:57:00.695Z" }, ] +[[package]] +name = "pyyaml" +version = "6.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" }, + { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" }, + { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" }, + { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" }, + { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" }, + { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" }, + { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" }, + { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" }, + { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" }, + { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" }, + { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669, upload-time = "2025-09-25T21:32:23.673Z" }, + { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252, upload-time = "2025-09-25T21:32:25.149Z" }, + { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081, upload-time = "2025-09-25T21:32:26.575Z" }, + { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159, upload-time = "2025-09-25T21:32:27.727Z" }, + { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626, upload-time = "2025-09-25T21:32:28.878Z" }, + { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613, upload-time = "2025-09-25T21:32:30.178Z" }, + { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115, upload-time = "2025-09-25T21:32:31.353Z" }, + { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" }, + { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" }, + { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" }, + { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" }, + { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" }, + { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" }, + { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" }, + { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" }, + { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" }, + { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" }, + { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" }, + { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" }, + { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" }, + { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" }, + { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" }, + { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" }, + { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" }, + { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" }, + { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, +] + [[package]] name = "rich" version = "14.2.0" @@ -355,3 +536,20 @@ sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac8 wheels = [ { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, ] + +[[package]] +name = "zarr" +version = "3.1.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "donfig" }, + { name = "google-crc32c" }, + { name = "numcodecs" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fc/76/7fa87f57c112c7b9c82f0a730f8b6f333e792574812872e2cd45ab604199/zarr-3.1.5.tar.gz", hash = "sha256:fbe0c79675a40c996de7ca08e80a1c0a20537bd4a9f43418b6d101395c0bba2b", size = 366825, upload-time = "2025-11-21T14:06:01.492Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/44/15/bb13b4913ef95ad5448490821eee4671d0e67673342e4d4070854e5fe081/zarr-3.1.5-py3-none-any.whl", hash = "sha256:29cd905afb6235b94c09decda4258c888fcb79bb6c862ef7c0b8fe009b5c8563", size = 284067, upload-time = "2025-11-21T14:05:59.235Z" }, +]