diff --git a/.codeocean/datasets.json b/.codeocean/datasets.json index 716efc2..d16d13c 100644 --- a/.codeocean/datasets.json +++ b/.codeocean/datasets.json @@ -17,6 +17,10 @@ "id": "78a80081-c645-4e38-beb7-b9d9308a35d9", "mount": "microns1412" }, + { + "id": "aafc99cc-92ee-4d04-b152-92f1063a3268", + "mount": "v1dd_1196" + }, { "id": "aff09b9b-5cdc-49ef-8e39-358a8ead98d8", "mount": "visp-patchseq-taxonomy-info" diff --git a/.github/instructions/changelog.instructions.md b/.github/instructions/changelog.instructions.md new file mode 100644 index 0000000..5b3ebda --- /dev/null +++ b/.github/instructions/changelog.instructions.md @@ -0,0 +1,54 @@ +--- +description: "Use when editing CHANGELOG.md, drafting release notes, or summarizing user-visible changes. Enforces Keep a Changelog format, SemVer scope, and the user-voice rule." +applyTo: "CHANGELOG.md" +--- +# Changelog rules + +The changelog is the user-facing log of what changed in +`connects_common_connectivity`. It is **not** an internal work journal. + +## Format +- [Keep a Changelog 1.1.0](https://keepachangelog.com/en/1.1.0/) + + [SemVer](https://semver.org/spec/v2.0.0.html). +- All new entries go under `## [Unreleased]` until a release is cut. +- Use only the standard sections: `Added`, `Changed`, `Deprecated`, `Removed`, + `Fixed`, `Security`. Omit empty sections in released versions; keep them as + empty headers under `[Unreleased]` so contributors see the slots. +- Newest version on top. Releases are `## [X.Y.Z] - YYYY-MM-DD`. + +## Voice and scope (the rule that actually matters) +- Write in **user voice**: what changed for someone who imports + `connects_common_connectivity`, runs the `ccc` CLI, or follows the README. +- One bullet per change. Past tense, present-perfect-style is fine + (`Added …`, `Moved …`, `Fixed …`). No first person, no narrative. +- **Include**: new public names, removed public names, moved import paths, + changed signatures, changed defaults, behavior fixes a user could observe, + new CLI flags, new config keys, dropped Python versions. +- **Exclude**: internal refactors, test-only changes, planning-doc edits, + prompt/agent-customization edits, dev-tooling tweaks, comment-only changes. + If a user couldn't notice it, it doesn't belong here. +- If a change has both an internal and a user-visible side, log only the + user-visible side. + +## Linking +- Reference public names in backticks: `` `write_models` ``, `` `io.writers` ``. +- Link to issues/PRs only when they add information a user would want + (`#123`); do not link to internal planning docs. + +## Deprecations and removals +- Announce in `Deprecated` first (one release minimum) before moving to + `Removed`, except for genuinely unused or never-released names. +- Name the replacement when there is one: "Deprecated `X`; use `Y` instead." + +## Releasing (manual for now) +1. Rename `## [Unreleased]` to `## [X.Y.Z] - YYYY-MM-DD` (today's date). +2. Drop empty subsections from the released block. +3. Add a fresh `## [Unreleased]` at the top with all six empty sub-headers. +4. Bump the version in `pyproject.toml` in the same commit. + +## Anti-patterns +- "Refactored internals." — internal, drop it. +- "Updated planning docs." — internal, drop it. +- "Various fixes." — split into specific bullets or drop. +- "Added new feature." — name the public symbol or describe the behavior. +- Long prose paragraphs — one bullet, one change. diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..0d8c810 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,90 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added + +### Changed + +### Deprecated + +### Removed + +### Fixed + +### Security + +## [0.2.0] - 2026-06-23 + +### Added + +- Added `connects_common_connectivity.config` with `Settings`, + `get_settings()`, `find_config_file()`, `output_root()`, and + `table_path()`. Settings are discovered from a `ccc_config.yaml` at (or + above) the cwd; `CCC_OUTPUT_ROOT` overrides `output_root`. Relative + `output_root` values are anchored at the config file's directory so a + notebook in `code/` and a script at the repo root resolve to the same + place. +- Added curated public API at `connects_common_connectivity.io`: + `write_models()` (single dispatch core for all generated pydantic + models), `write_projection_matrix()`, `WriteResult`, + `WRITABLE_CLASSES`, and re-exports of `get_settings`, `Settings`, and + `table_path`. The surface is pinned by `__all__`. +- Added write-time validation: `write_models()` now re-validates each + model through a runtime-derived strict subclass that flips + `WriteSpec.required_for_write` slots to non-optional, raising + `ValueError` before any IO if a write-required slot is missing or + `None`. Public helpers `strict_model_for()` and `validate_for_write()` + live in `connects_common_connectivity.io.write_validation`. +- Added `WriteSpec` registry entries for `AlgorithmRun` and + `HierarchyCategory` (both project-agnostic, scope=`["id"]`, + `overwrite_scoped`). These classes are now writable through + `write_models(...)` and surface in `WRITABLE_CLASSES`. +- Added an `output_root=` keyword to `write_models()` and + `write_projection_matrix()` for per-call overrides of the on-disk root. + Accepts a `str` or `Path` and writes to `//`, + bypassing `ccc_config.yaml` for that call. Mutually exclusive with + `settings=` (passing both raises `TypeError`). Lets a single notebook + redirect its writes (e.g. an isolated test dataset) without mutating + process-global config or environment variables. +- Added `populate_region_coverage()` in + `connects_common_connectivity.io.write_utils` for deriving + `ProjectionMeasurementMatrix.region_coverage` from a dense matrix. +- Added `CALCIUM_IMAGING` value to the `Modality` enum for calcium + imaging based functional correlations. + +### Changed + +- Migrated `code/etl_*.ipynb` notebooks to the curated IO API: + hardcoded `OUTPUT_ROOT = "../scratch/..."` strings are replaced with + `output_root()` from `connects_common_connectivity.config`, and + hand-rolled `write_deltalake(..., mode=..., predicate=..., partition_by=...)` + calls for every registry-backed model are replaced with `write_models(...)` + (and `write_projection_matrix(...)` for projection-matrix metadata rows). + Wide cell-feature / projection-matrix parquets and `CellCellConnectivityLong` + writes remain on raw `write_deltalake` pending registry support. +- Moved `connects_common_connectivity.arrow_utils` and + `connects_common_connectivity.write_utils` under + `connects_common_connectivity.io.*`. + +### Removed + +- Removed the deprecated re-export shims + `connects_common_connectivity.arrow_utils` and + `connects_common_connectivity.write_utils`. Import from + `connects_common_connectivity.io.arrow_utils` / + `connects_common_connectivity.io.write_utils` instead. + +### Fixed + +- Fixed `DataSet` writes to scope on `(project_id, id)` instead of + `project_id` alone, so sibling notebooks sharing a `project_id` (e.g. + patchseq exc/inh) no longer overwrite each other's `DataSet` rows. +- Fixed `write_models()` to honor `Settings.dry_run=True`: writes are now + skipped, `rows_written` is reported as `0`, and no Delta table + directories are created. diff --git a/README.md b/README.md index 7517cd0..7820126 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ The pilot of the Common Connectivity Pilot is focused on developing a framework - Packaged with `pyproject.toml` and intended to be managed via `uv` - BrainRegion ETL example from Parquet (S3/local) via `examples/etl_brain_regions.py` or CLI `ccc etl-brain-regions` - Generic Parquet→LinkML loader utility (`parquet_loader.py`) for any class in the schema +- Curated IO layer (`connects_common_connectivity.io`) for writing generated pydantic models to a shared Delta lake — `write_models(...)` / `write_projection_matrix(...)` dispatched via a `WriteSpec` registry, with output location resolved from `ccc_config.yaml` ## Getting Started (with uv) @@ -147,7 +148,7 @@ Pydantic models; this repository currently favors agility for early design. ## ETL Notebooks -A set of ETL Jupyter notebooks in `code/` registers real datasets into the shared Delta Lake store under `results/em_patchseq_wnm_v1/`. These serve as concrete working examples for every schema class. +A set of ETL Jupyter notebooks in `code/` registers real datasets into a shared Delta Lake store via the `connects_common_connectivity.io` layer (`write_models`, `write_projection_matrix`). The output location is resolved from `ccc_config.yaml` at the repo root (or the `CCC_OUTPUT_ROOT` environment variable), so notebooks do not hard-code a destination path. These serve as concrete working examples for every schema class. - **`code/etl_examples_readme.ipynb`** — markdown-only overview of all registered datasets and feature sets: what each dataset contains, why cell counts differ between sources, and how shared feature sets work across projects. Start here if you're new to the data. diff --git a/ccc_config.yaml b/ccc_config.yaml new file mode 100644 index 0000000..6a121c9 --- /dev/null +++ b/ccc_config.yaml @@ -0,0 +1,5 @@ +# Package-wide settings for ConnectsCommonConnectivity. +# Discovered by walking up from cwd (pyproject.toml/ruff/pytest pattern). +# Edit this file (or set CCC_OUTPUT_ROOT) to repoint writers/readers. +output_root: scratch/em_patchseq_wnm_v2/ +dry_run: false diff --git a/code/etl_examples_readme.ipynb b/code/etl_examples_readme.ipynb index d4fd6b5..2224376 100644 --- a/code/etl_examples_readme.ipynb +++ b/code/etl_examples_readme.ipynb @@ -8,7 +8,7 @@ "\n", "A quick-reference guide to what was registered and why. Use this notebook to orient yourself before diving into a specific ETL notebook.\n", "\n", - "> **All notebooks live in `code/`.** Outputs land in `../scratch/em_patchseq_wnm_v1/`." + "> **All notebooks live in `code/`.** Outputs land in `../scratch/em_patchseq_wnm_v1/`. Registry-backed model tables are written with `write_models(...)` (projection rows use `write_projection_matrix(...)`)." ] }, { @@ -123,7 +123,7 @@ "| `etl_tasic_01_cluster.ipynb` | `tasic_2018_visp_taxonomy` | Tasic 2018 VISp scRNA-seq taxonomy (class → subclass → cluster) |\n", "| `etl_visp_met_types_01_cluster.ipynb` | `visp_met_types_taxonomy` | VISp MET-types (class → cluster), 45 leaves |\n", "\n", - "Both write `algorithmrun/`, `clusterhierarchy/`, `cluster/`, and `hierarchycategory/` rows. No `project_id`; rows are scoped by `hierarchy_id` so multiple taxonomies can coexist in the same Delta tables.\n" + "Both write `algorithmrun/`, `clusterhierarchy/`, `cluster/`, and `hierarchycategory/` rows. No `project_id`; `Cluster` rows are scoped by `hierarchy_id`, while the others are id-scoped in the write registry.\n" ] }, { @@ -289,7 +289,7 @@ "\n", "Source: `ProjectionMatrix_tip_and_branch_roll_up.csv`. Cell ids are the SWC filename with `.swc` stripped (matches `_01`).\n", "\n", - "Adds **+4 new cells** found in the projection CSV but not yet in `dataitem/` — the same late-addition pattern as the `_02` notebooks. Registered via `append_new_dataitems`.\n" + "Adds **+4 new cells** found in the projection CSV but not yet in `dataitem/` — the same late-addition pattern as the `_02` notebooks. Registered via `write_models(DataItem(...))` (append-new-by-id mode).\n" ] }, { diff --git a/code/etl_minnie_01_dataset_dataitem.ipynb b/code/etl_minnie_01_dataset_dataitem.ipynb index 81c4028..b150252 100644 --- a/code/etl_minnie_01_dataset_dataitem.ipynb +++ b/code/etl_minnie_01_dataset_dataitem.ipynb @@ -6,20 +6,13 @@ "source": [ "# ETL — Minnie65: DataSet & DataItem\n", "\n", - "Writes one `DataSet` record (`dataset_id = \"minnie65_v1412_nuclei\"`, `project_id = \"minnie65\"`) and one `DataItem` per nucleus from the CAVE `nucleus_detection_lookup_v1` view at materialization version 1412, plus the corresponding `DataItemDataSetAssociation` links. Cohort DataSets (e.g. `minnie65_v1412_csm_cluster`) and cell features are written by later notebooks." + "Writes one `DataSet` record (`dataset_id = \"minnie65_v1300_nuclei\"`, `project_id = \"minnie65\"`) and one `DataItem` per nucleus from the CAVE `nucleus_detection_lookup_v1` view at materialization version 1300, plus the corresponding `DataItemDataSetAssociation` links. Cohort DataSets (e.g. `minnie65_v1300_csm_cluster`) and cell features are written by later notebooks." ] }, { "cell_type": "code", "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:19.297771Z", - "iopub.status.busy": "2026-04-30T23:47:19.297592Z", - "iopub.status.idle": "2026-04-30T23:47:20.707966Z", - "shell.execute_reply": "2026-04-30T23:47:20.707186Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -37,53 +30,41 @@ "import pandas as pd\n", "import polars as pl\n", "import pyarrow as pa\n", - "from deltalake import write_deltalake\n", "\n", - "from connects_common_connectivity.arrow_utils import (\n", - " build_arrow_schema,\n", - " models_to_table,\n", - " attach_linkml_metadata,\n", - ")\n", "from connects_common_connectivity.models import (\n", " DataSet,\n", " DataItem,\n", " DataItemDataSetAssociation,\n", " Modality,\n", ")\n", - "from connects_common_connectivity.write_utils import append_new_dataitems" + "from connects_common_connectivity.config import output_root\n", + "from connects_common_connectivity.io import write_models\n" ] }, { "cell_type": "code", "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:20.710164Z", - "iopub.status.busy": "2026-04-30T23:47:20.709772Z", - "iopub.status.idle": "2026-04-30T23:47:20.713941Z", - "shell.execute_reply": "2026-04-30T23:47:20.713229Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v1/\n", + "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v3/\n", "PROJECT_ID : minnie65\n", - "DATASET_ID : minnie65_v1412_nuclei\n", + "DATASET_ID : minnie65_v1300_nuclei\n", "CAVE_DATASTACK : minnie65_phase3_v1\n", - "CAVE_VERSION : 1412\n", + "CAVE_VERSION : 1300\n", "CAVE_VIEW : nucleus_detection_lookup_v1\n" ] } ], "source": [ - "OUTPUT_ROOT = \"../scratch/em_patchseq_wnm_v1/\"\n", + "OUTPUT_ROOT = output_root()\n", "PROJECT_ID = \"minnie65\"\n", - "DATASET_ID = \"minnie65_v1412_nuclei\"\n", + "DATASET_ID = \"minnie65_v1300_nuclei\"\n", "CAVE_DATASTACK = \"minnie65_phase3_v1\"\n", - "CAVE_VERSION = 1412\n", + "CAVE_VERSION = 1300\n", "CAVE_VIEW = \"nucleus_detection_lookup_v1\"\n", "\n", "print(f\"OUTPUT_ROOT : {OUTPUT_ROOT}\")\n", @@ -104,102 +85,24 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:20.715508Z", - "iopub.status.busy": "2026-04-30T23:47:20.715326Z", - "iopub.status.idle": "2026-04-30T23:47:24.276080Z", - "shell.execute_reply": "2026-04-30T23:47:24.275422Z" - } - }, + "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Shape: (133969, 7)\n" + "ename": "HTTPError", + "evalue": "503 Server Error: Service Temporarily Unavailable for url: https://minnie.microns-daf.com/materialize/version content:b'\\r\\n503 Service Temporarily Unavailable\\r\\n\\r\\n

503 Service Temporarily Unavailable

\\r\\n
nginx
\\r\\n\\r\\n\\r\\n'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mHTTPError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m client = caveclient.CAVEclient(CAVE_DATASTACK, auth_token=os.environ[\u001b[33m\"CUSTOM_KEY\"\u001b[39m])\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m client.materialize.version = CAVE_VERSION\n\u001b[32m 3\u001b[39m \n\u001b[32m 4\u001b[39m nuc_df = client.materialize.query_view(CAVE_VIEW)\n\u001b[32m 5\u001b[39m nuc_df = nuc_df.query(\u001b[33m\"pt_root_id != 0\"\u001b[39m)\n", + "\u001b[36mFile \u001b[39m\u001b[32m/opt/conda/lib/python3.12/site-packages/caveclient/frameworkclient.py:633\u001b[39m, in \u001b[36mCAVEclientFull.materialize\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 628\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 629\u001b[39m \u001b[33;03mA client for the materialization service. See [client.materialize](../api/materialize.md)\u001b[39;00m\n\u001b[32m 630\u001b[39m \u001b[33;03mfor more information.\u001b[39;00m\n\u001b[32m 631\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 632\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m._materialize \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m633\u001b[39m \u001b[38;5;28mself\u001b[39m._materialize = \u001b[30;43mMaterializationClient\u001b[39;49m\u001b[30;43m(\u001b[39;49m\n\u001b[32m 634\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43mserver_address\u001b[39;49m\u001b[30;43m=\u001b[39;49m\u001b[30;43mself\u001b[39;49m\u001b[30;43m.\u001b[39;49m\u001b[30;43mlocal_server\u001b[39;49m\u001b[30;43m,\u001b[39;49m\n\u001b[32m 635\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43mauth_client\u001b[39;49m\u001b[30;43m=\u001b[39;49m\u001b[30;43mself\u001b[39;49m\u001b[30;43m.\u001b[39;49m\u001b[30;43mauth\u001b[39;49m\u001b[30;43m,\u001b[39;49m\n\u001b[32m 636\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43mdatastack_name\u001b[39;49m\u001b[30;43m=\u001b[39;49m\u001b[30;43mself\u001b[39;49m\u001b[30;43m.\u001b[39;49m\u001b[30;43m_datastack_name\u001b[39;49m\u001b[30;43m,\u001b[39;49m\n\u001b[32m 637\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43msynapse_table\u001b[39;49m\u001b[30;43m=\u001b[39;49m\u001b[30;43mself\u001b[39;49m\u001b[30;43m.\u001b[39;49m\u001b[30;43minfo\u001b[39;49m\u001b[30;43m.\u001b[39;49m\u001b[30;43mget_datastack_info\u001b[39;49m\u001b[30;43m(\u001b[39;49m\u001b[30;43m)\u001b[39;49m\u001b[30;43m.\u001b[39;49m\u001b[30;43mget\u001b[39;49m\u001b[30;43m(\u001b[39;49m\u001b[30;43m\"\u001b[39;49m\u001b[30;43msynapse_table\u001b[39;49m\u001b[30;43m\"\u001b[39;49m\u001b[30;43m,\u001b[39;49m\u001b[30;43m \u001b[39;49m\u001b[30;43;01mNone\u001b[39;49;00m\u001b[30;43m)\u001b[39;49m\u001b[30;43m,\u001b[39;49m\n\u001b[32m 638\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43mmax_retries\u001b[39;49m\u001b[30;43m=\u001b[39;49m\u001b[30;43mself\u001b[39;49m\u001b[30;43m.\u001b[39;49m\u001b[30;43m_max_retries\u001b[39;49m\u001b[30;43m,\u001b[39;49m\n\u001b[32m 639\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43mpool_maxsize\u001b[39;49m\u001b[30;43m=\u001b[39;49m\u001b[30;43mself\u001b[39;49m\u001b[30;43m.\u001b[39;49m\u001b[30;43m_pool_maxsize\u001b[39;49m\u001b[30;43m,\u001b[39;49m\n\u001b[32m 640\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43mpool_block\u001b[39;49m\u001b[30;43m=\u001b[39;49m\u001b[30;43mself\u001b[39;49m\u001b[30;43m.\u001b[39;49m\u001b[30;43m_pool_block\u001b[39;49m\u001b[30;43m,\u001b[39;49m\n\u001b[32m 641\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43mover_client\u001b[39;49m\u001b[30;43m=\u001b[39;49m\u001b[30;43mself\u001b[39;49m\u001b[30;43m,\u001b[39;49m\n\u001b[32m 642\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43mdesired_resolution\u001b[39;49m\u001b[30;43m=\u001b[39;49m\u001b[30;43mself\u001b[39;49m\u001b[30;43m.\u001b[39;49m\u001b[30;43mdesired_resolution\u001b[39;49m\u001b[30;43m,\u001b[39;49m\n\u001b[32m 643\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43m)\u001b[39;49m\n\u001b[32m 644\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._materialize\n", + "\u001b[36mFile \u001b[39m\u001b[32m/opt/conda/lib/python3.12/site-packages/caveclient/materializationengine.py:221\u001b[39m, in \u001b[36mMaterializationClient.__init__\u001b[39m\u001b[34m(self, server_address, datastack_name, auth_client, cg_client, synapse_table, api_version, version, verify, max_retries, pool_maxsize, pool_block, desired_resolution, over_client)\u001b[39m\n\u001b[32m 209\u001b[39m auth_header = auth_client.request_header\n\u001b[32m 210\u001b[39m endpoints, api_version = _api_endpoints(\n\u001b[32m 211\u001b[39m api_version,\n\u001b[32m 212\u001b[39m SERVER_KEY,\n\u001b[32m (...)\u001b[39m\u001b[32m 218\u001b[39m verify=verify,\n\u001b[32m 219\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m221\u001b[39m \u001b[30;43msuper\u001b[39;49m\u001b[30;43m(\u001b[39;49m\u001b[30;43mMaterializationClient\u001b[39;49m\u001b[30;43m,\u001b[39;49m\u001b[30;43m \u001b[39;49m\u001b[30;43mself\u001b[39;49m\u001b[30;43m)\u001b[39;49m\u001b[30;43m.\u001b[39;49m\u001b[30;43m__init__\u001b[39;49m\u001b[30;43m(\u001b[39;49m\n\u001b[32m 222\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43mserver_address\u001b[39;49m\u001b[30;43m,\u001b[39;49m\n\u001b[32m 223\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43mauth_header\u001b[39;49m\u001b[30;43m,\u001b[39;49m\n\u001b[32m 224\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43mapi_version\u001b[39;49m\u001b[30;43m,\u001b[39;49m\n\u001b[32m 225\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43mendpoints\u001b[39;49m\u001b[30;43m,\u001b[39;49m\n\u001b[32m 226\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43mSERVER_KEY\u001b[39;49m\u001b[30;43m,\u001b[39;49m\n\u001b[32m 227\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43mverify\u001b[39;49m\u001b[30;43m=\u001b[39;49m\u001b[30;43mverify\u001b[39;49m\u001b[30;43m,\u001b[39;49m\n\u001b[32m 228\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43mmax_retries\u001b[39;49m\u001b[30;43m=\u001b[39;49m\u001b[30;43mmax_retries\u001b[39;49m\u001b[30;43m,\u001b[39;49m\n\u001b[32m 229\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43mpool_maxsize\u001b[39;49m\u001b[30;43m=\u001b[39;49m\u001b[30;43mpool_maxsize\u001b[39;49m\u001b[30;43m,\u001b[39;49m\n\u001b[32m 230\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43mpool_block\u001b[39;49m\u001b[30;43m=\u001b[39;49m\u001b[30;43mpool_block\u001b[39;49m\u001b[30;43m,\u001b[39;49m\n\u001b[32m 231\u001b[39m \u001b[30;43m \u001b[39;49m\u001b[30;43mover_client\u001b[39;49m\u001b[30;43m=\u001b[39;49m\u001b[30;43mover_client\u001b[39;49m\u001b[30;43m,\u001b[39;49m\n\u001b[32m 232\u001b[39m \u001b[30;43m\u001b[39;49m\u001b[30;43m)\u001b[39;49m\n\u001b[32m 233\u001b[39m \u001b[38;5;28mself\u001b[39m._datastack_name = datastack_name\n\u001b[32m 234\u001b[39m \u001b[38;5;28mself\u001b[39m._version = version\n", + "\u001b[36mFile \u001b[39m\u001b[32m/opt/conda/lib/python3.12/site-packages/caveclient/base.py:217\u001b[39m, in \u001b[36mClientBase.__init__\u001b[39m\u001b[34m(self, server_address, auth_header, api_version, endpoints, server_name, verify, max_retries, pool_maxsize, pool_block, over_client)\u001b[39m\n\u001b[32m 215\u001b[39m \u001b[38;5;28mself\u001b[39m._endpoints = endpoints\n\u001b[32m 216\u001b[39m \u001b[38;5;28mself\u001b[39m._fc = over_client\n\u001b[32m--> \u001b[39m\u001b[32m217\u001b[39m \u001b[38;5;28mself\u001b[39m._server_version = \u001b[30;43mself\u001b[39;49m\u001b[30;43m.\u001b[39;49m\u001b[30;43m_get_version\u001b[39;49m\u001b[30;43m(\u001b[39;49m\u001b[30;43m)\u001b[39;49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m/opt/conda/lib/python3.12/site-packages/caveclient/base.py:246\u001b[39m, in \u001b[36mClientBase._get_version\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 244\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 245\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m246\u001b[39m version_str = \u001b[30;43mhandle_response\u001b[39;49m\u001b[30;43m(\u001b[39;49m\u001b[30;43mresponse\u001b[39;49m\u001b[30;43m,\u001b[39;49m\u001b[30;43m \u001b[39;49m\u001b[30;43mas_json\u001b[39;49m\u001b[30;43m=\u001b[39;49m\u001b[30;43;01mTrue\u001b[39;49;00m\u001b[30;43m)\u001b[39;49m\n\u001b[32m 247\u001b[39m version = Version(version_str)\n\u001b[32m 248\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m version\n", + "\u001b[36mFile \u001b[39m\u001b[32m/opt/conda/lib/python3.12/site-packages/caveclient/base.py:94\u001b[39m, in \u001b[36mhandle_response\u001b[39m\u001b[34m(response, as_json, log_warning)\u001b[39m\n\u001b[32m 92\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"Deal with potential errors in endpoint response and return json for default case\"\"\"\u001b[39;00m\n\u001b[32m 93\u001b[39m \u001b[38;5;66;03m# NOTE: consider adding \"None on 404\" as an option?\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m94\u001b[39m \u001b[30;43m_raise_for_status\u001b[39;49m\u001b[30;43m(\u001b[39;49m\u001b[30;43mresponse\u001b[39;49m\u001b[30;43m,\u001b[39;49m\u001b[30;43m \u001b[39;49m\u001b[30;43mlog_warning\u001b[39;49m\u001b[30;43m=\u001b[39;49m\u001b[30;43mlog_warning\u001b[39;49m\u001b[30;43m)\u001b[39;49m\n\u001b[32m 95\u001b[39m _check_authorization_redirect(response)\n\u001b[32m 96\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m as_json:\n", + "\u001b[36mFile \u001b[39m\u001b[32m/opt/conda/lib/python3.12/site-packages/caveclient/base.py:84\u001b[39m, in \u001b[36m_raise_for_status\u001b[39m\u001b[34m(r, log_warning)\u001b[39m\n\u001b[32m 76\u001b[39m http_error_msg = \u001b[33m\"\u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m Server Error: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m for url: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m content:\u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m\"\u001b[39m % (\n\u001b[32m 77\u001b[39m r.status_code,\n\u001b[32m 78\u001b[39m reason,\n\u001b[32m 79\u001b[39m r.url,\n\u001b[32m 80\u001b[39m r.content,\n\u001b[32m 81\u001b[39m )\n\u001b[32m 83\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m http_error_msg:\n\u001b[32m---> \u001b[39m\u001b[32m84\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m requests.HTTPError(http_error_msg, response=r)\n\u001b[32m 85\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m log_warning:\n\u001b[32m 86\u001b[39m warning = r.headers.get(\u001b[33m\"\u001b[39m\u001b[33mWarning\u001b[39m\u001b[33m\"\u001b[39m)\n", + "\u001b[31mHTTPError\u001b[39m: 503 Server Error: Service Temporarily Unavailable for url: https://minnie.microns-daf.com/materialize/version content:b'\\r\\n503 Service Temporarily Unavailable\\r\\n\\r\\n

503 Service Temporarily Unavailable

\\r\\n
nginx
\\r\\n\\r\\n\\r\\n'" ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idvolumept_root_idorig_root_idpt_supervoxel_idpt_positionpt_position_lookup
1373879229.04504386469113609013560786469113609013560796218056992431305[228816, 239776, 19593][228816, 239776, 19593]
320185893.75383686469113537389367886469113537389367884955554103121097[146848, 213600, 26267][146848, 213600, 26267]
4600774135.1897918646911356823787440111493022281121981[339120, 276112, 19442][339520, 276480, 19506]
\n", - "
" - ], - "text/plain": [ - " id volume pt_root_id orig_root_id \\\n", - "1 373879 229.045043 864691136090135607 864691136090135607 \n", - "3 201858 93.753836 864691135373893678 864691135373893678 \n", - "4 600774 135.189791 864691135682378744 0 \n", - "\n", - " pt_supervoxel_id pt_position pt_position_lookup \n", - "1 96218056992431305 [228816, 239776, 19593] [228816, 239776, 19593] \n", - "3 84955554103121097 [146848, 213600, 26267] [146848, 213600, 26267] \n", - "4 111493022281121981 [339120, 276112, 19442] [339520, 276480, 19506] " - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ @@ -222,61 +125,25 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:24.278066Z", - "iopub.status.busy": "2026-04-30T23:47:24.277862Z", - "iopub.status.idle": "2026-04-30T23:47:24.364744Z", - "shell.execute_reply": "2026-04-30T23:47:24.364040Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "DataSet written: (1, 5)\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "dataset = DataSet(\n", " id=DATASET_ID,\n", - " name=\"Minnie65 v1412 nucleus catalog\",\n", + " name=\"Minnie65 v1300 nucleus catalog\",\n", " publication=\"doi.org/10.1038/s41586-025-08778-6\",\n", " modality=Modality.ELECTRON_MICROSCOPY.value,\n", " project_id=PROJECT_ID,\n", ")\n", - "\n", - "schema_ds = build_arrow_schema(DataSet)\n", - "table_ds = models_to_table([dataset], schema=schema_ds)\n", - "table_ds = attach_linkml_metadata(table_ds, linkml_class=\"DataSet\")\n", - "\n", - "# mode='overwrite' makes re-runs idempotent instead of appending duplicates.\n", - "# predicate scopes the overwrite to this project only — other projects' rows\n", - "# in the shared Delta table are left untouched.\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"dataset/\",\n", - " table_ds,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"DataSet written:\", table_ds.shape)" + "result = write_models([dataset], output_root=OUTPUT_ROOT)\n", + "print(f\"DataSet written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:24.366592Z", - "iopub.status.busy": "2026-04-30T23:47:24.366391Z", - "iopub.status.idle": "2026-04-30T23:47:24.398220Z", - "shell.execute_reply": "2026-04-30T23:47:24.397292Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -289,7 +156,7 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ str ┆ str ┆ str │\n", "╞══════════════════════╪═════════════════╪══════════════════════╪═════════════════════╪════════════╡\n", - "│ minnie65_v1412_nucle ┆ Minnie65 v1412 ┆ doi.org/10.1038/s415 ┆ ELECTRON_MICROSCOPY ┆ minnie65 │\n", + "│ minnie65_v1300_nucle ┆ Minnie65 v1300 ┆ doi.org/10.1038/s415 ┆ ELECTRON_MICROSCOPY ┆ minnie65 │\n", "│ i ┆ nucleus catalog ┆ 86-025-087… ┆ ┆ │\n", "└──────────────────────┴─────────────────┴──────────────────────┴─────────────────────┴────────────┘\n" ] @@ -299,7 +166,7 @@ "# Verification\n", "ds_verify = (\n", " pl.read_delta(OUTPUT_ROOT + \"dataset/\")\n", - " .filter(pl.col(\"project_id\") == PROJECT_ID)\n", + " .filter((pl.col(\"project_id\") == PROJECT_ID) & (pl.col(\"id\") == DATASET_ID))\n", " .filter(pl.col(\"id\") == DATASET_ID)\n", ")\n", "print(ds_verify.shape)\n", @@ -318,20 +185,13 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:24.400176Z", - "iopub.status.busy": "2026-04-30T23:47:24.399893Z", - "iopub.status.idle": "2026-04-30T23:47:26.643935Z", - "shell.execute_reply": "2026-04-30T23:47:26.643138Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "DataItem rows appended: 0 (total in batch: 133969)\n" + "DataItem rows appended: 133969 (total in batch: 133969)\n" ] } ], @@ -340,29 +200,14 @@ " DataItem(id=str(row.id), name=str(row.pt_root_id), project_id=PROJECT_ID)\n", " for row in nuc_df.itertuples()\n", "]\n", - "\n", - "schema_di = build_arrow_schema(DataItem)\n", - "table_di = models_to_table(dataitems, schema=schema_di)\n", - "table_di = attach_linkml_metadata(table_di, linkml_class=\"DataItem\")\n", - "\n", - "# append_new_dataitems checks which ids already exist for this project and appends\n", - "# only new rows — safe when multiple _01 notebooks share a project_id, since\n", - "# each dataset's cells are registered without wiping the other's rows.\n", - "n_appended = append_new_dataitems(OUTPUT_ROOT + \"dataitem/\", table_di, project_id=PROJECT_ID)\n", + "n_appended = write_models(dataitems, output_root=OUTPUT_ROOT).rows_written\n", "print(f\"DataItem rows appended: {n_appended} (total in batch: {len(dataitems)})\")" ] }, { "cell_type": "code", "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:26.645628Z", - "iopub.status.busy": "2026-04-30T23:47:26.645427Z", - "iopub.status.idle": "2026-04-30T23:47:26.721724Z", - "shell.execute_reply": "2026-04-30T23:47:26.720975Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -408,20 +253,13 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:26.723482Z", - "iopub.status.busy": "2026-04-30T23:47:26.723266Z", - "iopub.status.idle": "2026-04-30T23:47:28.450853Z", - "shell.execute_reply": "2026-04-30T23:47:28.450093Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "DataItemDataSetAssociation written: (133969, 3)\n" + "DataItemDataSetAssociation written: 133969 rows\n" ] } ], @@ -434,35 +272,14 @@ " )\n", " for item in dataitems\n", "]\n", - "\n", - "schema_assoc = build_arrow_schema(DataItemDataSetAssociation)\n", - "table_assoc = models_to_table(associations, schema=schema_assoc)\n", - "table_assoc = attach_linkml_metadata(table_assoc, linkml_class=\"DataItemDataSetAssociation\")\n", - "\n", - "# mode='overwrite' makes re-runs idempotent instead of appending duplicates.\n", - "# predicate scopes the overwrite to this project only — other projects' rows\n", - "# in the shared Delta table are left untouched.\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"dataitem_dataset_association/\",\n", - " table_assoc,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND dataset_id = '{DATASET_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"DataItemDataSetAssociation written:\", table_assoc.shape)" + "result = write_models(associations, output_root=OUTPUT_ROOT)\n", + "print(f\"DataItemDataSetAssociation written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 9, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:28.452562Z", - "iopub.status.busy": "2026-04-30T23:47:28.452363Z", - "iopub.status.idle": "2026-04-30T23:47:28.497155Z", - "shell.execute_reply": "2026-04-30T23:47:28.496377Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -475,11 +292,11 @@ "│ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ str │\n", "╞═════════════╪═══════════════════════╪════════════╡\n", - "│ 373879 ┆ minnie65_v1412_nuclei ┆ minnie65 │\n", - "│ 201858 ┆ minnie65_v1412_nuclei ┆ minnie65 │\n", - "│ 600774 ┆ minnie65_v1412_nuclei ┆ minnie65 │\n", - "│ 408486 ┆ minnie65_v1412_nuclei ┆ minnie65 │\n", - "│ 598774 ┆ minnie65_v1412_nuclei ┆ minnie65 │\n", + "│ 373879 ┆ minnie65_v1300_nuclei ┆ minnie65 │\n", + "│ 201858 ┆ minnie65_v1300_nuclei ┆ minnie65 │\n", + "│ 600774 ┆ minnie65_v1300_nuclei ┆ minnie65 │\n", + "│ 408486 ┆ minnie65_v1300_nuclei ┆ minnie65 │\n", + "│ 598774 ┆ minnie65_v1300_nuclei ┆ minnie65 │\n", "└─────────────┴───────────────────────┴────────────┘\n" ] } @@ -512,7 +329,7 @@ "| `dataitem_dataset_association/` | `DataItemDataSetAssociation` | `len(nuc_df)` |\n", "\n", "**Intentionally not written here:**\n", - "- Cohort DataSets (e.g. `minnie65_v1412_csm_cluster`) — each cohort is an additional `DataSet` row plus `DataItemDataSetAssociation` rows pointing at the same `DataItem` ids; written by `_02`/`_03` notebooks.\n", + "- Cohort DataSets (e.g. `minnie65_v1300_csm_cluster`) — each cohort is an additional `DataSet` row plus `DataItemDataSetAssociation` rows pointing at the same `DataItem` ids; written by `_02`/`_03` notebooks.\n", "- Cell features (`pt_position`, cell type labels, etc.) — written in `_02` as `CellFeature` records." ] }, diff --git a/code/etl_minnie_02_cell_features.ipynb b/code/etl_minnie_02_cell_features.ipynb index 08ae4fc..dd20d19 100644 --- a/code/etl_minnie_02_cell_features.ipynb +++ b/code/etl_minnie_02_cell_features.ipynb @@ -4,22 +4,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# ETL — Minnie65: Cell Features\n", + "# ETL \u2014 Minnie65: Cell Features\n", "\n", - "Writes the CSM dendrite-ultrastructure cohort `DataSet` (`minnie65_v1412_csm_cluster`), its `DataItemDataSetAssociation` links, `CellFeatureDefinition` rows, `CellFeatureSet` rows, wide-form feature parquet tables, and `CellFeatureMatrix` pointer rows for two feature sets. Each feature-set section is independently idempotent. Prerequisite: `etl_minnie_01_dataset_dataitem.ipynb`." + "Writes the CSM dendrite-ultrastructure cohort `DataSet` (`minnie65_v1300_csm_cluster`), its `DataItemDataSetAssociation` links, `CellFeatureDefinition` rows, `CellFeatureSet` rows, wide-form feature parquet tables, and `CellFeatureMatrix` pointer rows for two feature sets. Each feature-set section is independently idempotent. Prerequisite: `etl_minnie_01_dataset_dataitem.ipynb`." ] }, { "cell_type": "code", "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:37.269601Z", - "iopub.status.busy": "2026-04-30T23:47:37.269413Z", - "iopub.status.idle": "2026-04-30T23:47:39.329482Z", - "shell.execute_reply": "2026-04-30T23:47:39.328688Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -42,12 +35,6 @@ "import standard_transform\n", "from deltalake import write_deltalake\n", "\n", - "from connects_common_connectivity.arrow_utils import (\n", - " attach_linkml_metadata,\n", - " build_arrow_schema,\n", - " build_cell_feature_matrix_schema,\n", - " models_to_table,\n", - ")\n", "from connects_common_connectivity.models import (\n", " CellFeatureDefinition,\n", " CellFeatureMatrix,\n", @@ -56,28 +43,24 @@ " DataSet,\n", " Modality,\n", " Unit,\n", - ")" + ")\n", + "from connects_common_connectivity.io.arrow_utils import build_cell_feature_matrix_schema\n", + "from connects_common_connectivity.config import output_root\n", + "from connects_common_connectivity.io import write_models\n" ] }, { "cell_type": "code", "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:39.331475Z", - "iopub.status.busy": "2026-04-30T23:47:39.331078Z", - "iopub.status.idle": "2026-04-30T23:47:39.335310Z", - "shell.execute_reply": "2026-04-30T23:47:39.334773Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v1/\n", + "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v2/\n", "PROJECT_ID : minnie65\n", - "COHORT_DATASET_ID : minnie65_v1412_csm_cluster\n", + "COHORT_DATASET_ID : minnie65_v1300_csm_cluster\n", "FSI_CSM : csm_cluster_features\n", "FSI_STD : minnie65_std_transform_coordinates\n" ] @@ -86,13 +69,13 @@ "source": [ "FEATURES_PARQUET = \"/data/minnie1412/minnie_features.parquet\"\n", "FEATURES_CSV = \"/data/minnie1412/minnie_cell_features.csv\"\n", - "OUTPUT_ROOT = \"../scratch/em_patchseq_wnm_v1/\"\n", + "OUTPUT_ROOT = output_root()\n", "PROJECT_ID = \"minnie65\"\n", - "COHORT_DATASET_ID = \"minnie65_v1412_csm_cluster\"\n", + "COHORT_DATASET_ID = \"minnie65_v1300_csm_cluster\"\n", "FSI_CSM = \"csm_cluster_features\"\n", "FSI_STD = \"minnie65_std_transform_coordinates\"\n", "CAVE_DATASTACK = \"minnie65_phase3_v1\"\n", - "CAVE_VERSION = 1412\n", + "CAVE_VERSION = 1300\n", "CAVE_VIEW = \"nucleus_detection_lookup_v1\"\n", "\n", "print(f\"OUTPUT_ROOT : {OUTPUT_ROOT}\")\n", @@ -112,29 +95,22 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:39.337347Z", - "iopub.status.busy": "2026-04-30T23:47:39.337158Z", - "iopub.status.idle": "2026-04-30T23:47:39.378022Z", - "shell.execute_reply": "2026-04-30T23:47:39.377250Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Prerequisite OK: minnie65_v1412_nuclei\n" + "Prerequisite OK: minnie65_v1300_nuclei\n" ] } ], "source": [ "prereq = (\n", " pl.read_delta(OUTPUT_ROOT + \"dataset/\")\n", - " .filter(pl.col(\"id\") == \"minnie65_v1412_nuclei\")\n", + " .filter(pl.col(\"id\") == \"minnie65_v1300_nuclei\")\n", ")\n", - "assert prereq.shape[0] == 1, \"etl_minnie_01 must be run first — minnie65_v1412_nuclei DataSet not found\"\n", + "assert prereq.shape[0] == 1, \"etl_minnie_01 must be run first \u2014 minnie65_v1300_nuclei DataSet not found\"\n", "print(\"Prerequisite OK:\", prereq[\"id\"][0])" ] }, @@ -148,20 +124,13 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:39.380017Z", - "iopub.status.busy": "2026-04-30T23:47:39.379810Z", - "iopub.status.idle": "2026-04-30T23:47:39.518990Z", - "shell.execute_reply": "2026-04-30T23:47:39.518178Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Dropped 4 duplicate id row(s): 35787 → 35783\n", + "Dropped 4 duplicate id row(s): 35787 \u2192 35783\n", "Features parquet shape: (35783, 112)\n", "Feature metadata CSV shape: (82, 6)\n" ] @@ -285,7 +254,7 @@ " \n", " \n", "\n", - "

3 rows × 112 columns

\n", + "

3 rows \u00d7 112 columns

\n", "" ], "text/plain": [ @@ -415,7 +384,7 @@ "# table cell-indexed (one row per nucleus id).\n", "n_before = len(feat_df)\n", "feat_df = feat_df.drop_duplicates(subset=\"id\", keep=\"first\")\n", - "print(f\"Dropped {n_before - len(feat_df)} duplicate id row(s): {n_before} → {len(feat_df)}\")\n", + "print(f\"Dropped {n_before - len(feat_df)} duplicate id row(s): {n_before} \u2192 {len(feat_df)}\")\n", "\n", "print(\"Features parquet shape:\", feat_df.shape)\n", "print(\"Feature metadata CSV shape:\", feat_meta.shape)\n", @@ -432,66 +401,37 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:39.520747Z", - "iopub.status.busy": "2026-04-30T23:47:39.520544Z", - "iopub.status.idle": "2026-04-30T23:47:40.175516Z", - "shell.execute_reply": "2026-04-30T23:47:40.174643Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "DataSet written: (1, 5)\n", - "DataItemDataSetAssociation written: (35783, 3)\n" - ] - } - ], + "execution_count": 12, + "metadata": {}, + "outputs": [], "source": [ "cohort_ds = DataSet(\n", " id=COHORT_DATASET_ID,\n", - " name=\"Minnie65 v1412 CSM dendrite ultrastructure cohort\",\n", + " name=\"Minnie65 v1300 CSM dendrite ultrastructure cohort\",\n", " modality=Modality.ELECTRON_MICROSCOPY.value,\n", " project_id=PROJECT_ID,\n", ")\n", + "result = write_models([cohort_ds], output_root=OUTPUT_ROOT)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ "cell_ids = feat_df[\"id\"].astype(str).tolist()\n", "associations = [\n", " DataItemDataSetAssociation(dataitem_id=cid, dataset_id=COHORT_DATASET_ID, project_id=PROJECT_ID)\n", " for cid in cell_ids\n", "]\n", - "\n", - "schema_ds = build_arrow_schema(DataSet)\n", - "table_ds = attach_linkml_metadata(models_to_table([cohort_ds], schema=schema_ds), linkml_class=\"DataSet\")\n", - "schema_assoc = build_arrow_schema(DataItemDataSetAssociation)\n", - "table_assoc = attach_linkml_metadata(models_to_table(associations, schema=schema_assoc), linkml_class=\"DataItemDataSetAssociation\")\n", - "\n", - "# Predicate must include id=COHORT_DATASET_ID, not just project_id. The dataset/ table\n", - "# is shared across all notebooks for this project — a predicate of project_id='minnie65'\n", - "# alone would wipe the nucleus catalog row (minnie65_v1412_nuclei) written by\n", - "# etl_minnie_01, forcing that notebook to be rerun before this one works again.\n", - "write_deltalake(OUTPUT_ROOT + \"dataset/\", table_ds,\n", - " mode=\"overwrite\", predicate=f\"project_id = '{PROJECT_ID}' AND id = '{COHORT_DATASET_ID}'\", partition_by=[\"project_id\"])\n", - "write_deltalake(OUTPUT_ROOT + \"dataitem_dataset_association/\", table_assoc,\n", - " mode=\"overwrite\", predicate=f\"project_id = '{PROJECT_ID}'\", partition_by=[\"project_id\"])\n", - "print(\"DataSet written:\", table_ds.shape)\n", - "print(\"DataItemDataSetAssociation written:\", table_assoc.shape)" + "result = write_models(associations, output_root=OUTPUT_ROOT)" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:40.177331Z", - "iopub.status.busy": "2026-04-30T23:47:40.177117Z", - "iopub.status.idle": "2026-04-30T23:47:40.222700Z", - "shell.execute_reply": "2026-04-30T23:47:40.221892Z" - } - }, + "execution_count": 14, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -499,14 +439,14 @@ "text": [ "DataSet: (1, 5)\n", "shape: (1, 5)\n", - "┌────────────────────────────┬────────────────────┬─────────────┬─────────────────────┬────────────┐\n", - "│ id ┆ name ┆ publication ┆ modality ┆ project_id │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str ┆ str ┆ str │\n", - "╞════════════════════════════╪════════════════════╪═════════════╪═════════════════════╪════════════╡\n", - "│ minnie65_v1412_csm_cluster ┆ Minnie65 v1412 CSM ┆ null ┆ ELECTRON_MICROSCOPY ┆ minnie65 │\n", - "│ ┆ dendrite ul… ┆ ┆ ┆ │\n", - "└────────────────────────────┴────────────────────┴─────────────┴─────────────────────┴────────────┘\n", + "\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n", + "\u2502 id \u2506 name \u2506 publication \u2506 modality \u2506 project_id \u2502\n", + "\u2502 --- \u2506 --- \u2506 --- \u2506 --- \u2506 --- \u2502\n", + "\u2502 str \u2506 str \u2506 str \u2506 str \u2506 str \u2502\n", + "\u255e\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2561\n", + "\u2502 minnie65_v1300_csm_cluster \u2506 Minnie65 v1300 CSM \u2506 null \u2506 ELECTRON_MICROSCOPY \u2506 minnie65 \u2502\n", + "\u2502 \u2506 dendrite ul\u2026 \u2506 \u2506 \u2506 \u2502\n", + "\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n", "Associations: (35783, 3)\n" ] } @@ -534,21 +474,14 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:40.224480Z", - "iopub.status.busy": "2026-04-30T23:47:40.224274Z", - "iopub.status.idle": "2026-04-30T23:47:40.318717Z", - "shell.execute_reply": "2026-04-30T23:47:40.318057Z" - } - }, + "execution_count": 15, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CSM CellFeatureDefinition written: (82, 8)\n" + "CellFeatureDefinition written: 82 rows\n" ] } ], @@ -569,32 +502,14 @@ " if pd.notna(row[\"range_max\"]):\n", " kwargs[\"range_max\"] = float(row[\"range_max\"])\n", " csm_fds.append(CellFeatureDefinition(**kwargs))\n", - "\n", - "schema_cfd = build_arrow_schema(CellFeatureDefinition)\n", - "table_cfd_csm = attach_linkml_metadata(\n", - " models_to_table(csm_fds, schema=schema_cfd), linkml_class=\"CellFeatureDefinition\"\n", - ")\n", - "# Predicate scopes this overwrite to the CSM feature set only — STD definitions are untouched.\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"cellfeaturedefinition/\", table_cfd_csm,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND feature_set_id = '{FSI_CSM}'\",\n", - " partition_by=[\"project_id\", \"feature_set_id\"],\n", - ")\n", - "print(\"CSM CellFeatureDefinition written:\", table_cfd_csm.shape)" + "result = write_models(csm_fds, output_root=OUTPUT_ROOT)\n", + "print(f\"CellFeatureDefinition written: {result.rows_written} rows\")" ] }, { "cell_type": "code", - "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:40.320667Z", - "iopub.status.busy": "2026-04-30T23:47:40.320468Z", - "iopub.status.idle": "2026-04-30T23:47:40.360176Z", - "shell.execute_reply": "2026-04-30T23:47:40.359367Z" - } - }, + "execution_count": 16, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -602,23 +517,23 @@ "text": [ "(82, 8)\n", "shape: (3, 8)\n", - "┌────────────┬────────────┬────────────┬───────────┬───────────┬───────────┬───────────┬───────────┐\n", - "│ id ┆ descriptio ┆ unit ┆ data_type ┆ range_min ┆ range_max ┆ project_i ┆ feature_s │\n", - "│ --- ┆ n ┆ --- ┆ --- ┆ --- ┆ --- ┆ d ┆ et_id │\n", - "│ str ┆ --- ┆ str ┆ str ┆ f64 ┆ f64 ┆ --- ┆ --- │\n", - "│ ┆ str ┆ ┆ ┆ ┆ ┆ str ┆ str │\n", - "╞════════════╪════════════╪════════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡\n", - "│ nucleus_vo ┆ Nucleus ┆ MICRONS_CU ┆ \n", + "shape: (3, 9)
pre_pt_root_idpost_pt_root_idn_synsum_sizepre_nuc_idpost_nuc_id__index_level_0__pre_nuc_id_strpost_nuc_id_str
i64i64i64i64i64i64i64strstr
8646911351368998658646911348847431621598433717530404343176584"337175""304043"
8646911353603462008646911348847567301592033016733914243181492"330167""339142"
8646911353736017368646911348847567301385227359533914243181499"273595""339142"
" + ], + "text/plain": [ + "shape: (3, 9)\n", + "\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n", + "\u2502 pre_pt_roo \u2506 post_pt_ro \u2506 n_syn \u2506 sum_size \u2506 \u2026 \u2506 post_nuc_i \u2506 __index_le \u2506 pre_nuc_i \u2506 post_nuc_ \u2502\n", + "\u2502 t_id \u2506 ot_id \u2506 --- \u2506 --- \u2506 \u2506 d \u2506 vel_0__ \u2506 d_str \u2506 id_str \u2502\n", + "\u2502 --- \u2506 --- \u2506 i64 \u2506 i64 \u2506 \u2506 --- \u2506 --- \u2506 --- \u2506 --- \u2502\n", + "\u2502 i64 \u2506 i64 \u2506 \u2506 \u2506 \u2506 i64 \u2506 i64 \u2506 str \u2506 str \u2502\n", + "\u255e\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2561\n", + "\u2502 8646911351 \u2506 8646911348 \u2506 1 \u2506 5984 \u2506 \u2026 \u2506 304043 \u2506 43176584 \u2506 337175 \u2506 304043 \u2502\n", + "\u2502 36899865 \u2506 84743162 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2502\n", + "\u2502 8646911353 \u2506 8646911348 \u2506 1 \u2506 5920 \u2506 \u2026 \u2506 339142 \u2506 43181492 \u2506 330167 \u2506 339142 \u2502\n", + "\u2502 60346200 \u2506 84756730 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2502\n", + "\u2502 8646911353 \u2506 8646911348 \u2506 1 \u2506 3852 \u2506 \u2026 \u2506 339142 \u2506 43181499 \u2506 273595 \u2506 339142 \u2502\n", + "\u2502 73601736 \u2506 84756730 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2502\n", + "\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "conn_df = pl.read_parquet(PARQUET_PATH)\n", "print(f\"Raw parquet rows: {conn_df.shape[0]}\")\n", @@ -314,31 +439,47 @@ "metadata": {}, "source": [ "---\n", - "## Example 1 — (proofread ∩ CSM)-pre × CSM-post\n", + "## Example 1 \u2014 (proofread \u2229 CSM)-pre \u00d7 CSM-post\n", "\n", - "Filter to pre-synaptic cells in the proofread ∩ CSM set and post-synaptic cells in the full CSM set. Write two `CellCellConnectivityLong` measurement types per pair (`SYNAPSE_COUNT`, `SUM_ANATOMICAL_SIZE`)." + "Filter to pre-synaptic cells in the proofread \u2229 CSM set and post-synaptic cells in the full CSM set. Write two `CellCellConnectivityLong` measurement types per pair (`SYNAPSE_COUNT`, `SUM_ANATOMICAL_SIZE`)." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "6c002b88", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Example 1 filtered rows ((proofread \u2229 CSM)-pre \u00d7 CSM-post): 583750\n" + ] + } + ], "source": [ "conn_ex1 = conn_df.filter(\n", " pl.col(\"pre_nuc_id_str\").is_in(proofread_nuc_ids)\n", " & pl.col(\"post_nuc_id_str\").is_in(csm_nuc_ids)\n", ")\n", - "print(f\"Example 1 filtered rows ((proofread ∩ CSM)-pre × CSM-post): {conn_ex1.shape[0]}\")" + "print(f\"Example 1 filtered rows ((proofread \u2229 CSM)-pre \u00d7 CSM-post): {conn_ex1.shape[0]}\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "91251dcc", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All 34313 unique cell ids confirmed in dataitem/.\n" + ] + } + ], "source": [ "pre_ids_ex1 = set(conn_ex1[\"pre_nuc_id_str\"].to_list())\n", "post_ids_ex1 = set(conn_ex1[\"post_nuc_id_str\"].to_list())\n", @@ -351,10 +492,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "id": "ebf80af9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CellCellConnectivityLong rows (Example 1): 1167500\n" + ] + } + ], "source": [ "rows_ex1 = []\n", "for row in conn_ex1.iter_rows(named=True):\n", @@ -390,10 +539,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "id": "e0ef2071", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Written to cellcellconnectivitylong_proofread_pre_to_csm_post/: 1167500 rows\n" + ] + } + ], "source": [ "schema_cc = build_arrow_schema(CellCellConnectivityLong)\n", "table_ex1 = attach_linkml_metadata(\n", @@ -413,10 +570,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "id": "a494d0fe", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape: (1167500, 9)\n", + "shape: (2, 2)\n", + "\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n", + "\u2502 measurement_type \u2506 len \u2502\n", + "\u2502 --- \u2506 --- \u2502\n", + "\u2502 str \u2506 u32 \u2502\n", + "\u255e\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2561\n", + "\u2502 SYNAPSE_COUNT \u2506 583750 \u2502\n", + "\u2502 SUM_ANATOMICAL_SIZE \u2506 583750 \u2502\n", + "\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n", + "shape: (3, 9)\n", + "\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n", + "\u2502 id \u2506 descriptio \u2506 presynapti \u2506 postsynapt \u2506 \u2026 \u2506 value \u2506 unit \u2506 project_id \u2506 measuremen \u2502\n", + "\u2502 --- \u2506 n \u2506 c_cell \u2506 ic_cell \u2506 \u2506 --- \u2506 --- \u2506 --- \u2506 t_type \u2502\n", + "\u2502 str \u2506 --- \u2506 --- \u2506 --- \u2506 \u2506 f64 \u2506 str \u2506 str \u2506 --- \u2502\n", + "\u2502 \u2506 str \u2506 str \u2506 str \u2506 \u2506 \u2506 \u2506 \u2506 str \u2502\n", + "\u255e\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2561\n", + "\u2502 301245_3614 \u2506 null \u2506 301245 \u2506 361468 \u2506 \u2026 \u2506 1.0 \u2506 COUNT \u2506 minnie65 \u2506 SYNAPSE_CO \u2502\n", + "\u2502 68_SYNAPSE_ \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 UNT \u2502\n", + "\u2502 COUNT \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2502\n", + "\u2502 262722_2564 \u2506 null \u2506 262722 \u2506 256466 \u2506 \u2026 \u2506 1.0 \u2506 COUNT \u2506 minnie65 \u2506 SYNAPSE_CO \u2502\n", + "\u2502 66_SYNAPSE_ \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 UNT \u2502\n", + "\u2502 COUNT \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2502\n", + "\u2502 296668_2285 \u2506 null \u2506 296668 \u2506 228553 \u2506 \u2026 \u2506 1.0 \u2506 COUNT \u2506 minnie65 \u2506 SYNAPSE_CO \u2502\n", + "\u2502 53_SYNAPSE_ \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 UNT \u2502\n", + "\u2502 COUNT \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2502\n", + "\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n" + ] + } + ], "source": [ "ex1_v = pl.read_delta(OUTPUT_ROOT + \"cellcellconnectivitylong_proofread_pre_to_csm_post/\").filter(\n", " pl.col(\"project_id\") == PROJECT_ID\n", @@ -434,14 +625,14 @@ "metadata": {}, "source": [ "---\n", - "## Example 2 — (proofread ∩ CSM)-pre × (proofread ∩ CSM)-post\n", + "## Example 2 \u2014 (proofread \u2229 CSM)-pre \u00d7 (proofread \u2229 CSM)-post\n", "\n", - "Filter the same parquet to pairs where both pre and post are in the proofread ∩ CSM set. Write `SYNAPSE_COUNT` only." + "Filter the same parquet to pairs where both pre and post are in the proofread \u2229 CSM set. Write `SYNAPSE_COUNT` only." ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 30, "id": "34be2c33", "metadata": {}, "outputs": [ @@ -449,7 +640,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Example 2 filtered rows ((proofread ∩ CSM) × (proofread ∩ CSM)): 96788\n" + "Example 2 filtered rows ((proofread \u2229 CSM) \u00d7 (proofread \u2229 CSM)): 83877\n" ] } ], @@ -458,12 +649,12 @@ " pl.col(\"pre_nuc_id_str\").is_in(proofread_nuc_ids)\n", " & pl.col(\"post_nuc_id_str\").is_in(proofread_nuc_ids)\n", ")\n", - "print(f\"Example 2 filtered rows ((proofread ∩ CSM) × (proofread ∩ CSM)): {conn_ex2.shape[0]}\")" + "print(f\"Example 2 filtered rows ((proofread \u2229 CSM) \u00d7 (proofread \u2229 CSM)): {conn_ex2.shape[0]}\")" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 31, "id": "b0aa6ab0", "metadata": {}, "outputs": [ @@ -471,7 +662,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "All 1861 unique cell ids confirmed in dataitem/.\n" + "All 1636 unique cell ids confirmed in dataitem/.\n" ] } ], @@ -487,7 +678,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 32, "id": "6e4a2ee8", "metadata": {}, "outputs": [ @@ -495,7 +686,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "CellCellConnectivityLong rows (Example 2): 96788\n" + "CellCellConnectivityLong rows (Example 2): 83877\n" ] } ], @@ -522,7 +713,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 33, "id": "65ffa4e9", "metadata": {}, "outputs": [ @@ -530,7 +721,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Written to cellcellconnectivitylong_proofread_to_proofread/: 96788 rows\n" + "Written to cellcellconnectivitylong_proofread_to_proofread/: 83877 rows\n" ] } ], @@ -552,7 +743,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 34, "id": "e40d8180", "metadata": {}, "outputs": [ @@ -560,32 +751,32 @@ "name": "stdout", "output_type": "stream", "text": [ - "Shape: (96788, 9)\n", + "Shape: (83877, 9)\n", "shape: (1, 2)\n", - "┌──────────────────┬───────┐\n", - "│ measurement_type ┆ len │\n", - "│ --- ┆ --- │\n", - "│ str ┆ u32 │\n", - "╞══════════════════╪═══════╡\n", - "│ SYNAPSE_COUNT ┆ 96788 │\n", - "└──────────────────┴───────┘\n", + "\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n", + "\u2502 measurement_type \u2506 len \u2502\n", + "\u2502 --- \u2506 --- \u2502\n", + "\u2502 str \u2506 u32 \u2502\n", + "\u255e\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2561\n", + "\u2502 SYNAPSE_COUNT \u2506 83877 \u2502\n", + "\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n", "shape: (3, 9)\n", - "┌─────────────┬────────────┬────────────┬────────────┬───┬───────┬───────┬────────────┬────────────┐\n", - "│ id ┆ descriptio ┆ presynapti ┆ postsynapt ┆ … ┆ value ┆ unit ┆ project_id ┆ measuremen │\n", - "│ --- ┆ n ┆ c_cell ┆ ic_cell ┆ ┆ --- ┆ --- ┆ --- ┆ t_type │\n", - "│ str ┆ --- ┆ --- ┆ --- ┆ ┆ f64 ┆ str ┆ str ┆ --- │\n", - "│ ┆ str ┆ str ┆ str ┆ ┆ ┆ ┆ ┆ str │\n", - "╞═════════════╪════════════╪════════════╪════════════╪═══╪═══════╪═══════╪════════════╪════════════╡\n", - "│ 226128_5188 ┆ null ┆ 226128 ┆ 518848 ┆ … ┆ 1.0 ┆ COUNT ┆ minnie65 ┆ SYNAPSE_CO │\n", - "│ 48_SYNAPSE_ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ UNT │\n", - "│ COUNT ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 394967_5188 ┆ null ┆ 394967 ┆ 518848 ┆ … ┆ 1.0 ┆ COUNT ┆ minnie65 ┆ SYNAPSE_CO │\n", - "│ 48_SYNAPSE_ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ UNT │\n", - "│ COUNT ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "│ 292685_5188 ┆ null ┆ 292685 ┆ 518848 ┆ … ┆ 1.0 ┆ COUNT ┆ minnie65 ┆ SYNAPSE_CO │\n", - "│ 48_SYNAPSE_ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ UNT │\n", - "│ COUNT ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", - "└─────────────┴────────────┴────────────┴────────────┴───┴───────┴───────┴────────────┴────────────┘\n" + "\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n", + "\u2502 id \u2506 descriptio \u2506 presynapti \u2506 postsynapt \u2506 \u2026 \u2506 value \u2506 unit \u2506 project_id \u2506 measuremen \u2502\n", + "\u2502 --- \u2506 n \u2506 c_cell \u2506 ic_cell \u2506 \u2506 --- \u2506 --- \u2506 --- \u2506 t_type \u2502\n", + "\u2502 str \u2506 --- \u2506 --- \u2506 --- \u2506 \u2506 f64 \u2506 str \u2506 str \u2506 --- \u2502\n", + "\u2502 \u2506 str \u2506 str \u2506 str \u2506 \u2506 \u2506 \u2506 \u2506 str \u2502\n", + "\u255e\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u256a\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2561\n", + "\u2502 226128_5188 \u2506 null \u2506 226128 \u2506 518848 \u2506 \u2026 \u2506 1.0 \u2506 COUNT \u2506 minnie65 \u2506 SYNAPSE_CO \u2502\n", + "\u2502 48_SYNAPSE_ \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 UNT \u2502\n", + "\u2502 COUNT \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2502\n", + "\u2502 394967_5188 \u2506 null \u2506 394967 \u2506 518848 \u2506 \u2026 \u2506 1.0 \u2506 COUNT \u2506 minnie65 \u2506 SYNAPSE_CO \u2502\n", + "\u2502 48_SYNAPSE_ \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 UNT \u2502\n", + "\u2502 COUNT \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2502\n", + "\u2502 292685_5188 \u2506 null \u2506 292685 \u2506 518848 \u2506 \u2026 \u2506 1.0 \u2506 COUNT \u2506 minnie65 \u2506 SYNAPSE_CO \u2502\n", + "\u2502 48_SYNAPSE_ \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 UNT \u2502\n", + "\u2502 COUNT \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2506 \u2502\n", + "\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2534\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n" ] } ], @@ -611,10 +802,10 @@ "\n", "| Path | Rows |\n", "|------|------|\n", - "| `dataset/` | +1 (`minnie65_v1412_proofread` = proofread ∩ CSM cells) |\n", - "| `dataitem_dataset_association/` | one per proofread ∩ CSM cell |\n", - "| `cellcellconnectivitylong_proofread_pre_to_csm_post/` | 2 × filtered pairs: (proofread ∩ CSM)-pre × CSM-post (`SYNAPSE_COUNT` + `SUM_ANATOMICAL_SIZE`) |\n", - "| `cellcellconnectivitylong_proofread_to_proofread/` | 1 × filtered pairs: (proofread ∩ CSM) × (proofread ∩ CSM) (`SYNAPSE_COUNT` only) |\n", + "| `dataset/` | +1 (`minnie65_v1300_proofread` = proofread \u2229 CSM cells) |\n", + "| `dataitem_dataset_association/` | one per proofread \u2229 CSM cell |\n", + "| `cellcellconnectivitylong_proofread_pre_to_csm_post/` | 2 \u00d7 filtered pairs: (proofread \u2229 CSM)-pre \u00d7 CSM-post (`SYNAPSE_COUNT` + `SUM_ANATOMICAL_SIZE`) |\n", + "| `cellcellconnectivitylong_proofread_to_proofread/` | 1 \u00d7 filtered pairs: (proofread \u2229 CSM) \u00d7 (proofread \u2229 CSM) (`SYNAPSE_COUNT` only) |\n", "\n", "Both examples use the same precomputed `minnie_soma_soma_connectivity.parquet` with different filters." ] diff --git a/code/etl_tasic_01_cluster.ipynb b/code/etl_tasic_01_cluster.ipynb index d9037b8..66c78dc 100644 --- a/code/etl_tasic_01_cluster.ipynb +++ b/code/etl_tasic_01_cluster.ipynb @@ -29,14 +29,7 @@ "cell_type": "code", "execution_count": 1, "id": "49b62c21", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:12:43.721647Z", - "iopub.status.busy": "2026-05-01T05:12:43.721467Z", - "iopub.status.idle": "2026-05-01T05:12:44.756044Z", - "shell.execute_reply": "2026-05-01T05:12:44.755256Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", @@ -44,40 +37,29 @@ "import pandas as pd\n", "import polars as pl\n", "import pyarrow as pa\n", - "from deltalake import write_deltalake\n", "\n", - "from connects_common_connectivity.arrow_utils import (\n", - " attach_linkml_metadata,\n", - " build_arrow_schema,\n", - " models_to_table,\n", - ")\n", "from connects_common_connectivity.models import (\n", " AlgorithmRun,\n", " Cluster,\n", " ClusterHierarchy,\n", " HierarchyCategory,\n", - ")" + ")\n", + "from connects_common_connectivity.config import output_root\n", + "from connects_common_connectivity.io import write_models\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "a6acd535", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:12:44.758092Z", - "iopub.status.busy": "2026-05-01T05:12:44.757807Z", - "iopub.status.idle": "2026-05-01T05:12:44.761868Z", - "shell.execute_reply": "2026-05-01T05:12:44.761112Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INPUT_FEATHER : /data/visp-patchseq-taxonomy-info/anno.feather\n", - "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v1/\n", + "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v2/\n", "HIERARCHY_ID : tasic_2018_visp_taxonomy\n", "RUN_ID : tasic_2018_visp_clustering\n" ] @@ -85,7 +67,7 @@ ], "source": [ "INPUT_FEATHER = \"/data/visp-patchseq-taxonomy-info/anno.feather\"\n", - "OUTPUT_ROOT = \"../scratch/em_patchseq_wnm_v1/\"\n", + "OUTPUT_ROOT = output_root()\n", "HIERARCHY_ID = \"tasic_2018_visp_taxonomy\"\n", "RUN_ID = \"tasic_2018_visp_clustering\"\n", "ROOT_ID = \"cell\"\n", @@ -109,14 +91,7 @@ "cell_type": "code", "execution_count": 3, "id": "333984ed", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:12:44.763424Z", - "iopub.status.busy": "2026-05-01T05:12:44.763226Z", - "iopub.status.idle": "2026-05-01T05:12:44.842095Z", - "shell.execute_reply": "2026-05-01T05:12:44.841343Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -175,20 +150,13 @@ "cell_type": "code", "execution_count": 4, "id": "1685c251", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:12:44.843833Z", - "iopub.status.busy": "2026-05-01T05:12:44.843604Z", - "iopub.status.idle": "2026-05-01T05:12:44.932844Z", - "shell.execute_reply": "2026-05-01T05:12:44.932059Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "HierarchyCategory written: (4, 3)\n" + "HierarchyCategory written: 4 rows\n" ] } ], @@ -203,32 +171,15 @@ "]\n", "CATEGORY_IDS = [c.id for c in category_rows]\n", "\n", - "schema_cat = build_arrow_schema(HierarchyCategory)\n", - "table_cat = attach_linkml_metadata(\n", - " models_to_table(category_rows, schema=schema_cat),\n", - " linkml_class=\"HierarchyCategory\",\n", - ")\n", - "quoted = \", \".join(f\"'{i}'\" for i in CATEGORY_IDS)\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"hierarchycategory/\", table_cat,\n", - " mode=\"overwrite\",\n", - " predicate=f\"id IN ({quoted})\",\n", - ")\n", - "print(\"HierarchyCategory written:\", table_cat.shape)" + "result = write_models(category_rows, output_root=OUTPUT_ROOT)\n", + "print(f\"HierarchyCategory written: {result.rows_written} rows\")\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "72271432", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:12:44.934473Z", - "iopub.status.busy": "2026-05-01T05:12:44.934274Z", - "iopub.status.idle": "2026-05-01T05:12:44.952946Z", - "shell.execute_reply": "2026-05-01T05:12:44.952180Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -268,20 +219,13 @@ "cell_type": "code", "execution_count": 6, "id": "025bb878", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:12:44.954653Z", - "iopub.status.busy": "2026-05-01T05:12:44.954455Z", - "iopub.status.idle": "2026-05-01T05:12:45.040072Z", - "shell.execute_reply": "2026-05-01T05:12:45.039247Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "AlgorithmRun written: (1, 9)\n" + "AlgorithmRun written: 1 rows\n" ] } ], @@ -298,31 +242,15 @@ " # algorithmrun/ row would duplicate state. Future: schema flip to inlined: false (list of ids).\n", ")\n", "\n", - "schema_run = build_arrow_schema(AlgorithmRun)\n", - "table_run = attach_linkml_metadata(\n", - " models_to_table([run_row], schema=schema_run),\n", - " linkml_class=\"AlgorithmRun\",\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"algorithmrun/\", table_run,\n", - " mode=\"overwrite\",\n", - " predicate=f\"id = '{RUN_ID}'\",\n", - ")\n", - "print(\"AlgorithmRun written:\", table_run.shape)" + "result = write_models([run_row], output_root=OUTPUT_ROOT)\n", + "print(f\"AlgorithmRun written: {result.rows_written} rows\")\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "067fb84c", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:12:45.041726Z", - "iopub.status.busy": "2026-05-01T05:12:45.041524Z", - "iopub.status.idle": "2026-05-01T05:12:45.056344Z", - "shell.execute_reply": "2026-05-01T05:12:45.055614Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -364,27 +292,14 @@ "cell_type": "code", "execution_count": 8, "id": "0d1e5c5c", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:12:45.058227Z", - "iopub.status.busy": "2026-05-01T05:12:45.057879Z", - "iopub.status.idle": "2026-05-01T05:12:45.134989Z", - "shell.execute_reply": "2026-05-01T05:12:45.134166Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Cluster rows built: 138\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Cluster written: (138, 9)\n" + "Cluster rows built: 138\n", + "Cluster written: 138 rows\n" ] } ], @@ -440,33 +355,15 @@ "\n", "assert len(cluster_rows) == 1 + len(class_labels) + len(subclass_labels) + len(cluster_labels)\n", "print(f\"Cluster rows built: {len(cluster_rows)}\")\n", - "\n", - "schema_clu = build_arrow_schema(Cluster)\n", - "table_clu = attach_linkml_metadata(\n", - " models_to_table(cluster_rows, schema=schema_clu),\n", - " linkml_class=\"Cluster\",\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"cluster/\", table_clu,\n", - " mode=\"overwrite\",\n", - " predicate=f\"hierarchy_id = '{HIERARCHY_ID}'\",\n", - " partition_by=[\"hierarchy_id\"],\n", - ")\n", - "print(\"Cluster written:\", table_clu.shape)" + "result = write_models(cluster_rows, output_root=OUTPUT_ROOT)\n", + "print(f\"Cluster written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "9aa57316", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:12:45.136667Z", - "iopub.status.busy": "2026-05-01T05:12:45.136439Z", - "iopub.status.idle": "2026-05-01T05:12:45.151140Z", - "shell.execute_reply": "2026-05-01T05:12:45.150336Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -508,20 +405,13 @@ "cell_type": "code", "execution_count": 10, "id": "31d4b276", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:12:45.152759Z", - "iopub.status.busy": "2026-05-01T05:12:45.152568Z", - "iopub.status.idle": "2026-05-01T05:12:45.240060Z", - "shell.execute_reply": "2026-05-01T05:12:45.239253Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "ClusterHierarchy written: (1, 4)\n" + "ClusterHierarchy written: 1 rows\n" ] } ], @@ -532,32 +422,15 @@ " root=ROOT_ID,\n", " clusters=[c.id for c in cluster_rows],\n", ")\n", - "\n", - "schema_h = build_arrow_schema(ClusterHierarchy)\n", - "table_h = attach_linkml_metadata(\n", - " models_to_table([hierarchy_row], schema=schema_h),\n", - " linkml_class=\"ClusterHierarchy\",\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"clusterhierarchy/\", table_h,\n", - " mode=\"overwrite\",\n", - " predicate=f\"id = '{HIERARCHY_ID}'\",\n", - ")\n", - "print(\"ClusterHierarchy written:\", table_h.shape)" + "result = write_models([hierarchy_row], output_root=OUTPUT_ROOT)\n", + "print(f\"ClusterHierarchy written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 11, "id": "5072ae81", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:12:45.241640Z", - "iopub.status.busy": "2026-05-01T05:12:45.241449Z", - "iopub.status.idle": "2026-05-01T05:12:45.259267Z", - "shell.execute_reply": "2026-05-01T05:12:45.258529Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -597,6 +470,14 @@ "\n", "Tasic taxonomy: 1 synthetic root + 3 classes + 23 subclasses + 111 leaf clusters. No `DataItem`s registered (out of scope). Idempotent." ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c123e95-ff37-4190-8381-5593ed47082d", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/code/etl_v1dd_00_explore.ipynb b/code/etl_v1dd_00_explore.ipynb new file mode 100644 index 0000000..6df6557 --- /dev/null +++ b/code/etl_v1dd_00_explore.ipynb @@ -0,0 +1,1047 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3458f478", + "metadata": {}, + "source": [ + "# ETL — V1DD (release 1196) exploration\n", + "\n", + "Loads every artifact in `/data/v1dd_1196/` and prints shape, columns, and a small head for each. Each load is followed by a short note proposing which common-connectivity schema the file maps to. No writes — this notebook is a planning aid for the subsequent `etl_v1dd_01_*`, `etl_v1dd_02_*`, etc. notebooks.\n", + "\n", + "V1DD is the same modality as MICrONS Minnie (EM connectomics + coregistered 2P functional imaging). Schema mapping mirrors the `etl_minnie_*` notebooks where applicable." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ea6bef6e", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "77c09ebf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DATA_ROOT : /data/v1dd_1196\n", + "PROJECT_ID : v1dd\n", + "RELEASE : 1196\n", + "\n", + "Contents:\n", + " - cell_cell_correlations_by_stimulus.feather\n", + " - cell_cell_correlations_by_stimulus_coregistered.feather\n", + " - coregistration_1196.feather\n", + " - data_description.json\n", + " - metadata.nd.json\n", + " - original_metadata\n", + " - proofread_axon_list_1196.npy\n", + " - proofread_dendrite_list_1196.npy\n", + " - snr_by_cell.feather\n", + " - soma_and_cell_type_1196.feather\n", + " - subject.json\n", + " - syn_df_all_to_proofread_to_all_1196.feather\n", + " - syn_label_df_all_to_proofread_to_all_1196.feather\n" + ] + } + ], + "source": [ + "DATA_ROOT = Path(\"/data/v1dd_1196\")\n", + "PROJECT_ID = \"v1dd\"\n", + "RELEASE = \"1196\"\n", + "\n", + "print(f\"DATA_ROOT : {DATA_ROOT}\")\n", + "print(f\"PROJECT_ID : {PROJECT_ID}\")\n", + "print(f\"RELEASE : {RELEASE}\")\n", + "print()\n", + "print(\"Contents:\")\n", + "for p in sorted(DATA_ROOT.iterdir()):\n", + " print(\" -\", p.name)" + ] + }, + { + "cell_type": "markdown", + "id": "3b985e2b", + "metadata": {}, + "source": [ + "## 1. Provenance metadata (JSON)\n", + "\n", + "`data_description.json`, `subject.json`, `metadata.nd.json` are aind-data-schema records that describe the release as a whole." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "df1c9581", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-16T18:43:20.515800Z", + "iopub.status.busy": "2026-06-16T18:43:20.515579Z", + "iopub.status.idle": "2026-06-16T18:43:20.634537Z", + "shell.execute_reply": "2026-06-16T18:43:20.633727Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "name : v1dd-analysis-1196-1_2025-08-14_16-38-00\n", + "project_name : V1 Deep Dive\n", + "modalities : ['EM']\n", + "institution : AIBS\n", + "license : CC-BY-4.0\n", + "subject_id : 409828\n", + "genotype : Slc17a7-IRES2-Cre/wt;Camk2a-tTA/wt;Ai94(TITL-GCaMP6s)/wt\n", + "sex : Male\n", + "species : House mouse\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "S3 location : s3://aind-open-data/v1dd-analysis-1196-1_2025-08-14_16-38-00\n" + ] + } + ], + "source": [ + "data_desc = json.loads((DATA_ROOT / \"data_description.json\").read_text())\n", + "subject = json.loads((DATA_ROOT / \"subject.json\").read_text())\n", + "\n", + "print(\"name :\", data_desc[\"name\"])\n", + "print(\"project_name :\", data_desc[\"project_name\"])\n", + "print(\"modalities :\", [m[\"abbreviation\"] for m in data_desc[\"modalities\"]])\n", + "print(\"institution :\", data_desc[\"institution\"][\"abbreviation\"])\n", + "print(\"license :\", data_desc[\"license\"])\n", + "print(\"subject_id :\", data_desc[\"subject_id\"])\n", + "print(\"genotype :\", subject[\"subject_details\"][\"genotype\"])\n", + "print(\"sex :\", subject[\"subject_details\"][\"sex\"])\n", + "print(\"species :\", subject[\"subject_details\"][\"species\"][\"common_name\"])\n", + "print(\"S3 location :\", json.loads((DATA_ROOT / \"metadata.nd.json\").read_text())[\"location\"])" + ] + }, + { + "cell_type": "markdown", + "id": "506cb0c6", + "metadata": {}, + "source": [ + "**Schema mapping:** `core_schema.yaml::DataSet` — exactly one row, `project_id=\"v1dd\"`, modality `ELECTRON_MICROSCOPY`. The `publication` slot can hold the V1DD release reference; `name` comes from `data_description.name`. Subject metadata (genotype, sex, species) has no slot in the current core schema and would be dropped or recorded only in `DataSet.name`/notes." + ] + }, + { + "cell_type": "markdown", + "id": "23ac200e", + "metadata": {}, + "source": [ + "## 2. EM soma table — `soma_and_cell_type_1196.feather`\n", + "\n", + "The catalog of EM somas detected in the V1DD volume. Direct analogue of MICrONS' `nucleus_detection_lookup_v1` CAVE view used by `etl_minnie_01_dataset_dataitem.ipynb`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "86b933de", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-16T18:43:20.636256Z", + "iopub.status.busy": "2026-06-16T18:43:20.636056Z", + "iopub.status.idle": "2026-06-16T18:43:21.545481Z", + "shell.execute_reply": "2026-06-16T18:43:21.544724Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape : (207455, 11)\n", + "cols : ['id', 'pt_position_x', 'pt_position_y', 'pt_position_z', 'pt_position_trform_x', 'pt_position_trform_y', 'pt_position_trform_z', 'pt_root_id', 'volume', 'cell_type_coarse', 'cell_type']\n", + "dtypes :\n", + " id int64\n", + "pt_position_x int64\n", + "pt_position_y int64\n", + "pt_position_z int64\n", + "pt_position_trform_x float64\n", + "pt_position_trform_y float64\n", + "pt_position_trform_z float64\n", + "pt_root_id int64\n", + "volume float64\n", + "cell_type_coarse object\n", + "cell_type object\n", + "dtype: object\n", + "n_unique pt_root_id : 163064\n", + "n_unique id : 207455\n", + "cell_type_coarse counts:\n", + " cell_type_coarse\n", + "None 158263\n", + "E 42495\n", + "I 6697\n", + "Name: count, dtype: int64\n", + "cell_type counts (top 10):\n", + " cell_type\n", + "None 158263\n", + "L6-CT 11260\n", + "L4-IT 7955\n", + "L3-IT 6361\n", + "L6-IT 6044\n", + "L5-IT 5090\n", + "L2-IT 3073\n", + "PTC 2951\n", + "L5-ET 2013\n", + "DTC 1933\n", + "Name: count, dtype: int64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idpt_position_xpt_position_ypt_position_zpt_position_trform_xpt_position_trform_ypt_position_trform_zpt_root_idvolumecell_type_coarsecell_type
0228132632828749849738270-323721.447979549910.283106392909.832613864691132737039043458.464831NoneNone
1543247130492297791583880330339.020171595962.275760-306424.55135486469113273083998873.345940NoneNone
2203262624680531094283770-252082.627894203770.72823521544.029756864691132654552792338.276613EL3-IT
\n", + "
" + ], + "text/plain": [ + " id pt_position_x pt_position_y pt_position_z pt_position_trform_x \\\n", + "0 228132 632828 749849 738270 -323721.447979 \n", + "1 543247 1304922 977915 83880 330339.020171 \n", + "2 203262 624680 531094 283770 -252082.627894 \n", + "\n", + " pt_position_trform_y pt_position_trform_z pt_root_id volume \\\n", + "0 549910.283106 392909.832613 864691132737039043 458.464831 \n", + "1 595962.275760 -306424.551354 864691132730839988 73.345940 \n", + "2 203770.728235 21544.029756 864691132654552792 338.276613 \n", + "\n", + " cell_type_coarse cell_type \n", + "0 None None \n", + "1 None None \n", + "2 E L3-IT " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "soma_df = pd.read_feather(DATA_ROOT / \"soma_and_cell_type_1196.feather\")\n", + "print(\"shape :\", soma_df.shape)\n", + "print(\"cols :\", list(soma_df.columns))\n", + "print(\"dtypes :\\n\", soma_df.dtypes)\n", + "print(\"n_unique pt_root_id :\", soma_df['pt_root_id'].nunique())\n", + "print(\"n_unique id :\", soma_df['id'].nunique())\n", + "print(\"cell_type_coarse counts:\\n\", soma_df['cell_type_coarse'].value_counts(dropna=False).head())\n", + "print(\"cell_type counts (top 10):\\n\", soma_df['cell_type'].value_counts(dropna=False).head(10))\n", + "soma_df.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "744dd27f", + "metadata": {}, + "source": [ + "**Schema mapping:**\n", + "\n", + "- `core_schema.yaml::DataItem` — one row per nucleus, `id = str(row.id)` (the soma id), `name = str(row.pt_root_id)`. Mirrors `etl_minnie_01`. Also `DataItemDataSetAssociation` linking each soma to the V1DD `DataSet`.\n", + "- `cell_features_schema.yaml::CellFeatureMatrix` — `pt_position_{x,y,z}` (voxel-space soma centroid), `pt_position_trform_{x,y,z}` (transformed/CCF coords), and `volume` make a numeric feature set (e.g. `feature_set_id = \"v1dd_soma_geometry\"`).\n", + "- `cell_type_coarse` / `cell_type` — categorical labels. Two options: (a) write as categorical columns inside a `CellFeatureMatrix` (cf. Minnie's CSM coarse types), or (b) treat the V1DD coarse/fine cell-type taxonomy as a `clustering_schema.yaml::ClusterHierarchy` and write `ClusterMembership` rows. Pattern (b) matches `etl_minnie_03_cluster_and_cluster_membership.ipynb`." + ] + }, + { + "cell_type": "markdown", + "id": "60aee4b0", + "metadata": {}, + "source": [ + "## 3. Proofread axon / dendrite lists — `.npy`\n", + "\n", + "Lists of `pt_root_id`s whose axon (resp. dendrite) has been manually proofread. These define the proofread cohort used in the synapse table below." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9a1cf28b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "axon list : shape (1210,) dtype int64\n", + " n_unique : 1210\n", + " sample : [864691132534275418, 864691132534315610, 864691132535664474, 864691132536286810, 864691132536904794]\n", + "\n", + "dendrite list : shape (63986,) dtype int64\n", + " n_unique : 63986\n", + " sample : [864691132496108732, 864691132511800666, 864691132525163794, 864691132533275738, 864691132533347418]\n", + "\n", + "overlap axon ∩ dendrite : 1148\n" + ] + } + ], + "source": [ + "axon_ids = np.load(DATA_ROOT / \"proofread_axon_list_1196.npy\", allow_pickle=True)\n", + "dend_ids = np.load(DATA_ROOT / \"proofread_dendrite_list_1196.npy\", allow_pickle=True)\n", + "\n", + "print(\"axon list : shape\", axon_ids.shape, \"dtype\", axon_ids.dtype)\n", + "print(\" n_unique :\", len(set(axon_ids.tolist())))\n", + "print(\" sample :\", axon_ids[:5].tolist())\n", + "print()\n", + "print(\"dendrite list : shape\", dend_ids.shape, \"dtype\", dend_ids.dtype)\n", + "print(\" n_unique :\", len(set(dend_ids.tolist())))\n", + "print(\" sample :\", dend_ids[:5].tolist())\n", + "print()\n", + "print(\"overlap axon ∩ dendrite :\", len(set(axon_ids.tolist()) & set(dend_ids.tolist())))" + ] + }, + { + "cell_type": "markdown", + "id": "284b3da6", + "metadata": {}, + "source": [ + "**Schema mapping:** These are cohort definitions, not features. Best modelled as two extra `core_schema.yaml::DataSet` rows (e.g. `v1dd_1196_proofread_axons`, `v1dd_1196_proofread_dendrites`) with their own `DataItemDataSetAssociation` rows pointing at the existing soma `DataItem` ids. Same pattern as the Minnie cohort DataSets noted in `etl_minnie_01`'s summary cell. Note: ids here are `pt_root_id` (int64); the soma `DataItem`s above are keyed by the soma `id` column — a `pt_root_id → soma_id` join is required before writing the associations." + ] + }, + { + "cell_type": "markdown", + "id": "783658a6", + "metadata": {}, + "source": [ + "## 4. Functional coregistration — `coregistration_1196.feather`\n", + "\n", + "Maps EM `pt_root_id`s to functional 2P ROIs (volume / column / plane / roi tuple)." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "55cc1599", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape : (571, 5)\n", + "cols : ['pt_root_id', 'column', 'volume', 'plane', 'roi']\n", + "n_unique pt_root_id : 553\n", + "n_unique (volume,column,plane,roi): 565\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pt_root_idcolumnvolumeplaneroi
0864691132830842994130143
186469113274146645713240
286469113277089372913398
\n", + "
" + ], + "text/plain": [ + " pt_root_id column volume plane roi\n", + "0 864691132830842994 1 3 0 143\n", + "1 864691132741466457 1 3 2 40\n", + "2 864691132770893729 1 3 3 98" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "coreg_df = pd.read_feather(DATA_ROOT / \"coregistration_1196.feather\")\n", + "print(\"shape :\", coreg_df.shape)\n", + "print(\"cols :\", list(coreg_df.columns))\n", + "print(\"n_unique pt_root_id :\", coreg_df['pt_root_id'].nunique())\n", + "print(\"n_unique (volume,column,plane,roi):\", coreg_df.drop_duplicates(['volume','column','plane','roi']).shape[0])\n", + "coreg_df.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "831e6318", + "metadata": {}, + "source": [ + "**Schema mapping:** A cross-modal cell-to-cell link table. Two reasonable options:\n", + "\n", + "- `mappings_schema.yaml` — if a `CellToCellMapping` (or similar cross-cell mapping) class exists, this is the natural home (EM cell ↔ functional cell).\n", + "- Otherwise, register the coregistered functional cells as `DataItem`s in a `v1dd_coregistered_functional_cells` `DataSet` (id = the 4-tuple stringified), then write association rows. The mapping itself (EM ↔ functional) can be a `CellCellConnectivityLong` row with a relation tag like `coregistration` — but that is a stretch and a dedicated mapping class is preferred. Schema-fit decision deferred to notebook `_03`." + ] + }, + { + "cell_type": "markdown", + "id": "1f3ecd8e", + "metadata": {}, + "source": [ + "## 5. Functional SNR — `snr_by_cell.feather`\n", + "\n", + "One SNR scalar per functional ROI (keyed by the same `volume / column / plane / roi` tuple as the coregistration table)." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "6a252201", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape : (4458, 5)\n", + "cols : ['column', 'volume', 'plane', 'roi', 'snr']\n", + "n_unique cells: 4458\n", + "snr describe :\n", + " count 4458.000000\n", + "mean 4.196671\n", + "std 4.135927\n", + "min 0.953515\n", + "25% 2.021927\n", + "50% 3.285306\n", + "75% 4.877459\n", + "max 93.560258\n", + "Name: snr, dtype: float64\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
columnvolumeplaneroisnr
013002.974124
113012.304902
213021.442091
\n", + "
" + ], + "text/plain": [ + " column volume plane roi snr\n", + "0 1 3 0 0 2.974124\n", + "1 1 3 0 1 2.304902\n", + "2 1 3 0 2 1.442091" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "snr_df = pd.read_feather(DATA_ROOT / \"snr_by_cell.feather\")\n", + "print(\"shape :\", snr_df.shape)\n", + "print(\"cols :\", list(snr_df.columns))\n", + "print(\"n_unique cells:\", snr_df.drop_duplicates(['volume','column','plane','roi']).shape[0])\n", + "print(\"snr describe :\\n\", snr_df['snr'].describe())\n", + "snr_df.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "44dab7c2", + "metadata": {}, + "source": [ + "**Schema mapping:** `cell_features_schema.yaml::CellFeatureMatrix` with one `CellFeatureDefinition` (`snr`, dtype `\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idpre_pt_position_xpre_pt_position_ypre_pt_position_zpost_pt_position_xpost_pt_position_ypost_pt_position_zctr_pt_position_xctr_pt_position_yctr_pt_position_zsizepre_pt_root_idpost_pt_root_id
0354386968758200.5802316.1304380.0757861.0802558.6304650.0757967.7802597.4304380.0240864691132536286810864691132734919083
1378070488792063.2514342.5183735.0792664.6514284.3183915.0792412.4514294.0183735.03056864691132572190492864691132606767301
2499493001977071.3390075.8191340.0976974.3390104.9190935.0976838.5390337.7190935.01346864691132573738810864691132747578447
\n", + "" + ], + "text/plain": [ + " id pre_pt_position_x pre_pt_position_y pre_pt_position_z \\\n", + "0 354386968 758200.5 802316.1 304380.0 \n", + "1 378070488 792063.2 514342.5 183735.0 \n", + "2 499493001 977071.3 390075.8 191340.0 \n", + "\n", + " post_pt_position_x post_pt_position_y post_pt_position_z \\\n", + "0 757861.0 802558.6 304650.0 \n", + "1 792664.6 514284.3 183915.0 \n", + "2 976974.3 390104.9 190935.0 \n", + "\n", + " ctr_pt_position_x ctr_pt_position_y ctr_pt_position_z size \\\n", + "0 757967.7 802597.4 304380.0 240 \n", + "1 792412.4 514294.0 183735.0 3056 \n", + "2 976838.5 390337.7 190935.0 1346 \n", + "\n", + " pre_pt_root_id post_pt_root_id \n", + "0 864691132536286810 864691132734919083 \n", + "1 864691132572190492 864691132606767301 \n", + "2 864691132573738810 864691132747578447 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "syn_df = pd.read_feather(DATA_ROOT / \"syn_df_all_to_proofread_to_all_1196.feather\")\n", + "syn_label_df = pd.read_feather(DATA_ROOT / \"syn_label_df_all_to_proofread_to_all_1196.feather\")\n", + "\n", + "print(\"syn_df shape :\", syn_df.shape)\n", + "print(\"syn_df cols :\", list(syn_df.columns))\n", + "print(\"n_unique pre_pt_root :\", syn_df['pre_pt_root_id'].nunique())\n", + "print(\"n_unique post_pt_root :\", syn_df['post_pt_root_id'].nunique())\n", + "print(\"size describe :\\n\", syn_df['size'].describe())\n", + "print()\n", + "print(\"syn_label_df shape :\", syn_label_df.shape)\n", + "print(\"syn_label_df cols :\", list(syn_label_df.columns))\n", + "print(\"syn_label_df index :\", syn_label_df.index.name)\n", + "print(\"tag counts :\\n\", syn_label_df['tag'].value_counts(dropna=False).head())\n", + "syn_df.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "a848e37a", + "metadata": {}, + "source": [ + "**Schema mapping:**\n", + "\n", + "- `cell_cell_schema.yaml::CellCellConnectivityLong` — aggregate synapses per (pre, post) pair into synapse-count and total-size weights, written to a dedicated subdirectory per §5g of the prompt guide (e.g. `cellcellconnectivitylong_all_to_proofread_to_all/`). Mirrors `etl_minnie_04_cell_cell.ipynb`.\n", + "- Raw per-synapse rows (8.2M) do **not** fit any current schema — there is no per-synapse class in the common schemas. They would either stay as a parquet sidecar or be summarized away. The label table (spine vs other) is per-synapse and would be summarized in the same aggregation (e.g. as `n_spine_synapses` weight or a separate connectivity matrix)." + ] + }, + { + "cell_type": "markdown", + "id": "1e0ee419", + "metadata": {}, + "source": [ + "## 7. Functional cell–cell correlations\n", + "\n", + "`cell_cell_correlations_by_stimulus.feather` — all functional ROI pairs, one Pearson correlation per stimulus condition.\n", + "\n", + "`cell_cell_correlations_by_stimulus_coregistered.feather` — same, but restricted to coregistered EM cells and keyed by `pt_root_id` rather than ROI tuple." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "23774f12", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-16T18:43:34.090024Z", + "iopub.status.busy": "2026-06-16T18:43:34.089746Z", + "iopub.status.idle": "2026-06-16T18:43:43.292708Z", + "shell.execute_reply": "2026-06-16T18:43:43.292033Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "corr_df shape : (8846260, 13)\n", + "corr_df cols : ['pre_roi', 'post_roi', 'pre_plane', 'post_plane', 'column', 'volume', 'drifting_gratings_full', 'drifting_gratings_windowed', 'locally_sparse_noise', 'natural_images', 'natural_images_12', 'natural_movie', 'spontaneous']\n", + "stimulus columns : ['drifting_gratings_full', 'drifting_gratings_windowed', 'locally_sparse_noise', 'natural_images', 'natural_images_12', 'natural_movie', 'spontaneous']\n", + "\n", + "corr_co_df shape : (148728, 9)\n", + "corr_co_df cols : ['pre_pt_root_id', 'post_pt_root_id', 'drifting_gratings_full', 'drifting_gratings_windowed', 'locally_sparse_noise', 'natural_images', 'natural_images_12', 'natural_movie', 'spontaneous']\n", + "corr_co_df n_unique pre : 551\n", + "corr_co_df n_unique post: 551\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pre_pt_root_idpost_pt_root_iddrifting_gratings_fulldrifting_gratings_windowedlocally_sparse_noisenatural_imagesnatural_images_12natural_moviespontaneous
08646911326318723548646911329937477010.0501420.0278190.1544720.1817240.1589090.0049570.078471
18646911326318723548646911327864477560.0552670.0182840.1194180.1155870.1240210.0102960.197349
28646911326318723548646911326179615370.0654440.0623670.1436600.0650780.0732970.0531970.108968
\n", + "
" + ], + "text/plain": [ + " pre_pt_root_id post_pt_root_id drifting_gratings_full \\\n", + "0 864691132631872354 864691132993747701 0.050142 \n", + "1 864691132631872354 864691132786447756 0.055267 \n", + "2 864691132631872354 864691132617961537 0.065444 \n", + "\n", + " drifting_gratings_windowed locally_sparse_noise natural_images \\\n", + "0 0.027819 0.154472 0.181724 \n", + "1 0.018284 0.119418 0.115587 \n", + "2 0.062367 0.143660 0.065078 \n", + "\n", + " natural_images_12 natural_movie spontaneous \n", + "0 0.158909 0.004957 0.078471 \n", + "1 0.124021 0.010296 0.197349 \n", + "2 0.073297 0.053197 0.108968 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "corr_df = pd.read_feather(DATA_ROOT / \"cell_cell_correlations_by_stimulus.feather\")\n", + "corr_co_df = pd.read_feather(DATA_ROOT / \"cell_cell_correlations_by_stimulus_coregistered.feather\")\n", + "\n", + "stim_cols = ['drifting_gratings_full','drifting_gratings_windowed','locally_sparse_noise',\n", + " 'natural_images','natural_images_12','natural_movie','spontaneous']\n", + "\n", + "print(\"corr_df shape :\", corr_df.shape)\n", + "print(\"corr_df cols :\", list(corr_df.columns))\n", + "print(\"stimulus columns :\", stim_cols)\n", + "print()\n", + "print(\"corr_co_df shape :\", corr_co_df.shape)\n", + "print(\"corr_co_df cols :\", list(corr_co_df.columns))\n", + "print(\"corr_co_df n_unique pre :\", corr_co_df['pre_pt_root_id'].nunique())\n", + "print(\"corr_co_df n_unique post:\", corr_co_df['post_pt_root_id'].nunique())\n", + "corr_co_df.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "eebce5a3", + "metadata": {}, + "source": [ + "**Schema mapping:** Both tables are cell-pair × scalar-per-stimulus → `cell_cell_schema.yaml::CellCellConnectivityLong`, one folder per stimulus condition (§5g pattern) **per table**:\n", + "\n", + "- `cellcellconnectivitylong_func_corr_/` — keyed by functional-cell ids (from §4 registration). 8.8M rows × 7 stimuli ≈ 62M rows total; may want to threshold or sample.\n", + "- `cellcellconnectivitylong_func_corr_coreg_/` — keyed by EM `pt_root_id` (i.e. by soma `DataItem` ids). 149k rows per stimulus; small.\n", + "\n", + "The coregistered version is the one with direct anatomical interpretability and should be prioritized." + ] + }, + { + "cell_type": "markdown", + "id": "e1417e82", + "metadata": {}, + "source": [ + "## Summary — proposed notebook split\n", + "\n", + "| File(s) | Schema target | Future notebook |\n", + "|---|---|---|\n", + "| `data_description.json`, `subject.json`, `soma_and_cell_type_1196.feather` | `DataSet` + `DataItem` + `DataItemDataSetAssociation` | `etl_v1dd_01_dataset_dataitem.ipynb` |\n", + "| `proofread_axon_list_1196.npy`, `proofread_dendrite_list_1196.npy` | extra cohort `DataSet`s + associations | `etl_v1dd_01_dataset_dataitem.ipynb` (or `_01b`) |\n", + "| `soma_and_cell_type_1196.feather` (numeric cols), `snr_by_cell.feather` | `CellFeatureMatrix` | `etl_v1dd_02_cell_features.ipynb` |\n", + "| `soma_and_cell_type_1196.feather` (`cell_type` / `cell_type_coarse`) | `ClusterHierarchy` + `ClusterMembership` | `etl_v1dd_03_cluster_and_cluster_membership.ipynb` |\n", + "| `coregistration_1196.feather` | cross-modal mapping (schema TBD; see §4 note) | `etl_v1dd_03_mapping.ipynb` |\n", + "| `syn_df_…_1196.feather` (+ labels) | `CellCellConnectivityLong` (aggregated) | `etl_v1dd_04_cell_cell.ipynb` |\n", + "| `cell_cell_correlations_by_stimulus_coregistered.feather` | `CellCellConnectivityLong` (one folder per stimulus) | `etl_v1dd_04_cell_cell.ipynb` |\n", + "| `cell_cell_correlations_by_stimulus.feather` | same, functional-cell-keyed | `etl_v1dd_04_cell_cell.ipynb` (defer if functional cells aren't registered) |\n", + "\n", + "**Open questions for the schema owner before writing _01:**\n", + "1. Is there a canonical cross-modal cell-link class for the EM↔functional coregistration table, or should it ride on `CellCellConnectivityLong`?\n", + "2. Should non-coregistered functional cells (the 4458 in `snr_by_cell` minus the 571 coregistered) be registered as `DataItem`s? If yes, with what id scheme — the `(volume, column, plane, roi)` 4-tuple stringified?\n", + "3. Does the V1DD cell-type taxonomy already exist as a `ClusterHierarchy` somewhere (shared with MICrONS CSM), or does this dataset own it?" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/code/etl_v1dd_01_v1196.ipynb b/code/etl_v1dd_01_v1196.ipynb new file mode 100644 index 0000000..4b629f2 --- /dev/null +++ b/code/etl_v1dd_01_v1196.ipynb @@ -0,0 +1,1060 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "title", + "metadata": {}, + "source": [ + "# ETL — V1DD release 1196 (single-notebook)\n", + "\n", + "Writes the full V1DD 1196 release into the common-connectivity schemas under one notebook, project `v1dd`.\n", + "\n", + "**Two DataSets inside project `v1dd`:**\n", + "- `v1dd_1196_em` — every EM soma in `soma_and_cell_type_1196.feather` (DataItem id = soma `id`).\n", + "- `v1dd_1196_func` — every functional ROI in `snr_by_cell.feather` (DataItem id = `f\"{volume}-{column}-{plane}-{roi}\"`).\n", + "\n", + "**Additional cohort DataSets (subsets of `v1dd_1196_em`):**\n", + "- `v1dd_1196_proofread_axons` — `proofread_axon_list_1196.npy`.\n", + "- `v1dd_1196_proofread_dendrites` — `proofread_dendrite_list_1196.npy`.\n", + "\n", + "**Additional cohort DataSet (subset of `v1dd_1196_func`):**\n", + "- `v1dd_1196_func_coregistered` — functional ROIs that appear in `coregistration_1196.feather`.\n", + "\n", + "Sections marked **TODO** are skeletons only; we will fill them together. Each section ends with an **open questions** list." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "imports", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-17T01:07:55.435587Z", + "iopub.status.busy": "2026-06-17T01:07:55.435338Z", + "iopub.status.idle": "2026-06-17T01:07:56.615920Z", + "shell.execute_reply": "2026-06-17T01:07:56.615055Z" + } + }, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import polars as pl\n", + "import pyarrow as pa\n", + "\n", + "from connects_common_connectivity.models import (\n", + " AlgorithmRun,\n", + " CellCellConnectivityLong,\n", + " CellFeatureDefinition,\n", + " CellFeatureMatrix,\n", + " CellFeatureSet,\n", + " CellToCellMapping,\n", + " Cluster,\n", + " ClusterHierarchy,\n", + " ClusterMembership,\n", + " DataItem,\n", + " DataItemDataSetAssociation,\n", + " DataSet,\n", + " MappingSet,\n", + " Modality,\n", + " SpatialLocation,\n", + ")\n", + "from connects_common_connectivity.io import write_models" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "constants", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-17T01:07:56.617795Z", + "iopub.status.busy": "2026-06-17T01:07:56.617515Z", + "iopub.status.idle": "2026-06-17T01:07:56.621880Z", + "shell.execute_reply": "2026-06-17T01:07:56.621183Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DATA_ROOT : /data/v1dd_1196\n", + "OUTPUT_ROOT : ../scratch/v1dd_1196_v1/\n", + "PROJECT_ID : v1dd\n", + "RELEASE : 1196\n", + "DATASET_EM : v1dd_1196_em\n", + "DATASET_FUNC : v1dd_1196_func\n", + "DATASET_PROOFREAD_AXON : v1dd_1196_proofread_axons\n", + "DATASET_PROOFREAD_DEND : v1dd_1196_proofread_dendrites\n", + "DATASET_FUNC_COREG : v1dd_1196_func_coregistered\n", + "HIERARCHY_ID_V1DD : v1dd_cell_types\n", + "HIERARCHY_ID_MINNIE : minnie65_csm_cell_types\n", + "FS_EM_SOMA_GEOM : v1dd_em_soma_geometry\n", + "FS_FUNC_QC : v1dd_func_qc\n", + "FS_FUNC_POSITION : v1dd_func_imaging_position\n" + ] + } + ], + "source": [ + "DATA_ROOT = Path(\"/data/v1dd_1196\")\n", + "OUTPUT_ROOT = \"../scratch/v1dd_1196_v1/\"\n", + "PROJECT_ID = \"v1dd\"\n", + "RELEASE = \"1196\"\n", + "\n", + "DATASET_EM = \"v1dd_1196_em\"\n", + "DATASET_FUNC = \"v1dd_1196_func\"\n", + "DATASET_PROOFREAD_AXON = \"v1dd_1196_proofread_axons\"\n", + "DATASET_PROOFREAD_DEND = \"v1dd_1196_proofread_dendrites\"\n", + "DATASET_FUNC_COREG = \"v1dd_1196_func_coregistered\"\n", + "\n", + "HIERARCHY_ID_V1DD = \"v1dd_cell_types\"\n", + "HIERARCHY_ID_MINNIE = \"minnie65_csm_cell_types\" # for comparison only\n", + "\n", + "FS_EM_SOMA_GEOM = \"v1dd_em_soma_geometry\"\n", + "FS_FUNC_QC = \"v1dd_func_qc\"\n", + "FS_FUNC_POSITION = \"v1dd_func_imaging_position\"\n", + "\n", + "for k, v in list(locals().items()):\n", + " if k.isupper():\n", + " print(f\"{k:24s}: {v}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "prereq", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-17T01:07:56.623448Z", + "iopub.status.busy": "2026-06-17T01:07:56.623169Z", + "iopub.status.idle": "2026-06-17T01:07:56.788552Z", + "shell.execute_reply": "2026-06-17T01:07:56.787849Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All input files present.\n" + ] + } + ], + "source": [ + "# Sanity check: every expected input file is on disk.\n", + "expected = [\n", + " \"data_description.json\",\n", + " \"subject.json\",\n", + " \"soma_and_cell_type_1196.feather\",\n", + " \"proofread_axon_list_1196.npy\",\n", + " \"proofread_dendrite_list_1196.npy\",\n", + " \"snr_by_cell.feather\",\n", + " \"coregistration_1196.feather\",\n", + " \"syn_df_all_to_proofread_to_all_1196.feather\",\n", + " \"syn_label_df_all_to_proofread_to_all_1196.feather\",\n", + " \"cell_cell_correlations_by_stimulus.feather\",\n", + " \"cell_cell_correlations_by_stimulus_coregistered.feather\",\n", + "]\n", + "missing = [f for f in expected if not (DATA_ROOT / f).exists()]\n", + "assert not missing, f\"Missing input files: {missing}\"\n", + "print(\"All input files present.\")" + ] + }, + { + "cell_type": "markdown", + "id": "master-id-decision", + "metadata": {}, + "source": [ + "## Master decision — `DataItem.id = pt_root_id`\n", + "\n", + "Every downstream file in this release is keyed by `pt_root_id`; only `soma_and_cell_type_1196.feather` carries the per-detection soma `id`. We therefore use `str(pt_root_id)` as the EM `DataItem.id` for the whole notebook and treat soma-centroid data as features attached to the cell.\n", + "\n", + "### Where each id appears\n", + "\n", + "| File | soma `id` | `pt_root_id` |\n", + "|---|:---:|:---:|\n", + "| `soma_and_cell_type_1196.feather` | ✅ | ✅ |\n", + "| `proofread_axon_list_1196.npy` | — | ✅ |\n", + "| `proofread_dendrite_list_1196.npy` | — | ✅ |\n", + "| `coregistration_1196.feather` | — | ✅ |\n", + "| `syn_df_all_to_proofread_to_all_1196.feather` | — | ✅ (`pre_pt_root_id`, `post_pt_root_id`) |\n", + "| `cell_cell_correlations_by_stimulus_coregistered.feather` | — | ✅ |\n", + "| `snr_by_cell.feather`, `cell_cell_correlations_by_stimulus.feather` | — | — (functional ROI tuples) |\n", + "\n", + "### Key counts from `soma_and_cell_type` (207,455 rows)\n", + "\n", + "| quantity | value |\n", + "|---|---:|\n", + "| unique soma `id` | 207,455 |\n", + "| unique `pt_root_id` | 163,064 |\n", + "| rows with `pt_root_id == 0` (orphan detections) | 3,835 |\n", + "| rows with non-zero `pt_root_id` | 203,620 |\n", + "| unique non-zero `pt_root_id` | 163,063 |\n", + "| `pt_root_id`s with > 1 soma row | 19,615 (~12 %) |\n", + "| max soma rows for a single `pt_root_id` | 184 |\n", + "\n", + "### Policy\n", + "\n", + "- **EM `DataItem.id = str(pt_root_id)`** (one row per segment), `name = str(pt_root_id)`.\n", + "- **Drop `pt_root_id == 0` rows** — they cannot be referenced from any other file and so cannot be cohort-associated or linked.\n", + "- **Collapse multi-soma `pt_root_id`s** to one DataItem by picking the soma row with the largest `volume` (largest nucleus detection is the most plausible primary soma); the other rows are dropped from the cell-features matrix. Number of collapsed cells: 19,615; rows discarded: 207,455 − 163,063 − 3,835 = 40,557.\n", + "- **Downstream joins are direct lookups** on `pt_root_id` everywhere. No `pt_root_id → soma_id` resolution step is needed in §2, §5, §8, §9, §10.\n", + "\n", + "This matches the way the rest of the V1DD release is keyed and aligns with Minnie's nucleus-per-segment convention (the 12 % multi-detection rate is the only thing that differs — Minnie's `nucleus_detection_lookup_v1` already does the collapse at the source)." + ] + }, + { + "cell_type": "markdown", + "id": "s1-md", + "metadata": {}, + "source": [ + "## 1. `DataSet` rows\n", + "\n", + "Five DataSet rows under `project_id=\"v1dd\"`. Provenance comes from `data_description.json`; `publication` points at the V1DD physiology repository.\n", + "\n", + "| id | modality | parent |\n", + "|---|---|---|\n", + "| `v1dd_1196_em` | `ELECTRON_MICROSCOPY` | — |\n", + "| `v1dd_1196_proofread_axons` | `ELECTRON_MICROSCOPY` | subset of `v1dd_1196_em` |\n", + "| `v1dd_1196_proofread_dendrites` | `ELECTRON_MICROSCOPY` | subset of `v1dd_1196_em` |\n", + "| `v1dd_1196_func` | `CALCIUM_IMAGING` | — |\n", + "| `v1dd_1196_func_coregistered` | `CALCIUM_IMAGING` | subset of `v1dd_1196_func` |" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "s1-code", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-17T01:07:56.790316Z", + "iopub.status.busy": "2026-06-17T01:07:56.790121Z", + "iopub.status.idle": "2026-06-17T01:07:57.314099Z", + "shell.execute_reply": "2026-06-17T01:07:57.296497Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DataSet rows written: 5\n" + ] + } + ], + "source": [ + "V1DD_PUBLICATION = \"https://github.com/AllenInstitute/v1dd_physiology\"\n", + "\n", + "datasets = [\n", + " DataSet(\n", + " id=DATASET_EM,\n", + " name=\"V1DD release 1196 — EM somas\",\n", + " publication=V1DD_PUBLICATION,\n", + " modality=Modality.ELECTRON_MICROSCOPY.value,\n", + " project_id=PROJECT_ID,\n", + " ),\n", + " DataSet(\n", + " id=DATASET_PROOFREAD_AXON,\n", + " name=\"V1DD release 1196 — proofread axons cohort\",\n", + " publication=V1DD_PUBLICATION,\n", + " modality=Modality.ELECTRON_MICROSCOPY.value,\n", + " project_id=PROJECT_ID,\n", + " ),\n", + " DataSet(\n", + " id=DATASET_PROOFREAD_DEND,\n", + " name=\"V1DD release 1196 — proofread dendrites cohort\",\n", + " publication=V1DD_PUBLICATION,\n", + " modality=Modality.ELECTRON_MICROSCOPY.value,\n", + " project_id=PROJECT_ID,\n", + " ),\n", + " DataSet(\n", + " id=DATASET_FUNC,\n", + " name=\"V1DD release 1196 — functional 2P ROIs\",\n", + " publication=V1DD_PUBLICATION,\n", + " modality=Modality.CALCIUM_IMAGING.value,\n", + " project_id=PROJECT_ID,\n", + " ),\n", + " DataSet(\n", + " id=DATASET_FUNC_COREG,\n", + " name=\"V1DD release 1196 — coregistered functional ROIs cohort\",\n", + " publication=V1DD_PUBLICATION,\n", + " modality=Modality.CALCIUM_IMAGING.value,\n", + " project_id=PROJECT_ID,\n", + " ),\n", + "]\n", + "\n", + "result = write_models(datasets, output_root=OUTPUT_ROOT)\n", + "print(f\"DataSet rows written: {result.rows_written}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "s1-verify", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-17T01:07:57.316012Z", + "iopub.status.busy": "2026-06-17T01:07:57.315796Z", + "iopub.status.idle": "2026-06-17T01:07:57.353718Z", + "shell.execute_reply": "2026-06-17T01:07:57.352843Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (5, 5)\n", + " id name modality publication\n", + " v1dd_1196_func_coregistered V1DD release 1196 — coregistered functional ROIs cohort CALCIUM_IMAGING https://github.com/AllenInstitute/v1dd_physiology\n", + " v1dd_1196_func V1DD release 1196 — functional 2P ROIs CALCIUM_IMAGING https://github.com/AllenInstitute/v1dd_physiology\n", + "v1dd_1196_proofread_dendrites V1DD release 1196 — proofread dendrites cohort ELECTRON_MICROSCOPY https://github.com/AllenInstitute/v1dd_physiology\n", + " v1dd_1196_proofread_axons V1DD release 1196 — proofread axons cohort ELECTRON_MICROSCOPY https://github.com/AllenInstitute/v1dd_physiology\n", + " v1dd_1196_em V1DD release 1196 — EM somas ELECTRON_MICROSCOPY https://github.com/AllenInstitute/v1dd_physiology\n", + "\n", + "OK — 5 DataSet rows for project v1dd.\n" + ] + } + ], + "source": [ + "ds_verify = (\n", + " pl.read_delta(OUTPUT_ROOT + \"dataset/\")\n", + " .filter(pl.col(\"project_id\") == PROJECT_ID)\n", + ")\n", + "print(\"shape:\", ds_verify.shape)\n", + "print(ds_verify.select([\"id\", \"name\", \"modality\", \"publication\"]).to_pandas().to_string(index=False))\n", + "\n", + "expected_ids = {DATASET_EM, DATASET_PROOFREAD_AXON, DATASET_PROOFREAD_DEND,\n", + " DATASET_FUNC, DATASET_FUNC_COREG}\n", + "got_ids = set(ds_verify[\"id\"].to_list())\n", + "assert expected_ids <= got_ids, f\"missing DataSet ids: {expected_ids - got_ids}\"\n", + "assert ds_verify[\"id\"].n_unique() == ds_verify.shape[0], \"duplicate DataSet ids\"\n", + "modalities = dict(zip(ds_verify[\"id\"].to_list(), ds_verify[\"modality\"].to_list()))\n", + "for em_id in (DATASET_EM, DATASET_PROOFREAD_AXON, DATASET_PROOFREAD_DEND):\n", + " assert modalities[em_id] == Modality.ELECTRON_MICROSCOPY.value, em_id\n", + "for fn_id in (DATASET_FUNC, DATASET_FUNC_COREG):\n", + " assert modalities[fn_id] == Modality.CALCIUM_IMAGING.value, fn_id\n", + "print(\"\\nOK — 5 DataSet rows for project v1dd.\")" + ] + }, + { + "cell_type": "markdown", + "id": "s2-md", + "metadata": {}, + "source": [ + "## 2. EM `DataItem`s and `DataItemDataSetAssociation`s\n", + "\n", + "Per the master decision: one EM `DataItem` per unique non-zero `pt_root_id` in `soma_and_cell_type_1196.feather`. `id = name = str(pt_root_id)`. Where multiple soma rows share a `pt_root_id`, keep the one with the largest `volume`.\n", + "\n", + "Associations:\n", + "- Every kept EM cell → `v1dd_1196_em`.\n", + "- Cells whose `pt_root_id` ∈ `proofread_axon_list_1196.npy` → also `v1dd_1196_proofread_axons` (46/1210 proofread roots are absent from the soma catalog and will be skipped — see exploration cell).\n", + "- Cells whose `pt_root_id` ∈ `proofread_dendrite_list_1196.npy` → also `v1dd_1196_proofread_dendrites` (63986/63986 present).\n", + "\n", + "**Open questions:**\n", + "1. Proofread axon roots missing from the soma catalog (46/1210) — skip silently or log + skip? (Leaning: log + skip; they're real proofread cells without a soma centroid in this release.)\n", + "2. `neuroglancer_link` slot — V1DD has a public neuroglancer state; if a URL template is available we should populate it. Leaving null until a template is confirmed." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "s2-load", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-17T01:07:57.355615Z", + "iopub.status.busy": "2026-06-17T01:07:57.355409Z", + "iopub.status.idle": "2026-06-17T01:07:57.686941Z", + "shell.execute_reply": "2026-06-17T01:07:57.686050Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "soma_df shape : (207455, 11)\n", + "pt_root_id unique : 163064\n", + "id unique : 207455\n", + "axon ids unique : 1210\n", + "dend ids unique : 63986\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "axon ∩ soma roots : 1164\n", + "dend ∩ soma roots : 63986\n" + ] + } + ], + "source": [ + "soma_df = pd.read_feather(DATA_ROOT / \"soma_and_cell_type_1196.feather\")\n", + "axon_ids = np.load(DATA_ROOT / \"proofread_axon_list_1196.npy\", allow_pickle=True)\n", + "dend_ids = np.load(DATA_ROOT / \"proofread_dendrite_list_1196.npy\", allow_pickle=True)\n", + "\n", + "print(\"soma_df shape :\", soma_df.shape)\n", + "print(\"pt_root_id unique :\", soma_df['pt_root_id'].nunique())\n", + "print(\"id unique :\", soma_df['id'].nunique())\n", + "print(\"axon ids unique :\", len(set(axon_ids.tolist())))\n", + "print(\"dend ids unique :\", len(set(dend_ids.tolist())))\n", + "print(\"axon ∩ soma roots :\", len(set(axon_ids.tolist()) & set(soma_df['pt_root_id'].tolist())))\n", + "print(\"dend ∩ soma roots :\", len(set(dend_ids.tolist()) & set(soma_df['pt_root_id'].tolist())))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "s2-code", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-17T01:07:57.688584Z", + "iopub.status.busy": "2026-06-17T01:07:57.688389Z", + "iopub.status.idle": "2026-06-17T01:07:57.691184Z", + "shell.execute_reply": "2026-06-17T01:07:57.690436Z" + } + }, + "outputs": [], + "source": [ + "# TODO: build EM DataItem rows + base associations to DATASET_EM.\n", + "# TODO: join axon/dend lists on pt_root_id to derive cohort association rows.\n", + "# TODO: write_models(...) + verify counts.\n", + "pass" + ] + }, + { + "cell_type": "markdown", + "id": "s3-md", + "metadata": {}, + "source": [ + "## 3. EM soma `CellFeatureMatrix` (`v1dd_em_soma_geometry`)\n", + "\n", + "Numeric features per EM DataItem:\n", + "- `pt_position_x`, `pt_position_y`, `pt_position_z` — voxel coordinates in the EM volume.\n", + "- `volume` — soma volume (units to confirm; likely µm³).\n", + "\n", + "Three `CellFeatureDefinition` rows (dtype ` soma DataItem id.\n", + "# TODO: write mappingset/ and celltocellmapping/ with correct predicates.\n", + "pass" + ] + }, + { + "cell_type": "markdown", + "id": "s9-md", + "metadata": {}, + "source": [ + "## 9. Synapses — `CellCellConnectivityLong`\n", + "\n", + "Source: `syn_df_all_to_proofread_to_all_1196.feather` (8.2M rows) + `syn_label_df_all_to_proofread_to_all_1196.feather` (6.7M tag rows, indexed by synapse `id`).\n", + "\n", + "Aggregate per (`pre_pt_root_id`, `post_pt_root_id`) pair into:\n", + "- `synapse_count` — number of synapses (count, dimensionless).\n", + "- `synapse_size_sum` — total `size` (voxel-count; units to confirm).\n", + "- *(optional)* `spine_synapse_count` — count of synapses tagged `spine`.\n", + "\n", + "Write to its own subdirectory per §5g: `cellcellconnectivitylong_proofread_to_proofread/` (folder name from the source feather). Pre/post cell ids are EM `DataItem` ids — i.e. `str(pt_root_id)` directly, no join needed.\n", + "\n", + "**Open question — defer decision:** *Should raw per-synapse rows get their own schema?* Today `CellCellConnectivityLong` collapses to one row per cell pair, which loses per-synapse position, size, and label information. Two paths:\n", + "- Keep aggregated only; ship raw rows as a parquet sidecar outside the common schema.\n", + "- Propose a new `Synapse` class (slots: id, pre_cell, post_cell, ctr_position, size, tag) — would require a schema PR.\n", + "\n", + "Leaving this **open**; the skeleton implements the aggregated form only.\n", + "\n", + "**Other open questions:**\n", + "1. What `measurement_type` enum value covers `synapse_count` and `synapse_size_sum`? Need to read `SynapticMeasurementType` enum values.\n", + "2. Unit for `synapse_size_sum` — `size` is in voxels (need confirmation); convert to nm³ or leave as voxel counts?\n", + "3. Most synapse endpoints (≈4.2M roots, of which only ~59k are in the soma catalog) have no matching EM `DataItem` — `CellCellConnectivityLong` requires both endpoints to be registered DataItems, so the un-cataloged endpoints must be dropped or we must register additional \"synapse-partner\" DataItems for them. Leaning: drop the un-cataloged side and keep only edges where both endpoints are in `v1dd_1196_em`.\n", + "4. The label feather is indexed by synapse `id` but is shorter than the main synapse table (6.7M vs 8.2M) — unlabelled synapses should be treated as `tag=null`, not implicitly `non-spine`.\n", + "5. Connectome discriminator — per §5g, the folder name scopes the example; confirm `cellcellconnectivitylong_proofread_to_proofread/` is the right convention." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "s9-code", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-17T01:07:57.982113Z", + "iopub.status.busy": "2026-06-17T01:07:57.981950Z", + "iopub.status.idle": "2026-06-17T01:07:57.984248Z", + "shell.execute_reply": "2026-06-17T01:07:57.983638Z" + } + }, + "outputs": [], + "source": [ + "# TODO: load syn_df + syn_label_df; join labels on synapse id.\n", + "# TODO: groupby (pre, post) -> synapse_count, synapse_size_sum, spine_count.\n", + "# TODO: join pre/post pt_root_id -> EM DataItem id.\n", + "# TODO: build CellCellConnectivityLong rows (one per pair per measurement_type).\n", + "# TODO: write to cellcellconnectivitylong_proofread_to_proofread/.\n", + "pass" + ] + }, + { + "cell_type": "markdown", + "id": "s10-md", + "metadata": {}, + "source": [ + "## 10. Functional cell-cell correlations — `CellCellConnectivityLong`\n", + "\n", + "Two source tables, seven stimulus conditions each:\n", + "- `cell_cell_correlations_by_stimulus_coregistered.feather` — keyed by `pre_pt_root_id` / `post_pt_root_id` (which **are** the EM DataItem ids). Pairs **do** repeat (multiple ROIs per EM cell, ~4 %). Two options: (a) write rows directly as (EM, EM) pairs and let consumers see the duplicates, (b) average correlations within each (pre, post) pair, (c) explode back into functional DataItem ids via coreg and write at (func, func) level.\n", + "- `cell_cell_correlations_by_stimulus.feather` — keyed by `(volume, column, plane, roi)` × 2. Tuples are unique. Maps cleanly to functional DataItem ids.\n", + "\n", + "Skeleton plan: one folder per (table, stimulus), e.g. `cellcellconnectivitylong_func_corr_drifting_gratings_full/`, `cellcellconnectivitylong_func_corr_coreg_drifting_gratings_full/`. 7 stimuli × 2 tables = 14 folders.\n", + "\n", + "Verified earlier: in the coregistered table, 148728 rows reduce to 142410 unique (pre_root, post_root) pairs — i.e. ~4% of rows share a pair with another row. 12 self-pairs exist. Pre-set == post-set (551 cells, fully symmetric).\n", + "\n", + "**Open questions:**\n", + "1. **Which key for the coregistered table?** With EM ids = pt_root_id, options (a)/(b)/(c) above are all on the table. (a) is the most direct; (b) loses ROI-level information; (c) requires picking which of the multiple coreg ROIs gets the correlation when collapsing pre side and same on post.\n", + "2. **Symmetry** — Pearson correlation is symmetric (corr(a,b) == corr(b,a)). The table appears to include both directions (8.8M rows ≈ N*(N-1) not N*(N-1)/2). Should we deduplicate, or keep as-is for ease of querying? `CellCellConnectivityLong` doesn't enforce direction.\n", + "3. **Self-pairs** — drop the 12 self-pair rows in the coregistered table?\n", + "4. **Measurement type / unit** — need a `SynapticMeasurementType` enum value for \"Pearson correlation\". If none exists, propose `pearson_correlation`?\n", + "5. **Scale** — 7 stimuli × 8.8M = 62M rows for the all-ROI table. Write all of it, or threshold (|r| > 0.1) and keep a sparse view? Storage cost vs query utility tradeoff.\n", + "6. Per §5g, do we need a per-stimulus folder, or can we use one folder with `measurement_type` as the discriminator? (Schema has only one `measurement_type` enum per row, so per-stimulus folders are the natural fit unless the enum has a `pearson_correlation_` variant — unlikely.)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "s10-explore", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-17T01:07:57.985687Z", + "iopub.status.busy": "2026-06-17T01:07:57.985523Z", + "iopub.status.idle": "2026-06-17T01:08:11.820420Z", + "shell.execute_reply": "2026-06-17T01:08:11.819635Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "== coregistered table ==\n", + "rows : 148728\n", + "unique pre roots : 551\n", + "unique post roots : 551\n", + "unique (pre,post) : 142410\n", + "self pairs : 12\n", + "pre set == post set : True\n", + "\n", + "== all-ROI table ==\n", + "rows : 8846260\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "unique tuples : 8846260\n", + "self pairs (same vol,col,pln,roi): 0\n" + ] + } + ], + "source": [ + "corr_df = pd.read_feather(DATA_ROOT / \"cell_cell_correlations_by_stimulus.feather\")\n", + "corr_co_df = pd.read_feather(DATA_ROOT / \"cell_cell_correlations_by_stimulus_coregistered.feather\")\n", + "\n", + "print(\"== coregistered table ==\")\n", + "print(\"rows :\", len(corr_co_df))\n", + "print(\"unique pre roots :\", corr_co_df['pre_pt_root_id'].nunique())\n", + "print(\"unique post roots :\", corr_co_df['post_pt_root_id'].nunique())\n", + "print(\"unique (pre,post) :\", corr_co_df.drop_duplicates(['pre_pt_root_id','post_pt_root_id']).shape[0])\n", + "print(\"self pairs :\", (corr_co_df['pre_pt_root_id']==corr_co_df['post_pt_root_id']).sum())\n", + "print(\"pre set == post set :\", set(corr_co_df['pre_pt_root_id'].unique()) == set(corr_co_df['post_pt_root_id'].unique()))\n", + "print()\n", + "key = ['pre_roi','post_roi','pre_plane','post_plane','column','volume']\n", + "print(\"== all-ROI table ==\")\n", + "print(\"rows :\", len(corr_df))\n", + "print(\"unique tuples :\", corr_df.drop_duplicates(key).shape[0])\n", + "print(\"self pairs (same vol,col,pln,roi):\",\n", + " ((corr_df['pre_roi']==corr_df['post_roi']) & (corr_df['pre_plane']==corr_df['post_plane'])).sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "s10-code", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-17T01:08:11.822465Z", + "iopub.status.busy": "2026-06-17T01:08:11.822184Z", + "iopub.status.idle": "2026-06-17T01:08:11.824801Z", + "shell.execute_reply": "2026-06-17T01:08:11.824101Z" + } + }, + "outputs": [], + "source": [ + "# TODO: pivot each table -> long form (one row per pair per stimulus).\n", + "# TODO: resolve open questions above (key choice, dedup, threshold).\n", + "# TODO: write 14 folders cellcellconnectivitylong_func_corr_{coreg_,}/.\n", + "pass" + ] + }, + { + "cell_type": "markdown", + "id": "summary-md", + "metadata": {}, + "source": [ + "## Summary (skeleton)\n", + "\n", + "| Output path | Class | Rows | Status |\n", + "|---|---|---|---|\n", + "| `dataset/` | `DataSet` × 5 | 5 | skeleton |\n", + "| `dataitem/` | `DataItem` | ~207k EM + ~4.5k func | skeleton |\n", + "| `dataitem_dataset_association/` | `DataItemDataSetAssociation` | ~213k base + cohort rows | skeleton |\n", + "| `cellfeaturedefinition/`, `cellfeatureset/`, `cellfeaturematrix/`, `cellfeatures/v1dd_em_soma_geometry/` | EM soma geometry | 4 defs, 1 set, 1 matrix, 207k cell rows | skeleton |\n", + "| `singlecellreconstruction/` | `SingleCellReconstruction` | ≤207k (drops NaN trform) | skeleton |\n", + "| `cluster/`, `clusterhierarchy/`, `algorithmrun/`, `clustermembership/` | V1DD taxonomy | 15, 1, 1, parent-propagated | skeleton |\n", + "| `cellfeatures/v1dd_func_qc/`, `cellfeatures/v1dd_func_imaging_position/` | Functional features | 1 + 4 defs, 2 sets, 2 matrices | skeleton |\n", + "| `mappingset/`, `celltocellmapping/` | EM↔func coregistration | 1 set, 571 rows | skeleton |\n", + "| `cellcellconnectivitylong_proofread_to_proofread/` | Synapse aggregation | ~N pairs × M measurement types | skeleton |\n", + "| `cellcellconnectivitylong_func_corr_/` × 7 | All-ROI correlations | ~8.8M per stim | skeleton |\n", + "| `cellcellconnectivitylong_func_corr_coreg_/` × 7 | Coreg correlations | ~149k per stim | skeleton |\n", + "\n", + "**Cross-section open questions (need answers before we wire the writes):**\n", + "- pt_root_id → soma_id join policy (multi-match, missing) — §2, §8, §9.\n", + "- Modality enum value for calcium imaging — §1.\n", + "- Reference space + units for `pt_position_trform_*` — §3, §4.\n", + "- Per-synapse schema vs aggregation-only — §9.\n", + "- Correlation key (EM cell vs functional ROI) and symmetry handling — §10." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/code/etl_visp_exc_patchseq_01_dataset_dataitem.ipynb b/code/etl_visp_exc_patchseq_01_dataset_dataitem.ipynb index 823a840..849b83b 100644 --- a/code/etl_visp_exc_patchseq_01_dataset_dataitem.ipynb +++ b/code/etl_visp_exc_patchseq_01_dataset_dataitem.ipynb @@ -12,53 +12,34 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:04.757520Z", - "iopub.status.busy": "2026-04-30T23:48:04.757339Z", - "iopub.status.idle": "2026-04-30T23:48:05.726578Z", - "shell.execute_reply": "2026-04-30T23:48:05.725798Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import polars as pl\n", "import pyarrow as pa\n", - "from deltalake import write_deltalake\n", "\n", - "from connects_common_connectivity.arrow_utils import (\n", - " build_arrow_schema,\n", - " models_to_table,\n", - " attach_linkml_metadata,\n", - ")\n", "from connects_common_connectivity.models import (\n", " DataSet,\n", " DataItem,\n", " DataItemDataSetAssociation,\n", " Modality,\n", ")\n", - "from connects_common_connectivity.write_utils import append_new_dataitems" + "from connects_common_connectivity.config import output_root\n", + "from connects_common_connectivity.io import write_models\n" ] }, { "cell_type": "code", "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:05.728644Z", - "iopub.status.busy": "2026-04-30T23:48:05.728254Z", - "iopub.status.idle": "2026-04-30T23:48:05.731888Z", - "shell.execute_reply": "2026-04-30T23:48:05.731306Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INPUT_CSV : /data/visp-features-and-mapping/inferred_met_types.csv\n", - "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v1/\n", + "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v2/\n", "PROJECT_ID : visp_patchseq\n", "DATASET_ID : visp_exc_patchseq\n" ] @@ -66,7 +47,7 @@ ], "source": [ "INPUT_CSV = \"/data/visp-features-and-mapping/inferred_met_types.csv\"\n", - "OUTPUT_ROOT = \"../scratch/em_patchseq_wnm_v1/\"\n", + "OUTPUT_ROOT = output_root()\n", "PROJECT_ID = \"visp_patchseq\"\n", "DATASET_ID = \"visp_exc_patchseq\"\n", "\n", @@ -86,14 +67,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:05.767988Z", - "iopub.status.busy": "2026-04-30T23:48:05.767676Z", - "iopub.status.idle": "2026-04-30T23:48:05.784996Z", - "shell.execute_reply": "2026-04-30T23:48:05.784401Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -180,20 +154,13 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:05.786870Z", - "iopub.status.busy": "2026-04-30T23:48:05.786601Z", - "iopub.status.idle": "2026-04-30T23:48:05.874844Z", - "shell.execute_reply": "2026-04-30T23:48:05.874120Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "DataSet written: (1, 5)\n" + "DataSet written: 1 rows\n" ] } ], @@ -205,35 +172,14 @@ " modality=Modality.MORPHOLOGY.value,\n", " project_id=PROJECT_ID,\n", ")\n", - "\n", - "schema_ds = build_arrow_schema(DataSet)\n", - "table_ds = models_to_table([dataset], schema=schema_ds)\n", - "table_ds = attach_linkml_metadata(table_ds, linkml_class=\"DataSet\")\n", - "\n", - "# mode='overwrite' makes re-runs idempotent instead of appending duplicates.\n", - "# predicate scopes the overwrite to this project only — other projects' rows\n", - "# in the shared Delta table are left untouched.\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"dataset/\",\n", - " table_ds,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"DataSet written:\", table_ds.shape)" + "result = write_models([dataset], output_root=OUTPUT_ROOT)\n", + "print(f\"DataSet written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:05.876591Z", - "iopub.status.busy": "2026-04-30T23:48:05.876391Z", - "iopub.status.idle": "2026-04-30T23:48:05.903970Z", - "shell.execute_reply": "2026-04-30T23:48:05.903193Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -256,12 +202,12 @@ "# Verification\n", "ds_verify = (\n", " pl.read_delta(OUTPUT_ROOT + \"dataset/\")\n", - " .filter(pl.col(\"project_id\") == PROJECT_ID)\n", + " .filter((pl.col(\"project_id\") == PROJECT_ID) & (pl.col(\"id\") == DATASET_ID))\n", " .filter(pl.col(\"id\") == DATASET_ID)\n", ")\n", "print(ds_verify.shape)\n", "print(ds_verify.head())\n", - "assert ds_verify.shape[0] == 1, f\"Expected 1 DataSet row, got {ds_verify.shape[0]}\"\n", + "assert ds_verify.shape[0] == 1, f\"Expected 1 DataSet row for {DATASET_ID}, got {ds_verify.shape[0]}\"\n", "assert ds_verify[\"id\"][0] == DATASET_ID, \"DataSet id mismatch\"" ] }, @@ -275,14 +221,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:05.905653Z", - "iopub.status.busy": "2026-04-30T23:48:05.905463Z", - "iopub.status.idle": "2026-04-30T23:48:05.956577Z", - "shell.execute_reply": "2026-04-30T23:48:05.955878Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -299,29 +238,14 @@ " DataItem(id=cid, name=cid, project_id=PROJECT_ID)\n", " for cid in cell_ids\n", "]\n", - "\n", - "schema_di = build_arrow_schema(DataItem)\n", - "table_di = models_to_table(dataitems, schema=schema_di)\n", - "table_di = attach_linkml_metadata(table_di, linkml_class=\"DataItem\")\n", - "\n", - "# append_new_dataitems checks which ids already exist for this project and appends\n", - "# only new rows — safe when multiple _01 notebooks share a project_id, since\n", - "# each dataset's cells are registered without wiping the other's rows.\n", - "n_appended = append_new_dataitems(OUTPUT_ROOT + \"dataitem/\", table_di, project_id=PROJECT_ID)\n", + "n_appended = write_models(dataitems, output_root=OUTPUT_ROOT).rows_written\n", "print(f\"DataItem rows appended: {n_appended} (total in batch: {len(cell_ids)})\")" ] }, { "cell_type": "code", "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:05.958361Z", - "iopub.status.busy": "2026-04-30T23:48:05.958078Z", - "iopub.status.idle": "2026-04-30T23:48:05.988036Z", - "shell.execute_reply": "2026-04-30T23:48:05.987297Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -334,11 +258,11 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ str ┆ str │\n", "╞═══════════╪═══════════╪═══════════════════╪═══════════════╡\n", - "│ 908902400 ┆ 908902400 ┆ null ┆ visp_patchseq │\n", - "│ 965091329 ┆ 965091329 ┆ null ┆ visp_patchseq │\n", - "│ 978149378 ┆ 978149378 ┆ null ┆ visp_patchseq │\n", - "│ 834891776 ┆ 834891776 ┆ null ┆ visp_patchseq │\n", - "│ 897003522 ┆ 897003522 ┆ null ┆ visp_patchseq │\n", + "│ 601790961 ┆ 601790961 ┆ null ┆ visp_patchseq │\n", + "│ 602535278 ┆ 602535278 ┆ null ┆ visp_patchseq │\n", + "│ 604646725 ┆ 604646725 ┆ null ┆ visp_patchseq │\n", + "│ 623326230 ┆ 623326230 ┆ null ┆ visp_patchseq │\n", + "│ 623434306 ┆ 623434306 ┆ null ┆ visp_patchseq │\n", "└───────────┴───────────┴───────────────────┴───────────────┘\n" ] } @@ -366,20 +290,13 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:05.989687Z", - "iopub.status.busy": "2026-04-30T23:48:05.989496Z", - "iopub.status.idle": "2026-04-30T23:48:06.099487Z", - "shell.execute_reply": "2026-04-30T23:48:06.098701Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "DataItemDataSetAssociation written: (1528, 3)\n" + "DataItemDataSetAssociation written: 1528 rows\n" ] } ], @@ -392,35 +309,14 @@ " )\n", " for cid in cell_ids\n", "]\n", - "\n", - "schema_assoc = build_arrow_schema(DataItemDataSetAssociation)\n", - "table_assoc = models_to_table(associations, schema=schema_assoc)\n", - "table_assoc = attach_linkml_metadata(table_assoc, linkml_class=\"DataItemDataSetAssociation\")\n", - "\n", - "# mode='overwrite' makes re-runs idempotent instead of appending duplicates.\n", - "# predicate scopes the overwrite to this project only — other projects' rows\n", - "# in the shared Delta table are left untouched.\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"dataitem_dataset_association/\",\n", - " table_assoc,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND dataset_id = '{DATASET_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"DataItemDataSetAssociation written:\", table_assoc.shape)" + "result = write_models(associations, output_root=OUTPUT_ROOT)\n", + "print(f\"DataItemDataSetAssociation written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 9, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:06.101171Z", - "iopub.status.busy": "2026-04-30T23:48:06.100968Z", - "iopub.status.idle": "2026-04-30T23:48:06.126639Z", - "shell.execute_reply": "2026-04-30T23:48:06.125980Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", diff --git a/code/etl_visp_exc_patchseq_02_cell_features.ipynb b/code/etl_visp_exc_patchseq_02_cell_features.ipynb index 7a622b0..1d82194 100644 --- a/code/etl_visp_exc_patchseq_02_cell_features.ipynb +++ b/code/etl_visp_exc_patchseq_02_cell_features.ipynb @@ -12,14 +12,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:09.239172Z", - "iopub.status.busy": "2026-04-30T23:48:09.238988Z", - "iopub.status.idle": "2026-04-30T23:48:10.290552Z", - "shell.execute_reply": "2026-04-30T23:48:10.289788Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import os\n", @@ -31,37 +24,27 @@ "import pyarrow as pa\n", "from deltalake import write_deltalake\n", "\n", - "from connects_common_connectivity.arrow_utils import (\n", - " attach_linkml_metadata,\n", - " build_arrow_schema,\n", - " build_cell_feature_matrix_schema,\n", - " models_to_table,\n", - ")\n", "from connects_common_connectivity.models import (\n", " CellFeatureDefinition,\n", " CellFeatureMatrix,\n", " CellFeatureSet,\n", " Unit,\n", - ")" + ")\n", + "from connects_common_connectivity.io.arrow_utils import build_cell_feature_matrix_schema\n", + "from connects_common_connectivity.config import output_root\n", + "from connects_common_connectivity.io import write_models\n" ] }, { "cell_type": "code", "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:10.292861Z", - "iopub.status.busy": "2026-04-30T23:48:10.292520Z", - "iopub.status.idle": "2026-04-30T23:48:10.296435Z", - "shell.execute_reply": "2026-04-30T23:48:10.295830Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v1/\n", + "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v2/\n", "PROJECT_ID : visp_patchseq\n", "DATASET_ID : visp_exc_patchseq\n", "FEATURE_SET_ID : exc_visp_morph_features\n" @@ -71,7 +54,7 @@ "source": [ "DEFS_CSV = \"/data/visp-features-and-mapping/exc_visp_patchseq_morph_feature_definitions.csv\"\n", "WIDE_CSV = \"/data/visp-features-and-mapping/morph_features_mMET_exc_wide_unnormalized.csv\"\n", - "OUTPUT_ROOT = \"../scratch/em_patchseq_wnm_v1/\"\n", + "OUTPUT_ROOT = output_root()\n", "PROJECT_ID = \"visp_patchseq\"\n", "DATASET_ID = \"visp_exc_patchseq\"\n", "FEATURE_SET_ID = \"exc_visp_morph_features\"\n", @@ -92,14 +75,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:10.333830Z", - "iopub.status.busy": "2026-04-30T23:48:10.333493Z", - "iopub.status.idle": "2026-04-30T23:48:10.381149Z", - "shell.execute_reply": "2026-04-30T23:48:10.380385Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -143,14 +119,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:10.382962Z", - "iopub.status.busy": "2026-04-30T23:48:10.382762Z", - "iopub.status.idle": "2026-04-30T23:48:10.397104Z", - "shell.execute_reply": "2026-04-30T23:48:10.396352Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -258,20 +227,13 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:10.398859Z", - "iopub.status.busy": "2026-04-30T23:48:10.398669Z", - "iopub.status.idle": "2026-04-30T23:48:10.499404Z", - "shell.execute_reply": "2026-04-30T23:48:10.498654Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CellFeatureDefinition written: (50, 8)\n" + "CellFeatureDefinition written: 50 rows\n" ] } ], @@ -291,31 +253,14 @@ " if pd.notna(row[\"range_max\"]):\n", " kwargs[\"range_max\"] = float(row[\"range_max\"])\n", " feature_defs.append(CellFeatureDefinition(**kwargs))\n", - "\n", - "schema_cfd = build_arrow_schema(CellFeatureDefinition)\n", - "table_cfd = attach_linkml_metadata(\n", - " models_to_table(feature_defs, schema=schema_cfd), linkml_class=\"CellFeatureDefinition\"\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"cellfeaturedefinition/\", table_cfd,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND feature_set_id = '{FEATURE_SET_ID}'\",\n", - " partition_by=[\"project_id\", \"feature_set_id\"],\n", - ")\n", - "print(\"CellFeatureDefinition written:\", table_cfd.shape)" + "result = write_models(feature_defs, output_root=OUTPUT_ROOT)\n", + "print(f\"CellFeatureDefinition written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:10.501253Z", - "iopub.status.busy": "2026-04-30T23:48:10.501047Z", - "iopub.status.idle": "2026-04-30T23:48:10.534370Z", - "shell.execute_reply": "2026-04-30T23:48:10.533551Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -364,20 +309,13 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:10.536452Z", - "iopub.status.busy": "2026-04-30T23:48:10.536158Z", - "iopub.status.idle": "2026-04-30T23:48:10.638863Z", - "shell.execute_reply": "2026-04-30T23:48:10.638182Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CellFeatureSet written: (1, 5)\n" + "CellFeatureSet written: 1 rows\n" ] } ], @@ -394,31 +332,14 @@ " extraction_method=\"Computed via https://github.com/AllenInstitute/skeleton_keys.\",\n", " project_id=PROJECT_ID,\n", ")\n", - "\n", - "schema_cfs = build_arrow_schema(CellFeatureSet)\n", - "table_cfs = attach_linkml_metadata(\n", - " models_to_table([feature_set], schema=schema_cfs), linkml_class=\"CellFeatureSet\"\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"cellfeatureset/\", table_cfs,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND id = '{FEATURE_SET_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"CellFeatureSet written:\", table_cfs.shape)" + "result = write_models([feature_set], output_root=OUTPUT_ROOT)\n", + "print(f\"CellFeatureSet written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:10.640977Z", - "iopub.status.busy": "2026-04-30T23:48:10.640772Z", - "iopub.status.idle": "2026-04-30T23:48:10.671088Z", - "shell.execute_reply": "2026-04-30T23:48:10.670239Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -455,14 +376,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:10.672890Z", - "iopub.status.busy": "2026-04-30T23:48:10.672697Z", - "iopub.status.idle": "2026-04-30T23:48:10.708439Z", - "shell.execute_reply": "2026-04-30T23:48:10.707666Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -668,14 +582,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:10.710134Z", - "iopub.status.busy": "2026-04-30T23:48:10.709940Z", - "iopub.status.idle": "2026-04-30T23:48:10.817703Z", - "shell.execute_reply": "2026-04-30T23:48:10.816905Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -701,14 +608,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:10.819464Z", - "iopub.status.busy": "2026-04-30T23:48:10.819253Z", - "iopub.status.idle": "2026-04-30T23:48:10.844564Z", - "shell.execute_reply": "2026-04-30T23:48:10.843762Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -758,20 +658,13 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:10.846336Z", - "iopub.status.busy": "2026-04-30T23:48:10.846143Z", - "iopub.status.idle": "2026-04-30T23:48:11.021468Z", - "shell.execute_reply": "2026-04-30T23:48:11.020647Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CellFeatureMatrix written: (1, 5)\n" + "CellFeatureMatrix written: 1 rows\n" ] } ], @@ -784,31 +677,14 @@ " cell_index_column=\"id\",\n", " project_id=PROJECT_ID,\n", ")\n", - "\n", - "schema_cfm = build_arrow_schema(CellFeatureMatrix)\n", - "table_cfm = attach_linkml_metadata(\n", - " models_to_table([cfm], schema=schema_cfm), linkml_class=\"CellFeatureMatrix\"\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"cellfeaturematrix/\", table_cfm,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND feature_set_id = '{FEATURE_SET_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"CellFeatureMatrix written:\", table_cfm.shape)" + "result = write_models([cfm], output_root=OUTPUT_ROOT)\n", + "print(f\"CellFeatureMatrix written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 13, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:11.023215Z", - "iopub.status.busy": "2026-04-30T23:48:11.023018Z", - "iopub.status.idle": "2026-04-30T23:48:11.067847Z", - "shell.execute_reply": "2026-04-30T23:48:11.067154Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", diff --git a/code/etl_visp_exc_patchseq_03_cluster_membership_and_mapping.ipynb b/code/etl_visp_exc_patchseq_03_cluster_membership_and_mapping.ipynb index c716a47..742bbb6 100644 --- a/code/etl_visp_exc_patchseq_03_cluster_membership_and_mapping.ipynb +++ b/code/etl_visp_exc_patchseq_03_cluster_membership_and_mapping.ipynb @@ -22,52 +22,34 @@ "cell_type": "code", "execution_count": 1, "id": "fbe20dd9", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T19:07:57.322938Z", - "iopub.status.busy": "2026-05-01T19:07:57.322757Z", - "iopub.status.idle": "2026-05-01T19:07:58.394389Z", - "shell.execute_reply": "2026-05-01T19:07:58.393649Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import polars as pl\n", - "from deltalake import write_deltalake\n", "\n", - "from connects_common_connectivity.arrow_utils import (\n", - " attach_linkml_metadata,\n", - " build_arrow_schema,\n", - " models_to_table,\n", - ")\n", "from connects_common_connectivity.models import (\n", " CellToClusterMapping,\n", " ClusterMembership,\n", " MappingSet,\n", ")\n", - "from connects_common_connectivity.write_utils import walk_ancestors\n" + "from connects_common_connectivity.io.write_utils import walk_ancestors\n", + "from connects_common_connectivity.config import output_root\n", + "from connects_common_connectivity.io import write_models\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "f48baf63", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T19:07:58.396799Z", - "iopub.status.busy": "2026-05-01T19:07:58.396502Z", - "iopub.status.idle": "2026-05-01T19:07:58.402261Z", - "shell.execute_reply": "2026-05-01T19:07:58.401583Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INPUT_CSV : /data/visp-features-and-mapping/inferred_met_types.csv\n", - "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v1/\n", + "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v2/\n", "PROJECT_ID : visp_patchseq\n", "DATASET_ID : visp_exc_patchseq\n", "TTYPE_HIERARCHY_ID : tasic_2018_visp_taxonomy\n", @@ -79,7 +61,7 @@ ], "source": [ "INPUT_CSV = \"/data/visp-features-and-mapping/inferred_met_types.csv\"\n", - "OUTPUT_ROOT = \"../scratch/em_patchseq_wnm_v1/\"\n", + "OUTPUT_ROOT = output_root()\n", "\n", "PROJECT_ID = \"visp_patchseq\"\n", "DATASET_ID = \"visp_exc_patchseq\"\n", @@ -114,14 +96,7 @@ "cell_type": "code", "execution_count": 3, "id": "9ffb70f0", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T19:07:58.404227Z", - "iopub.status.busy": "2026-05-01T19:07:58.404038Z", - "iopub.status.idle": "2026-05-01T19:07:58.482759Z", - "shell.execute_reply": "2026-05-01T19:07:58.481916Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -170,14 +145,7 @@ "cell_type": "code", "execution_count": 4, "id": "363edbba", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T19:07:58.484558Z", - "iopub.status.busy": "2026-05-01T19:07:58.484364Z", - "iopub.status.idle": "2026-05-01T19:07:58.502958Z", - "shell.execute_reply": "2026-05-01T19:07:58.502267Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -262,14 +230,7 @@ "cell_type": "code", "execution_count": 5, "id": "d6fe8d5a", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T19:07:58.504930Z", - "iopub.status.busy": "2026-05-01T19:07:58.504740Z", - "iopub.status.idle": "2026-05-01T19:07:58.508607Z", - "shell.execute_reply": "2026-05-01T19:07:58.507875Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -304,14 +265,7 @@ "cell_type": "code", "execution_count": 6, "id": "1bf18692", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T19:07:58.510280Z", - "iopub.status.busy": "2026-05-01T19:07:58.510091Z", - "iopub.status.idle": "2026-05-01T19:07:58.515303Z", - "shell.execute_reply": "2026-05-01T19:07:58.514671Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -335,20 +289,13 @@ "cell_type": "code", "execution_count": 7, "id": "66a5586b", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T19:07:58.517007Z", - "iopub.status.busy": "2026-05-01T19:07:58.516828Z", - "iopub.status.idle": "2026-05-01T19:07:58.616792Z", - "shell.execute_reply": "2026-05-01T19:07:58.616090Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "MappingSet written: (1, 13)\n" + "MappingSet written: 1 rows\n" ] } ], @@ -368,33 +315,15 @@ " target_hierarchy=TTYPE_HIERARCHY_ID,\n", " project_id=PROJECT_ID,\n", ")\n", - "\n", - "schema_ms = build_arrow_schema(MappingSet)\n", - "table_ms = attach_linkml_metadata(\n", - " models_to_table([ttype_mapping_set], schema=schema_ms),\n", - " linkml_class=\"MappingSet\",\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"mappingset/\", table_ms,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND id = '{MAPPING_SET_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"MappingSet written:\", table_ms.shape)\n" + "result = write_models([ttype_mapping_set], output_root=OUTPUT_ROOT)\n", + "print(f\"MappingSet written: {result.rows_written} rows\")\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "2614cb03", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T19:07:58.618769Z", - "iopub.status.busy": "2026-05-01T19:07:58.618560Z", - "iopub.status.idle": "2026-05-01T19:07:58.648431Z", - "shell.execute_reply": "2026-05-01T19:07:58.647683Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -434,27 +363,14 @@ "cell_type": "code", "execution_count": 9, "id": "7a0ca37b", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T19:07:58.650259Z", - "iopub.status.busy": "2026-05-01T19:07:58.650024Z", - "iopub.status.idle": "2026-05-01T19:07:58.871300Z", - "shell.execute_reply": "2026-05-01T19:07:58.870510Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CellToClusterMapping rows built: 6112\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CellToClusterMapping written: (6112, 8)\n" + "CellToClusterMapping rows built: 6112\n", + "CellToClusterMapping written: 6112 rows\n" ] } ], @@ -474,33 +390,15 @@ " project_id=PROJECT_ID,\n", " ))\n", "print(f\"CellToClusterMapping rows built: {len(ttype_mappings)}\")\n", - "\n", - "schema_ccm = build_arrow_schema(CellToClusterMapping)\n", - "table_ccm = attach_linkml_metadata(\n", - " models_to_table(ttype_mappings, schema=schema_ccm),\n", - " linkml_class=\"CellToClusterMapping\",\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"celltoclustermapping/\", table_ccm,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND mapping_set = '{MAPPING_SET_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"CellToClusterMapping written:\", table_ccm.shape)\n" + "result = write_models(ttype_mappings, output_root=OUTPUT_ROOT)\n", + "print(f\"CellToClusterMapping written: {result.rows_written} rows\")\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "1e3146fb", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T19:07:58.873195Z", - "iopub.status.busy": "2026-05-01T19:07:58.872979Z", - "iopub.status.idle": "2026-05-01T19:07:58.907259Z", - "shell.execute_reply": "2026-05-01T19:07:58.906462Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -562,14 +460,7 @@ "cell_type": "code", "execution_count": 11, "id": "435ca181", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T19:07:58.909071Z", - "iopub.status.busy": "2026-05-01T19:07:58.908872Z", - "iopub.status.idle": "2026-05-01T19:07:58.914624Z", - "shell.execute_reply": "2026-05-01T19:07:58.913982Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -593,31 +484,15 @@ "cell_type": "code", "execution_count": 12, "id": "65ce039a", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T19:07:58.916379Z", - "iopub.status.busy": "2026-05-01T19:07:58.916200Z", - "iopub.status.idle": "2026-05-01T19:07:59.161676Z", - "shell.execute_reply": "2026-05-01T19:07:59.160879Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "New ClusterMembership rows built: 1152\n", - "Existing rows under predicate: 2637\n", - " rows owned by other notebooks (kept): 1485\n", - " rows owned by this notebook (dropped, will rewrite): 1152\n", - "Total ClusterMembership rows to write: 2637\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ClusterMembership written: (2637, 7)\n" + "Existing rows under predicate: 1152; kept (other notebooks): 0; new: 1152\n", + "ClusterMembership written: 1152 rows\n" ] } ], @@ -635,72 +510,56 @@ " ))\n", "print(f\"New ClusterMembership rows built: {len(memberships)}\")\n", "\n", - "# Merge step: read existing rows under the predicate, drop rows we own, union with new.\n", "our_cell_ids = set(met_df.index.tolist())\n", - "existing_cm = (\n", - " pl.read_delta(OUTPUT_ROOT + \"clustermembership/\")\n", - " .filter((pl.col(\"project_id\") == PROJECT_ID) & (pl.col(\"hierarchy_id\") == METTYPE_HIERARCHY_ID))\n", - ")\n", - "print(f\"Existing rows under predicate: {existing_cm.shape[0]}\")\n", "\n", - "other_cm = existing_cm.filter(~pl.col(\"item\").is_in(list(our_cell_ids)))\n", - "print(f\" rows owned by other notebooks (kept): {other_cm.shape[0]}\")\n", - "print(f\" rows owned by this notebook (dropped, will rewrite): \"\n", - " f\"{existing_cm.shape[0] - other_cm.shape[0]}\")\n", - "\n", - "# Re-validate kept rows through the Pydantic model (integrity check + uniform schema).\n", + "# Merge-then-overwrite: ClusterMembership is overwrite_scoped on\n", + "# (project_id, hierarchy_id), so a plain overwrite here would clobber rows\n", + "# written under the same predicate by sibling notebooks (e.g.\n", + "# etl_visp_inh_patchseq_03's 495 GABAergic-MET cells). Read existing rows,\n", + "# keep the ones this notebook does not own (item NOT IN our_cell_ids), and\n", + "# union them with the new rows before re-writing the full scope.\n", + "try:\n", + " existing_cm = (\n", + " pl.read_delta(OUTPUT_ROOT + \"clustermembership/\")\n", + " .filter((pl.col(\"project_id\") == PROJECT_ID) & (pl.col(\"hierarchy_id\") == METTYPE_HIERARCHY_ID))\n", + " )\n", + "except Exception:\n", + " existing_cm = pl.DataFrame(schema={\"item\": pl.Utf8})\n", + "other_cm = existing_cm.filter(~pl.col(\"item\").is_in(list(our_cell_ids))) if existing_cm.shape[0] else existing_cm\n", "other_memberships = [ClusterMembership(**row) for row in other_cm.to_dicts()]\n", "all_memberships = other_memberships + memberships\n", - "print(f\"Total ClusterMembership rows to write: {len(all_memberships)}\")\n", - "\n", - "schema_cm = build_arrow_schema(ClusterMembership)\n", - "table_cm = attach_linkml_metadata(\n", - " models_to_table(all_memberships, schema=schema_cm),\n", - " linkml_class=\"ClusterMembership\",\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"clustermembership/\", table_cm,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND hierarchy_id = '{METTYPE_HIERARCHY_ID}'\",\n", - " partition_by=[\"project_id\", \"hierarchy_id\"],\n", - ")\n", - "print(\"ClusterMembership written:\", table_cm.shape)\n" + "print(f\"Existing rows under predicate: {existing_cm.shape[0]}; kept (other notebooks): {other_cm.shape[0]}; new: {len(memberships)}\")\n", + "result = write_models(all_memberships, output_root=OUTPUT_ROOT)\n", + "print(f\"ClusterMembership written: {result.rows_written} rows\")\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "6c93328b", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T19:07:59.163552Z", - "iopub.status.busy": "2026-05-01T19:07:59.163345Z", - "iopub.status.idle": "2026-05-01T19:07:59.187839Z", - "shell.execute_reply": "2026-05-01T19:07:59.187100Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(2637, 7)\n", + "(1152, 7)\n", "shape: (3, 7)\n", - "┌───────────┬───────────┬────────────────┬─────────────┬──────────┬───────────────┬────────────────┐\n", - "│ item ┆ cluster ┆ membership_sco ┆ probability ┆ distance ┆ project_id ┆ hierarchy_id │\n", - "│ --- ┆ --- ┆ re ┆ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ --- ┆ f64 ┆ f64 ┆ str ┆ str │\n", - "│ ┆ ┆ f64 ┆ ┆ ┆ ┆ │\n", - "╞═══════════╪═══════════╪════════════════╪═════════════╪══════════╪═══════════════╪════════════════╡\n", - "│ 601506507 ┆ Vip-MET-2 ┆ null ┆ null ┆ null ┆ visp_patchseq ┆ visp_met_types │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ _taxonomy │\n", - "│ 601506507 ┆ GABAergic ┆ null ┆ null ┆ null ┆ visp_patchseq ┆ visp_met_types │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ _taxonomy │\n", - "│ 601506507 ┆ cell ┆ null ┆ null ┆ null ┆ visp_patchseq ┆ visp_met_types │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ _taxonomy │\n", - "└───────────┴───────────┴────────────────┴─────────────┴──────────┴───────────────┴────────────────┘\n", + "┌────────────┬───────────────┬──────────────┬─────────────┬──────────┬──────────────┬──────────────┐\n", + "│ item ┆ cluster ┆ membership_s ┆ probability ┆ distance ┆ project_id ┆ hierarchy_id │\n", + "│ --- ┆ --- ┆ core ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ --- ┆ f64 ┆ f64 ┆ str ┆ str │\n", + "│ ┆ ┆ f64 ┆ ┆ ┆ ┆ │\n", + "╞════════════╪═══════════════╪══════════════╪═════════════╪══════════╪══════════════╪══════════════╡\n", + "│ 1039273993 ┆ L6b ┆ null ┆ null ┆ null ┆ visp_patchse ┆ visp_met_typ │\n", + "│ ┆ ┆ ┆ ┆ ┆ q ┆ es_taxonomy │\n", + "│ 1039273993 ┆ Glutamatergic ┆ null ┆ null ┆ null ┆ visp_patchse ┆ visp_met_typ │\n", + "│ ┆ ┆ ┆ ┆ ┆ q ┆ es_taxonomy │\n", + "│ 1039273993 ┆ cell ┆ null ┆ null ┆ null ┆ visp_patchse ┆ visp_met_typ │\n", + "│ ┆ ┆ ┆ ┆ ┆ q ┆ es_taxonomy │\n", + "└────────────┴───────────────┴──────────────┴─────────────┴──────────┴──────────────┴──────────────┘\n", "Our cells present: 384 / 384\n", - "Other-notebook rows preserved: 1485\n" + "Other-notebook rows preserved: 0\n" ] } ], @@ -750,14 +609,7 @@ "cell_type": "code", "execution_count": 14, "id": "a9e13592", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T19:07:59.189559Z", - "iopub.status.busy": "2026-05-01T19:07:59.189366Z", - "iopub.status.idle": "2026-05-01T19:07:59.197123Z", - "shell.execute_reply": "2026-05-01T19:07:59.196470Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -791,20 +643,13 @@ "cell_type": "code", "execution_count": 15, "id": "74b1a5ca", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T19:07:59.198654Z", - "iopub.status.busy": "2026-05-01T19:07:59.198475Z", - "iopub.status.idle": "2026-05-01T19:07:59.290568Z", - "shell.execute_reply": "2026-05-01T19:07:59.289816Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "MappingSet (inferred) written: (1, 13)\n" + "MappingSet written: 1 rows\n" ] } ], @@ -826,33 +671,15 @@ " target_hierarchy=METTYPE_HIERARCHY_ID,\n", " project_id=PROJECT_ID,\n", ")\n", - "\n", - "schema_ms_inf = build_arrow_schema(MappingSet)\n", - "table_ms_inf = attach_linkml_metadata(\n", - " models_to_table([inferred_mapping_set], schema=schema_ms_inf),\n", - " linkml_class=\"MappingSet\",\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"mappingset/\", table_ms_inf,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND id = '{MAPPING_SET_INFERRED_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"MappingSet (inferred) written:\", table_ms_inf.shape)\n" + "result = write_models([inferred_mapping_set], output_root=OUTPUT_ROOT)\n", + "print(f\"MappingSet written: {result.rows_written} rows\")\n" ] }, { "cell_type": "code", "execution_count": 16, "id": "cd687756", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T19:07:59.292224Z", - "iopub.status.busy": "2026-05-01T19:07:59.292034Z", - "iopub.status.idle": "2026-05-01T19:07:59.347344Z", - "shell.execute_reply": "2026-05-01T19:07:59.346573Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -898,27 +725,14 @@ "cell_type": "code", "execution_count": 17, "id": "dcdef943", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T19:07:59.349093Z", - "iopub.status.busy": "2026-05-01T19:07:59.348810Z", - "iopub.status.idle": "2026-05-01T19:07:59.497900Z", - "shell.execute_reply": "2026-05-01T19:07:59.497182Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CellToClusterMapping (inferred) rows built: 3159\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CellToClusterMapping (inferred) written: (3159, 8)\n" + "CellToClusterMapping (inferred) rows built: 3159\n", + "CellToClusterMapping written: 3159 rows\n" ] } ], @@ -935,33 +749,15 @@ " project_id=PROJECT_ID,\n", " ))\n", "print(f\"CellToClusterMapping (inferred) rows built: {len(inferred_mappings)}\")\n", - "\n", - "schema_ccm_inf = build_arrow_schema(CellToClusterMapping)\n", - "table_ccm_inf = attach_linkml_metadata(\n", - " models_to_table(inferred_mappings, schema=schema_ccm_inf),\n", - " linkml_class=\"CellToClusterMapping\",\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"celltoclustermapping/\", table_ccm_inf,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND mapping_set = '{MAPPING_SET_INFERRED_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"CellToClusterMapping (inferred) written:\", table_ccm_inf.shape)\n" + "result = write_models(inferred_mappings, output_root=OUTPUT_ROOT)\n", + "print(f\"CellToClusterMapping written: {result.rows_written} rows\")\n" ] }, { "cell_type": "code", "execution_count": 18, "id": "2406a7b7", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T19:07:59.499786Z", - "iopub.status.busy": "2026-05-01T19:07:59.499491Z", - "iopub.status.idle": "2026-05-01T19:07:59.573613Z", - "shell.execute_reply": "2026-05-01T19:07:59.572844Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", diff --git a/code/etl_visp_inh_patchseq_01_dataset_dataitem.ipynb b/code/etl_visp_inh_patchseq_01_dataset_dataitem.ipynb index 1ada0b3..0e8db3f 100644 --- a/code/etl_visp_inh_patchseq_01_dataset_dataitem.ipynb +++ b/code/etl_visp_inh_patchseq_01_dataset_dataitem.ipynb @@ -12,53 +12,34 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:56.180164Z", - "iopub.status.busy": "2026-04-30T23:47:56.179975Z", - "iopub.status.idle": "2026-04-30T23:47:57.107995Z", - "shell.execute_reply": "2026-04-30T23:47:57.107250Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import polars as pl\n", "import pyarrow as pa\n", - "from deltalake import write_deltalake\n", "\n", - "from connects_common_connectivity.arrow_utils import (\n", - " build_arrow_schema,\n", - " models_to_table,\n", - " attach_linkml_metadata,\n", - ")\n", "from connects_common_connectivity.models import (\n", " DataSet,\n", " DataItem,\n", " DataItemDataSetAssociation,\n", " Modality,\n", ")\n", - "from connects_common_connectivity.write_utils import append_new_dataitems" + "from connects_common_connectivity.config import output_root\n", + "from connects_common_connectivity.io import write_models\n" ] }, { "cell_type": "code", "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:57.110118Z", - "iopub.status.busy": "2026-04-30T23:47:57.109835Z", - "iopub.status.idle": "2026-04-30T23:47:57.113453Z", - "shell.execute_reply": "2026-04-30T23:47:57.112829Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INPUT_CSV : /data/visp-features-and-mapping/patchseq_tx_cell_ttype_labels.csv\n", - "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v1/\n", + "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v2/\n", "PROJECT_ID : visp_patchseq\n", "DATASET_ID : visp_inh_patchseq\n" ] @@ -66,7 +47,7 @@ ], "source": [ "INPUT_CSV = \"/data/visp-features-and-mapping/patchseq_tx_cell_ttype_labels.csv\"\n", - "OUTPUT_ROOT = \"../scratch/em_patchseq_wnm_v1/\"\n", + "OUTPUT_ROOT = output_root()\n", "PROJECT_ID = \"visp_patchseq\"\n", "DATASET_ID = \"visp_inh_patchseq\"\n", "\n", @@ -86,14 +67,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:57.149546Z", - "iopub.status.busy": "2026-04-30T23:47:57.149348Z", - "iopub.status.idle": "2026-04-30T23:47:57.166055Z", - "shell.execute_reply": "2026-04-30T23:47:57.165393Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -176,20 +150,13 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:57.167637Z", - "iopub.status.busy": "2026-04-30T23:47:57.167453Z", - "iopub.status.idle": "2026-04-30T23:47:57.254083Z", - "shell.execute_reply": "2026-04-30T23:47:57.253239Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "DataSet written: (1, 5)\n" + "DataSet written: 1 rows\n" ] } ], @@ -201,35 +168,14 @@ " modality=Modality.MORPHOLOGY.value,\n", " project_id=PROJECT_ID,\n", ")\n", - "\n", - "schema_ds = build_arrow_schema(DataSet)\n", - "table_ds = models_to_table([dataset], schema=schema_ds)\n", - "table_ds = attach_linkml_metadata(table_ds, linkml_class=\"DataSet\")\n", - "\n", - "# mode='overwrite' makes re-runs idempotent instead of appending duplicates.\n", - "# predicate scopes the overwrite to this project only — other projects' rows\n", - "# in the shared Delta table are left untouched.\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"dataset/\",\n", - " table_ds,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"DataSet written:\", table_ds.shape)" + "result = write_models([dataset], output_root=OUTPUT_ROOT)\n", + "print(f\"DataSet written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:57.255866Z", - "iopub.status.busy": "2026-04-30T23:47:57.255666Z", - "iopub.status.idle": "2026-04-30T23:47:57.284453Z", - "shell.execute_reply": "2026-04-30T23:47:57.283762Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -252,11 +198,11 @@ "# Verification\n", "ds_verify = (\n", " pl.read_delta(OUTPUT_ROOT + \"dataset/\")\n", - " .filter(pl.col(\"project_id\") == PROJECT_ID)\n", + " .filter((pl.col(\"project_id\") == PROJECT_ID) & (pl.col(\"id\") == DATASET_ID))\n", ")\n", "print(ds_verify.shape)\n", "print(ds_verify.head())\n", - "assert ds_verify.shape[0] == 1, f\"Expected 1 DataSet row, got {ds_verify.shape[0]}\"\n", + "assert ds_verify.shape[0] == 1, f\"Expected 1 DataSet row for {DATASET_ID}, got {ds_verify.shape[0]}\"\n", "assert ds_verify[\"id\"][0] == DATASET_ID, \"DataSet id mismatch\"" ] }, @@ -270,14 +216,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:57.286221Z", - "iopub.status.busy": "2026-04-30T23:47:57.286032Z", - "iopub.status.idle": "2026-04-30T23:47:57.362675Z", - "shell.execute_reply": "2026-04-30T23:47:57.361906Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -294,29 +233,14 @@ " DataItem(id=cid, name=cid, project_id=PROJECT_ID)\n", " for cid in cell_ids\n", "]\n", - "\n", - "schema_di = build_arrow_schema(DataItem)\n", - "table_di = models_to_table(dataitems, schema=schema_di)\n", - "table_di = attach_linkml_metadata(table_di, linkml_class=\"DataItem\")\n", - "\n", - "# append_new_dataitems checks which ids already exist for this project and appends\n", - "# only new rows — safe when multiple _01 notebooks share a project_id, since\n", - "# each dataset's cells are registered without wiping the other's rows.\n", - "n_appended = append_new_dataitems(OUTPUT_ROOT + \"dataitem/\", table_di, project_id=PROJECT_ID)\n", + "n_appended = write_models(dataitems, output_root=OUTPUT_ROOT).rows_written\n", "print(f\"DataItem rows appended: {n_appended} (total in batch: {len(cell_ids)})\")" ] }, { "cell_type": "code", "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:57.364409Z", - "iopub.status.busy": "2026-04-30T23:47:57.364127Z", - "iopub.status.idle": "2026-04-30T23:47:57.396288Z", - "shell.execute_reply": "2026-04-30T23:47:57.395594Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -329,11 +253,11 @@ "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ str ┆ str │\n", "╞═══════════╪═══════════╪═══════════════════╪═══════════════╡\n", - "│ 908902400 ┆ 908902400 ┆ null ┆ visp_patchseq │\n", - "│ 965091329 ┆ 965091329 ┆ null ┆ visp_patchseq │\n", - "│ 978149378 ┆ 978149378 ┆ null ┆ visp_patchseq │\n", - "│ 834891776 ┆ 834891776 ┆ null ┆ visp_patchseq │\n", - "│ 897003522 ┆ 897003522 ┆ null ┆ visp_patchseq │\n", + "│ 601790961 ┆ 601790961 ┆ null ┆ visp_patchseq │\n", + "│ 602535278 ┆ 602535278 ┆ null ┆ visp_patchseq │\n", + "│ 604646725 ┆ 604646725 ┆ null ┆ visp_patchseq │\n", + "│ 623326230 ┆ 623326230 ┆ null ┆ visp_patchseq │\n", + "│ 623434306 ┆ 623434306 ┆ null ┆ visp_patchseq │\n", "└───────────┴───────────┴───────────────────┴───────────────┘\n" ] } @@ -361,20 +285,13 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:57.398059Z", - "iopub.status.busy": "2026-04-30T23:47:57.397866Z", - "iopub.status.idle": "2026-04-30T23:47:57.530900Z", - "shell.execute_reply": "2026-04-30T23:47:57.530109Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "DataItemDataSetAssociation written: (2759, 3)\n" + "DataItemDataSetAssociation written: 2759 rows\n" ] } ], @@ -387,35 +304,14 @@ " )\n", " for cid in cell_ids\n", "]\n", - "\n", - "schema_assoc = build_arrow_schema(DataItemDataSetAssociation)\n", - "table_assoc = models_to_table(associations, schema=schema_assoc)\n", - "table_assoc = attach_linkml_metadata(table_assoc, linkml_class=\"DataItemDataSetAssociation\")\n", - "\n", - "# mode='overwrite' makes re-runs idempotent instead of appending duplicates.\n", - "# predicate scopes the overwrite to this project only — other projects' rows\n", - "# in the shared Delta table are left untouched.\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"dataitem_dataset_association/\",\n", - " table_assoc,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND dataset_id = '{DATASET_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"DataItemDataSetAssociation written:\", table_assoc.shape)" + "result = write_models(associations, output_root=OUTPUT_ROOT)\n", + "print(f\"DataItemDataSetAssociation written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 9, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:47:57.532673Z", - "iopub.status.busy": "2026-04-30T23:47:57.532468Z", - "iopub.status.idle": "2026-04-30T23:47:57.562596Z", - "shell.execute_reply": "2026-04-30T23:47:57.561856Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", diff --git a/code/etl_visp_inh_patchseq_02_cell_features.ipynb b/code/etl_visp_inh_patchseq_02_cell_features.ipynb index 7d0e385..645e5c7 100644 --- a/code/etl_visp_inh_patchseq_02_cell_features.ipynb +++ b/code/etl_visp_inh_patchseq_02_cell_features.ipynb @@ -12,7 +12,14 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-23T15:20:50.096869Z", + "iopub.status.busy": "2026-06-23T15:20:50.096673Z", + "iopub.status.idle": "2026-06-23T15:20:51.049345Z", + "shell.execute_reply": "2026-06-23T15:20:51.048486Z" + } + }, "outputs": [], "source": [ "import os\n", @@ -24,11 +31,8 @@ "import pyarrow as pa\n", "from deltalake import write_deltalake\n", "\n", - "from connects_common_connectivity.arrow_utils import (\n", - " attach_linkml_metadata,\n", - " build_arrow_schema,\n", + "from connects_common_connectivity.io.arrow_utils import (\n", " build_cell_feature_matrix_schema,\n", - " models_to_table,\n", ")\n", "from connects_common_connectivity.models import (\n", " CellFeatureDefinition,\n", @@ -38,19 +42,27 @@ " DataItemDataSetAssociation,\n", " Unit,\n", ")\n", - "from connects_common_connectivity.write_utils import append_new_dataitems" + "from connects_common_connectivity.config import output_root\n", + "from connects_common_connectivity.io import write_models\n" ] }, { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-23T15:20:51.051494Z", + "iopub.status.busy": "2026-06-23T15:20:51.051177Z", + "iopub.status.idle": "2026-06-23T15:20:51.056074Z", + "shell.execute_reply": "2026-06-23T15:20:51.055398Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v1/\n", + "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v2/\n", "PROJECT_ID : visp_patchseq\n", "DATASET_ID : visp_inh_patchseq\n", "FEATURE_SET_ID : inh_visp_morph_features\n" @@ -60,7 +72,7 @@ "source": [ "DEFS_CSV = \"/data/visp-features-and-mapping/inh_visp_patchseq_morph_feature_definitions.csv\"\n", "WIDE_CSV = \"/data/visp-features-and-mapping/inh_ivscc_features_wide_unnormalized.csv\"\n", - "OUTPUT_ROOT = \"../scratch/em_patchseq_wnm_v1/\"\n", + "OUTPUT_ROOT = output_root()\n", "PROJECT_ID = \"visp_patchseq\"\n", "DATASET_ID = \"visp_inh_patchseq\"\n", "FEATURE_SET_ID = \"inh_visp_morph_features\"\n", @@ -81,13 +93,20 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-23T15:20:51.093387Z", + "iopub.status.busy": "2026-06-23T15:20:51.093158Z", + "iopub.status.idle": "2026-06-23T15:20:51.248230Z", + "shell.execute_reply": "2026-06-23T15:20:51.247573Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Prerequisite OK: 4287 DataItem rows for project_id='visp_patchseq'\n" + "Prerequisite OK: 4407 DataItem rows for project_id='visp_patchseq'\n" ] } ], @@ -108,22 +127,32 @@ "source": [ "## Register new cells from the wide CSV\n", "\n", - "Check which cell ids in the wide CSV are not yet in the `DataItem` table, register any new ones\n", - "via `append_new_dataitems`, and add `DataItemDataSetAssociation` rows for those new cells." + "Check which cell ids in the wide CSV are not yet in the `DataItem` table and register any\n", + "new ones via `append_new_dataitems`. Then re-assert the full\n", + "`(project_id, dataset_id)` association scope as the **union** of the existing scope and the\n", + "wide-CSV ids — `DataItemDataSetAssociation` is `overwrite_scoped`, so passing only the\n", + "wide-CSV ids would clobber rows written by `_01` for the same scope." ] }, { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-23T15:20:51.250117Z", + "iopub.status.busy": "2026-06-23T15:20:51.249923Z", + "iopub.status.idle": "2026-06-23T15:20:51.265020Z", + "shell.execute_reply": "2026-06-23T15:20:51.264236Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cells in wide CSV : 520\n", - "Already in DataItem : 400\n", - "New to register : 120\n" + "Already in DataItem : 520\n", + "New to register : 0\n" ] } ], @@ -142,58 +171,75 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-23T15:20:51.266754Z", + "iopub.status.busy": "2026-06-23T15:20:51.266462Z", + "iopub.status.idle": "2026-06-23T15:20:51.503978Z", + "shell.execute_reply": "2026-06-23T15:20:51.503186Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "DataItems appended: 120\n", - "Associations appended: 120\n" + "No new cells to register — all already present.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Associations written for (visp_patchseq, visp_inh_patchseq): 2879\n" ] } ], "source": [ "if new_ids:\n", - " schema_di = build_arrow_schema(DataItem)\n", - " new_di_table = attach_linkml_metadata(\n", - " models_to_table(\n", - " [DataItem(id=cid, name=cid, project_id=PROJECT_ID) for cid in new_ids],\n", - " schema=schema_di,\n", - " ),\n", - " linkml_class=\"DataItem\",\n", - " )\n", - " # append_new_dataitems is idempotent: on re-run, new_ids will be empty (cells already registered)\n", - " # so this block will be skipped entirely.\n", - " n_di = append_new_dataitems(OUTPUT_ROOT + \"dataitem/\", new_di_table, project_id=PROJECT_ID)\n", + " n_di = write_models(\n", + " [DataItem(id=cid, name=cid, project_id=PROJECT_ID) for cid in new_ids],\n", + " output_root=OUTPUT_ROOT,\n", + " ).rows_written\n", " print(f\"DataItems appended: {n_di}\")\n", + "else:\n", + " print(\"No new cells to register \\u2014 all already present.\")\n", "\n", - " schema_assoc = build_arrow_schema(DataItemDataSetAssociation)\n", - " new_assoc_table = attach_linkml_metadata(\n", - " models_to_table(\n", - " [\n", - " DataItemDataSetAssociation(dataitem_id=cid, dataset_id=DATASET_ID, project_id=PROJECT_ID)\n", - " for cid in new_ids\n", - " ],\n", - " schema=schema_assoc,\n", - " ),\n", - " linkml_class=\"DataItemDataSetAssociation\",\n", - " )\n", - " # mode=\"append\" is safe here: new_ids only contains cells not yet in DataItem.\n", - " # Re-runs skip this block (new_ids is empty), so no duplicate associations accumulate.\n", - " write_deltalake(\n", - " OUTPUT_ROOT + \"dataitem_dataset_association/\", new_assoc_table,\n", - " mode=\"append\", partition_by=[\"project_id\"],\n", + "# Re-assert the full (project_id, dataset_id) association scope. The\n", + "# DataItemDataSetAssociation WriteSpec is overwrite_scoped on those two\n", + "# columns, so we must pass every id that should remain in scope — not\n", + "# just the wide-CSV ids — otherwise rows registered by earlier notebooks\n", + "# (e.g. `_01`'s ttype-CSV cells) would be clobbered. Union the existing\n", + "# scope with the wide-CSV ids; the write is idempotent and self-heals\n", + "# any partial prior run.\n", + "try:\n", + " existing_assoc_ids = set(\n", + " pl.read_delta(OUTPUT_ROOT + \"dataitem_dataset_association/\")\n", + " .filter((pl.col(\"project_id\") == PROJECT_ID) & (pl.col(\"dataset_id\") == DATASET_ID))\n", + " [\"dataitem_id\"].to_list()\n", " )\n", - " print(f\"Associations appended: {len(new_ids)}\")\n", - "else:\n", - " print(\"No new cells to register — all already present.\")" + "except Exception:\n", + " existing_assoc_ids = set()\n", + "full_assoc_ids = sorted(existing_assoc_ids | set(all_wide_ids))\n", + "n_assoc = write_models(\n", + " [DataItemDataSetAssociation(dataitem_id=cid, dataset_id=DATASET_ID, project_id=PROJECT_ID)\n", + " for cid in full_assoc_ids],\n", + " output_root=OUTPUT_ROOT,\n", + ").rows_written\n", + "print(f\"Associations written for ({PROJECT_ID}, {DATASET_ID}): {n_assoc}\")\n" ] }, { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-23T15:20:51.505735Z", + "iopub.status.busy": "2026-06-23T15:20:51.505441Z", + "iopub.status.idle": "2026-06-23T15:20:51.724583Z", + "shell.execute_reply": "2026-06-23T15:20:51.723848Z" + } + }, "outputs": [ { "name": "stdout", @@ -231,7 +277,14 @@ { "cell_type": "code", "execution_count": 7, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-23T15:20:51.795339Z", + "iopub.status.busy": "2026-06-23T15:20:51.795074Z", + "iopub.status.idle": "2026-06-23T15:20:51.810825Z", + "shell.execute_reply": "2026-06-23T15:20:51.810089Z" + } + }, "outputs": [ { "name": "stdout", @@ -334,13 +387,20 @@ { "cell_type": "code", "execution_count": 8, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-23T15:20:51.812587Z", + "iopub.status.busy": "2026-06-23T15:20:51.812386Z", + "iopub.status.idle": "2026-06-23T15:20:51.911130Z", + "shell.execute_reply": "2026-06-23T15:20:51.910362Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CellFeatureDefinition written: (46, 8)\n" + "CellFeatureDefinition written: 46 rows\n" ] } ], @@ -360,24 +420,21 @@ " if pd.notna(row[\"range_max\"]):\n", " kwargs[\"range_max\"] = float(row[\"range_max\"])\n", " feature_defs.append(CellFeatureDefinition(**kwargs))\n", - "\n", - "schema_cfd = build_arrow_schema(CellFeatureDefinition)\n", - "table_cfd = attach_linkml_metadata(\n", - " models_to_table(feature_defs, schema=schema_cfd), linkml_class=\"CellFeatureDefinition\"\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"cellfeaturedefinition/\", table_cfd,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND feature_set_id = '{FEATURE_SET_ID}'\",\n", - " partition_by=[\"project_id\", \"feature_set_id\"],\n", - ")\n", - "print(\"CellFeatureDefinition written:\", table_cfd.shape)" + "result = write_models(feature_defs, output_root=OUTPUT_ROOT)\n", + "print(f\"CellFeatureDefinition written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 9, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-23T15:20:51.912804Z", + "iopub.status.busy": "2026-06-23T15:20:51.912596Z", + "iopub.status.idle": "2026-06-23T15:20:51.955067Z", + "shell.execute_reply": "2026-06-23T15:20:51.954279Z" + } + }, "outputs": [ { "name": "stdout", @@ -428,13 +485,20 @@ { "cell_type": "code", "execution_count": 10, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-23T15:20:51.956779Z", + "iopub.status.busy": "2026-06-23T15:20:51.956575Z", + "iopub.status.idle": "2026-06-23T15:20:52.056779Z", + "shell.execute_reply": "2026-06-23T15:20:52.056078Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CellFeatureSet written: (1, 5)\n" + "CellFeatureSet written: 1 rows\n" ] } ], @@ -451,24 +515,21 @@ " extraction_method=\"Computed via https://github.com/AllenInstitute/skeleton_keys.\",\n", " project_id=PROJECT_ID,\n", ")\n", - "\n", - "schema_cfs = build_arrow_schema(CellFeatureSet)\n", - "table_cfs = attach_linkml_metadata(\n", - " models_to_table([feature_set], schema=schema_cfs), linkml_class=\"CellFeatureSet\"\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"cellfeatureset/\", table_cfs,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND id = '{FEATURE_SET_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"CellFeatureSet written:\", table_cfs.shape)" + "result = write_models([feature_set], output_root=OUTPUT_ROOT)\n", + "print(f\"CellFeatureSet written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 11, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-23T15:20:52.058498Z", + "iopub.status.busy": "2026-06-23T15:20:52.058295Z", + "iopub.status.idle": "2026-06-23T15:20:52.092684Z", + "shell.execute_reply": "2026-06-23T15:20:52.091917Z" + } + }, "outputs": [ { "name": "stdout", @@ -505,7 +566,14 @@ { "cell_type": "code", "execution_count": 12, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-23T15:20:52.094443Z", + "iopub.status.busy": "2026-06-23T15:20:52.094155Z", + "iopub.status.idle": "2026-06-23T15:20:52.131184Z", + "shell.execute_reply": "2026-06-23T15:20:52.130494Z" + } + }, "outputs": [ { "name": "stdout", @@ -701,7 +769,14 @@ { "cell_type": "code", "execution_count": 13, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-23T15:20:52.132768Z", + "iopub.status.busy": "2026-06-23T15:20:52.132581Z", + "iopub.status.idle": "2026-06-23T15:20:52.239611Z", + "shell.execute_reply": "2026-06-23T15:20:52.238847Z" + } + }, "outputs": [ { "name": "stdout", @@ -727,7 +802,14 @@ { "cell_type": "code", "execution_count": 14, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-23T15:20:52.241324Z", + "iopub.status.busy": "2026-06-23T15:20:52.241119Z", + "iopub.status.idle": "2026-06-23T15:20:52.260862Z", + "shell.execute_reply": "2026-06-23T15:20:52.260110Z" + } + }, "outputs": [ { "name": "stdout", @@ -777,13 +859,20 @@ { "cell_type": "code", "execution_count": 15, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-23T15:20:52.262523Z", + "iopub.status.busy": "2026-06-23T15:20:52.262324Z", + "iopub.status.idle": "2026-06-23T15:20:52.368326Z", + "shell.execute_reply": "2026-06-23T15:20:52.367544Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CellFeatureMatrix written: (1, 5)\n" + "CellFeatureMatrix written: 1 rows\n" ] } ], @@ -796,24 +885,21 @@ " cell_index_column=\"id\",\n", " project_id=PROJECT_ID,\n", ")\n", - "\n", - "schema_cfm = build_arrow_schema(CellFeatureMatrix)\n", - "table_cfm = attach_linkml_metadata(\n", - " models_to_table([cfm], schema=schema_cfm), linkml_class=\"CellFeatureMatrix\"\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"cellfeaturematrix/\", table_cfm,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND feature_set_id = '{FEATURE_SET_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"CellFeatureMatrix written:\", table_cfm.shape)" + "result = write_models([cfm], output_root=OUTPUT_ROOT)\n", + "print(f\"CellFeatureMatrix written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 16, - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-23T15:20:52.370303Z", + "iopub.status.busy": "2026-06-23T15:20:52.370029Z", + "iopub.status.idle": "2026-06-23T15:20:52.430668Z", + "shell.execute_reply": "2026-06-23T15:20:52.429928Z" + } + }, "outputs": [ { "name": "stdout", @@ -826,7 +912,7 @@ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ str ┆ str ┆ str │\n", "╞════════════════════╪════════════════════╪════════════════════╪═══════════════════╪═══════════════╡\n", - "│ visp_patchseq_inh_ ┆ inh_visp_morph_fea ┆ file:///results/em ┆ id ┆ visp_patchseq │\n", + "│ visp_patchseq_inh_ ┆ inh_visp_morph_fea ┆ file:///scratch/em ┆ id ┆ visp_patchseq │\n", "│ visp_morph_f… ┆ tures ┆ _patchseq_wn… ┆ ┆ │\n", "└────────────────────┴────────────────────┴────────────────────┴───────────────────┴───────────────┘\n" ] @@ -851,13 +937,13 @@ "| Output path | Class | Rows |\n", "|---|---|---|\n", "| `dataitem/` | `DataItem` | +new cells from wide CSV (≤ 520 total, 120 new on first run) |\n", - "| `dataitem_dataset_association/` | `DataItemDataSetAssociation` | +new cells from wide CSV |\n", + "| `dataitem_dataset_association/` | `DataItemDataSetAssociation` | full scope = existing ∪ wide-CSV ids (re-written every run) |\n", "| `cellfeaturedefinition/` | `CellFeatureDefinition` | 46 |\n", "| `cellfeatureset/` | `CellFeatureSet` | 1 (`inh_visp_morph_features`) |\n", "| `cellfeatures/inh_visp_morph_features/` | wide parquet | 520 cells × 46 features |\n", "| `cellfeaturematrix/` | `CellFeatureMatrix` | 1 |\n", "\n", - "`dataitem/` and `dataitem_dataset_association/` use `append_new_dataitems` / `mode=\"append\"` scoped to new cells only — re-running is idempotent and never wipes rows from `etl_visp_inh_patchseq_01`. All other writes use `mode=\"overwrite\"` with a scoped predicate." + "`dataitem/` uses `append_new_dataitems` scoped to new cells only — re-running is idempotent and never wipes rows from `etl_visp_inh_patchseq_01`. `dataitem_dataset_association/` is `overwrite_scoped` on `(project_id, dataset_id)`; this notebook re-asserts the full scope as `existing ∪ wide-CSV ids` so siblings under the same scope (e.g. `_01`'s ttype cells, `_03`'s MET cells) are preserved. All other writes use `mode=\"overwrite\"` with a scoped predicate." ] }, { @@ -884,7 +970,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.13.13" } }, "nbformat": 4, diff --git a/code/etl_visp_inh_patchseq_03_cluster_membership_and_mapping.ipynb b/code/etl_visp_inh_patchseq_03_cluster_membership_and_mapping.ipynb index f2d89e7..cb5d7b9 100644 --- a/code/etl_visp_inh_patchseq_03_cluster_membership_and_mapping.ipynb +++ b/code/etl_visp_inh_patchseq_03_cluster_membership_and_mapping.ipynb @@ -19,37 +19,39 @@ " `visp_met_cell_assignments_text_names.csv`, column `met_type`, indexed by cell id.\n", "\n", "Both writes use **parent propagation**: one row per (cell, ancestor) pair walked from the\n", - "leaf to the root via `walk_ancestors` (in `connects_common_connectivity.write_utils`).\n", + "leaf to the root via `walk_ancestors` (in `connects_common_connectivity.io.write_utils`).\n", "`probability` is left null (no probability column in either source).\n", "\n", - "## Section 0 — register missing inhibitory dataset associations\n", + "## Section 0 — re-assert the inhibitory dataset associations\n", "\n", "The MET CSV has 495 cells. All 495 are already in `dataitem/` (registered by earlier\n", - "notebooks), but only 392 are associated with `dataset_id=\"visp_inh_patchseq\"`. The other\n", - "103 are GABAergic MET-types with no dataset association at all. Section 0 appends the\n", - "missing 103 `DataItemDataSetAssociation` rows so every MET cell has the proper inh\n", - "dataset link before membership is written. (This mirrors the pattern in\n", - "`etl_visp_inh_patchseq_02_cell_features.ipynb`.)\n", + "notebooks). `DataItemDataSetAssociation` is `overwrite_scoped` on\n", + "`(project_id, dataset_id)`, so any `write_models` call replaces *every* row in the\n", + "scope. Multiple notebooks contribute rows to this same scope\n", + "(`_01` writes the 2759 ttype cells, `_02` writes the wide-CSV cells, `_03` adds the\n", + "GABAergic MET cells), so a plain overwrite from any one of them clobbers the\n", + "others. Section 0 therefore reads the existing scope, **unions** it with the 495\n", + "MET CSV cells, and re-writes the full set. The result is idempotent and self-heals\n", + "any cells missed by a prior partial run.\n", "\n", "## Merge-then-overwrite for `clustermembership/`\n", "\n", - "`etl_visp_exc_patchseq_03_cluster_membership_and_mapping.ipynb` already wrote 1152\n", + "`etl_visp_exc_patchseq_03_cluster_membership_and_mapping.ipynb` writes 1152\n", "ClusterMembership rows under predicate\n", "`project_id='visp_patchseq' AND hierarchy_id='visp_met_types_taxonomy'`. This notebook\n", "writes under the **same predicate**, so a plain overwrite would clobber the exc rows.\n", "Instead, the membership write uses **merge-then-overwrite**: read existing rows, drop\n", "the rows this notebook owns (`item IN `), union with new rows, then\n", - "overwrite. This makes both notebooks idempotent and order-independent, matching the\n", - "codebase pattern used in `_02` for `dataitem_dataset_association`.\n", + "overwrite. This makes both notebooks idempotent and order-independent.\n", "\n", - "## Outputs (under `../scratch/em_patchseq_wnm_v1/`)\n", + "## Outputs \n", "\n", "| Path | Class | Rows added |\n", "|---|---|---|\n", - "| `dataitem_dataset_association/` | `DataItemDataSetAssociation` | +103 (first run); 0 on re-run |\n", + "| `dataitem_dataset_association/` | `DataItemDataSetAssociation` | union of existing scope ∪ 495 MET cells (full scope re-written every run) |\n", "| `mappingset/` | `MappingSet` | 1 (`visp_inh_patchseq_ttype_mapping`) |\n", "| `celltoclustermapping/` | `CellToClusterMapping` | 2759 cells × 4 ancestors = 11036 |\n", - "| `clustermembership/` | `ClusterMembership` | 495 cells × 3 ancestors = 1485 (merged with exc's 1152 → 2637 total under predicate) |\n" + "| `clustermembership/` | `ClusterMembership` | 495 cells × 3 ancestors = 1485, merged with exc's 1152 → 2637 total under predicate |\n" ] }, { @@ -58,31 +60,26 @@ "id": "18153e32", "metadata": { "execution": { - "iopub.execute_input": "2026-05-01T19:06:42.803275Z", - "iopub.status.busy": "2026-05-01T19:06:42.802990Z", - "iopub.status.idle": "2026-05-01T19:06:43.734089Z", - "shell.execute_reply": "2026-05-01T19:06:43.733251Z" + "iopub.execute_input": "2026-06-23T15:20:55.810196Z", + "iopub.status.busy": "2026-06-23T15:20:55.809925Z", + "iopub.status.idle": "2026-06-23T15:20:56.809125Z", + "shell.execute_reply": "2026-06-23T15:20:56.808287Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import polars as pl\n", - "import pyarrow as pa\n", - "from deltalake import write_deltalake\n", "\n", - "from connects_common_connectivity.arrow_utils import (\n", - " attach_linkml_metadata,\n", - " build_arrow_schema,\n", - " models_to_table,\n", - ")\n", "from connects_common_connectivity.models import (\n", " CellToClusterMapping,\n", " ClusterMembership,\n", " DataItemDataSetAssociation,\n", " MappingSet,\n", ")\n", - "from connects_common_connectivity.write_utils import walk_ancestors\n" + "from connects_common_connectivity.io.write_utils import walk_ancestors\n", + "from connects_common_connectivity.config import output_root\n", + "from connects_common_connectivity.io import write_models\n" ] }, { @@ -91,10 +88,10 @@ "id": "989cb0f6", "metadata": { "execution": { - "iopub.execute_input": "2026-05-01T19:06:43.736089Z", - "iopub.status.busy": "2026-05-01T19:06:43.735788Z", - "iopub.status.idle": "2026-05-01T19:06:43.741477Z", - "shell.execute_reply": "2026-05-01T19:06:43.740785Z" + "iopub.execute_input": "2026-06-23T15:20:56.811285Z", + "iopub.status.busy": "2026-06-23T15:20:56.810951Z", + "iopub.status.idle": "2026-06-23T15:20:56.816451Z", + "shell.execute_reply": "2026-06-23T15:20:56.815736Z" } }, "outputs": [ @@ -104,7 +101,7 @@ "text": [ "TTYPE_CSV : /data/visp-features-and-mapping/patchseq_tx_cell_ttype_labels.csv\n", "METTYPE_CSV : /data/visp-features-and-mapping/visp_met_cell_assignments_text_names.csv\n", - "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v1/\n", + "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v2/\n", "PROJECT_ID : visp_patchseq\n", "DATASET_ID : visp_inh_patchseq\n", "TTYPE_HIERARCHY_ID : tasic_2018_visp_taxonomy\n", @@ -116,7 +113,7 @@ "source": [ "TTYPE_CSV = \"/data/visp-features-and-mapping/patchseq_tx_cell_ttype_labels.csv\"\n", "METTYPE_CSV = \"/data/visp-features-and-mapping/visp_met_cell_assignments_text_names.csv\"\n", - "OUTPUT_ROOT = \"../scratch/em_patchseq_wnm_v1/\"\n", + "OUTPUT_ROOT = output_root()\n", "\n", "PROJECT_ID = \"visp_patchseq\"\n", "DATASET_ID = \"visp_inh_patchseq\"\n", @@ -150,10 +147,10 @@ "id": "8888859f", "metadata": { "execution": { - "iopub.execute_input": "2026-05-01T19:06:43.743089Z", - "iopub.status.busy": "2026-05-01T19:06:43.742909Z", - "iopub.status.idle": "2026-05-01T19:06:43.817751Z", - "shell.execute_reply": "2026-05-01T19:06:43.816961Z" + "iopub.execute_input": "2026-06-23T15:20:56.818120Z", + "iopub.status.busy": "2026-06-23T15:20:56.817924Z", + "iopub.status.idle": "2026-06-23T15:20:57.032494Z", + "shell.execute_reply": "2026-06-23T15:20:57.031770Z" } }, "outputs": [ @@ -161,7 +158,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "DataItems for project_id='visp_patchseq': 4407\n", + "DataItems for project_id='visp_patchseq': 4407\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "Clusters loaded: tasic_2018_visp_taxonomy=138 visp_met_types_taxonomy=48\n" ] } @@ -194,12 +197,15 @@ "id": "7d1b7045", "metadata": {}, "source": [ - "## Section 0 — register missing `visp_inh_patchseq` associations\n", - "\n", - "Cells in `visp_met_cell_assignments_text_names.csv` that exist in `dataitem/` for\n", - "`project_id='visp_patchseq'` but lack a `dataset_id='visp_inh_patchseq'` association\n", - "get one appended here. `mode=\"append\"` is safe because we only emit rows for ids that\n", - "are not yet associated; on re-run the to-register set is empty and the block no-ops.\n" + "## Section 0 — re-assert `visp_inh_patchseq` associations\n", + "\n", + "`DataItemDataSetAssociation` is `overwrite_scoped` on `(project_id, dataset_id)`,\n", + "so a `write_models` call replaces the entire scope. Several notebooks contribute\n", + "rows to `(visp_patchseq, visp_inh_patchseq)` — `_01` writes the 2759 ttype cells,\n", + "`_02` writes the wide-CSV cells, and `_03` adds the 495 GABAergic MET cells. To\n", + "avoid clobbering siblings, this section reads the existing scope, unions it with\n", + "the MET CSV ids, and re-writes the full set. On re-run the set is unchanged so\n", + "the write is a no-op in content.\n" ] }, { @@ -208,10 +214,10 @@ "id": "f7cce99b", "metadata": { "execution": { - "iopub.execute_input": "2026-05-01T19:06:43.819485Z", - "iopub.status.busy": "2026-05-01T19:06:43.819286Z", - "iopub.status.idle": "2026-05-01T19:06:43.835273Z", - "shell.execute_reply": "2026-05-01T19:06:43.834585Z" + "iopub.execute_input": "2026-06-23T15:20:57.034186Z", + "iopub.status.busy": "2026-06-23T15:20:57.033963Z", + "iopub.status.idle": "2026-06-23T15:20:57.048121Z", + "shell.execute_reply": "2026-06-23T15:20:57.047461Z" } }, "outputs": [ @@ -294,10 +300,10 @@ "id": "9e0f8d3e", "metadata": { "execution": { - "iopub.execute_input": "2026-05-01T19:06:43.836885Z", - "iopub.status.busy": "2026-05-01T19:06:43.836702Z", - "iopub.status.idle": "2026-05-01T19:06:43.902643Z", - "shell.execute_reply": "2026-05-01T19:06:43.901756Z" + "iopub.execute_input": "2026-06-23T15:20:57.049626Z", + "iopub.status.busy": "2026-06-23T15:20:57.049448Z", + "iopub.status.idle": "2026-06-23T15:20:57.113248Z", + "shell.execute_reply": "2026-06-23T15:20:57.112507Z" } }, "outputs": [ @@ -306,8 +312,8 @@ "output_type": "stream", "text": [ "All 495 MET CSV cells exist in DataItem.\n", - "Existing visp_inh_patchseq associations: 2759\n", - "MET cells needing inh-dataset association: 103\n" + "Existing visp_inh_patchseq associations: 2879\n", + "MET cells needing inh-dataset association: 0\n" ] } ], @@ -338,10 +344,10 @@ "id": "2c8d4901", "metadata": { "execution": { - "iopub.execute_input": "2026-05-01T19:06:43.904358Z", - "iopub.status.busy": "2026-05-01T19:06:43.904162Z", - "iopub.status.idle": "2026-05-01T19:06:44.028708Z", - "shell.execute_reply": "2026-05-01T19:06:44.027964Z" + "iopub.execute_input": "2026-06-23T15:20:57.115015Z", + "iopub.status.busy": "2026-06-23T15:20:57.114800Z", + "iopub.status.idle": "2026-06-23T15:20:57.337830Z", + "shell.execute_reply": "2026-06-23T15:20:57.337011Z" } }, "outputs": [ @@ -349,38 +355,26 @@ "name": "stdout", "output_type": "stream", "text": [ - "Associations appended: 103\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total visp_inh_patchseq associations now: 2862\n" + "Associations written for (visp_patchseq, visp_inh_patchseq): 2879\n", + "Total visp_inh_patchseq associations now: 2879\n" ] } ], "source": [ - "if ids_needing_assoc:\n", - " schema_assoc = build_arrow_schema(DataItemDataSetAssociation)\n", - " new_assoc_table = attach_linkml_metadata(\n", - " models_to_table(\n", - " [DataItemDataSetAssociation(dataitem_id=cid, dataset_id=DATASET_ID, project_id=PROJECT_ID)\n", - " for cid in ids_needing_assoc],\n", - " schema=schema_assoc,\n", - " ),\n", - " linkml_class=\"DataItemDataSetAssociation\",\n", - " )\n", - " # mode=\"append\" is idempotent here: ids_needing_assoc only contains ids without an\n", - " # existing (project, dataset) association. On re-run, the set is empty and we skip.\n", - " write_deltalake(\n", - " OUTPUT_ROOT + \"dataitem_dataset_association/\", new_assoc_table,\n", - " mode=\"append\",\n", - " partition_by=[\"project_id\"],\n", - " )\n", - " print(f\"Associations appended: {len(ids_needing_assoc)}\")\n", - "else:\n", - " print(\"No new associations needed — all MET cells already linked to inh dataset.\")\n", + "# Re-assert the full (project_id, dataset_id) association scope.\n", + "# DataItemDataSetAssociation is overwrite_scoped on (project_id, dataset_id),\n", + "# so we must pass every id that should remain in scope — not just the MET\n", + "# CSV ids — otherwise rows registered by earlier notebooks (e.g. `_01`'s\n", + "# ttype-CSV cells and `_02`'s wide-CSV cells) would be clobbered. Union the\n", + "# existing scope with the MET CSV ids; the write is idempotent and self-\n", + "# heals any cells that were missed by a prior partial run.\n", + "full_assoc_ids = sorted(existing_inh_ids | met_csv_ids)\n", + "n_assoc = write_models(\n", + " [DataItemDataSetAssociation(dataitem_id=cid, dataset_id=DATASET_ID, project_id=PROJECT_ID)\n", + " for cid in full_assoc_ids],\n", + " output_root=OUTPUT_ROOT,\n", + ").rows_written\n", + "print(f\"Associations written for ({PROJECT_ID}, {DATASET_ID}): {n_assoc}\")\n", "\n", "# Verify post-condition: every MET cell now has the inh dataset association.\n", "post_assoc = (\n", @@ -410,10 +404,10 @@ "id": "321ae589", "metadata": { "execution": { - "iopub.execute_input": "2026-05-01T19:06:44.030528Z", - "iopub.status.busy": "2026-05-01T19:06:44.030228Z", - "iopub.status.idle": "2026-05-01T19:06:44.045003Z", - "shell.execute_reply": "2026-05-01T19:06:44.044281Z" + "iopub.execute_input": "2026-06-23T15:20:57.339643Z", + "iopub.status.busy": "2026-06-23T15:20:57.339427Z", + "iopub.status.idle": "2026-06-23T15:20:57.353843Z", + "shell.execute_reply": "2026-06-23T15:20:57.353141Z" } }, "outputs": [ @@ -501,10 +495,10 @@ "id": "be05809d", "metadata": { "execution": { - "iopub.execute_input": "2026-05-01T19:06:44.046635Z", - "iopub.status.busy": "2026-05-01T19:06:44.046442Z", - "iopub.status.idle": "2026-05-01T19:06:44.052535Z", - "shell.execute_reply": "2026-05-01T19:06:44.051767Z" + "iopub.execute_input": "2026-06-23T15:20:57.355599Z", + "iopub.status.busy": "2026-06-23T15:20:57.355306Z", + "iopub.status.idle": "2026-06-23T15:20:57.361378Z", + "shell.execute_reply": "2026-06-23T15:20:57.360677Z" } }, "outputs": [ @@ -540,10 +534,10 @@ "id": "1719a00c", "metadata": { "execution": { - "iopub.execute_input": "2026-05-01T19:06:44.054354Z", - "iopub.status.busy": "2026-05-01T19:06:44.054078Z", - "iopub.status.idle": "2026-05-01T19:06:44.253974Z", - "shell.execute_reply": "2026-05-01T19:06:44.253194Z" + "iopub.execute_input": "2026-06-23T15:20:57.363017Z", + "iopub.status.busy": "2026-06-23T15:20:57.362820Z", + "iopub.status.idle": "2026-06-23T15:20:57.532173Z", + "shell.execute_reply": "2026-06-23T15:20:57.531422Z" } }, "outputs": [ @@ -551,7 +545,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "MappingSet written: (1, 13)\n" + "MappingSet written: 1 rows\n" ] } ], @@ -571,19 +565,8 @@ " target_hierarchy=TTYPE_HIERARCHY_ID,\n", " project_id=PROJECT_ID,\n", ")\n", - "\n", - "schema_ms = build_arrow_schema(MappingSet)\n", - "table_ms = attach_linkml_metadata(\n", - " models_to_table([ttype_mapping_set], schema=schema_ms),\n", - " linkml_class=\"MappingSet\",\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"mappingset/\", table_ms,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND id = '{MAPPING_SET_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"MappingSet written:\", table_ms.shape)\n" + "result = write_models([ttype_mapping_set], output_root=OUTPUT_ROOT)\n", + "print(f\"MappingSet written: {result.rows_written} rows\")\n" ] }, { @@ -592,10 +575,10 @@ "id": "8ba5cb2d", "metadata": { "execution": { - "iopub.execute_input": "2026-05-01T19:06:44.255856Z", - "iopub.status.busy": "2026-05-01T19:06:44.255653Z", - "iopub.status.idle": "2026-05-01T19:06:44.288094Z", - "shell.execute_reply": "2026-05-01T19:06:44.287377Z" + "iopub.execute_input": "2026-06-23T15:20:57.533854Z", + "iopub.status.busy": "2026-06-23T15:20:57.533639Z", + "iopub.status.idle": "2026-06-23T15:20:57.570176Z", + "shell.execute_reply": "2026-06-23T15:20:57.569421Z" } }, "outputs": [ @@ -639,10 +622,10 @@ "id": "e9784282", "metadata": { "execution": { - "iopub.execute_input": "2026-05-01T19:06:44.289845Z", - "iopub.status.busy": "2026-05-01T19:06:44.289645Z", - "iopub.status.idle": "2026-05-01T19:06:44.650947Z", - "shell.execute_reply": "2026-05-01T19:06:44.650132Z" + "iopub.execute_input": "2026-06-23T15:20:57.571869Z", + "iopub.status.busy": "2026-06-23T15:20:57.571658Z", + "iopub.status.idle": "2026-06-23T15:20:57.934109Z", + "shell.execute_reply": "2026-06-23T15:20:57.933171Z" } }, "outputs": [ @@ -657,7 +640,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "CellToClusterMapping written: (11036, 8)\n" + "CellToClusterMapping written: 11036 rows\n" ] } ], @@ -677,19 +660,8 @@ " project_id=PROJECT_ID,\n", " ))\n", "print(f\"CellToClusterMapping rows built: {len(ttype_mappings)}\")\n", - "\n", - "schema_ccm = build_arrow_schema(CellToClusterMapping)\n", - "table_ccm = attach_linkml_metadata(\n", - " models_to_table(ttype_mappings, schema=schema_ccm),\n", - " linkml_class=\"CellToClusterMapping\",\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"celltoclustermapping/\", table_ccm,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND mapping_set = '{MAPPING_SET_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"CellToClusterMapping written:\", table_ccm.shape)\n" + "result = write_models(ttype_mappings, output_root=OUTPUT_ROOT)\n", + "print(f\"CellToClusterMapping written: {result.rows_written} rows\")\n" ] }, { @@ -698,10 +670,10 @@ "id": "0f5e1931", "metadata": { "execution": { - "iopub.execute_input": "2026-05-01T19:06:44.652753Z", - "iopub.status.busy": "2026-05-01T19:06:44.652528Z", - "iopub.status.idle": "2026-05-01T19:06:44.689069Z", - "shell.execute_reply": "2026-05-01T19:06:44.688395Z" + "iopub.execute_input": "2026-06-23T15:20:57.935888Z", + "iopub.status.busy": "2026-06-23T15:20:57.935680Z", + "iopub.status.idle": "2026-06-23T15:20:57.977838Z", + "shell.execute_reply": "2026-06-23T15:20:57.977022Z" } }, "outputs": [ @@ -768,10 +740,10 @@ "id": "1c6330d0", "metadata": { "execution": { - "iopub.execute_input": "2026-05-01T19:06:44.691132Z", - "iopub.status.busy": "2026-05-01T19:06:44.690925Z", - "iopub.status.idle": "2026-05-01T19:06:44.698318Z", - "shell.execute_reply": "2026-05-01T19:06:44.697495Z" + "iopub.execute_input": "2026-06-23T15:20:57.979675Z", + "iopub.status.busy": "2026-06-23T15:20:57.979460Z", + "iopub.status.idle": "2026-06-23T15:20:57.986158Z", + "shell.execute_reply": "2026-06-23T15:20:57.985423Z" } }, "outputs": [ @@ -804,10 +776,10 @@ "id": "3d5fd53f", "metadata": { "execution": { - "iopub.execute_input": "2026-05-01T19:06:44.700177Z", - "iopub.status.busy": "2026-05-01T19:06:44.699923Z", - "iopub.status.idle": "2026-05-01T19:06:44.711751Z", - "shell.execute_reply": "2026-05-01T19:06:44.711101Z" + "iopub.execute_input": "2026-06-23T15:20:57.987882Z", + "iopub.status.busy": "2026-06-23T15:20:57.987675Z", + "iopub.status.idle": "2026-06-23T15:20:57.995516Z", + "shell.execute_reply": "2026-06-23T15:20:57.994782Z" } }, "outputs": [ @@ -841,10 +813,10 @@ "id": "e0e80e17", "metadata": { "execution": { - "iopub.execute_input": "2026-05-01T19:06:44.713432Z", - "iopub.status.busy": "2026-05-01T19:06:44.713246Z", - "iopub.status.idle": "2026-05-01T19:06:44.748326Z", - "shell.execute_reply": "2026-05-01T19:06:44.747640Z" + "iopub.execute_input": "2026-06-23T15:20:57.997261Z", + "iopub.status.busy": "2026-06-23T15:20:57.997054Z", + "iopub.status.idle": "2026-06-23T15:20:58.048872Z", + "shell.execute_reply": "2026-06-23T15:20:58.048055Z" } }, "outputs": [ @@ -852,30 +824,29 @@ "name": "stdout", "output_type": "stream", "text": [ - "Existing rows under predicate: 1152\n", - " rows owned by other notebooks (kept): 1152\n", - " rows owned by this notebook (dropped, will rewrite): 0\n", + "Existing rows under predicate: 2637; kept (other notebooks): 1152; new: 1485\n", "Total ClusterMembership rows to write: 2637\n" ] } ], "source": [ - "# Merge step: read existing rows under the predicate, drop rows we own, union with new.\n", - "existing_cm = (\n", - " pl.read_delta(OUTPUT_ROOT + \"clustermembership/\")\n", - " .filter((pl.col(\"project_id\") == PROJECT_ID) & (pl.col(\"hierarchy_id\") == METTYPE_HIERARCHY_ID))\n", - ")\n", - "print(f\"Existing rows under predicate: {existing_cm.shape[0]}\")\n", - "\n", - "other_cm = existing_cm.filter(~pl.col(\"item\").is_in(list(our_cell_ids)))\n", - "print(f\" rows owned by other notebooks (kept): {other_cm.shape[0]}\")\n", - "print(f\" rows owned by this notebook (dropped, will rewrite): {existing_cm.shape[0] - other_cm.shape[0]}\")\n", - "\n", - "# Re-validate kept rows through the Pydantic model (integrity check + uniform schema).\n", - "other_memberships = [\n", - " ClusterMembership(**row) for row in other_cm.to_dicts()\n", - "]\n", + "# Merge-then-overwrite: ClusterMembership is overwrite_scoped on\n", + "# (project_id, hierarchy_id), so a plain overwrite here would clobber rows\n", + "# written under the same predicate by sibling notebooks (e.g.\n", + "# etl_visp_exc_patchseq_03's 1152 mMET-exc cells). Read existing rows,\n", + "# keep the ones this notebook does not own (item NOT IN our_cell_ids), and\n", + "# union them with the new rows before re-writing the full scope.\n", + "try:\n", + " existing_cm = (\n", + " pl.read_delta(OUTPUT_ROOT + \"clustermembership/\")\n", + " .filter((pl.col(\"project_id\") == PROJECT_ID) & (pl.col(\"hierarchy_id\") == METTYPE_HIERARCHY_ID))\n", + " )\n", + "except Exception:\n", + " existing_cm = pl.DataFrame(schema={\"item\": pl.Utf8})\n", + "other_cm = existing_cm.filter(~pl.col(\"item\").is_in(list(our_cell_ids))) if existing_cm.shape[0] else existing_cm\n", + "other_memberships = [ClusterMembership(**row) for row in other_cm.to_dicts()]\n", "all_memberships = other_memberships + new_memberships\n", + "print(f\"Existing rows under predicate: {existing_cm.shape[0]}; kept (other notebooks): {other_cm.shape[0]}; new: {len(new_memberships)}\")\n", "print(f\"Total ClusterMembership rows to write: {len(all_memberships)}\")\n" ] }, @@ -885,10 +856,10 @@ "id": "be955ac8", "metadata": { "execution": { - "iopub.execute_input": "2026-05-01T19:06:44.749906Z", - "iopub.status.busy": "2026-05-01T19:06:44.749718Z", - "iopub.status.idle": "2026-05-01T19:06:44.847240Z", - "shell.execute_reply": "2026-05-01T19:06:44.846459Z" + "iopub.execute_input": "2026-06-23T15:20:58.050716Z", + "iopub.status.busy": "2026-06-23T15:20:58.050415Z", + "iopub.status.idle": "2026-06-23T15:20:58.201025Z", + "shell.execute_reply": "2026-06-23T15:20:58.200282Z" } }, "outputs": [ @@ -896,23 +867,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "ClusterMembership written: (2637, 7)\n" + "ClusterMembership written: 2637 rows\n" ] } ], "source": [ - "schema_cm = build_arrow_schema(ClusterMembership)\n", - "table_cm = attach_linkml_metadata(\n", - " models_to_table(all_memberships, schema=schema_cm),\n", - " linkml_class=\"ClusterMembership\",\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"clustermembership/\", table_cm,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND hierarchy_id = '{METTYPE_HIERARCHY_ID}'\",\n", - " partition_by=[\"project_id\", \"hierarchy_id\"],\n", - ")\n", - "print(\"ClusterMembership written:\", table_cm.shape)\n" + "result = write_models(all_memberships, output_root=OUTPUT_ROOT)\n", + "print(f\"ClusterMembership written: {result.rows_written} rows\")\n" ] }, { @@ -921,10 +882,10 @@ "id": "4f82b279", "metadata": { "execution": { - "iopub.execute_input": "2026-05-01T19:06:44.848977Z", - "iopub.status.busy": "2026-05-01T19:06:44.848784Z", - "iopub.status.idle": "2026-05-01T19:06:44.871249Z", - "shell.execute_reply": "2026-05-01T19:06:44.870592Z" + "iopub.execute_input": "2026-06-23T15:20:58.203086Z", + "iopub.status.busy": "2026-06-23T15:20:58.202793Z", + "iopub.status.idle": "2026-06-23T15:20:58.240103Z", + "shell.execute_reply": "2026-06-23T15:20:58.239359Z" } }, "outputs": [ @@ -1018,7 +979,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.13.13" } }, "nbformat": 4, diff --git a/code/etl_visp_met_types_01_cluster.ipynb b/code/etl_visp_met_types_01_cluster.ipynb index ef86407..6c7bbe4 100644 --- a/code/etl_visp_met_types_01_cluster.ipynb +++ b/code/etl_visp_met_types_01_cluster.ipynb @@ -16,14 +16,7 @@ "cell_type": "code", "execution_count": 1, "id": "c11cf781", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:28:38.643082Z", - "iopub.status.busy": "2026-05-01T05:28:38.642887Z", - "iopub.status.idle": "2026-05-01T05:28:39.552182Z", - "shell.execute_reply": "2026-05-01T05:28:39.551342Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", @@ -31,33 +24,22 @@ "\n", "import pandas as pd\n", "import polars as pl\n", - "from deltalake import write_deltalake\n", "\n", - "from connects_common_connectivity.arrow_utils import (\n", - " attach_linkml_metadata,\n", - " build_arrow_schema,\n", - " models_to_table,\n", - ")\n", "from connects_common_connectivity.models import (\n", " AlgorithmRun,\n", " Cluster,\n", " ClusterHierarchy,\n", " HierarchyCategory,\n", - ")" + ")\n", + "from connects_common_connectivity.config import output_root\n", + "from connects_common_connectivity.io import write_models\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "b00131aa", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:28:39.554244Z", - "iopub.status.busy": "2026-05-01T05:28:39.553945Z", - "iopub.status.idle": "2026-05-01T05:28:39.558308Z", - "shell.execute_reply": "2026-05-01T05:28:39.557653Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -65,7 +47,7 @@ "text": [ "INPUT_JSON : /data/visp-patchseq-taxonomy-info/met_type_colors.json\n", "TASIC_FEATHER : /data/visp-patchseq-taxonomy-info/anno.feather\n", - "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v1/\n", + "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v2/\n", "HIERARCHY_ID : visp_met_types_taxonomy\n", "RUN_ID : visp_met_types_clustering\n" ] @@ -74,7 +56,7 @@ "source": [ "INPUT_JSON = \"/data/visp-patchseq-taxonomy-info/met_type_colors.json\"\n", "TASIC_FEATHER = \"/data/visp-patchseq-taxonomy-info/anno.feather\"\n", - "OUTPUT_ROOT = \"../scratch/em_patchseq_wnm_v1/\"\n", + "OUTPUT_ROOT = output_root()\n", "HIERARCHY_ID = \"visp_met_types_taxonomy\"\n", "RUN_ID = \"visp_met_types_clustering\"\n", "ROOT_ID = \"cell\"\n", @@ -100,14 +82,7 @@ "cell_type": "code", "execution_count": 3, "id": "2a561a2d", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:28:39.560142Z", - "iopub.status.busy": "2026-05-01T05:28:39.559956Z", - "iopub.status.idle": "2026-05-01T05:28:39.620725Z", - "shell.execute_reply": "2026-05-01T05:28:39.619674Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -154,20 +129,13 @@ "cell_type": "code", "execution_count": 4, "id": "761bfedf", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:28:39.622909Z", - "iopub.status.busy": "2026-05-01T05:28:39.622641Z", - "iopub.status.idle": "2026-05-01T05:28:39.786928Z", - "shell.execute_reply": "2026-05-01T05:28:39.785961Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "HierarchyCategory written: (3, 3)\n" + "HierarchyCategory written: 3 rows\n" ] } ], @@ -180,32 +148,15 @@ "]\n", "CATEGORY_IDS = [c.id for c in category_rows]\n", "\n", - "schema_cat = build_arrow_schema(HierarchyCategory)\n", - "table_cat = attach_linkml_metadata(\n", - " models_to_table(category_rows, schema=schema_cat),\n", - " linkml_class=\"HierarchyCategory\",\n", - ")\n", - "quoted = \", \".join(f\"'{i}'\" for i in CATEGORY_IDS)\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"hierarchycategory/\", table_cat,\n", - " mode=\"overwrite\",\n", - " predicate=f\"id IN ({quoted})\",\n", - ")\n", - "print(\"HierarchyCategory written:\", table_cat.shape)" + "result = write_models(category_rows, output_root=OUTPUT_ROOT)\n", + "print(f\"HierarchyCategory written: {result.rows_written} rows\")\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "c9e74c2d", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:28:39.788846Z", - "iopub.status.busy": "2026-05-01T05:28:39.788468Z", - "iopub.status.idle": "2026-05-01T05:28:39.809219Z", - "shell.execute_reply": "2026-05-01T05:28:39.808481Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -246,20 +197,13 @@ "cell_type": "code", "execution_count": 6, "id": "b472c110", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:28:39.810925Z", - "iopub.status.busy": "2026-05-01T05:28:39.810728Z", - "iopub.status.idle": "2026-05-01T05:28:39.890086Z", - "shell.execute_reply": "2026-05-01T05:28:39.889402Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "AlgorithmRun written: (1, 9)\n" + "AlgorithmRun written: 1 rows\n" ] } ], @@ -274,31 +218,15 @@ " # produced_hierarchies omitted: schema declares it as inlined dict[id, ClusterHierarchy].\n", ")\n", "\n", - "schema_run = build_arrow_schema(AlgorithmRun)\n", - "table_run = attach_linkml_metadata(\n", - " models_to_table([run_row], schema=schema_run),\n", - " linkml_class=\"AlgorithmRun\",\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"algorithmrun/\", table_run,\n", - " mode=\"overwrite\",\n", - " predicate=f\"id = '{RUN_ID}'\",\n", - ")\n", - "print(\"AlgorithmRun written:\", table_run.shape)" + "result = write_models([run_row], output_root=OUTPUT_ROOT)\n", + "print(f\"AlgorithmRun written: {result.rows_written} rows\")\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "d4e82c41", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:28:39.891808Z", - "iopub.status.busy": "2026-05-01T05:28:39.891604Z", - "iopub.status.idle": "2026-05-01T05:28:39.907935Z", - "shell.execute_reply": "2026-05-01T05:28:39.907197Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -340,21 +268,14 @@ "cell_type": "code", "execution_count": 8, "id": "932c050e", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:28:39.909553Z", - "iopub.status.busy": "2026-05-01T05:28:39.909369Z", - "iopub.status.idle": "2026-05-01T05:28:39.982978Z", - "shell.execute_reply": "2026-05-01T05:28:39.982231Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cluster rows built: 48\n", - "Cluster written: (48, 9)\n" + "Cluster written: 48 rows\n" ] } ], @@ -406,33 +327,15 @@ "\n", "assert len(cluster_rows) == 1 + 2 + 45 == 48\n", "print(f\"Cluster rows built: {len(cluster_rows)}\")\n", - "\n", - "schema_clu = build_arrow_schema(Cluster)\n", - "table_clu = attach_linkml_metadata(\n", - " models_to_table(cluster_rows, schema=schema_clu),\n", - " linkml_class=\"Cluster\",\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"cluster/\", table_clu,\n", - " mode=\"overwrite\",\n", - " predicate=f\"hierarchy_id = '{HIERARCHY_ID}'\",\n", - " partition_by=[\"hierarchy_id\"],\n", - ")\n", - "print(\"Cluster written:\", table_clu.shape)" + "result = write_models(cluster_rows, output_root=OUTPUT_ROOT)\n", + "print(f\"Cluster written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "69e176bd", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:28:39.984785Z", - "iopub.status.busy": "2026-05-01T05:28:39.984479Z", - "iopub.status.idle": "2026-05-01T05:28:39.999045Z", - "shell.execute_reply": "2026-05-01T05:28:39.998323Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -472,20 +375,13 @@ "cell_type": "code", "execution_count": 10, "id": "6322bb75", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:28:40.000627Z", - "iopub.status.busy": "2026-05-01T05:28:40.000440Z", - "iopub.status.idle": "2026-05-01T05:28:40.075371Z", - "shell.execute_reply": "2026-05-01T05:28:40.074635Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "ClusterHierarchy written: (1, 4)\n" + "ClusterHierarchy written: 1 rows\n" ] } ], @@ -496,32 +392,15 @@ " root=ROOT_ID,\n", " clusters=[c.id for c in cluster_rows],\n", ")\n", - "\n", - "schema_h = build_arrow_schema(ClusterHierarchy)\n", - "table_h = attach_linkml_metadata(\n", - " models_to_table([hierarchy_row], schema=schema_h),\n", - " linkml_class=\"ClusterHierarchy\",\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"clusterhierarchy/\", table_h,\n", - " mode=\"overwrite\",\n", - " predicate=f\"id = '{HIERARCHY_ID}'\",\n", - ")\n", - "print(\"ClusterHierarchy written:\", table_h.shape)" + "result = write_models([hierarchy_row], output_root=OUTPUT_ROOT)\n", + "print(f\"ClusterHierarchy written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 11, "id": "759595ff", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T05:28:40.076994Z", - "iopub.status.busy": "2026-05-01T05:28:40.076799Z", - "iopub.status.idle": "2026-05-01T05:28:40.092872Z", - "shell.execute_reply": "2026-05-01T05:28:40.092160Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -560,6 +439,14 @@ "\n", "Coexists alongside the Tasic taxonomy in the same global tables. Idempotent." ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffe9882c-bf92-428d-bb65-f3fb574bbc13", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/code/etl_wnm_exc_01_dataset_dataitem.ipynb b/code/etl_wnm_exc_01_dataset_dataitem.ipynb index 01fa280..ab9d264 100644 --- a/code/etl_wnm_exc_01_dataset_dataitem.ipynb +++ b/code/etl_wnm_exc_01_dataset_dataitem.ipynb @@ -12,53 +12,34 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:14.065449Z", - "iopub.status.busy": "2026-04-30T23:48:14.065189Z", - "iopub.status.idle": "2026-04-30T23:48:15.118656Z", - "shell.execute_reply": "2026-04-30T23:48:15.117861Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import polars as pl\n", "import pyarrow as pa\n", - "from deltalake import write_deltalake\n", "\n", - "from connects_common_connectivity.arrow_utils import (\n", - " build_arrow_schema,\n", - " models_to_table,\n", - " attach_linkml_metadata,\n", - ")\n", "from connects_common_connectivity.models import (\n", " DataSet,\n", " DataItem,\n", " DataItemDataSetAssociation,\n", " Modality,\n", ")\n", - "from connects_common_connectivity.write_utils import append_new_dataitems" + "from connects_common_connectivity.config import output_root\n", + "from connects_common_connectivity.io import write_models\n" ] }, { "cell_type": "code", "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:15.120699Z", - "iopub.status.busy": "2026-04-30T23:48:15.120419Z", - "iopub.status.idle": "2026-04-30T23:48:15.124117Z", - "shell.execute_reply": "2026-04-30T23:48:15.123418Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INPUT_CSV : /data/visp-features-and-mapping/FullMorphMetaData_Master.csv\n", - "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v1/\n", + "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v2/\n", "PROJECT_ID : visp_wnm\n", "DATASET_ID : visp_exc_wnm\n" ] @@ -66,7 +47,7 @@ ], "source": [ "INPUT_CSV = \"/data/visp-features-and-mapping/FullMorphMetaData_Master.csv\" # same as /data/exc_vis_manuscript_wnm_axon_projection/FullMorphMetaData_Master.csv\n", - "OUTPUT_ROOT = \"../scratch/em_patchseq_wnm_v1/\"\n", + "OUTPUT_ROOT = output_root()\n", "PROJECT_ID = \"visp_wnm\"\n", "DATASET_ID = \"visp_exc_wnm\"\n", "\n", @@ -86,14 +67,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:15.159715Z", - "iopub.status.busy": "2026-04-30T23:48:15.159506Z", - "iopub.status.idle": "2026-04-30T23:48:15.183476Z", - "shell.execute_reply": "2026-04-30T23:48:15.182853Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -278,20 +252,13 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:15.185336Z", - "iopub.status.busy": "2026-04-30T23:48:15.185151Z", - "iopub.status.idle": "2026-04-30T23:48:15.274137Z", - "shell.execute_reply": "2026-04-30T23:48:15.273360Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "DataSet written: (1, 5)\n" + "DataSet written: 1 rows\n" ] } ], @@ -303,35 +270,14 @@ " modality=Modality.MORPHOLOGY.value,\n", " project_id=PROJECT_ID,\n", ")\n", - "\n", - "schema_ds = build_arrow_schema(DataSet)\n", - "table_ds = models_to_table([dataset], schema=schema_ds)\n", - "table_ds = attach_linkml_metadata(table_ds, linkml_class=\"DataSet\")\n", - "\n", - "# mode='overwrite' makes re-runs idempotent instead of appending duplicates.\n", - "# predicate scopes the overwrite to this project only — other projects' rows\n", - "# in the shared Delta table are left untouched.\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"dataset/\",\n", - " table_ds,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"DataSet written:\", table_ds.shape)" + "result = write_models([dataset], output_root=OUTPUT_ROOT)\n", + "print(f\"DataSet written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:15.275798Z", - "iopub.status.busy": "2026-04-30T23:48:15.275601Z", - "iopub.status.idle": "2026-04-30T23:48:15.303015Z", - "shell.execute_reply": "2026-04-30T23:48:15.302315Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -354,7 +300,7 @@ "# Verification\n", "ds_verify = (\n", " pl.read_delta(OUTPUT_ROOT + \"dataset/\")\n", - " .filter(pl.col(\"project_id\") == PROJECT_ID)\n", + " .filter((pl.col(\"project_id\") == PROJECT_ID) & (pl.col(\"id\") == DATASET_ID))\n", " .filter(pl.col(\"id\") == DATASET_ID)\n", ")\n", "print(ds_verify.shape)\n", @@ -373,20 +319,13 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:15.304692Z", - "iopub.status.busy": "2026-04-30T23:48:15.304506Z", - "iopub.status.idle": "2026-04-30T23:48:15.342531Z", - "shell.execute_reply": "2026-04-30T23:48:15.341804Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "DataItem rows appended: 0 (total in batch: 341)\n" + "DataItem rows appended: 341 (total in batch: 341)\n" ] } ], @@ -397,53 +336,32 @@ " DataItem(id=cid, name=cid, project_id=PROJECT_ID)\n", " for cid in cell_ids\n", "]\n", - "\n", - "schema_di = build_arrow_schema(DataItem)\n", - "table_di = models_to_table(dataitems, schema=schema_di)\n", - "table_di = attach_linkml_metadata(table_di, linkml_class=\"DataItem\")\n", - "\n", - "# append_new_dataitems checks which ids already exist for this project and appends\n", - "# only new rows — safe when multiple _01 notebooks share a project_id, since\n", - "# each dataset's cells are registered without wiping the other's rows.\n", - "n_appended = append_new_dataitems(OUTPUT_ROOT + \"dataitem/\", table_di, project_id=PROJECT_ID)\n", + "n_appended = write_models(dataitems, output_root=OUTPUT_ROOT).rows_written\n", "print(f\"DataItem rows appended: {n_appended} (total in batch: {len(cell_ids)})\")" ] }, { "cell_type": "code", "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:15.344171Z", - "iopub.status.busy": "2026-04-30T23:48:15.343978Z", - "iopub.status.idle": "2026-04-30T23:48:15.374605Z", - "shell.execute_reply": "2026-04-30T23:48:15.373897Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(345, 4)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "(341, 4)\n", "shape: (5, 4)\n", - "┌───────────────────────────────┬───────────────────────────────┬───────────────────┬────────────┐\n", - "│ id ┆ name ┆ neuroglancer_link ┆ project_id │\n", - "│ --- ┆ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str ┆ str │\n", - "╞═══════════════════════════════╪═══════════════════════════════╪═══════════════════╪════════════╡\n", - "│ 17109_6801-X7432-Y4405_reg ┆ 17109_6801-X7432-Y4405_reg ┆ null ┆ visp_wnm │\n", - "│ 211541_6961-X18505-Y15909_reg ┆ 211541_6961-X18505-Y15909_reg ┆ null ┆ visp_wnm │\n", - "│ 220309_5824-X3486-Y10261_reg ┆ 220309_5824-X3486-Y10261_reg ┆ null ┆ visp_wnm │\n", - "│ 221686_5481-X4093-Y13144_reg ┆ 221686_5481-X4093-Y13144_reg ┆ null ┆ visp_wnm │\n", - "│ 182709_6984-X2452-Y12423_reg ┆ 182709_6984-X2452-Y12423_reg ┆ null ┆ visp_wnm │\n", - "└───────────────────────────────┴───────────────────────────────┴───────────────────┴────────────┘\n" + "┌──────────────────────────────┬──────────────────────────────┬───────────────────┬────────────┐\n", + "│ id ┆ name ┆ neuroglancer_link ┆ project_id │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str ┆ str │\n", + "╞══════════════════════════════╪══════════════════════════════╪═══════════════════╪════════════╡\n", + "│ 182709_6984-X2452-Y12423_reg ┆ 182709_6984-X2452-Y12423_reg ┆ null ┆ visp_wnm │\n", + "│ 182709_7126-X2913-Y10535_reg ┆ 182709_7126-X2913-Y10535_reg ┆ null ┆ visp_wnm │\n", + "│ 182724_5937-X3804-Y11955_reg ┆ 182724_5937-X3804-Y11955_reg ┆ null ┆ visp_wnm │\n", + "│ 182724_6175-X3782-Y10859_reg ┆ 182724_6175-X3782-Y10859_reg ┆ null ┆ visp_wnm │\n", + "│ 182724_6354-X4834-Y8105_reg ┆ 182724_6354-X4834-Y8105_reg ┆ null ┆ visp_wnm │\n", + "└──────────────────────────────┴──────────────────────────────┴───────────────────┴────────────┘\n" ] } ], @@ -470,20 +388,13 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:15.376252Z", - "iopub.status.busy": "2026-04-30T23:48:15.376066Z", - "iopub.status.idle": "2026-04-30T23:48:15.479340Z", - "shell.execute_reply": "2026-04-30T23:48:15.478626Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "DataItemDataSetAssociation written: (341, 3)\n" + "DataItemDataSetAssociation written: 341 rows\n" ] } ], @@ -496,35 +407,14 @@ " )\n", " for cid in cell_ids\n", "]\n", - "\n", - "schema_assoc = build_arrow_schema(DataItemDataSetAssociation)\n", - "table_assoc = models_to_table(associations, schema=schema_assoc)\n", - "table_assoc = attach_linkml_metadata(table_assoc, linkml_class=\"DataItemDataSetAssociation\")\n", - "\n", - "# mode='overwrite' makes re-runs idempotent instead of appending duplicates.\n", - "# predicate scopes the overwrite to this project only — other projects' rows\n", - "# in the shared Delta table are left untouched.\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"dataitem_dataset_association/\",\n", - " table_assoc,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND dataset_id = '{DATASET_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"DataItemDataSetAssociation written:\", table_assoc.shape)" + "result = write_models(associations, output_root=OUTPUT_ROOT)\n", + "print(f\"DataItemDataSetAssociation written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 9, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:15.481138Z", - "iopub.status.busy": "2026-04-30T23:48:15.480945Z", - "iopub.status.idle": "2026-04-30T23:48:15.508840Z", - "shell.execute_reply": "2026-04-30T23:48:15.508076Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", diff --git a/code/etl_wnm_exc_02_cell_features.ipynb b/code/etl_wnm_exc_02_cell_features.ipynb index 87a08c7..dfce782 100644 --- a/code/etl_wnm_exc_02_cell_features.ipynb +++ b/code/etl_wnm_exc_02_cell_features.ipynb @@ -12,14 +12,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:18.628894Z", - "iopub.status.busy": "2026-04-30T23:48:18.628709Z", - "iopub.status.idle": "2026-04-30T23:48:19.603134Z", - "shell.execute_reply": "2026-04-30T23:48:19.602313Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import os\n", @@ -29,14 +22,10 @@ "import pandas as pd\n", "import polars as pl\n", "import pyarrow as pa\n", - "from deltalake import DeltaTable, write_deltalake\n", - "from deltalake.exceptions import TableNotFoundError\n", + "from deltalake import write_deltalake\n", "\n", - "from connects_common_connectivity.arrow_utils import (\n", - " attach_linkml_metadata,\n", - " build_arrow_schema,\n", + "from connects_common_connectivity.io.arrow_utils import (\n", " build_cell_feature_matrix_schema,\n", - " models_to_table,\n", ")\n", "from connects_common_connectivity.models import (\n", " CellFeatureDefinition,\n", @@ -45,26 +34,20 @@ " DataItem,\n", " DataItemDataSetAssociation,\n", ")\n", - "from connects_common_connectivity.write_utils import append_new_dataitems" + "from connects_common_connectivity.config import output_root\n", + "from connects_common_connectivity.io import write_models\n" ] }, { "cell_type": "code", "execution_count": 2, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:19.605227Z", - "iopub.status.busy": "2026-04-30T23:48:19.604924Z", - "iopub.status.idle": "2026-04-30T23:48:19.609127Z", - "shell.execute_reply": "2026-04-30T23:48:19.608474Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v1/\n", + "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v2/\n", "PROJECT_ID : visp_wnm\n", "DATASET_ID : visp_exc_wnm\n", "FSI_SHARED : exc_visp_morph_features\n", @@ -76,7 +59,7 @@ "source": [ "WIDE_CSV_SET1 = \"/data/visp-features-and-mapping/RawFeaturesWide_ChamferCorr.csv\"\n", "WIDE_CSV_SET2 = \"/data/exc_vis_manuscript_wnm_axon_projection/AxonRawReatureWide.csv\"\n", - "OUTPUT_ROOT = \"../scratch/em_patchseq_wnm_v1/\"\n", + "OUTPUT_ROOT = output_root()\n", "PROJECT_ID = \"visp_wnm\"\n", "DATASET_ID = \"visp_exc_wnm\" # prereq assertion only\n", "FSI_SHARED = \"exc_visp_morph_features\"\n", @@ -101,14 +84,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:19.646258Z", - "iopub.status.busy": "2026-04-30T23:48:19.646020Z", - "iopub.status.idle": "2026-04-30T23:48:19.711345Z", - "shell.execute_reply": "2026-04-30T23:48:19.710566Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -156,14 +132,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:19.713091Z", - "iopub.status.busy": "2026-04-30T23:48:19.712889Z", - "iopub.status.idle": "2026-04-30T23:48:19.742976Z", - "shell.execute_reply": "2026-04-30T23:48:19.742232Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -190,14 +159,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:19.744765Z", - "iopub.status.busy": "2026-04-30T23:48:19.744457Z", - "iopub.status.idle": "2026-04-30T23:48:19.775512Z", - "shell.execute_reply": "2026-04-30T23:48:19.774800Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -404,14 +366,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:19.777119Z", - "iopub.status.busy": "2026-04-30T23:48:19.776927Z", - "iopub.status.idle": "2026-04-30T23:48:19.782780Z", - "shell.execute_reply": "2026-04-30T23:48:19.782088Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -445,14 +400,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:19.784306Z", - "iopub.status.busy": "2026-04-30T23:48:19.784123Z", - "iopub.status.idle": "2026-04-30T23:48:19.788073Z", - "shell.execute_reply": "2026-04-30T23:48:19.787410Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -479,27 +427,14 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:19.789628Z", - "iopub.status.busy": "2026-04-30T23:48:19.789451Z", - "iopub.status.idle": "2026-04-30T23:48:19.962801Z", - "shell.execute_reply": "2026-04-30T23:48:19.962014Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Appended 0 new DataItem rows\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Appended 4 new DataItemDataSetAssociation rows\n" + "Appended 4 new DataItem rows\n", + "Associations written for (visp_wnm, visp_exc_wnm): 345\n" ] } ], @@ -507,65 +442,40 @@ "# Register new cells (DataItem + DataItemDataSetAssociation) for those in Set1 not yet in _01.\n", "if new_ids_set1:\n", " new_items = [DataItem(id=i, name=i, project_id=PROJECT_ID) for i in new_ids_set1]\n", - " schema_di = build_arrow_schema(DataItem)\n", - " table_di = attach_linkml_metadata(\n", - " models_to_table(new_items, schema=schema_di), linkml_class=\"DataItem\"\n", - " )\n", - " n_appended = append_new_dataitems(\n", - " OUTPUT_ROOT + \"dataitem/\", table_di, project_id=PROJECT_ID\n", - " )\n", + " n_appended = write_models(new_items, output_root=OUTPUT_ROOT).rows_written\n", " print(f\"Appended {n_appended} new DataItem rows\")\n", - "\n", - " new_assoc = [\n", - " DataItemDataSetAssociation(dataitem_id=i, dataset_id=DATASET_ID, project_id=PROJECT_ID)\n", - " for i in new_ids_set1\n", - " ]\n", - " schema_assoc = build_arrow_schema(DataItemDataSetAssociation)\n", - " table_assoc = attach_linkml_metadata(\n", - " models_to_table(new_assoc, schema=schema_assoc),\n", - " linkml_class=\"DataItemDataSetAssociation\",\n", - " )\n", - " # Append — association table uses append_new_dataitems pattern\n", - " # (no overwrite predicate since we only add new rows here)\n", - " existing_assoc = pl.read_delta(OUTPUT_ROOT + \"dataitem_dataset_association/\").filter(\n", - " (pl.col(\"project_id\") == PROJECT_ID) & (pl.col(\"dataset_id\") == DATASET_ID)\n", - " )\n", - " existing_assoc_ids = set(existing_assoc[\"dataitem_id\"].to_list())\n", - " truly_new_assoc = [a for a in new_assoc if a.dataitem_id not in existing_assoc_ids]\n", - " if truly_new_assoc:\n", - " table_new_assoc = attach_linkml_metadata(\n", - " models_to_table(truly_new_assoc, schema=schema_assoc),\n", - " linkml_class=\"DataItemDataSetAssociation\",\n", - " )\n", - " write_deltalake(\n", - " OUTPUT_ROOT + \"dataitem_dataset_association/\", table_new_assoc,\n", - " mode=\"append\", partition_by=[\"project_id\"],\n", - " )\n", - " print(f\"Appended {len(truly_new_assoc)} new DataItemDataSetAssociation rows\")\n", - " else:\n", - " print(\"No new association rows to append.\")\n", "else:\n", - " print(\"All Set1 cells already registered — no new DataItem or association writes.\")\n", + " print(\"All Set1 cells already registered \\u2014 no new DataItem writes.\")\n", + "\n", + "# Re-assert the full (project_id, dataset_id) association scope as the union\n", + "# of any existing assoc rows and the Set1 ids. DataItemDataSetAssociation is\n", + "# overwrite_scoped on (project_id, dataset_id), so passing the full intended\n", + "# set is idempotent and self-heals partial prior runs.\n", + "existing_assoc = (\n", + " pl.read_delta(OUTPUT_ROOT + \"dataitem_dataset_association/\")\n", + " .filter((pl.col(\"project_id\") == PROJECT_ID) & (pl.col(\"dataset_id\") == DATASET_ID))\n", + ")\n", + "existing_assoc_ids = set(existing_assoc[\"dataitem_id\"].to_list())\n", + "all_assoc_ids = sorted(existing_assoc_ids | set(set1_ids))\n", + "n_assoc = write_models(\n", + " [DataItemDataSetAssociation(dataitem_id=i, dataset_id=DATASET_ID, project_id=PROJECT_ID)\n", + " for i in all_assoc_ids],\n", + " output_root=OUTPUT_ROOT,\n", + ").rows_written\n", + "print(f\"Associations written for ({PROJECT_ID}, {DATASET_ID}): {n_assoc}\")\n", "\n", "# Refresh registered ids so Set2 coverage check reflects newly added cells.\n", "wnm_registered_ids = set(\n", " pl.read_delta(OUTPUT_ROOT + \"dataitem_dataset_association/\")\n", " .filter((pl.col(\"project_id\") == PROJECT_ID) & (pl.col(\"dataset_id\") == DATASET_ID))\n", " [\"dataitem_id\"].to_list()\n", - ")" + ")\n" ] }, { "cell_type": "code", "execution_count": 9, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:19.964762Z", - "iopub.status.busy": "2026-04-30T23:48:19.964525Z", - "iopub.status.idle": "2026-04-30T23:48:19.985174Z", - "shell.execute_reply": "2026-04-30T23:48:19.984485Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -616,14 +526,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:19.986864Z", - "iopub.status.busy": "2026-04-30T23:48:19.986586Z", - "iopub.status.idle": "2026-04-30T23:48:20.086613Z", - "shell.execute_reply": "2026-04-30T23:48:20.085905Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -647,14 +550,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:20.088517Z", - "iopub.status.busy": "2026-04-30T23:48:20.088316Z", - "iopub.status.idle": "2026-04-30T23:48:20.109520Z", - "shell.execute_reply": "2026-04-30T23:48:20.108797Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -689,20 +585,13 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:20.111126Z", - "iopub.status.busy": "2026-04-30T23:48:20.110940Z", - "iopub.status.idle": "2026-04-30T23:48:20.210760Z", - "shell.execute_reply": "2026-04-30T23:48:20.210083Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CellFeatureMatrix (Set1) written: (1, 5)\n" + "CellFeatureMatrix written: 1 rows\n" ] } ], @@ -715,31 +604,14 @@ " cell_index_column=\"id\",\n", " project_id=PROJECT_ID,\n", ")\n", - "\n", - "schema_cfm = build_arrow_schema(CellFeatureMatrix)\n", - "table_cfm1 = attach_linkml_metadata(\n", - " models_to_table([cfm1], schema=schema_cfm), linkml_class=\"CellFeatureMatrix\"\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"cellfeaturematrix/\", table_cfm1,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND feature_set_id = '{FSI_SHARED}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"CellFeatureMatrix (Set1) written:\", table_cfm1.shape)" + "result = write_models([cfm1], output_root=OUTPUT_ROOT)\n", + "print(f\"CellFeatureMatrix written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 13, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:20.212491Z", - "iopub.status.busy": "2026-04-30T23:48:20.212303Z", - "iopub.status.idle": "2026-04-30T23:48:20.243356Z", - "shell.execute_reply": "2026-04-30T23:48:20.242654Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -778,14 +650,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:20.245004Z", - "iopub.status.busy": "2026-04-30T23:48:20.244824Z", - "iopub.status.idle": "2026-04-30T23:48:20.269510Z", - "shell.execute_reply": "2026-04-30T23:48:20.268857Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -982,14 +847,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:20.271056Z", - "iopub.status.busy": "2026-04-30T23:48:20.270877Z", - "iopub.status.idle": "2026-04-30T23:48:20.274359Z", - "shell.execute_reply": "2026-04-30T23:48:20.273683Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1012,14 +870,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:20.275844Z", - "iopub.status.busy": "2026-04-30T23:48:20.275669Z", - "iopub.status.idle": "2026-04-30T23:48:20.279278Z", - "shell.execute_reply": "2026-04-30T23:48:20.278634Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1047,49 +898,26 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:20.280747Z", - "iopub.status.busy": "2026-04-30T23:48:20.280573Z", - "iopub.status.idle": "2026-04-30T23:48:20.359262Z", - "shell.execute_reply": "2026-04-30T23:48:20.358507Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CellFeatureDefinition (Set2) written: (51, 8)\n" + "CellFeatureDefinition written: 51 rows\n" ] } ], "source": [ "# Write CellFeatureDefinition for Set2.\n", - "schema_cfd2 = build_arrow_schema(CellFeatureDefinition)\n", - "table_cfd2 = attach_linkml_metadata(\n", - " models_to_table(feature_defs_2, schema=schema_cfd2), linkml_class=\"CellFeatureDefinition\"\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"cellfeaturedefinition/\", table_cfd2,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND feature_set_id = '{FSI_LOCAL}'\",\n", - " partition_by=[\"project_id\", \"feature_set_id\"],\n", - ")\n", - "print(\"CellFeatureDefinition (Set2) written:\", table_cfd2.shape)" + "result = write_models(feature_defs_2, output_root=OUTPUT_ROOT)\n", + "print(f\"CellFeatureDefinition written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 18, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:20.360992Z", - "iopub.status.busy": "2026-04-30T23:48:20.360804Z", - "iopub.status.idle": "2026-04-30T23:48:20.385097Z", - "shell.execute_reply": "2026-04-30T23:48:20.384343Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1129,20 +957,13 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:20.386705Z", - "iopub.status.busy": "2026-04-30T23:48:20.386520Z", - "iopub.status.idle": "2026-04-30T23:48:20.481839Z", - "shell.execute_reply": "2026-04-30T23:48:20.481073Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CellFeatureSet (Set2) written: (1, 5)\n" + "CellFeatureSet written: 1 rows\n" ] } ], @@ -1154,31 +975,14 @@ " feature_definition_ids=[fd.id for fd in feature_defs_2],\n", " project_id=PROJECT_ID,\n", ")\n", - "\n", - "schema_cfs2 = build_arrow_schema(CellFeatureSet)\n", - "table_cfs2 = attach_linkml_metadata(\n", - " models_to_table([feature_set_2], schema=schema_cfs2), linkml_class=\"CellFeatureSet\"\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"cellfeatureset/\", table_cfs2,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND id = '{FSI_LOCAL}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"CellFeatureSet (Set2) written:\", table_cfs2.shape)" + "result = write_models([feature_set_2], output_root=OUTPUT_ROOT)\n", + "print(f\"CellFeatureSet written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 20, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:20.483976Z", - "iopub.status.busy": "2026-04-30T23:48:20.483642Z", - "iopub.status.idle": "2026-04-30T23:48:20.574398Z", - "shell.execute_reply": "2026-04-30T23:48:20.506833Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1210,14 +1014,7 @@ { "cell_type": "code", "execution_count": 21, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:20.576314Z", - "iopub.status.busy": "2026-04-30T23:48:20.576111Z", - "iopub.status.idle": "2026-04-30T23:48:20.694421Z", - "shell.execute_reply": "2026-04-30T23:48:20.693601Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1255,14 +1052,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:20.696496Z", - "iopub.status.busy": "2026-04-30T23:48:20.696299Z", - "iopub.status.idle": "2026-04-30T23:48:20.712959Z", - "shell.execute_reply": "2026-04-30T23:48:20.712169Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1297,20 +1087,13 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:20.714882Z", - "iopub.status.busy": "2026-04-30T23:48:20.714536Z", - "iopub.status.idle": "2026-04-30T23:48:20.820187Z", - "shell.execute_reply": "2026-04-30T23:48:20.819316Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CellFeatureMatrix (Set2) written: (1, 5)\n" + "CellFeatureMatrix written: 1 rows\n" ] } ], @@ -1323,30 +1106,14 @@ " cell_index_column=\"id\",\n", " project_id=PROJECT_ID,\n", ")\n", - "\n", - "table_cfm2 = attach_linkml_metadata(\n", - " models_to_table([cfm2], schema=schema_cfm), linkml_class=\"CellFeatureMatrix\"\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"cellfeaturematrix/\", table_cfm2,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND feature_set_id = '{FSI_LOCAL}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"CellFeatureMatrix (Set2) written:\", table_cfm2.shape)" + "result = write_models([cfm2], output_root=OUTPUT_ROOT)\n", + "print(f\"CellFeatureMatrix written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 24, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:20.821892Z", - "iopub.status.busy": "2026-04-30T23:48:20.821686Z", - "iopub.status.idle": "2026-04-30T23:48:20.847276Z", - "shell.execute_reply": "2026-04-30T23:48:20.846500Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1385,14 +1152,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:20.848961Z", - "iopub.status.busy": "2026-04-30T23:48:20.848770Z", - "iopub.status.idle": "2026-04-30T23:48:20.868832Z", - "shell.execute_reply": "2026-04-30T23:48:20.868109Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1585,14 +1345,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:20.870393Z", - "iopub.status.busy": "2026-04-30T23:48:20.870203Z", - "iopub.status.idle": "2026-04-30T23:48:20.874070Z", - "shell.execute_reply": "2026-04-30T23:48:20.873304Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1615,14 +1368,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:20.875780Z", - "iopub.status.busy": "2026-04-30T23:48:20.875594Z", - "iopub.status.idle": "2026-04-30T23:48:20.879387Z", - "shell.execute_reply": "2026-04-30T23:48:20.878661Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1650,49 +1396,26 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:20.881069Z", - "iopub.status.busy": "2026-04-30T23:48:20.880887Z", - "iopub.status.idle": "2026-04-30T23:48:20.971161Z", - "shell.execute_reply": "2026-04-30T23:48:20.970401Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CellFeatureDefinition (Set3) written: (18, 8)\n" + "CellFeatureDefinition written: 18 rows\n" ] } ], "source": [ "# Write CellFeatureDefinition for Set3.\n", - "schema_cfd3 = build_arrow_schema(CellFeatureDefinition)\n", - "table_cfd3 = attach_linkml_metadata(\n", - " models_to_table(feature_defs_3, schema=schema_cfd3), linkml_class=\"CellFeatureDefinition\"\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"cellfeaturedefinition/\", table_cfd3,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND feature_set_id = '{FSI_FMOST}'\",\n", - " partition_by=[\"project_id\", \"feature_set_id\"],\n", - ")\n", - "print(\"CellFeatureDefinition (Set3) written:\", table_cfd3.shape)" + "result = write_models(feature_defs_3, output_root=OUTPUT_ROOT)\n", + "print(f\"CellFeatureDefinition written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 29, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:20.972788Z", - "iopub.status.busy": "2026-04-30T23:48:20.972593Z", - "iopub.status.idle": "2026-04-30T23:48:20.999724Z", - "shell.execute_reply": "2026-04-30T23:48:20.999000Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1732,20 +1455,13 @@ { "cell_type": "code", "execution_count": 30, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:21.001260Z", - "iopub.status.busy": "2026-04-30T23:48:21.001051Z", - "iopub.status.idle": "2026-04-30T23:48:21.104522Z", - "shell.execute_reply": "2026-04-30T23:48:21.103874Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CellFeatureSet (Set3) written: (1, 5)\n" + "CellFeatureSet written: 1 rows\n" ] } ], @@ -1760,31 +1476,14 @@ " feature_definition_ids=[fd.id for fd in feature_defs_3],\n", " project_id=PROJECT_ID,\n", ")\n", - "\n", - "schema_cfs3 = build_arrow_schema(CellFeatureSet)\n", - "table_cfs3 = attach_linkml_metadata(\n", - " models_to_table([feature_set_3], schema=schema_cfs3), linkml_class=\"CellFeatureSet\"\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"cellfeatureset/\", table_cfs3,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND id = '{FSI_FMOST}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"CellFeatureSet (Set3) written:\", table_cfs3.shape)" + "result = write_models([feature_set_3], output_root=OUTPUT_ROOT)\n", + "print(f\"CellFeatureSet written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 31, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:21.106154Z", - "iopub.status.busy": "2026-04-30T23:48:21.105974Z", - "iopub.status.idle": "2026-04-30T23:48:21.135399Z", - "shell.execute_reply": "2026-04-30T23:48:21.134661Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1816,14 +1515,7 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:21.136970Z", - "iopub.status.busy": "2026-04-30T23:48:21.136790Z", - "iopub.status.idle": "2026-04-30T23:48:21.234141Z", - "shell.execute_reply": "2026-04-30T23:48:21.233443Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1861,14 +1553,7 @@ { "cell_type": "code", "execution_count": 33, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:21.235822Z", - "iopub.status.busy": "2026-04-30T23:48:21.235639Z", - "iopub.status.idle": "2026-04-30T23:48:21.251778Z", - "shell.execute_reply": "2026-04-30T23:48:21.251158Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -1903,20 +1588,13 @@ { "cell_type": "code", "execution_count": 34, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:21.253449Z", - "iopub.status.busy": "2026-04-30T23:48:21.253273Z", - "iopub.status.idle": "2026-04-30T23:48:21.363584Z", - "shell.execute_reply": "2026-04-30T23:48:21.362860Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CellFeatureMatrix (Set3) written: (1, 5)\n" + "CellFeatureMatrix written: 1 rows\n" ] } ], @@ -1929,30 +1607,14 @@ " cell_index_column=\"id\",\n", " project_id=PROJECT_ID,\n", ")\n", - "\n", - "table_cfm3 = attach_linkml_metadata(\n", - " models_to_table([cfm3], schema=schema_cfm), linkml_class=\"CellFeatureMatrix\"\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"cellfeaturematrix/\", table_cfm3,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND feature_set_id = '{FSI_FMOST}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"CellFeatureMatrix (Set3) written:\", table_cfm3.shape)" + "result = write_models([cfm3], output_root=OUTPUT_ROOT)\n", + "print(f\"CellFeatureMatrix written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 35, - "metadata": { - "execution": { - "iopub.execute_input": "2026-04-30T23:48:21.365309Z", - "iopub.status.busy": "2026-04-30T23:48:21.365012Z", - "iopub.status.idle": "2026-04-30T23:48:21.400225Z", - "shell.execute_reply": "2026-04-30T23:48:21.399441Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -2002,6 +1664,13 @@ "\n", "Set 1 defs/set not written here — owned by `etl_visp_exc_patchseq_02_cell_features.ipynb`." ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/code/etl_wnm_exc_03_cell_to_cluster_mapping.ipynb b/code/etl_wnm_exc_03_cell_to_cluster_mapping.ipynb index eef27e2..f7b2aa1 100644 --- a/code/etl_wnm_exc_03_cell_to_cluster_mapping.ipynb +++ b/code/etl_wnm_exc_03_cell_to_cluster_mapping.ipynb @@ -18,51 +18,33 @@ "cell_type": "code", "execution_count": 1, "id": "d7f3dc14", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T18:33:48.832132Z", - "iopub.status.busy": "2026-05-01T18:33:48.831811Z", - "iopub.status.idle": "2026-05-01T18:33:49.789232Z", - "shell.execute_reply": "2026-05-01T18:33:49.788520Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import polars as pl\n", - "from deltalake import write_deltalake\n", "\n", - "from connects_common_connectivity.arrow_utils import (\n", - " attach_linkml_metadata,\n", - " build_arrow_schema,\n", - " models_to_table,\n", - ")\n", "from connects_common_connectivity.models import (\n", " CellToClusterMapping,\n", " MappingSet,\n", ")\n", - "from connects_common_connectivity.write_utils import walk_ancestors\n" + "from connects_common_connectivity.io.write_utils import walk_ancestors\n", + "from connects_common_connectivity.config import output_root\n", + "from connects_common_connectivity.io import write_models\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "a21f350b", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T18:33:49.791790Z", - "iopub.status.busy": "2026-05-01T18:33:49.791490Z", - "iopub.status.idle": "2026-05-01T18:33:49.795973Z", - "shell.execute_reply": "2026-05-01T18:33:49.795234Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INPUT_CSV : /data/visp-features-and-mapping/FullMorphMetaData_Master.csv\n", - "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v1/\n", + "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v2/\n", "PROJECT_ID : visp_wnm\n", "DATASET_ID : visp_exc_wnm\n", "METTYPE_HIERARCHY_ID : visp_met_types_taxonomy\n", @@ -73,7 +55,7 @@ ], "source": [ "INPUT_CSV = \"/data/visp-features-and-mapping/FullMorphMetaData_Master.csv\"\n", - "OUTPUT_ROOT = \"../scratch/em_patchseq_wnm_v1/\"\n", + "OUTPUT_ROOT = output_root()\n", "\n", "PROJECT_ID = \"visp_wnm\"\n", "DATASET_ID = \"visp_exc_wnm\"\n", @@ -105,14 +87,7 @@ "cell_type": "code", "execution_count": 3, "id": "41c33171", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T18:33:49.797878Z", - "iopub.status.busy": "2026-05-01T18:33:49.797693Z", - "iopub.status.idle": "2026-05-01T18:33:49.975623Z", - "shell.execute_reply": "2026-05-01T18:33:49.974830Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -159,14 +134,7 @@ "cell_type": "code", "execution_count": 4, "id": "bd1355ee", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T18:33:49.977372Z", - "iopub.status.busy": "2026-05-01T18:33:49.977168Z", - "iopub.status.idle": "2026-05-01T18:33:49.997698Z", - "shell.execute_reply": "2026-05-01T18:33:49.996913Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -247,14 +215,7 @@ "cell_type": "code", "execution_count": 5, "id": "8b8edd30", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T18:33:49.999400Z", - "iopub.status.busy": "2026-05-01T18:33:49.999168Z", - "iopub.status.idle": "2026-05-01T18:33:50.005268Z", - "shell.execute_reply": "2026-05-01T18:33:50.004494Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -296,20 +257,13 @@ "cell_type": "code", "execution_count": 6, "id": "75c8f92f", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T18:33:50.006831Z", - "iopub.status.busy": "2026-05-01T18:33:50.006651Z", - "iopub.status.idle": "2026-05-01T18:33:50.111706Z", - "shell.execute_reply": "2026-05-01T18:33:50.110952Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "MappingSet written: (1, 13)\n" + "MappingSet written: 1 rows\n" ] } ], @@ -328,33 +282,15 @@ " target_hierarchy=METTYPE_HIERARCHY_ID,\n", " project_id=PROJECT_ID,\n", ")\n", - "\n", - "schema_ms = build_arrow_schema(MappingSet)\n", - "table_ms = attach_linkml_metadata(\n", - " models_to_table([mapping_set], schema=schema_ms),\n", - " linkml_class=\"MappingSet\",\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"mappingset/\", table_ms,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND id = '{MAPPING_SET_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"MappingSet written:\", table_ms.shape)\n" + "result = write_models([mapping_set], output_root=OUTPUT_ROOT)\n", + "print(f\"MappingSet written: {result.rows_written} rows\")\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "ce55a05f", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T18:33:50.113389Z", - "iopub.status.busy": "2026-05-01T18:33:50.113192Z", - "iopub.status.idle": "2026-05-01T18:33:50.144146Z", - "shell.execute_reply": "2026-05-01T18:33:50.143379Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -405,27 +341,14 @@ "cell_type": "code", "execution_count": 8, "id": "2372cceb", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T18:33:50.145899Z", - "iopub.status.busy": "2026-05-01T18:33:50.145651Z", - "iopub.status.idle": "2026-05-01T18:33:50.265237Z", - "shell.execute_reply": "2026-05-01T18:33:50.264459Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CellToClusterMapping rows built: 1023\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CellToClusterMapping written: (1023, 8)\n" + "CellToClusterMapping rows built: 1023\n", + "CellToClusterMapping written: 1023 rows\n" ] } ], @@ -447,33 +370,15 @@ " project_id=PROJECT_ID,\n", " ))\n", "print(f\"CellToClusterMapping rows built: {len(mappings)}\")\n", - "\n", - "schema_ccm = build_arrow_schema(CellToClusterMapping)\n", - "table_ccm = attach_linkml_metadata(\n", - " models_to_table(mappings, schema=schema_ccm),\n", - " linkml_class=\"CellToClusterMapping\",\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"celltoclustermapping/\", table_ccm,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND mapping_set = '{MAPPING_SET_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"CellToClusterMapping written:\", table_ccm.shape)\n" + "result = write_models(mappings, output_root=OUTPUT_ROOT)\n", + "print(f\"CellToClusterMapping written: {result.rows_written} rows\")\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "99654342", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T18:33:50.267100Z", - "iopub.status.busy": "2026-05-01T18:33:50.266792Z", - "iopub.status.idle": "2026-05-01T18:33:50.301009Z", - "shell.execute_reply": "2026-05-01T18:33:50.300137Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", diff --git a/code/etl_wnm_exc_04_projection_matrix.ipynb b/code/etl_wnm_exc_04_projection_matrix.ipynb index 49325da..6ec399b 100644 --- a/code/etl_wnm_exc_04_projection_matrix.ipynb +++ b/code/etl_wnm_exc_04_projection_matrix.ipynb @@ -34,14 +34,7 @@ "cell_type": "code", "execution_count": 1, "id": "631cddfd", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T02:56:12.914446Z", - "iopub.status.busy": "2026-05-01T02:56:12.914262Z", - "iopub.status.idle": "2026-05-01T02:56:13.871104Z", - "shell.execute_reply": "2026-05-01T02:56:13.870358Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", @@ -52,11 +45,6 @@ "import pyarrow as pa\n", "from deltalake import write_deltalake\n", "\n", - "from connects_common_connectivity.arrow_utils import (\n", - " attach_linkml_metadata,\n", - " build_arrow_schema,\n", - " models_to_table,\n", - ")\n", "from connects_common_connectivity.models import (\n", " DataItem,\n", " DataItemDataSetAssociation,\n", @@ -66,28 +54,22 @@ " ProjectionMeasurementType,\n", " Unit,\n", ")\n", - "from connects_common_connectivity.write_utils import append_new_dataitems" + "from connects_common_connectivity.config import output_root\n", + "from connects_common_connectivity.io import write_models, write_projection_matrix\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "ecbab904", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T02:56:13.873631Z", - "iopub.status.busy": "2026-05-01T02:56:13.873337Z", - "iopub.status.idle": "2026-05-01T02:56:13.877411Z", - "shell.execute_reply": "2026-05-01T02:56:13.876702Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INPUT_CSV : /data/exc_vis_manuscript_wnm_axon_projection/ProjectionMatrix_tip_and_branch_roll_up.csv\n", - "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v1/\n", + "OUTPUT_ROOT : ../scratch/em_patchseq_wnm_v2/\n", "PROJECT_ID : visp_wnm\n", "DATASET_ID : visp_exc_wnm\n", "FSI_IPSI : wnm_exc_proj_ipsi\n", @@ -97,7 +79,7 @@ ], "source": [ "INPUT_CSV = \"/data/exc_vis_manuscript_wnm_axon_projection/ProjectionMatrix_tip_and_branch_roll_up.csv\"\n", - "OUTPUT_ROOT = \"../scratch/em_patchseq_wnm_v1/\"\n", + "OUTPUT_ROOT = output_root()\n", "PROJECT_ID = \"visp_wnm\"\n", "DATASET_ID = \"visp_exc_wnm\"\n", "\n", @@ -116,14 +98,7 @@ "cell_type": "code", "execution_count": 3, "id": "12f64431", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T02:56:13.879048Z", - "iopub.status.busy": "2026-05-01T02:56:13.878866Z", - "iopub.status.idle": "2026-05-01T02:56:13.925364Z", - "shell.execute_reply": "2026-05-01T02:56:13.924679Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -158,14 +133,7 @@ "cell_type": "code", "execution_count": 4, "id": "faa98c9e", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T02:56:13.927278Z", - "iopub.status.busy": "2026-05-01T02:56:13.927087Z", - "iopub.status.idle": "2026-05-01T02:56:13.954825Z", - "shell.execute_reply": "2026-05-01T02:56:13.954175Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -277,14 +245,7 @@ "cell_type": "code", "execution_count": 5, "id": "e931e9cc", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T02:56:13.956649Z", - "iopub.status.busy": "2026-05-01T02:56:13.956462Z", - "iopub.status.idle": "2026-05-01T02:56:13.961510Z", - "shell.execute_reply": "2026-05-01T02:56:13.960883Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -324,14 +285,7 @@ "cell_type": "code", "execution_count": 6, "id": "360d2dce", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T02:56:13.963273Z", - "iopub.status.busy": "2026-05-01T02:56:13.963091Z", - "iopub.status.idle": "2026-05-01T02:56:13.967068Z", - "shell.execute_reply": "2026-05-01T02:56:13.966432Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -360,14 +314,7 @@ "cell_type": "code", "execution_count": 7, "id": "88c01dbf", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T02:56:13.968765Z", - "iopub.status.busy": "2026-05-01T02:56:13.968588Z", - "iopub.status.idle": "2026-05-01T02:56:13.972362Z", - "shell.execute_reply": "2026-05-01T02:56:13.971627Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -385,13 +332,7 @@ " DataItem(id=cid, name=cid, project_id=PROJECT_ID, modality=Modality.MORPHOLOGY.value)\n", " for cid in new_ids\n", " ]\n", - " schema_di = build_arrow_schema(DataItem)\n", - " table_di = attach_linkml_metadata(\n", - " models_to_table(new_items, schema=schema_di), linkml_class=\"DataItem\"\n", - " )\n", - " n_appended = append_new_dataitems(\n", - " OUTPUT_ROOT + \"dataitem/\", table_di, project_id=PROJECT_ID\n", - " )\n", + " n_appended = write_models(new_items, output_root=OUTPUT_ROOT).rows_written\n", " print(f\"Appended {n_appended} new DataItem rows\")\n", "else:\n", " print(\"All cells already in DataItem; nothing to append.\")" @@ -401,14 +342,7 @@ "cell_type": "code", "execution_count": 8, "id": "9f5495f0", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T02:56:13.973937Z", - "iopub.status.busy": "2026-05-01T02:56:13.973754Z", - "iopub.status.idle": "2026-05-01T02:56:14.011147Z", - "shell.execute_reply": "2026-05-01T02:56:14.010347Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -449,66 +383,57 @@ "id": "6f6bf4aa", "metadata": {}, "source": [ - "## Write `DataItemDataSetAssociation` for all 345 cells" + "## Write `DataItemDataSetAssociation` as `existing ∪ 345 cells`\n", + "\n", + "`DataItemDataSetAssociation` is `overwrite_scoped` on `(project_id, dataset_id)`, so passing\n", + "only the 345 cell ids from this CSV would clobber rows written by `_01`/`_02` for the same\n", + "scope. Union with the existing scope before re-writing." ] }, { "cell_type": "code", "execution_count": 9, "id": "cd9f70b7", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T02:56:14.012903Z", - "iopub.status.busy": "2026-05-01T02:56:14.012716Z", - "iopub.status.idle": "2026-05-01T02:56:14.110969Z", - "shell.execute_reply": "2026-05-01T02:56:14.110193Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "DataItemDataSetAssociation written: (345, 3)\n" + "DataItemDataSetAssociation written: 345 rows\n" ] } ], "source": [ - "# Two-level predicate scopes the overwrite to (project_id, dataset_id), so other datasets\n", - "# sharing project_id (none today, but future-proof) are untouched. _01's 341 associations are\n", - "# a strict subset of these 345, so this overwrite is a safe superset.\n", + "# Re-assert the full (project_id, dataset_id) association scope as the union\n", + "# of any existing assoc rows and this CSV's cell_ids. DataItemDataSetAssociation\n", + "# is overwrite_scoped on (project_id, dataset_id), so passing only this CSV's\n", + "# ids would clobber rows registered by `_01` or `_02` for the same scope.\n", + "# Union with existing ids — the write is idempotent and self-heals partial runs.\n", + "try:\n", + " existing_assoc_ids = set(\n", + " pl.read_delta(OUTPUT_ROOT + \"dataitem_dataset_association/\")\n", + " .filter((pl.col(\"project_id\") == PROJECT_ID) & (pl.col(\"dataset_id\") == DATASET_ID))\n", + " [\"dataitem_id\"].to_list()\n", + " )\n", + "except Exception:\n", + " existing_assoc_ids = set()\n", + "full_assoc_ids = sorted(existing_assoc_ids | set(cell_ids))\n", "associations = [\n", " DataItemDataSetAssociation(\n", " dataitem_id=cid, dataset_id=DATASET_ID, project_id=PROJECT_ID,\n", " )\n", - " for cid in cell_ids\n", + " for cid in full_assoc_ids\n", "]\n", - "schema_assoc = build_arrow_schema(DataItemDataSetAssociation)\n", - "table_assoc = attach_linkml_metadata(\n", - " models_to_table(associations, schema=schema_assoc),\n", - " linkml_class=\"DataItemDataSetAssociation\",\n", - ")\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"dataitem_dataset_association/\", table_assoc,\n", - " mode=\"overwrite\",\n", - " predicate=f\"project_id = '{PROJECT_ID}' AND dataset_id = '{DATASET_ID}'\",\n", - " partition_by=[\"project_id\"],\n", - ")\n", - "print(\"DataItemDataSetAssociation written:\", table_assoc.shape)" + "result = write_models(associations, output_root=OUTPUT_ROOT)\n", + "print(f\"DataItemDataSetAssociation written: {result.rows_written} rows\")" ] }, { "cell_type": "code", "execution_count": 10, "id": "8a91a81d", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T02:56:14.112726Z", - "iopub.status.busy": "2026-05-01T02:56:14.112525Z", - "iopub.status.idle": "2026-05-01T02:56:14.142604Z", - "shell.execute_reply": "2026-05-01T02:56:14.141864Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -516,15 +441,15 @@ "text": [ "(345, 3)\n", "shape: (3, 3)\n", - "┌───────────────────────────────┬──────────────┬────────────┐\n", - "│ dataitem_id ┆ dataset_id ┆ project_id │\n", - "│ --- ┆ --- ┆ --- │\n", - "│ str ┆ str ┆ str │\n", - "╞═══════════════════════════════╪══════════════╪════════════╡\n", - "│ 18864_6734-X4899-Y27447_reg ┆ visp_exc_wnm ┆ visp_wnm │\n", - "│ 191812_7938-X6892-Y25312_reg ┆ visp_exc_wnm ┆ visp_wnm │\n", - "│ 211550_7718-X19461-Y16950_reg ┆ visp_exc_wnm ┆ visp_wnm │\n", - "└───────────────────────────────┴──────────────┴────────────┘\n" + "┌─────────────────────────────┬──────────────┬────────────┐\n", + "│ dataitem_id ┆ dataset_id ┆ project_id │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ str │\n", + "╞═════════════════════════════╪══════════════╪════════════╡\n", + "│ 17109_6201-X4328-Y6753_reg ┆ visp_exc_wnm ┆ visp_wnm │\n", + "│ 17109_6301-X4756-Y24516_reg ┆ visp_exc_wnm ┆ visp_wnm │\n", + "│ 17109_6601-X4384-Y7436_reg ┆ visp_exc_wnm ┆ visp_wnm │\n", + "└─────────────────────────────┴──────────────┴────────────┘\n" ] } ], @@ -554,14 +479,7 @@ "cell_type": "code", "execution_count": 11, "id": "4af58dbe", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T02:56:14.144439Z", - "iopub.status.busy": "2026-05-01T02:56:14.144242Z", - "iopub.status.idle": "2026-05-01T02:56:14.262603Z", - "shell.execute_reply": "2026-05-01T02:56:14.261905Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -602,14 +520,7 @@ "cell_type": "code", "execution_count": 12, "id": "f097ee84", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T02:56:14.264678Z", - "iopub.status.busy": "2026-05-01T02:56:14.264476Z", - "iopub.status.idle": "2026-05-01T02:56:14.281433Z", - "shell.execute_reply": "2026-05-01T02:56:14.280735Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -657,14 +568,7 @@ "cell_type": "code", "execution_count": 13, "id": "eb78fbca", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T02:56:14.283261Z", - "iopub.status.busy": "2026-05-01T02:56:14.283078Z", - "iopub.status.idle": "2026-05-01T02:56:14.369508Z", - "shell.execute_reply": "2026-05-01T02:56:14.368820Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -702,14 +606,7 @@ "cell_type": "code", "execution_count": 14, "id": "b6dacbdf", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T02:56:14.371472Z", - "iopub.status.busy": "2026-05-01T02:56:14.371264Z", - "iopub.status.idle": "2026-05-01T02:56:14.386451Z", - "shell.execute_reply": "2026-05-01T02:56:14.385720Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -755,14 +652,7 @@ "cell_type": "code", "execution_count": 15, "id": "43d47d58", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T02:56:14.388147Z", - "iopub.status.busy": "2026-05-01T02:56:14.387965Z", - "iopub.status.idle": "2026-05-01T02:56:14.395733Z", - "shell.execute_reply": "2026-05-01T02:56:14.394916Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -787,20 +677,13 @@ "cell_type": "code", "execution_count": 16, "id": "0f278d8a", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T02:56:14.397385Z", - "iopub.status.busy": "2026-05-01T02:56:14.397205Z", - "iopub.status.idle": "2026-05-01T02:56:14.479276Z", - "shell.execute_reply": "2026-05-01T02:56:14.478519Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "ProjectionMeasurementMatrix written: (2, 10)\n" + "ProjectionMeasurementMatrix written: 2 rows\n" ] } ], @@ -834,34 +717,16 @@ " unit=Unit.MICRONS_LENGTH,\n", ")\n", "\n", - "schema_pmm = build_arrow_schema(ProjectionMeasurementMatrix)\n", - "table_pmm = attach_linkml_metadata(\n", - " models_to_table([ipsi_matrix, contra_matrix], schema=schema_pmm),\n", - " linkml_class=\"ProjectionMeasurementMatrix\",\n", - ")\n", - "# Predicate is id IN (...) only — ProjectionMeasurementMatrix is not ProjectScoped (see schema gap #1).\n", - "# This scope pins the overwrite to exactly the two rows this notebook owns; other rows in the\n", - "# shared table are untouched.\n", - "write_deltalake(\n", - " OUTPUT_ROOT + \"projectionmeasurementmatrix/\", table_pmm,\n", - " mode=\"overwrite\",\n", - " predicate=f\"id IN ('{FSI_IPSI}', '{FSI_CONTRA}')\",\n", - ")\n", - "print(\"ProjectionMeasurementMatrix written:\", table_pmm.shape)" + "write_projection_matrix(ipsi_matrix, df[ipsi_cols].to_numpy(), output_root=OUTPUT_ROOT)\n", + "write_projection_matrix(contra_matrix, df[contra_cols].to_numpy(), output_root=OUTPUT_ROOT)\n", + "print(\"ProjectionMeasurementMatrix written: 2 rows\")\n" ] }, { "cell_type": "code", "execution_count": 17, "id": "b5fca905", - "metadata": { - "execution": { - "iopub.execute_input": "2026-05-01T02:56:14.481024Z", - "iopub.status.busy": "2026-05-01T02:56:14.480833Z", - "iopub.status.idle": "2026-05-01T02:56:14.498588Z", - "shell.execute_reply": "2026-05-01T02:56:14.497899Z" - } - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -914,7 +779,7 @@ "| Output path | Class | Rows |\n", "|---|---|---|\n", "| `dataitem/` | `DataItem` | +N new cells (4 expected; via `append_new_dataitems`) |\n", - "| `dataitem_dataset_association/` | `DataItemDataSetAssociation` | 345 (overwrite scoped to `project_id` + `dataset_id`) |\n", + "| `dataitem_dataset_association/` | `DataItemDataSetAssociation` | full scope = existing ∪ 345 cells (overwrite_scoped on `(project_id, dataset_id)`; union preserves rows from `_01`/`_02`) |\n", "| `projectionmeasurementmatrix/wnm_exc_proj_ipsi/` | wide parquet | 345 cells × 152 ipsilateral region columns |\n", "| `projectionmeasurementmatrix/wnm_exc_proj_contra/` | wide parquet | 345 cells × 68 contralateral region columns |\n", "| `projectionmeasurementmatrix/` | `ProjectionMeasurementMatrix` | 2 (one per laterality) |\n", diff --git a/code/parse_minnie_clustering.ipynb b/code/parse_minnie_clustering.ipynb index 7313538..4edf484 100644 --- a/code/parse_minnie_clustering.ipynb +++ b/code/parse_minnie_clustering.ipynb @@ -44,7 +44,7 @@ "metadata": {}, "outputs": [], "source": [ - "from connects_common_connectivity.arrow_utils import build_arrow_schema, models_to_table, attach_linkml_metadata\n" + "from connects_common_connectivity.io.arrow_utils import build_arrow_schema, models_to_table, attach_linkml_metadata\n" ] }, { @@ -798,7 +798,7 @@ "metadata": {}, "outputs": [], "source": [ - "from connects_common_connectivity.arrow_utils import build_cell_feature_matrix_schema\n", + "from connects_common_connectivity.io.arrow_utils import build_cell_feature_matrix_schema\n", "schema = build_cell_feature_matrix_schema(cfs, fds, cell_index_column=\"id\")" ] }, diff --git a/etl_example_prompt.md b/etl_example_prompt.md index 1f56b22..664908c 100644 --- a/etl_example_prompt.md +++ b/etl_example_prompt.md @@ -25,10 +25,11 @@ schemas/single_cell_schema.yaml ### Package utilities (read-only reference) ``` src/connects_common_connectivity/models.py # Pydantic models — read to understand fields -src/connects_common_connectivity/arrow_utils.py # build_arrow_schema, models_to_table, - # attach_linkml_metadata, - # build_cell_feature_matrix_schema -src/connects_common_connectivity/write_utils.py # append_new_dataitems, walk_ancestors +src/connects_common_connectivity/io/arrow_utils.py # build_arrow_schema, models_to_table, + # attach_linkml_metadata, + # build_cell_feature_matrix_schema +src/connects_common_connectivity/io/write_utils.py # walk_ancestors +src/connects_common_connectivity/io/writers.py # write_models, write_projection_matrix ``` ### Example notebooks (read for patterns) @@ -110,45 +111,30 @@ Every notebook follows this cell order. Do not skip sections. ## 5. Write pattern reference -### 5a. `dataitem/` — use `append_new_dataitems` (never overwrite, never plain append) +### 5a. Registry-backed tables — use `write_models` (do not hand-build predicates) ```python -from connects_common_connectivity.write_utils import append_new_dataitems +from connects_common_connectivity.io import write_models -n = append_new_dataitems(OUTPUT_ROOT + "dataitem/", arrow_table, project_id=PROJECT_ID) -print(f"Appended {n} new DataItem rows") +result = write_models(rows, output_root=OUTPUT_ROOT) +print(result.mode, result.rows_written, result.predicates) ``` -- Reads existing `(project_id, id)` pairs, appends only rows whose `id` is new. -- Re-running appends nothing. Two notebooks sharing the same `project_id` do not clobber each other. -- **Never** use `write_deltalake(..., mode="overwrite", predicate="project_id=...")` for `dataitem/`. A predicate-scoped overwrite wipes the entire partition, deleting the other dataset's cells. +- `write_models` infers class from `rows`, then applies the registered `subdir`, `partition_by`, scope predicate, and write mode from `io/write_spec.py`. +- For registry tables, notebook code should **not** call `write_deltalake` directly and should not build `predicate=` strings by hand. +- Re-running is idempotent when you pass the full intended scope slice (the standard pattern in current notebooks). -### 5b. All registry tables — `mode="overwrite"` with a two-level predicate +### 5b. `DataItem` writes are id-deduped append via `write_models` ```python -write_deltalake( - OUTPUT_ROOT + "/", arrow_table, - mode="overwrite", - predicate=f"project_id = '{PROJECT_ID}' AND = '{VALUE}'", - partition_by=["project_id"], -) +new_dataitems = [DataItem(id=cid, name=cid, project_id=PROJECT_ID) for cid in new_ids] +written = write_models(new_dataitems, output_root=OUTPUT_ROOT).rows_written +print(f"DataItems appended: {written}") ``` -A **two-level predicate** is required. One level (`project_id`) is not enough when two notebooks share a `project_id` but write different rows to the same table. The second level pins the predicate to exactly the rows this notebook owns. - -| Table | Second predicate field | Example value | -|---|---|---| -| `dataset/` | `id` | `dataset_id = 'visp_inh_patchseq'` | -| `dataitem_dataset_association/` | `dataset_id` | `dataset_id = 'visp_inh_patchseq'` | -| `cellfeaturedefinition/` | `feature_set_id` | `feature_set_id = 'inh_visp_morph_features'` | -| `cellfeatureset/` | `id` | `id = 'inh_visp_morph_features'` | -| `cellfeaturematrix/` | `feature_set_id` | `feature_set_id = 'inh_visp_morph_features'` | -| `clustermembership/` | `hierarchy_id` | `hierarchy_id = 'visp_met_types_taxonomy'` | -| `celltoclustermapping/` | `mapping_set` | `mapping_set = 'visp_exc_wnm_mettype_mapping'` | -| `projectionmeasurementmatrix/` | `id` | `id = 'wnm_exc_proj_ipsi'` | -| `cellcellconnectivitylong//` | (folder scopes the example) | — | - -`cellfeaturedefinition/` should also use `partition_by=["project_id", "feature_set_id"]` for query performance. +- `DataItem` dispatches to `append_new_by_id` (id dedupe within one `project_id` per call). +- Re-running with the same ids appends nothing. +- **Do not** use scoped overwrite for `dataitem/`. ### 5c. Wide-form feature parquet — `mode="overwrite"` with predicate on `project_id` @@ -171,37 +157,34 @@ If a feature CSV contains cell ids not present in the `_01` DataItems, register 1. Read `dataitem_dataset_association/` filtered to `project_id AND dataset_id` → collect existing ids. 2. Identify new ids (`set(csv_ids) - set(existing_ids)`). -3. Call `append_new_dataitems` for new `DataItem` rows. -4. Plain `mode="append"` for new `DataItemDataSetAssociation` rows — but only after deduplicating against existing association rows: - ```python - existing_assoc_ids = set(pl.read_delta(...).filter(...)[\"dataitem_id\"]) - truly_new = [a for a in new_assoc if a.dataitem_id not in existing_assoc_ids] - if truly_new: - write_deltalake(..., mode="append", ...) - ``` +3. Call `write_models([...DataItem(...)...], output_root=OUTPUT_ROOT)` for any new cells. +4. Re-assert the full `(project_id, dataset_id)` association scope with `write_models([...DataItemDataSetAssociation(...)...])` (pass the full intended set, not append-only deltas). ### 5e. Cluster taxonomy tables (global) -`cluster/`, `clusterhierarchy/`, `algorithmrun/` have **no `project_id`**. Multiple taxonomies coexist in the same Delta table; scope by `hierarchy_id` (or `id` for the hierarchy/run rows themselves). +`cluster/`, `clusterhierarchy/`, `algorithmrun/`, and `hierarchycategory/` have **no `project_id`**. Write through `write_models`; registry scopes are: + +- `Cluster`: `hierarchy_id` +- `ClusterHierarchy`: `id` +- `AlgorithmRun`: `id` +- `HierarchyCategory`: `id` ```python -write_deltalake( - OUTPUT_ROOT + "cluster/", arrow_table, - mode="overwrite", - predicate=f"hierarchy_id = '{HIERARCHY_ID}'", - partition_by=["hierarchy_id"], -) +write_models(cluster_rows, output_root=OUTPUT_ROOT) +write_models([hierarchy_row], output_root=OUTPUT_ROOT) +write_models([run_row], output_root=OUTPUT_ROOT) +write_models(category_rows, output_root=OUTPUT_ROOT) ``` -Use `predicate=f"id = '{HIERARCHY_ID}'"` for the single `clusterhierarchy/` row and `predicate=f"id = '{RUN_ID}'"` for the single `algorithmrun/` row. See `etl_tasic_01_cluster.ipynb` and `etl_visp_met_types_01_cluster.ipynb`. +See `etl_tasic_01_cluster.ipynb` and `etl_visp_met_types_01_cluster.ipynb`. ### 5f. Membership and mapping (project-scoped, per-hierarchy) -- `clustermembership/` — predicate `project_id AND hierarchy_id`, `partition_by=["project_id", "hierarchy_id"]`. -- `celltoclustermapping/` — predicate `project_id AND mapping_set`, `partition_by=["project_id", "mapping_set"]`. -- `mappingset/` — predicate by `id` (one row per named mapping). +- `ClusterMembership` is scoped by `project_id AND hierarchy_id`. +- `CellToClusterMapping` is scoped by `project_id AND mapping_set`. +- `MappingSet` is scoped by `project_id AND id`. -When two notebooks merge into the same `(project_id, hierarchy_id)` slice (e.g. exc + inh patch-seq both writing memberships into `(visp_patchseq, visp_met_types_taxonomy)`), each must read the existing slice back, union with the new rows, then overwrite. Re-running either notebook is then idempotent. +When two notebooks merge into the same scoped slice (for example, both patch-seq `_03` notebooks writing memberships for the same `(project_id, hierarchy_id)`), each notebook should write the full intended slice via `write_models(...)`. Re-running either notebook remains idempotent. ### 5g. Cell-cell connectivity (`cellcellconnectivitylong/`) @@ -216,7 +199,7 @@ Predicate `project_id` only; the folder scopes the example. See `etl_minnie_04_c ### 5h. Projection matrix (`projectionmeasurementmatrix/` + wide-form parquet) -One Delta row per matrix; underlying wide table in `projection_/`. Predicate `project_id AND id` for the registry row; predicate `project_id` for the wide-form folder (the folder already scopes to one matrix). See `etl_wnm_exc_04_projection_matrix.ipynb`. +Use `write_projection_matrix(pmm_row, dense_matrix, output_root=OUTPUT_ROOT)` for `ProjectionMeasurementMatrix` rows; it computes `region_coverage` from the dense matrix and delegates to the registry-backed writer. Keep direct `write_deltalake` only for the underlying wide-form `projectionmeasurementmatrix//` parquet folders. See `etl_wnm_exc_04_projection_matrix.ipynb`. ### 5i. Membership vs mapping @@ -229,10 +212,10 @@ If the cells were not in the cohort that defined the taxonomy, write `CellToClus ### 5j. Parent propagation (`walk_ancestors`) -Every membership and mapping is parent-propagated: one row per (cell × ancestor) all the way up to the root. Use `walk_ancestors` from `write_utils.py`: +Every membership and mapping is parent-propagated: one row per (cell × ancestor) all the way up to the root. Use `walk_ancestors` from `io.write_utils`: ```python -from connects_common_connectivity.write_utils import walk_ancestors +from connects_common_connectivity.io.write_utils import walk_ancestors for ancestor_id, is_leaf in walk_ancestors(leaf_id, parent_by_child): ... # build one row, set probability/membership_score on the leaf only @@ -245,7 +228,7 @@ for ancestor_id, is_leaf in walk_ancestors(leaf_id, parent_by_child): ## 6. Building arrow tables ```python -from connects_common_connectivity.arrow_utils import ( +from connects_common_connectivity.io.arrow_utils import ( build_arrow_schema, models_to_table, attach_linkml_metadata, @@ -317,10 +300,10 @@ When two projects (different `project_id`) share a feature set (same `feature_se | Mistake | What goes wrong | Correct approach | |---|---|---| -| `write_deltalake(dataitem/, mode="overwrite", predicate="project_id=...")` | Wipes the entire partition, deleting the other dataset's cells | Use `append_new_dataitems` | -| Single-level predicate `project_id` on shared tables | Second notebook wipes first notebook's rows | Always use two-level predicate | -| `mode="append"` on registry tables (dataset, cellfeatureset, etc.) | Accumulates duplicate rows on every re-run | Use `mode="overwrite"` with predicate | -| `mode="append"` on association table without dedup check | Accumulates duplicate association rows | Check existing ids before appending | +| Calling `write_deltalake` directly for a registry-backed model table | Notebook-level predicate/partition drift from `io/write_spec.py` | Use `write_models(...)` | +| Hand-building `predicate=` / `partition_by=` for model writes | Scope bugs (row loss or accidental clobber) | Let `write_models` apply the registered scope | +| Writing `DataItem` with overwrite or plain append | Clobbers or duplicates within a project partition | Use `write_models(DataItem(...))` (append_new_by_id) | +| Appending only delta associations in `_02`/`_03` notebooks | Partial reruns can leave missing links | Re-write the full `(project_id, dataset_id)` scoped association slice with `write_models` | | Raw string for enum slot (`modality="MORPHOLOGY"`) | Pydantic validation error | Use `Modality.MORPHOLOGY.value` | | Casting or reformatting id values | Ids won't match across tables | Use ids as-is from the source file | | Editing `models.py` directly | Changes lost on next schema regen | Edit the schema YAML, then regenerate | @@ -328,12 +311,12 @@ When two projects (different `project_id`) share a feature set (same `feature_se | Verifying with `project_id` filter only on a shared table | Asserts pass but row count is wrong (includes other dataset) | Always filter by both `project_id` and `dataset_id` (or `feature_set_id`) | | Positional `models_to_table(rows, ModelClass)` or `attach_linkml_metadata(table, "Cluster")` | Silent schema-construction error, opaque message | Use `schema=` and `linkml_class=` kwargs | | Setting `AlgorithmRun.produced_hierarchies = [hierarchy]` | Pydantic expects an inlined dict, not a list — validation error | Omit it; `ClusterHierarchy.run` carries the inverse link | -| `mode="overwrite"` on `clustermembership/` with predicate on `project_id` only | Wipes other hierarchies' rows for the same project | Use two-level predicate: `project_id AND hierarchy_id` | +| Manual overwrite on `clustermembership/` scoped only by `project_id` | Wipes other hierarchies' rows for the same project | Use `write_models` (`ClusterMembership` scope is `project_id AND hierarchy_id`) | | Writing `ClusterMembership` for cells not in the cohort that defined the taxonomy | Misrepresents provenance — they were classified, not members | Use `CellToClusterMapping` + a `MappingSet` row instead | --- ## 11. Known limitations -- **`HierarchyCategory` has no safe global write pattern today.** The table has no `project_id` and no `hierarchy_id` discriminator, and category ids (`class`, `subclass`, `cluster`) are intentionally shared across taxonomies. Predicate-scoped overwrite would clobber sibling taxonomies' rows; plain append collides on `id`. Current `_03` notebooks (`etl_minnie_03`, `etl_visp_met_types_01_cluster`) skip this write and flag a TODO. A global-dedup append helper is the planned fix. +- **`HierarchyCategory` rows are id-scoped global vocabulary rows.** Because ids like `class`, `subclass`, and `cluster` are shared across taxonomies, only write canonical shared definitions (same ids/meaning) via `write_models`. Do not invent taxonomy-specific category ids without a schema-level discriminator. - **`CellCellConnectivityLong` has no `connectome_id` discriminator.** Two example connectomes for the same project must live in separate folders (see §5g). Schema addition would let them share a folder. diff --git a/planning/20260623/ARCHITECTURE.md b/planning/20260623/ARCHITECTURE.md new file mode 100644 index 0000000..beb2651 --- /dev/null +++ b/planning/20260623/ARCHITECTURE.md @@ -0,0 +1,310 @@ +# IO Layer Architecture — write / read / validation + +Status: design agreed 2026-06-01. Implementation to be done by follow-up agents. +This document is the source of truth for the design. The runnable agent prompts +live in `planning/prompts/`. The task breakdown lives in `planning/TODO.md`. + +## Hard constraints (read before any work) + +The non-negotiable rules live in `prompts/00_shared_context.md` and are not restated here: +never edit `models.py` (generated) or `schemas/*.yaml` (ask YY first); the LinkML schema is +the single source of truth; all IO code lives under `src/connects_common_connectivity/io/`. +This document assumes those and adds the design on top. + +## What exists today (do not rebuild) + +- `models.py` — LinkML-generated pydantic v2 models. Key classes: + `DataSet`, `DataItem`, `DataItemDataSetAssociation`, `Cluster`, `ClusterHierarchy`, + `ClusterMembership`, `CellFeatureSet`, `CellFeatureDefinition`, `CellFeatureMatrix`, + `CellFeatureMeasurement`, `MappingSet`, `CellToCellMapping`, `CellToClusterMapping`, + `ClusterToClusterMapping`, `ProjectionMeasurementMatrix`, `BrainRegionAssociation`, + `ZarrDataset`, `ParquetDataset`. A `ProjectScoped` mixin supplies `project_id`. +- `arrow_utils.py` — `build_arrow_schema(model_cls)`, `models_to_table(models, schema)`, + `attach_linkml_metadata(table, linkml_class=...)`, `build_cell_feature_matrix_schema(...)`. + These already convert pydantic models → Arrow tables with LinkML metadata. **Reuse.** +- `write_utils.py` — `append_new_dataitems(path, table, project_id=...)` (id-deduped + append) and `walk_ancestors(leaf_id, parent_of)` (hierarchy denormalization). **Reuse; + the new writers wrap/generalize these rather than replacing them.** +- `parquet_loader.py` — `load_parquet_to_models(...)` (Parquet → models with a report). +- `cli.py` — LinkML `SchemaView`-based full validation (the `ccc` command). Kept as the + occasional heavyweight conformance check, **not** on the hot write path. +- `io/io_plans.md` — historical: two pre-existing design notes (now superseded). Both + are referenced below so the design history stays linkable. The source-tree file has + been deleted; what remained relevant moved into `planning/`: + - `populate_region_coverage(pmm, matrix)` — derives `region_coverage` from the dense + values **before** a matrix is written → a **write-side transform**. **Shipped** in + `io/write_utils.py`; the file's docstring is now the source of truth. + - `compare_region_coverage(pmms)` — summarizes overlap across already-written matrices → + **read/analysis**. **Deferred** to the read-side work; full spec moved to + `planning/prompts/_deferred/09_analysis.md`. + +## Target `io/` structure (clean is the goal) + +The existing IO files are scattered at the package root. The target is a single tidy `io/` +package; the existing modules are **relocated into it and become backends** the new files +call. "Do not rebuild" means *move and wrap, never reimplement*. + +``` +src/connects_common_connectivity/ + models.py # generated, UNTOUCHED, stays at root + cli.py # CLI entry point, stays at root; full LinkML conformance check + config.py # NEW package-wide Settings (output_root, dry_run, ...) — see below + io/ + __init__.py # NEW curated public API (what users import); __all__ + docstring + write_spec.py # NEW registry — source of truth + write_validation.py# NEW auto-derived strict submodels (write-safety validation) + arrow_utils.py # MOVED from root (no rename) (models <-> Arrow conversion) + writers.py # NEW write_models() + write_projection_matrix() + write_utils.py # MOVED from root (append-by-id backend, walk_ancestors, + # populate_region_coverage) + # --- deferred (see "Later — elaborations"; designs kept, not built yet) --- + parquet_loader.py # MOVED from root (PURE MOVE, not folded into readers) + readers.py # NEW predicate-based + cross-dataset reads +``` + +`config.py` lives at the **package root**, not in `io/`: configuration is package-wide +(`cli.py` and future plotting/analysis code read it too), so the general name belongs in the +general namespace next to `models.py`. Conversely the io validator is named +`write_validation.py`, not `validation.py`: it is specifically write-safety validation +coupled to `write_spec`, and the bare word "validation" is already claimed by `cli.py`'s +LinkML conformance check — two different validations, so neither owns the generic name. + +Seed-stage modules are NOT split out prematurely. `populate_region_coverage` is **not** a +separate "transforms" module — it lives in `write_utils.py` as a helper the projection +writer calls (it's write plumbing, like `append_new_dataitems`). Read-side +`compare_region_coverage` is deferred entirely (see "Later — elaborations"). + +Module placement summary (the operational "how to move them" lives in +`prompts/03_writers.md` so it is not restated in three places): +- `arrow_utils.py`, `write_utils.py` → `io/` as backends to `writers.py` (W3). +- `parquet_loader.py` → `io/parquet_loader.py` is a **pure move, deferred** with the + read-side work; do NOT move it now. +- `cli.py` stays at the package root as the `ccc` entry point; it owns the occasional full + LinkML conformance check (separate from `io/write_validation.py`, which is the fast + write-path check). +- `config.py` is NEW at the package root (package-wide settings; see structure note above). +- `models.py` stays at root, generated, never edited. + +Migration safety: while notebooks are being migrated, the moved modules may keep one-line +re-export shims at their old import paths (e.g. `from .io.arrow_utils import *`) so nothing breaks +mid-transition. Shim removal is a tracked task (TODO W6), gated by a test that asserts no +old import path is referenced anywhere once migration is complete — otherwise the two import +paths linger and become exactly the clutter this redesign removes. + +## The bug this design fixes + +In every `_01_dataset_dataitem` notebook the DataSet is written with: + +```python +write_deltalake(root+"dataset/", table_ds, mode="overwrite", + predicate=f"project_id = '{PROJECT_ID}'", partition_by=["project_id"]) +``` + +`visp_exc_patchseq` and `visp_inh_patchseq` **share** `project_id = 'visp_patchseq'` but +have different `dataset_id`. So writing the inhibitory dataset overwrites the excitatory +dataset's row (and vice versa). The association write already does the right thing +(`predicate = "project_id = '...' AND dataset_id = '...'"`). The fix is structural: the +correct scope columns must come from a **per-class registry**, not be retyped by hand in +each notebook. `DataSet`'s scope is `(project_id, id)`; the association's scope is +`(project_id, dataset_id)`; `DataItem` is append-by-id; etc. + +## Design overview + +One registry entry per class is the hub. It drives four things so they can never drift +apart: partitioning, the overwrite predicate (scope columns), which slots are required +for a safe write, and the auto-derived strict validator. + +``` + ┌─────────────────────────────┐ +LinkML schema ──▶│ models.py (generated) │ + └─────────────────────────────┘ + │ read-only + ▼ + ┌───────────────────────────────────────────────┐ + │ write_spec registry (one entry per class) │ + │ partition_by · scope_columns · write_mode · │ + │ required_for_write · cross_field_rules │ + └───────────────────────────────────────────────┘ + │ │ │ + ▼ ▼ ▼ + validation write module read module + (strict submodel (write_models + (predicate-based + + derived per write_projection flexible cross-dataset + class) _matrix) reads) + │ + ▼ + Settings (global output_root) +``` + +## Module 1 — `config.py` (package root; discovered config file) + +Decision: settings live in a **declarative, version-controlled `ccc_config.yaml`** at the +repo root, discovered by walking up from the working directory (the `pyproject.toml` / +`ruff` / `pytest` pattern) and loaded into a validated pydantic `Settings`. No `%run`, no +process-global mutation, no per-notebook setup. No new dependency (pydantic + PyYAML, the +latter already in the tree via LinkML). + +```yaml +# ccc_config.yaml (repo root — the ONE place values live) +output_root: ../scratch/em_patchseq_wnm_v1/ +dry_run: false +``` + +```python +class Settings(BaseModel): + output_root: Path # required, no default + dry_run: bool = False + # room for more knobs (schema_version_pin, ...) later + +@lru_cache +def get_settings() -> Settings: + path = find_config_file("ccc_config.yaml") # walk cwd → parents + if path is None: + raise RuntimeError("No ccc_config.yaml found — create one at the repo root " + "with output_root: ...") + data = yaml.safe_load(path.read_text()) + if env := os.environ.get("CCC_OUTPUT_ROOT"): # developer escape hatch, path only + data["output_root"] = env + return Settings(**data) +``` + +Resolution precedence: **explicit `settings=` arg (per call) > `CCC_OUTPUT_ROOT` env > +`ccc_config.yaml` > error.** The file is the source of truth and is validated by pydantic on +load; the env var is a subordinate developer override for `output_root` only (it cannot +express structured knobs like `dry_run`). There is no built-in default path — a missing file +fails loudly rather than writing somewhere arbitrary. `get_settings()` is a pure, cached +function of the filesystem (clearable in tests), not a mutable global. + +How the ETL uses it (kills the per-notebook setup): there is no config cell at all. A +notebook just imports and calls `write_models(...)` / `read_dataset(...)`; the library +discovers `ccc_config.yaml` on its own. Writers/readers do `settings = settings or +get_settings()`. To repoint local vs CodeOcean, edit the one file (or set `CCC_OUTPUT_ROOT`). +A `table_path(settings, "dataset")` helper resolves per-table subdirectories so nothing +concatenates path strings. + +## Module 2 — `write_spec.py` (the registry) + +An explicit, hand-maintained lookup, one entry per writable class. **Build it like a +prototype, not a derivation.** Do not assume every class is scoped-overwrite-with-predicate; +that pattern fits DataSet/Association, but `append_new_by_id` already exists for DataItem +because append was the right behavior there, and other classes may want append or modes we +haven't named yet. For each class, write a small real example in a notebook *first*, see how +it actually wants to be written, and let that experience set the entry. The registry is then +the source of truth, cross-checked against the schema for drift (class names and +`project_id`/identifier slots must exist in the generated models). + +Each entry declares: + +- `subdir` — Delta table subdirectory under `output_root` (e.g. `"dataset"`). +- `partition_by` — Delta partition columns (e.g. `["project_id"]`). +- `scope_columns` — for scoped-overwrite classes, the columns that define the predicate (the + identity within the shared table). DataSet → `["project_id", "id"]`; + DataItemDataSetAssociation → `["project_id", "dataset_id"]`. May be empty/N-A for + append-mode classes. +- `write_mode` — a small open vocabulary, not a fixed binary: `"overwrite_scoped"`, + `"append_new_by_id"` (the `append_new_dataitems` behavior), and whatever else the + prototyping surfaces. New modes are added when a class's example shows the existing ones + don't fit — `write_mode` is a `Literal` we extend, not a constraint to force classes into. +- `required_for_write` — slots that must be present/non-null for a safe write (may be + stricter than the schema's own `required`). +- `cross_field_rules` — names of cross-field checks to attach to the strict validator + (validation is layered in after the write path works; see ordering). + +For `overwrite_scoped`, the predicate is built from `scope_columns` + the row values, e.g. +`"project_id = 'visp_patchseq' AND id = 'visp_exc_patchseq'"`. This is exactly the bug +fix: DataSet now carries `id` in its scope. + +## Module 3 — `io/write_validation.py` (auto-derived strict submodels) + +Built **after** the write path works (priority order: config → write IO → validation). The +writers ship first with a pass-through validation hook; this module swaps the real validator +into that hook. + +Decision: **auto-derived** strict submodels — single source of truth. + +`strict_model_for(cls)` takes the generated pydantic model and returns a subclass that +(a) flips each slot in the registry's `required_for_write` to required, and (b) attaches +the registry's `cross_field_rules` as pydantic `model_validator`s. No field definitions +are restated; everything is read from `models.py` + the registry. `models.py` is never +touched. Validation runs on the hot write path (fast, pydantic-only, **no I/O**). The +LinkML/`cli.py` validator remains the separate, occasional full-conformance check. + +Hot-path validation is purely structural: required-slot enforcement plus pure cross-field +rules that only inspect the model in hand. **Referential checks that read other tables do +NOT belong on the hot path.** Example: "an association's `dataset_id` must refer to a +DataSet already present for that `project_id`" requires a reader, so it is an opt-in check +(`write_models(..., check_refs=True)`) deferred with the read-side work (it needs a reader), +not a strict-submodel validator. This keeps validation free of any dependency on readers. + +## Module 4 — `writers.py` (+ `io/write_utils.py`, `io/arrow_utils.py`) + +A single dispatch core, no per-class wrappers: + +- `write_models(models, *, settings=None) -> WriteResult` — infers the class, looks up + the registry, converts via `io/arrow_utils.py`, attaches LinkML metadata, then writes + per `write_mode` (scoped overwrite with the registry-built predicate, `append_new_by_id` + via the backend, `wide_parquet` for `CellFeatureMatrix`). It calls a **validation hook** + before writing; in the write-IO phase that hook is a pass-through, and Module 3 (built + afterward) swaps in the real strict validator with no restructuring. +- **No `write_dataset` / `write_dataitem` / `write_association` / etc. wrappers.** + `write_models()` infers the class from its argument; renaming it per class adds no + behavior, only drift surface. Discoverability is provided by + `WRITABLE_CLASSES = tuple(s.model_cls for s in REGISTRY.values())` plus + `write_models`'s docstring. +- `write_projection_matrix(pmm, matrix, *, settings=None) -> WriteResult` is the **one** + non-`write_models` public writer, justified because its signature is non-uniform (it + takes the dense matrix for `populate_region_coverage` enrichment before delegating to + `write_models`). No other exceptions — if a future class needs pre-write enrichment, the + caller does the enrichment and then calls `write_models`. +- `io/write_utils.py` (moved from root): `append_new_dataitems` is the `append_new_by_id` + backend; `walk_ancestors` is used by membership/mapping writers; `populate_region_coverage` + (ported from the now-deleted `io_plans.md`) is the pre-write projection helper. `write_projection_matrix` + calls `populate_region_coverage` (or accepts an already-enriched matrix). Keep it a pure + function (no IO, no mutation of input). Generalize `append_new_dataitems` only if needed + (e.g. parametrize the partition column) without breaking callers. Rationale: this is write + plumbing the projection writer needs — same shelf as `append_new_dataitems` — not a + separate "transforms" concern. + +Wide feature matrices (`CellFeatureMatrix`) stay inside the registry under +`write_mode = "wide_parquet"`; `write_models` dispatches them through +`build_cell_feature_matrix_schema` (in `io/arrow_utils.py`) and a Parquet write. + +## Later — elaborations (deferred; design kept, not built yet) + +These are **not actionable in this round.** Priority now is config → write IO → validation → +notebook migration. Once the write path is solid and notebooks are migrated, revisit: + +- **Readers** (`io/readers.py`): predicate-based readers mirroring the write spec + (`read_dataset`, `read_dataitem`, `read_features` scoped by `project_id`/`dataset_id`), + plus flexible cross-dataset reads now that datasets share tables — flagship: "all DataItems + with either a ClusterMembership or a CellToClusterMapping to a given cluster set." Users can + always drop to raw `polars.read_delta`; readers are conveniences, not a wall. When this + starts, `parquet_loader.py` is **moved** to `io/parquet_loader.py` (pure move, not folded) + and used as the typed-read backend. +- **Read-side analysis**: `compare_region_coverage(pmms)` (spec in + `planning/prompts/_deferred/09_analysis.md`) (shared vs + exclusive region coverage across matrices) — reads finished data and summarizes. +- **Opt-in referential check** (`write_models(..., check_refs=True)`): needs a reader, so it + rides with the read-side work. + +## Notebook migration (no logic, no schema, no models.py changes) + +For each ETL notebook: delete hardcoded `OUTPUT_ROOT` (no config cell — the library +discovers `ccc_config.yaml`), replace +direct `write_deltalake(...)` calls with the typed writers, and delete the per-cell +`mode`/`predicate`/`partition_by` bookkeeping (now owned by the registry). Verification +cells stay. The `visp_*_patchseq` bug is fixed automatically once DataSet writes go +through the registry (scope = project_id + id). Confirm exc + inh DataSet rows coexist +after a re-run as the migration's acceptance test. + +## Testing + +- Registry-vs-schema drift test (class names + scope/identifier slots exist in models). +- Idempotency: writing the same models twice yields no duplicates and no row loss. +- Shared-partition safety: writing dataset B does not remove dataset A's rows when they + share a `project_id` (the patchseq regression test). +- Per-class write example: every writable class has a small notebook example exercising its + registry entry (the prototyping evidence behind its `write_mode`/`scope_columns`). +- Strict-validator tests: missing `required_for_write` slot or failing cross-field rule + raises before any write touches disk (added with Module 3). +- Round-trip (write → read back → equality on scope columns): deferred with readers. diff --git a/planning/20260623/PR_message.md b/planning/20260623/PR_message.md new file mode 100644 index 0000000..5575892 --- /dev/null +++ b/planning/20260623/PR_message.md @@ -0,0 +1,58 @@ +# IO layer: write path + validation + +Ships the curated `connects_common_connectivity.io` write path end-to-end: package-wide configuration, a registry-driven write API, write-time validation derived from that same registry, ETL notebook migration to the new API, and the test suite to back it. + +## Design: WriteSpec as the single source of truth + +The `WriteSpec` registered per writable class is one declaration that drives both Delta dispatch (subdir, partitioning, scope columns, write mode) and write-time validation (`required_for_write` slots are flipped non-optional in auto-derived strict submodels and re-validated before any IO). Generated `models.py` is never touched. + +## Configuration + +- New `connects_common_connectivity.config`: pydantic `Settings`, cached `get_settings()`, walk-up discovery of `ccc_config.yaml`, plus `output_root()` / `table_path()` helpers. Relative values anchor at the config file's directory via `os.path.abspath` (avoids Code Ocean's `scratch -> /scratch` symlink). +- Precedence: explicit arg > `CCC_OUTPUT_ROOT` env > `ccc_config.yaml` > error. +- Repo-root `ccc_config.yaml` seeded. + +## Write registry and dispatch + +- `io/write_spec.py`: `WriteSpec`, `REGISTRY` (14 entries), `get_spec()`. +- `io/writers.py`: `write_models()` single-dispatch over the registry (no per-class wrappers), frozen `WriteResult` dataclass, `WRITABLE_CLASSES` tuple. `write_projection_matrix()` is the only non-`write_models` writer, justified by its non-uniform signature (dense matrix + model). +- `populate_region_coverage()` added in `io/write_utils.py`; derives `region_coverage` from the dense values before write. +- `DataSet` scope widened to `(project_id, id)` so patchseq exc/inh `DataSet` rows coexist (today's predicate-only-on-`project_id` behavior would overwrite one with the other). + +## Write-time validation + +- `io/write_validation.py`: `strict_model_for(cls)` flips `WriteSpec.required_for_write` slots to non-optional and strips `Optional` from those annotations (cached per class, no mutation of generated `models.py`). `validate_for_write()` re-validates instances and raises `ValueError` naming the missing slots before any IO. Wired into `write_models`. +- `required_for_write` populated for `Cluster`, `ClusterMembership`, `CellFeatureDefinition`. + +## Public API surface + +- Curated `io/__init__.py` re-exports pinned by `__all__`: `get_settings`, `Settings`, `table_path`, `write_models`, `write_projection_matrix`, `WriteResult`, `WRITABLE_CLASSES`. +- Per-call `output_root=` keyword on `write_models()` / `write_projection_matrix()` (mutually exclusive with `settings=`) so a single notebook can redirect its writes without mutating process-global config. +- `Modality.CALCIUM_IMAGING` added (for functional correlations in microns or v1dd-like datasets with EM + CI experiments). +- Removed `connects_common_connectivity.arrow_utils` / `connects_common_connectivity.write_utils` re-export shims; `arrow_utils.py` and `write_utils.py` now live exclusively under `io/`. + +## ETL notebook migration + +- Every registry-backed class is now exclusively written through `write_models` / `write_projection_matrix` in the ETL notebooks. Hand-rolled `write_deltalake` migrated. Per-notebook imports trimmed. +- Hardcoded `OUTPUT_ROOT = "../scratch/..."` strings replaced with `output_root()`. +- Patchseq exc/inh regression covered (see `DataSet` scope fix above). + +## Tests + +- Shared `tests/conftest.py` foundations (settings/cache/cwd isolation + shared fixtures); duplicated helpers removed. +- Tightened exception assertions to specific classes with meaningful `match=` checks. +- High-signal regression assertion messages where failures are otherwise hard to diagnose; list-validation failures now include row context. +- Per-class smoke parametrized over `WRITABLE_CLASSES`; registry-drift guard; no-shim regression (`test_shim_modules_deleted`, `_not_importable`, `_no_source_references_shim_paths`). +- Closed coverage gaps: CLI behavior, parquet loader contract, predicate escaping edge cases, relocation scan roots, dry-run semantics. +- Patchseq regression, idempotency, append-new-by-id, predicate construction, `output_root=` override, strict-validation failures, public-API surface. + +## Not in this PR + +- Wide cell-feature / projection-matrix parquet writes (still use `write_deltalake` directly). +- `CellCellConnectivityLong` — no registry entry yet; the `write_cellcellconnectivitylong` stub in `io/writers.py` documents the migration plan. +- The `etl_v1dd_01` new dataset ingestion prototype ongoing in parallel. +- A `merge_by_id` (read-existing → union → overwrite) write mode for shared scopes like `(visp_patchseq, visp_inh_patchseq)` where multiple notebooks contribute disjoint subsets. The union is currently inlined in patch-seq / WNM notebooks; see `planning/multi_writer_scope_design.md` for the draft design discussion. + +## Verification + +`uv run pytest -q` → 160 passed. diff --git a/planning/20260623/README.md b/planning/20260623/README.md new file mode 100644 index 0000000..d947d1e --- /dev/null +++ b/planning/20260623/README.md @@ -0,0 +1,29 @@ +# planning/ — IO layer design & agent prompts + +How we're building the user-friendly IO layer (write / read / validation) for +ConnectsCommonConnectivity. Created 2026-06-01. + +- `ARCHITECTURE.md` — the design (source of truth). +- `TODO.md` — ordered, flat task list (W1–W8). +- `prompts/` — one prompt per work item. `00_shared_context.md` is prepended to every other + prompt and holds the **hard rules** (don't edit `models.py` or `schemas/*.yaml`). +- `prompts/_deferred/` — designs kept for reference; not actionable this round. + +## TODO ↔ prompt map + +| TODO | Prompt | What it owns | +|------|-------------------------------------|-------------------------------------------------| +| W1 | `01_config.md` | `config.py` + `ccc_config.yaml` discovery | +| W2 | `02_write_spec.md` | Registry (seed 3 classes) + drift test | +| W3 | `03_writers.md` | Relocation, writers, per-class prototyping | +| W4 | `04_public_api.md` | `io/__init__.py` curated surface | +| W5 | `05_validation.md` | Strict submodels + hook swap | +| W6 | `06_notebook_migration.md` | Migrate notebooks, regression, shim removal | +| W7 | `07_tests.md` | Write-side suite gaps | +| W8 | (no prompt) | README / usage docs update | +| L1 | `_deferred/08_readers.md` | Readers (deferred) | +| L2 | `_deferred/09_analysis.md` | Read-side analysis + `check_refs` (deferred) | + +## How to run an item +Hand the implementing agent `00_shared_context.md` + the specific prompt, point it at +`ARCHITECTURE.md`, and follow the order in `TODO.md`. diff --git a/planning/20260623/TODO.md b/planning/20260623/TODO.md new file mode 100644 index 0000000..17e96e0 --- /dev/null +++ b/planning/20260623/TODO.md @@ -0,0 +1,109 @@ +# IO Layer — TODO + +Flat, ordered list. One row per prompt; sub-tasks live in the prompts. Design lives in +`ARCHITECTURE.md`. Hard rules: see `prompts/00_shared_context.md`. + +**Priority for this round: W1 → W7. Readers and analysis are deferred.** + +## This round (write path → migration → tests) + +- [x] **W1 — Config** (`prompts/01_config.md`) — `config.py` at the package root, pydantic + `Settings` loaded from a discovered `ccc_config.yaml` (walk-up like `pyproject.toml`), + cached `get_settings()`, `table_path()` helper, plus `output_root()` convenience that + returns the path relative to cwd (notebooks in `code/` see `../scratch/...`). Relative + values in the file are anchored at the config file's directory using `os.path.abspath` + (not `Path.resolve`, so Code Ocean's `scratch -> /scratch` symlink isn't followed). + Precedence: explicit arg > `CCC_OUTPUT_ROOT` env > `ccc_config.yaml` > error. No + `configure()` global, no `%run`. Re-exported from `io/__init__.py`. + `ccc_config.yaml` seeded at repo root with `output_root: scratch/em_patchseq_wnm_v1/`. + Tests: `tests/test_config.py` (14 tests, all passing). +- [x] **W2 — Write spec registry (seed only)** (`prompts/02_write_spec.md`) — + `io/write_spec.py`: `WriteSpec` pydantic model, `REGISTRY` seeded with **exactly three** + entries (`DataSet`, `DataItem`, `DataItemDataSetAssociation`), `get_spec()` lookup, and + the drift test (`tests/test_write_spec.py`). `required_for_write` and + `cross_field_rules` left empty — W5 owns those. The remaining classes are W3's job. +- [x] **W3 — Writers + relocation + registry expansion** (`prompts/03_writers.md`) — + Moved `arrow_utils.py`/`write_utils.py` into `io/` (re-export shims at old paths, to + be removed in W6). Built `io/writers.py`: `write_models()` dispatch, `WriteResult` + frozen dataclass, `WRITABLE_CLASSES` discovery tuple, pass-through `_validation_hook` + for W5 to swap, plus `write_projection_matrix()` (the one non-`write_models` public + writer, justified by its non-uniform signature). **No per-class wrappers** — + `write_models` infers the class. `populate_region_coverage` landed in + `io/write_utils.py`. Registry expanded to 12 entries (added `Cluster`, + `ClusterHierarchy`, `ClusterMembership`, `MappingSet`, `CellToClusterMapping`, + `CellFeatureSet`, `CellFeatureDefinition`, `CellFeatureMatrix`, + `ProjectionMeasurementMatrix`); `CellToCellMapping` / `ClusterToClusterMapping` / + `AlgorithmRun` deferred (no notebook writes them this round). **Deviation:** did + **not** add `wide_parquet` mode — the wide cell-feature Parquet is built from raw + dataframes that don't fit `WriteSpec`'s shape; `CellFeatureMatrix` stays as + `overwrite_scoped` for its metadata-pointer rows. Revisit when the wide-matrix + contract is clarified. Tests: `tests/test_writers.py`, `tests/test_write_relocation.py` + (full suite 119 passing). +- [x] **W4 — Public API** (`prompts/04_public_api.md`) — `io/__init__.py`: curated + re-exports + `__all__` (`get_settings`, `Settings`, `table_path`, `write_models`, + `write_projection_matrix`, `WriteResult`, `WRITABLE_CLASSES`). Module docstring + with usage example, `# TODO(W8): reader exports` placeholder. Test: + `tests/test_public_api.py`. +- [x] **W5 — Write validation** (`prompts/05_validation.md`) — `io/write_validation.py`: + `strict_model_for(cls)` flips `required_for_write` to required + strips `Optional` + from those annotations (cached per class, no `models.py` mutation); + `validate_for_write()` re-validates instances and raises `ValueError` naming the + missing slots before any IO. Wired into `write_models` (replaces the W3 + pass-through hook). Populated `required_for_write` for `Cluster`, + `ClusterMembership`, and `CellFeatureDefinition` (the only entries whose + predicate / partition columns are `Optional` in the generated schema). Tests: + `tests/test_write_validation.py`. Cross-field rules deferred (still empty list + on every spec). +- [x] **W6 — Notebook migration** (`prompts/06_notebook_migration.md`) — Every ETL + notebook now routes registry-backed writes through `write_models()` / + `write_projection_matrix()`; hardcoded `OUTPUT_ROOT = "../scratch/..."` strings + replaced with `output_root()` from `config`. Patchseq regression covered (exc and + inh `DataSet` rows coexist via `scope=["project_id", "id"]`). W3 re-export shims + removed; nothing imports the old `connects_common_connectivity.arrow_utils` / + `write_utils` paths. **Carve-outs (deferred, tracked elsewhere):** + (a) Wide cell-feature and wide projection-matrix parquets in + `etl_minnie_02`, `etl_visp_exc_patchseq_02`, `etl_visp_inh_patchseq_02`, + `etl_wnm_exc_02`, and `etl_wnm_exc_04` (ipsi/contra) still call + `write_deltalake` directly — `wide_parquet` mode not yet in the registry + (W3 deviation; revisit when the wide-matrix contract is clarified). + (b) `CellCellConnectivityLong` writes in `etl_minnie_04` (cells 19, 25) — + class not in the registry; `writers.py` has a `write_cellcellconnectivitylong` + stub documenting the migration plan. + (c) `etl_v1dd_01_v1196` cell 12 wide-parquet stub (still a `# TODO` placeholder). +- [x] **W7 — Write-side test suite** (`prompts/07_tests.md`) — Coverage verified against + the prompt's gap list: (1) per-class smoke for every `WRITABLE_CLASSES` entry via + `tests/test_writers.py::test_round_trip_each_writable_class` (parametrized, auto-covers + the 14 registered classes including post-hoc `AlgorithmRun` / `HierarchyCategory`); + (2) no-shim regression in `tests/test_write_relocation.py` + (`test_shim_modules_deleted`, `test_shim_modules_not_importable`, + `test_no_source_references_shim_paths`); (3) registry drift in + `tests/test_write_spec.py`; (4) patchseq regression / idempotency / append-new-by-id / + predicate construction / per-call `output_root=` override in `tests/test_writers.py`; + (5) strict-validation failures in `tests/test_write_validation.py`; (6) public-API + surface in `tests/test_public_api.py`. Full suite green: `uv run pytest -q` → 160 + passed. +- [x] **W8 — README / usage docs** — Update README for the write API. No prompt; small task. + Ask before large edits. + +## Deferred (do not start; design kept for reference) + +Designs live in `ARCHITECTURE.md` and `prompts/_deferred/`. Pick up only after W1–W7 land. + +- **L1 — Readers** (`prompts/_deferred/08_readers.md`) — `io/readers.py` (predicate-based + + cross-dataset). `parquet_loader.py` is **moved** to `io/parquet_loader.py` (pure move, + not folded) when this starts. +- **L2 — Read-side analysis + opt-in referential check** + (`prompts/_deferred/09_analysis.md`) — `compare_region_coverage` and + `write_models(..., check_refs=True)`. + +## Decisions locked (2026-06-01) +- Config: declarative `ccc_config.yaml` at repo root, discovered by walk-up, validated by + pydantic; no per-notebook setup, no `%run`, no global. Precedence: explicit arg > env + (escape hatch) > file > error (no default path). `config.py` at package root, not `io/`. +- Write spec: explicit registry, prototyped per class via notebook examples; `write_mode` is + an open vocabulary, not a forced overwrite assumption. +- `populate_region_coverage` lives in `write_utils.py` (write plumbing), not a transforms module. +- Validation: built after the write path; auto-derived strict submodels; structural-only, no I/O. + Named `io/write_validation.py` (cli owns the generic LinkML conformance check). +- Readers, analysis, referential check: deferred. `parquet_loader.py` is a pure move, not a fold. +- Public surface is `io/__init__.py`. Scope of this session: planning docs + prompts only. diff --git a/planning/20260623/prompts/00_shared_context.md b/planning/20260623/prompts/00_shared_context.md new file mode 100644 index 0000000..fc6da43 --- /dev/null +++ b/planning/20260623/prompts/00_shared_context.md @@ -0,0 +1,30 @@ +# Shared context — prepend to every IO-layer agent prompt + +You are working in the `ConnectsCommonConnectivity` repo (LinkML+pydantic schema for +multi-scale connectomics). **Read `planning/ARCHITECTURE.md` before starting** — it owns +the design, the existing-file inventory, the target `io/` layout, and the motivating bug. +This file is only the rules of the room. + +## Non-negotiable rules +1. **Never edit `src/connects_common_connectivity/models.py`** — auto-generated from + `schemas/*.yaml`. Read-only. +2. **Never edit `schemas/*.yaml`** without explicit written permission from YY. If your + task seems to need a new slot, STOP and report what you need and why. +3. **Single source of truth = the LinkML schema / generated models.** Read field + definitions from `models.py`; never restate them. +4. **IO code lives under `src/connects_common_connectivity/io/`.** Write-side root + modules (`arrow_utils.py`, `write_utils.py`) are MOVED there and wrapped as backends in + W3 — never reimplemented. `parquet_loader.py` is a **deferred move** that rides with the + read-side work; do NOT relocate it during W1–W7. `cli.py` and `models.py` stay at root; + so does `config.py` (package-wide settings, not IO-specific) and plotting stays in + `code/utils.py`. Exact layout: ARCHITECTURE.md → "Target io/ structure". +5. When you move a module, leave a one-line re-export shim at its old path until notebook + migration is done, so nothing breaks mid-transition. + +## Conventions +- Python 3.10+, pydantic v2; polars + pyarrow + deltalake (already deps). No new deps + without asking. +- Match existing style (ruff, line-length 100); docstring like the existing modules. +- Add `pytest` tests under `tests/` for anything you implement. +- Run the relevant tests and report results. Never mark work done with failing tests or a + partial implementation. diff --git a/planning/20260623/prompts/01_config.md b/planning/20260623/prompts/01_config.md new file mode 100644 index 0000000..83a9d86 --- /dev/null +++ b/planning/20260623/prompts/01_config.md @@ -0,0 +1,56 @@ +# Agent prompt — Config module (discovered config file) + +> Prepend `00_shared_context.md`. + +## Goal +Create `src/connects_common_connectivity/config.py` — at the **package root**, next to +`models.py` and `cli.py`, NOT in `io/`. Configuration is package-wide (cli and future +plotting/analysis read it too), so the general name belongs in the general namespace. +Settings live in **one declarative, version-controlled file** (`ccc_config.yaml`) that every +entry point discovers automatically — no per-notebook setup, no `%run`, no process-global +mutation. The library holds the *mechanism* and validates the file via pydantic; the +*values* live in `ccc_config.yaml` at the repo root. **No new dependency** — plain pydantic +`BaseModel` + PyYAML (already in the tree via LinkML). + +## Requirements +1. A `Settings(BaseModel)`: + - `output_root: Path` (required, no default). + - `dry_run: bool = False`, and room for more knobs (`schema_version_pin`, ...) later. + - **No built-in default output path.** The value comes from the config file. + - `describe()` / `__repr__` printing the resolved config. +2. **File discovery + typed load (the key piece):** + - `find_config_file(start: Path | None = None) -> Path | None` walks up from `cwd` + (or `start`) to the filesystem root looking for `ccc_config.yaml` — same pattern as + `pyproject.toml`/`ruff`/`pytest`. This is what lets a notebook in `code/` find the + repo-root config with zero config code. + - `get_settings() -> Settings` (cache with `functools.lru_cache`): + 1. find `ccc_config.yaml`; if none found, **raise a clear, actionable error** + (`"No ccc_config.yaml found — create one at the repo root with output_root: ..."`). + 2. `yaml.safe_load` it and construct `Settings(**data)` (pydantic validates here). + 3. **Developer escape hatch:** if `CCC_OUTPUT_ROOT` env is set, override + `output_root` with it (env wins over the file, for the path only — it cannot + express other knobs). Document it as override-only, not the primary path. + - Precedence overall: **explicit `settings=` arg (handled by callers) > `CCC_OUTPUT_ROOT` + env > `ccc_config.yaml` > error.** + - Provide a way to clear the cache for tests (e.g. expose `get_settings.cache_clear`). +3. `table_path(settings: Settings, table: str) -> Path` joins `output_root / table` (e.g. + `"dataset"`, `"dataitem"`, `"dataitem_dataset_association"`) using the exact subdir names + in the notebooks, so nothing concatenates path strings. +4. Export `Settings`, `get_settings`, `table_path` from `config.py` (and re-exported from + `io/__init__.py` for convenience). `io/` imports them via `from ..config import ...`. + Do NOT add a `configure()` process-global setter — discovery replaces it. + +## Tests (`tests/test_config.py`) +- `get_settings()` raises the actionable error when no `ccc_config.yaml` is discoverable. +- A `ccc_config.yaml` in a tmp dir is discovered from a nested cwd and loaded/validated. +- `CCC_OUTPUT_ROOT` env overrides only `output_root`; `dry_run` still comes from the file. +- An explicit `settings=` passed to a caller wins over both. +- `table_path` joins correctly and returns a `Path`. + +## Do not +- Add a built-in default output path, a `configure()` global, or `%run`-style coupling. Add + any dependency beyond pydantic + PyYAML. Touch `models.py` or schemas. + +## Report +List the subdir names found in the notebooks (for `table_path`) and the `output_root` +value you put in `ccc_config.yaml`. diff --git a/planning/20260623/prompts/02_write_spec.md b/planning/20260623/prompts/02_write_spec.md new file mode 100644 index 0000000..750fcb1 --- /dev/null +++ b/planning/20260623/prompts/02_write_spec.md @@ -0,0 +1,70 @@ +# Agent prompt — Write spec registry (seed only) + +> Prepend `00_shared_context.md`. Depends on nothing (reads generated models). + +## Goal +Create `src/connects_common_connectivity/io/write_spec.py` with the `WriteSpec` shape, a +`REGISTRY` seeded with **exactly three** entries (DataSet, DataItem, +DataItemDataSetAssociation), a `get_spec()` lookup, and a drift test. + +This prompt is the **minimum** needed to unblock W3. The remaining classes are added during +W3, where the writer exists to prototype against — see `03_writers.md` for that loop. + +## `WriteSpec` shape +Pydantic v2 `BaseModel` (the rest of the codebase uses pydantic — match it): + +```python +class WriteSpec(BaseModel): + model_cls: type # the generated pydantic class + subdir: str # Delta subdir under output_root + partition_by: list[str] # Delta partition columns + scope_columns: list[str] # columns defining the predicate + # (or the id column for append_new_by_id) + write_mode: Literal["overwrite_scoped", "append_new_by_id"] # extend in W3 if needed + required_for_write: list[str] = [] # leave empty here; W5 owns this + cross_field_rules: list[str] = [] # leave empty here; W5 owns this +``` + +Notes: +- `scope_columns` does double duty: for `overwrite_scoped` it's the predicate; for + `append_new_by_id` it's the id column(s) the backend dedupes on. One field, two + interpretations dispatched on `write_mode`. +- `required_for_write` and `cross_field_rules` are owned by W5 (validation). Leave them as + empty lists for the seed entries; do not guess. + +Expose: +- `REGISTRY: dict[str, WriteSpec]` keyed by class name (`"DataSet"`, etc.). +- `get_spec(model_or_cls) -> WriteSpec` — accepts a class or an instance. + +## Seed exactly these three + +| class | subdir | partition_by | scope_columns | write_mode | +|---|---|---|---|---| +| `DataSet` | `dataset` | `["project_id"]` | `["project_id", "id"]` ← patchseq fix | `overwrite_scoped` | +| `DataItem` | `dataitem` | `["project_id"]` | `["id"]` | `append_new_by_id` | +| `DataItemDataSetAssociation` | `dataitem_dataset_association` | `["project_id"]` | `["project_id", "dataset_id"]` | `overwrite_scoped` | + +The subdir names must match the existing notebook paths (grep +`code/etl_*_01_dataset_dataitem.ipynb` for `write_deltalake(` to confirm). The DataSet +scope is the patchseq fix — today's notebooks predicate only on `project_id`, which is why +`visp_inh_patchseq` overwrites `visp_exc_patchseq`. + +**Do NOT add any other classes here.** `Cluster`, `ClusterMembership`, `MappingSet`, +`CellFeatureSet`, `CellToClusterMapping`, `ProjectionMeasurementMatrix`, `CellFeatureMatrix`, +etc. are W3's responsibility, added one at a time as their write examples are prototyped. + +## Drift test (`tests/test_write_spec.py`) +- Every `REGISTRY` key resolves to a real class in `models.py` (importable, `model_cls` + matches the key). +- For each entry, every name in `scope_columns + partition_by + required_for_write` is a + field on `model_cls` (check `model_cls.model_fields`). Fail with the offending + class/field name. +- `get_spec(SomeClass)` and `get_spec(SomeClass(...))` return the same entry. + +## Report +- The three subdir names you wrote, and the matching paths grep'd from the notebooks. +- Confirmation that `tests/test_write_spec.py` passes (`pytest tests/test_write_spec.py -q`). + +## Do not +- Add a fourth class. Edit `models.py` or schemas. Touch any notebook. Populate + `required_for_write` or `cross_field_rules` (those are W5's job). diff --git a/planning/20260623/prompts/03_writers.md b/planning/20260623/prompts/03_writers.md new file mode 100644 index 0000000..7426912 --- /dev/null +++ b/planning/20260623/prompts/03_writers.md @@ -0,0 +1,204 @@ +# Agent prompt — Writers (dispatch core + registry expansion) + +> Prepend `00_shared_context.md`. Depends on `config.py` (W1), `write_spec.py` (W2). +> Validation (W5) slots into the pass-through hook below — not a dependency here. + +## What W3 ships +1. The dispatch core `write_models()` and a `WriteResult` value object. +2. The remaining `WriteSpec` entries (everything except the three W2 seeded), each driven + by a small write example so the entry reflects how the class actually wants to be + written. +3. `write_projection_matrix()` — the **only** standalone writer function, because it + needs a non-uniform signature (the dense matrix for `populate_region_coverage`). +4. The relocation of `arrow_utils.py` and `write_utils.py` into `io/`, plus the + `populate_region_coverage` helper. + +## No per-class wrapper functions +Decision: there are NO `write_dataset`, `write_dataitem`, `write_association`, etc. +wrappers. `write_models()` infers the class from its argument; renaming it eight times +adds no behavior, only drift surface. The single exception is `write_projection_matrix()` +because its signature is genuinely different (it accepts a dense matrix). Discoverability +is provided by `WRITABLE_CLASSES` (a tuple of `model_cls`) plus `write_models`'s docstring +listing them. + +## Relocation first +Before writing new code, MOVE the existing backends into `io/` with one-line re-export +shims at the old paths (deleted in W6): +- `arrow_utils.py` → `io/arrow_utils.py` +- `write_utils.py` → `io/write_utils.py` + +All new code imports from the `io/` locations. The shims look like: +```python +# src/connects_common_connectivity/arrow_utils.py +from .io.arrow_utils import * # noqa: F401,F403 (deprecated; removed in W6) +``` + +Add a quick smoke test (`tests/test_write_relocation.py`) that asserts the public names +(`build_arrow_schema`, `models_to_table`, `attach_linkml_metadata`, +`build_cell_feature_matrix_schema`, `append_new_dataitems`, `walk_ancestors`) are +importable from BOTH the new and the shim path. + +## Core: `write_models` +```python +def write_models(models, *, settings: Settings | None = None) -> WriteResult: ... +``` + +1. Accept a single model or an iterable; require homogeneous type; infer the class. +2. `settings = settings or get_settings()`. Explicit `settings=` always wins. +3. `spec = get_spec(cls)`. +4. **Validation hook** — call `_validation_hook(models, spec)` before any IO. In W3 this + is a pass-through (identity) function defined at module top: + ```python + _validation_hook = lambda models, spec: models # replaced in W5 + ``` + Wire the call site now; W5 monkey-patches the real validator in. +5. Convert via `arrow_utils.models_to_table` + `build_arrow_schema`; attach metadata with + `attach_linkml_metadata(linkml_class=cls.__name__)`. +6. Resolve path with `table_path(settings, spec.subdir)`. +7. Dispatch on `spec.write_mode` (factor each branch into a private helper so the tests + below can target each in isolation): + - `_dispatch_overwrite_scoped`: group rows by their `scope_columns` tuple via + `_group_by_scope`. **Write each group with its own predicate** — never widen a + predicate to cover rows it shouldn't. Predicate built by `_build_predicate`, format + `col1 = 'val1' AND col2 = 'val2'` (single quotes, AND-joined). One + `write_deltalake(... mode="overwrite", predicate=..., partition_by=spec.partition_by)` + call per group. + - `_dispatch_append_new_by_id`: delegate to `write_utils.append_new_dataitems`. The + existing signature (`output_path, table, *, project_id, id_column="id"`) already + covers the seed entries; if a new `append_new_by_id` entry needs a different + partition column, generalize then. Pull `id_column` from `spec.scope_columns[0]` + and `project_id` from the row values. +8. Return a `WriteResult`. + +`write_models` should know nothing class-specific; everything class-specific lives in the +registry. The only places that mention specific model classes are `write_spec.py` (the +registry) and `write_projection_matrix` (the one signature exception). + +## `WriteResult` +A frozen dataclass — this is a return value, not validated data: + +```python +from dataclasses import dataclass +from pathlib import Path + +@dataclass(frozen=True) +class WriteResult: + class_name: str + path: Path + mode: str + predicates: tuple[str, ...] # one per group; () for append_new_by_id / wide_parquet + rows_written: int +``` + +Co-locate in `writers.py`. + +## Discovery: `WRITABLE_CLASSES` +Replaces the per-class wrappers. One line in `writers.py`: + +```python +WRITABLE_CLASSES: tuple[type, ...] = tuple(spec.model_cls for spec in REGISTRY.values()) +``` + +`write_models`'s docstring should list `WRITABLE_CLASSES` (or instruct the reader to print +it) so users can see what's writable without reading the registry source. + +## Registry expansion (the prototype loop — the main intellectual work of W3) +W2 only seeded `DataSet`, `DataItem`, `DataItemDataSetAssociation`. Add the rest now, +one at a time, each driven by a real write example. Do NOT batch them up front. + +For each class below: +1. **Read the existing notebook write.** `grep -n 'write_deltalake' code/etl_*.ipynb` to + find the call(s); note the current `mode`, `predicate`, and `partition_by`. +2. **Decide the mode.** If neither `overwrite_scoped` nor `append_new_by_id` fits, extend + the `Literal` in `write_spec.py` with a new value, document it in one comment line, + and add the dispatch branch in `write_models`. Don't force a class into a mode that + doesn't fit. +3. **Add the entry to `REGISTRY`.** If a current notebook predicate looks wrong (like the + DataSet case), use the correct scope and note it in a comment. +4. **Write a smoke test** in `tests/test_writers.py` (NOT a production notebook — + notebooks are W6) that constructs one or two instances and round-trips them through + `write_models`. +5. **Update the drift test** if the new entry exposes a column the test doesn't already + check. + +Classes to add this round, roughly grouped: +- Cluster side: `Cluster`, `ClusterHierarchy`, `ClusterMembership`. +- Mapping side: `MappingSet`, `CellToClusterMapping`. (`CellToCellMapping` and + `ClusterToClusterMapping` only if a notebook actually writes them this round; otherwise + defer.) +- Feature side: `CellFeatureSet`, `CellFeatureDefinition`. (`CellFeatureMatrix` is wide + Parquet — see "Wide feature matrices" below.) +- Projection: `ProjectionMeasurementMatrix`. See "Projection pre-write helper" below. + +If a class isn't written by any current notebook, skip it — adding an entry no caller +exercises violates "prototype, don't assume." + +## Wide feature matrices +`CellFeatureMatrix` is wide Parquet, not row-Delta. It doesn't fit `overwrite_scoped` / +`append_new_by_id`. Keep it inside the registry by adding `write_mode = "wide_parquet"` +and routing it through `build_cell_feature_matrix_schema` + a Parquet write inside +`write_models`. Same registry, same dispatch, different branch — no separate wrapper. +(`write_models(cell_feature_matrix)` is the call.) If during prototyping the wide-Parquet +path turns out to need invariants that don't fit `WriteSpec` cleanly, stop and report +before adding a separate function. + +## Projection pre-write helper + `write_projection_matrix` +Port `populate_region_coverage(pmm, matrix)` from the (now-deleted) `io/io_plans.md` into +`io/write_utils.py` (write plumbing — same shelf as `append_new_dataitems`, NOT a separate +`transforms` module). Pure function: derive `region_coverage` from the dense values array, +return a NEW `ProjectionMeasurementMatrix` instance (no mutation, no IO). + +`write_projection_matrix` is the **one** non-`write_models` public writer: +```python +def write_projection_matrix(pmm, matrix, *, settings=None) -> WriteResult: + enriched = populate_region_coverage(pmm, matrix) + return write_models(enriched, settings=settings) +``` +It exists because its signature is non-uniform (takes the dense matrix). Don't introduce +a second exception — if some other class needs pre-write enrichment, route it through +`write_models` with the enrichment done by the caller, not via a new wrapper. + +Do NOT port `compare_region_coverage` — read-side, deferred (`_deferred/09_analysis.md`). + +## Private helpers (factor these out for testability) +- `_build_predicate(scope_columns, row_values) -> str` +- `_group_by_scope(table, scope_columns) -> list[tuple[tuple, Table]]` +- `_dispatch_overwrite_scoped(table, spec, path) -> WriteResult` +- `_dispatch_append_new_by_id(table, spec, path) -> WriteResult` +- `_validation_hook(models, spec) -> models` — pass-through; replaced by W5 + +These are private (underscore-prefixed). Tests import them directly to exercise their +units without going through Delta. + +## Tests (`tests/test_writers.py`) +- **Patchseq regression** (the headline): `write_models(DataSet(A))`, then + `write_models(DataSet(B))` with the same `project_id` but different `id`, read the + table back, assert both rows exist. +- **Idempotency**: writing the same models twice yields the same row count. +- **Append-new-by-id**: writing a batch with one new + one existing id appends only the + new one. +- **Multi-scope-group dispatch**: a batch with two distinct scope tuples produces two + predicates and two rows in the table; neither overwrites the other. Inspect + `WriteResult.predicates` to assert the count. +- **Predicate construction**: call `_build_predicate` directly and verify the format + (`col = 'val' AND col = 'val'`) by string match. +- **Per-class smoke**: iterate `WRITABLE_CLASSES` and round-trip a small instance of each + through `write_models` — every registry entry exercised. +- **`write_projection_matrix`**: enriches the PMM (sets `region_coverage`) and writes + successfully; the input is unmutated. + +## Reporting +- The full list of registry entries at the end of W3 (table: class / subdir / + partition_by / scope_columns / write_mode). +- Any class you skipped because no notebook writes it, and why. +- Any new `write_mode` you added beyond `overwrite_scoped` / `append_new_by_id` / + `wide_parquet`, with a one-sentence justification. +- Any current notebook predicate you believe is wrong (do not fix the notebook here — + W6 owns that). +- `pytest tests/ -q` summary (full suite, not just `test_writers.py`). + +## Do not +- Add per-class `write_*` wrapper functions. Hardcode any predicate. Skip the prototype + loop and bulk-add registry entries from intuition. Touch `models.py`, schemas, or any + notebook. Re-export internal backends from `io/__init__.py` (W4 owns the public + surface). diff --git a/planning/20260623/prompts/04_public_api.md b/planning/20260623/prompts/04_public_api.md new file mode 100644 index 0000000..149975c --- /dev/null +++ b/planning/20260623/prompts/04_public_api.md @@ -0,0 +1,29 @@ +# Agent prompt — Public API (`io/__init__.py`) + +> Prepend `00_shared_context.md`. Depends on writers (W3). + +## Why +`io/__init__.py` defines what users type after `from connects_common_connectivity.io +import …` and what shows up in autocomplete. It is the file that decides whether the +package feels curated or sprawling. + +## Requirements +1. Re-export, and only re-export, the curated names below. Source paths in parentheses. + - `get_settings`, `Settings`, `table_path` — from `..config` + - `write_models`, `write_projection_matrix`, `WriteResult`, `WRITABLE_CLASSES` + — from `.writers` +2. Define `__all__` to exactly that list (no more, no less). +3. Add a module docstring: one short paragraph on the IO layer + a 3–5 line usage + example using `write_models(...)` and `write_projection_matrix(...)`. No config + ceremony in the example — `get_settings()` is implicit. +4. Leave a single `# TODO(W8): reader exports` comment at the bottom of the imports + block, so the reader slot is obvious when W8 lands. + +## Test (`tests/test_public_api.py`) +- Import every name in `__all__` from `connects_common_connectivity.io` and assert it + resolves to a non-`None` object. +- Assert no name in `__all__` starts with `_`. + +## Do not +- Re-export `arrow_utils`, `write_utils`, or any private helper. +- Touch `models.py` or schemas. \ No newline at end of file diff --git a/planning/20260623/prompts/05_validation.md b/planning/20260623/prompts/05_validation.md new file mode 100644 index 0000000..1aba4d8 --- /dev/null +++ b/planning/20260623/prompts/05_validation.md @@ -0,0 +1,47 @@ +# Agent prompt — Write-validation (auto-derived strict submodels) + +> Prepend `00_shared_context.md`. Depends on `write_spec.py` (W2) and `writers.py` (W3). +> Wires into the pass-through `_validation_hook(models, spec) -> models` left in +> `write_models`. + +## Naming +File is `io/write_validation.py` — write-time, pydantic-only, registry-coupled. The +generic word "validation" is already used by `cli.py`'s LinkML conformance check; the +two are intentionally distinct. + +## What W5 ships +1. **Populate `required_for_write`** on the registry entries that need it. Driven by + the same prototype loop as W3: read the corresponding notebook's write call, identify + the slots the predicate / partition / append-id depend on, and list them. Empty list + is a valid answer — only add slots a real write actually relies on. +2. `strict_model_for(model_cls) -> type[BaseModel]`: + - Subclass the generated model at runtime; do NOT mutate `models.py` classes. + - For each name in `spec.required_for_write`, override the field to be required + (no default, not Optional). Use any pydantic v2 mechanism that doesn't touch the + parent class. + - Cache by class so the derived type is built once. +3. `validate_for_write(models, spec) -> models` — accepts the same shape `_validation_hook` + already does (single instance OR iterable, returns the same shape). Runs each instance + through the strict submodel; on failure, raise an error naming the class and the + failing slot. Pydantic-only, no I/O. +4. **Wire it in.** In `write_models`, replace the pass-through `_validation_hook` with + `validate_for_write`. This is the only edit to `writers.py`. + +## Out of scope (deferred, not skipped) +- Cross-field rules. `WriteSpec.cross_field_rules` exists as an empty list; until a real + invariant needs one, do not introduce a rule registry. Add the dict + `model_validator` + scaffolding when the first rule is actually written, not before. +- Referential checks (e.g. "association.dataset_id exists in DataSet"). These read other + tables and belong with the read-side opt-in `check_refs` (`_deferred/09_analysis.md`), + not on the write path. + +## Tests (`tests/test_write_validation.py`) +- A model with a missing `required_for_write` slot fails before any IO. +- A model with all slots present passes and is returned unchanged (field-by-field equal). +- The class object in `models.py` has the same `model_fields` after `strict_model_for` + runs as before — proving no in-place mutation. +- `validate_for_write([m1, m2], spec)` accepts a list (same shape contract as the hook). + +## Do not +- Edit `models.py` or schemas. Restate field types from the schema. Call the LinkML + validator on the write path. Add cross-field rules speculatively. \ No newline at end of file diff --git a/planning/20260623/prompts/06_notebook_migration.md b/planning/20260623/prompts/06_notebook_migration.md new file mode 100644 index 0000000..951de01 --- /dev/null +++ b/planning/20260623/prompts/06_notebook_migration.md @@ -0,0 +1,95 @@ +# Agent prompt — Notebook migration + +> Prepend `00_shared_context.md`. Depends on writers (and readers for verification cells). + +## Goal +Migrate the ETL notebooks in `code/etl_*.ipynb` to use the new IO API. Replace the +hand-rolled `write_deltalake(... mode/predicate/partition_by ...)` calls with +`write_models` / `write_projection_matrix`, and replace the hardcoded +`OUTPUT_ROOT = "../scratch/..."` constant with a call to +`connects_common_connectivity.config.output_root()` — a cwd-aware helper that returns +the path string with trailing `/`, so it's a literal drop-in for the old constant. +Notebooks keep their per-dataset config cell (input paths, dataset/project ids, +versions, feature-set ids, etc.); only the output root and the manual write +bookkeeping move into the library. + +## Required reading before touching any notebook +1. `etl_example_prompt.md` (repo root) — describes the **pre-migration** notebook patterns: + write predicates, two-level overwrite rules, `append_new_dataitems`, the patchseq + shared-partition bug, parent propagation, etc. Read this so you understand WHAT each + notebook is doing scientifically and WHY the old write patterns were shaped that way. + Treat its rules about ids, enums, schemas, and verification cells as still binding. +2. `src/connects_common_connectivity/io/` — the **post-migration** target. The functions + `write_models`, `write_projection_matrix`, `get_settings`, `table_path` (re-exported + from `connects_common_connectivity.io`) now own everything `etl_example_prompt.md` + spelled out by hand: mode, predicate, partition_by, append-new-by-id, two-level scoping + per class. Migration is the act of replacing those manual rules with these calls. +3. The config file `ccc_config.yaml` already exists at repo root — do NOT recreate it. + Migration only edits notebooks. + +## What changes between old and new +| Old (per `etl_example_prompt.md`) | New (this migration) | +|---|---| +| `OUTPUT_ROOT = "../scratch/..."` constant in cell 3 | `OUTPUT_ROOT = output_root()` — same string shape, sourced from `ccc_config.yaml` | +| `write_deltalake(path, table, mode="overwrite", predicate=..., partition_by=...)` | `write_models(instance_or_list)` — registry owns mode/predicate/partition | +| `append_new_dataitems(...)` for `dataitem/` | `write_models(dataitem_list)` — append-new-by-id is the registered mode | +| Manual two-level predicate strings | None in notebooks; the `WriteSpec` for each class encodes them | +| Verification cell hardcoded path string | `output_root() + "
/"` (or `table_path(get_settings(), "
")` for a typed `Path`) | +| `write_deltalake(...)` for projection matrix wide form | `write_projection_matrix(pmm, dense_matrix)` | + +The model construction, ETL transforms, and verification assertions do not change. + +## Per ETL notebook +1. Replace the hardcoded `OUTPUT_ROOT = "../scratch/..."` with + `OUTPUT_ROOT = output_root()` (imported from + `connects_common_connectivity.config`). The helper returns a cwd-relative path + string with trailing `/`, so existing string concatenations like + `OUTPUT_ROOT + "dataitem/"` keep working. `write_models(...)` calls need neither a + path nor `settings=` — the library discovers `ccc_config.yaml` on its own. +2. Replace each direct `write_deltalake(... mode=... predicate=... partition_by=...)` call + with `write_models(my_instance)` (or `write_models([inst1, inst2])`). The class is + inferred from the argument; the registry owns mode / predicate / partition. Use + `write_projection_matrix(pmm, matrix)` for the one projection notebook — it's the + single non-`write_models` writer. Delete the now-redundant `mode`/`predicate`/ + `partition_by` arguments and their explanatory comments. +3. Keep verification cells; their `OUTPUT_ROOT + "
/"` reads continue to work + unchanged once `OUTPUT_ROOT` is sourced from `output_root()`. + +## Pilot first — do not fan out +Migrate ONE notebook end-to-end before touching any others. Pick +`etl_visp_inh_patchseq_01_dataset_dataitem.ipynb` as the pilot (small, exercises the +patchseq bug, uses both `DataSet` and `DataItem` writes). For the pilot: + +1. Run the pre-migration version once and record the output Delta tables (row counts and + `(project_id, id)` sets for `dataset/`, `dataitem/`, `dataitem_dataset_association/`). +2. Migrate the notebook per the rules above and run it against a **fresh** output root + (point `ccc_config.yaml` or `CCC_OUTPUT_ROOT` somewhere new so the pre-migration data + is preserved for comparison). +3. Diff: assert the post-migration tables match the pre-migration ones in row count and + `(project_id, id)` set equality. Any drift is a registry/spec bug — STOP and report + before migrating further notebooks. +4. Only after the pilot passes the diff, proceed in the order below. + +## Migrate in this order +1. `etl_*_01_dataset_dataitem.ipynb` (all of minnie, wnm, visp_exc/inh patchseq) — these + carry the DataSet overwrite bug. +2. feature notebooks (`_02_cell_features`). +3. cluster / membership / mapping notebooks (`_03`, cluster files). +4. projection (`etl_wnm_exc_04_projection_matrix.ipynb`). + +## Patchseq regression acceptance test (do this explicitly) +Run `etl_visp_exc_patchseq_01` then `etl_visp_inh_patchseq_01` (in that order), then read +the `dataset` table and assert BOTH `visp_exc_patchseq` and `visp_inh_patchseq` rows +exist under `project_id='visp_patchseq'`. Before the fix, the second run wiped the first. +Report the before/after row counts. + +## After migration — hand off shim removal +Once every notebook imports from the `io/` paths, the write-side re-export shims at +`arrow_utils.py` and `write_utils.py` are dead weight. Do TODO 3.4: delete them and confirm +the no-shim test (`07_tests.md`) passes. Report which old paths were still referenced, if any. +(`parquet_loader.py` is untouched this round — it moves with the deferred read-side work.) + +## Do not +- Change the science/ETL transformation logic. Fix the `etl_visp_inh_patchseq` data logic + beyond the write path — the maintainer said the writer fix is enough for now. +- Touch `models.py` or schemas. diff --git a/planning/20260623/prompts/07_tests.md b/planning/20260623/prompts/07_tests.md new file mode 100644 index 0000000..17fe774 --- /dev/null +++ b/planning/20260623/prompts/07_tests.md @@ -0,0 +1,37 @@ +# Agent prompt — Write-side test suite + +> Prepend `00_shared_context.md`. Run after the write path + validation exist. (Reader/ +> analysis tests are deferred with that work.) + +## Goal +This is the LAST write-side prompt that will run — prompts 02–05 will not be re-executed. +That means this prompt is responsible for both the gaps below AND any cleanup left over +from earlier prompts. Several cases are already specified in their owning prompts: +- Registry↔schema drift → `02_write_spec.md` (`tests/test_write_spec.py`). +- Patchseq shared-partition regression, idempotency, append-new-by-id, predicate + construction → `03_writers.md` (`tests/test_writers.py`). +- Strict-validation failures → `05_validation.md` (`tests/test_write_validation.py`). +- Public-API surface → `04_public_api.md` (`tests/test_public_api.py`). + +If any of those tests are missing, red, or do not actually assert what their prompt +claimed, **fix them here** — there is no second pass. When you patch a test owned by an +earlier prompt, list which prompt and which test in the report so the spec docs can be +updated later. + +Use small synthetic models written to a `tmp_path` Delta root (point `CCC_OUTPUT_ROOT` at +`tmp_path`, or a tmp `ccc_config.yaml`) so tests never touch real data. + +## Gaps this prompt owns (not covered elsewhere) +1. **Per-class write-example smoke:** every writable class in the registry has a tiny write + that round-trips through `write_models` without error (the prototyping evidence as a test). +2. **No-shim regression (TODO 3.4):** after migration, assert no module imports the old + write-side paths `arrow_utils`, `write_utils` (grep / import-scan); the shims must be gone. +3. Confirm the suite is collected and green together (no per-prompt drift). +4. Patch any 02–05 test gaps discovered while running the suite (see goal above). + +Round-trip and cross-dataset read tests are deferred to the read-side work. + +## Reporting +Run `uv run pytest -q` (this repo uses `uv` — plain `pytest` will not pick up the +project venv) and paste the summary. Do not mark complete with failures. Also list any +tests you patched on behalf of an earlier prompt and a one-line reason for each. diff --git a/planning/20260623/prompts/_deferred/08_readers.md b/planning/20260623/prompts/_deferred/08_readers.md new file mode 100644 index 0000000..c75f8d4 --- /dev/null +++ b/planning/20260623/prompts/_deferred/08_readers.md @@ -0,0 +1,54 @@ +# Agent prompt — Readers (predicate-based + cross-dataset) + +> **DEFERRED — not actionable this round.** Priority is config → write IO → validation → +> notebook migration. This design is kept for reference; do not start it until the write path +> is done and notebooks are migrated. +> +> Prepend `00_shared_context.md`. Depends on `write_spec.py` (+ `config.py`). + +## Relocation first (clean structure) +**Move** `parquet_loader.py` → `io/parquet_loader.py` as a PURE MOVE (re-export shim at the +old path). Do NOT fold it into `io/readers.py` — keep it a standalone module; `readers.py` +imports `load_parquet_to_models` from it where typed reads are wanted. + +## Goal +Create `src/connects_common_connectivity/io/readers.py`: convenient reads over the shared +Delta tables, scoped by the registry, plus flexible cross-dataset/cross-schema queries. +Readers are conveniences — users can always drop to raw `polars.read_delta`. + +## Layer 1 — predicate-based readers +- `read_dataset(*, project_id=None, dataset_id=None, settings=None)`, + `read_dataitem(...)`, `read_features(...)` etc. +- Resolve the path via the registry `subdir` + `table_path`; filter by the given scope + columns; return a polars DataFrame (offer `.to_pandas()` convenience). +- Reuse `parquet_loader.load_parquet_to_models` where returning typed models is wanted. + +## Layer 2 — cross-dataset / cross-schema reads +Flagship function (build this and design it to generalize): +`read_dataitems_for_clusters(cluster_ids, *, via=("membership","mapping"), project_id=None, +settings=None) -> DataFrame`: +- Returns the union of DataItems that have EITHER a `ClusterMembership` OR a + `CellToClusterMapping` to any cluster in `cluster_ids`. +- Join the membership and mapping Delta tables on cluster id; collect distinct DataItem + ids; optionally hydrate with DataItem rows. Cross-dataset and cross-modality by design — + do not assume a single source dataset. +- Use `walk_ancestors` semantics so a query for a parent cluster also matches descendants + if the membership/mapping tables are denormalized that way (check how the `_03`/cluster + notebooks write the hierarchy before assuming). + +## Read-side analysis (section in this file, not a new module) +`compare_region_coverage(pmms)` is read-side analysis and starts as a clearly-marked section +in `readers.py` — do NOT create `io/analysis.py` yet (single function = premature module). +Its implementation is specified in `09_analysis.md`; build it there. When a second analysis +function appears, relocate the section to `io/analysis.py` (pure move, no public-API change). +`populate_region_coverage` is a write-side transform and stays with the writers +(`03_writers.md`), not here. + +## Tests (`tests/test_readers.py`) +- Round-trip: write models via the writers, read them back scoped, assert equality on + scope columns. +- `read_dataitems_for_clusters` returns the correct union for a small synthetic + membership + mapping fixture, including cross-dataset cases. + +## Do not +- Touch `models.py` or schemas. Lock users out of raw polars (readers are additive). diff --git a/planning/20260623/prompts/_deferred/09_analysis.md b/planning/20260623/prompts/_deferred/09_analysis.md new file mode 100644 index 0000000..88c266c --- /dev/null +++ b/planning/20260623/prompts/_deferred/09_analysis.md @@ -0,0 +1,62 @@ +# Agent prompt — Read-side analysis + referential check + +> **DEFERRED — not actionable this round.** Rides with the read-side work, after config → +> write IO → validation → notebook migration. Design kept for reference. +> +> Prepend `00_shared_context.md`. Depends on `readers.py` (uses read outputs). + +Two things land here, both requiring readers to exist: + +## A. Read-side analysis — `compare_region_coverage` +Add as a clearly-marked section in `io/readers.py` (NOT a new `io/analysis.py` yet — single +function = premature module; relocate to `io/analysis.py` only when a second analysis +function arrives, a pure move with no public-API change). It reads finished data and +summarizes; it never writes or mutates inputs. + +Spec for `compare_region_coverage(pmms) → dict` (moved here from the old +`src/connects_common_connectivity/io/io_plans.md`; source-tree file deleted): + +- **Input:** `pmms` — list of `ProjectionMeasurementMatrix` instances, each with + `region_index` and `region_coverage` populated. (`region_coverage` is produced by + `populate_region_coverage`, already shipped in `io/write_utils.py`.) +- **Computes:** + - `shared_regions`: intersection of all `region_index` across inputs (what regions can + we compare at all?). + - `shared_coverage`: intersection of all `region_coverage` across inputs (where do all + datasets have signal?). + - For every non-empty subset of the input PMMs (powerset, size 1 through N): count of + regions that are in that subset's `region_coverage` intersection but **not** in any + other PMM's `region_coverage` (exclusive to that combination). +- **Prints:** A summary table showing, for each subset combination, how many regions are + exclusively covered by that combination. Example for 3 datasets A, B, C: + ``` + Only in A: 12 + Only in B: 5 + Only in C: 8 + Only in A ∩ B: 3 + Only in A ∩ C: 2 + Only in B ∩ C: 1 + In all (A ∩ B ∩ C): 45 + ``` +- **Returns:** dict with keys `shared_regions`, `shared_coverage`, and + `exclusive_counts` (mapping subset labels to region counts). +- **Properties:** Pure function, no side effects. Does not modify inputs. + +## B. Opt-in referential check — `check_refs` +This is the home for the referential rule deliberately kept off the hot path in +`05_validation.md`. Implement it as an opt-in step invoked by writers: +- `write_models(..., check_refs=False)` — when True, before writing a + `DataItemDataSetAssociation`, read the `dataset` table (via the readers) and assert each + `dataset_id` exists for that `project_id`; raise a clear error naming the missing id. +- It reads other tables, so it is NOT a strict-submodel validator and never runs on the + default write path. Default `check_refs=False` keeps writes fast. + +## Tests +- `compare_region_coverage`: small synthetic PMM set gives expected shared/exclusive counts; + inputs are not mutated. +- `check_refs`: writing an association whose `dataset_id` is absent raises with + `check_refs=True`, and succeeds (no check) with the default. + +## Do not +- Write to disk in the analysis function. Put referential checks on the default write path. + Touch `models.py` or schemas. diff --git a/planning/20260623/tests_review/README.md b/planning/20260623/tests_review/README.md new file mode 100644 index 0000000..20542f9 --- /dev/null +++ b/planning/20260623/tests_review/README.md @@ -0,0 +1,20 @@ +# Tests Review — Findings & Implementation Plan + +Review of `tests/` (12 files, ~1,540 LOC) on branch `ingestion-v2`. + +## Documents + +- [`findings.md`](./findings.md) — Numbered review report: high / medium / low priority issues, plus what's working well. +- [`plan.md`](./plan.md) — Sequential implementation plan (5 work packages) for an agent to execute end-to-end in one go, with code snippets and per-package guardrails. + +## TL;DR + +Suite is solid (good docstrings, parametrization, regression tests named after the bug). Main gaps: + +1. No `conftest.py` → duplicated helpers, cache-pollution risk. +2. `pytest.raises(Exception)` used in several places → too broad. +3. Regression assertions lack failure messages. +4. `WRITABLE_CLASSES` ↔ `_make_instance` drift is silent. +5. Missing coverage for `cli.py`, `parquet_loader.py`, and `dry_run` semantics. + +Five sequential work packages proposed for end-to-end agent execution. WPs 1–4 are pure test refactors; WP 5 is the only one likely to surface production bugs (fix in-place). diff --git a/planning/20260623/tests_review/findings.md b/planning/20260623/tests_review/findings.md new file mode 100644 index 0000000..0b358f9 --- /dev/null +++ b/planning/20260623/tests_review/findings.md @@ -0,0 +1,60 @@ +# Tests Review — Findings + +Review of `tests/` on branch `ingestion-v2` (12 files, ~1,540 LOC). + +## 🔴 High priority + +1. **No `conftest.py`.** Shared helpers are duplicated across files: `_models()` is redefined in 4 schema files; `_make_table`, `_read`, the `settings` / `tmp_path` fixtures appear ad-hoc. Promote `settings`, `_read`, `_models`, `_make_instance` to `tests/conftest.py` as fixtures. Will shrink the suite and stop drift. + +2. **`pytest.raises(Exception)` is too broad** in `test_basic.py` (lines 22, 37, 69) and `test_config.py` (108, 115). It will pass on completely unrelated failures (ImportError, TypeError from a refactor). Use `ValidationError` / `RuntimeError` with `match=` like the other schema tests already do. + +3. **Cross-test cache pollution risk.** `get_settings` is `lru_cache`d but only `test_config.py` clears it (via autouse fixture). If any other test imports `get_settings` first, later config tests can flake. Move the `_reset_cache_and_env` autouse fixture into `conftest.py` so it runs for every test. + +4. **`test_no_source_references_shim_paths` walks `REPO_ROOT.rglob("*")`** including `data/`, `results/`, `scratch/`, `metadata/`, `.venv` siblings, etc. It's slow and brittle. Either restrict to `{src, tests, code, scripts, planning}` or add those large dirs to `EXCLUDED_DIRS`. Also worth caching the file list. + +5. **`test_round_trip_each_writable_class` silently skips coverage drift.** If someone adds a class to `WRITABLE_CLASSES` and forgets to extend `_make_instance`, the test raises `AssertionError("no fixture for …")` — which *looks* like a test failure but doesn't tell you the spec is missing. Convert the `raise AssertionError` to `pytest.fail("…add a fixture in _make_instance")`, or better, register per-class fixtures in a dict and assert `set(fixtures) == set(WRITABLE_CLASSES)` as its own test. + +## 🟡 Medium priority + +6. **Assertion failure messages are mostly bare.** Examples: + - `test_patchseq_regression_two_datasets_same_project` → `assert ids == [...]` with no `, f"…"` message. When this fails in CI, you'll get `AssertionError: assert ['x'] == ['visp_exc_patchseq','visp_inh_patchseq']` and nothing about which write was lost. Add messages like `f"second write wiped first; remaining ids={ids}"`. + - `test_first_write_appends_all`, `test_idempotent_partial_rerun`: same — a custom message naming the scenario would speed debugging by months over time. + +7. **Lots of inline `import` statements** (`test_basic.py` imports `pytest` and `ccc` inside every test, `test_write_validation.test_write_models_calls_validation_before_io` imports `Settings` and `write_models` inside the function). Lift to module top for consistency with the rest of the suite. + +8. **`test_enum_validation` and `test_projection_measurement_matrix_laterality` use overly permissive assertions:** `assert str(ds.modality) in {Modality.TRACER.value, Modality.TRACER.name, str(Modality.TRACER)}`. That comment says "depending on dynamic generation" — pin it. If the schema can return three things, the schema isn't deterministic and *that* is the bug; if it's deterministic, assert exactly one. + +9. **No negative test for `validate_for_write` with a `list` containing a bad row.** `test_validate_for_write_accepts_a_list` covers the happy path; add a counterpart that passes `[good, bad]` and asserts the error names *which row* failed. + +10. **`test_write_models_rejects_unregistered_class`** uses `pytest.raises(TypeError)` without `match=`. Add `match="WRITABLE_CLASSES"` or similar so a misleading TypeError from elsewhere doesn't false-positive. + +11. **`test_describe_includes_resolved_values`** asserts substring `"root"` which trivially matches the path. Strengthen: assert `str(settings.output_root)` is in the output verbatim. + +12. **Idempotency assertion in `test_overwrite_scoped_is_idempotent`** checks only row count, not row equality. If the writer silently overwrites with wrong content, the test passes. Read back and assert the row matches `ds`. + +## 🟢 Low priority / polish + +13. **Naming consistency.** Some files use `def _models()` factory, others import directly from `connects_common_connectivity.models`. Pick one — preferably the direct import, since `generate_pydantic_models()` is re-invoked on every test and is presumably expensive. + +14. **`test_basic.py` is a grab bag** (imports, model generation, enum, required field, multivalued, bounds). Split into `test_import.py` + fold the rest into the topical schema files that already exist. + +15. **No markers / no test plan.** Consider `pytest.mark.slow` for the full per-class round-trip and the repo-walk shim test. Speeds local TDD. + +16. **`test_write_relocation.py` test name is misleading** — it's about shim removal, not relocation. Rename to `test_no_shim_imports.py`. + +17. **Missing coverage:** + - No tests for `cli.py` (the `ccc` entry point). + - No tests for `parquet_loader.py`. + - No tests for `dry_run=True` actually being honored by `write_models` (config has the flag; writer behavior under it is untested). + - No concurrent-write / locking behavior for delta tables, even a basic sanity test. + - `_build_predicate_escapes_single_quotes` covers `'` — also test backslash, empty string, and unicode. + +18. **`test_io_reexports_settings_helpers`** asserts identity (`is`) which is fine, but the same pattern in `test_public_api` uses `hasattr`. Pick one approach for re-export tests. + +## ✅ What's working well + +- **Excellent module-level docstrings** stating *why* the test exists (`test_writers.py`, `test_write_validation.py`, `test_write_relocation.py`, `test_public_api.py`). Keep doing this. +- **Headline regression test** (`test_patchseq_regression_two_datasets_same_project`) is exactly right — named for the bug, documents the prior failure mode in its docstring. +- **Parametrization over the registry** in `test_write_spec.py` is the right shape — it auto-grows with new entries. +- **`extra="forbid"` enforcement test** (`test_cluster_rejects_project_id`) prevents silent schema breakage. Good. +- Strong **regex `match=` usage** in schema tests catches the right error *and* the right field. diff --git a/planning/20260623/tests_review/plan.md b/planning/20260623/tests_review/plan.md new file mode 100644 index 0000000..8594f53 --- /dev/null +++ b/planning/20260623/tests_review/plan.md @@ -0,0 +1,279 @@ +# Tests Review — Implementation Plan + +Five sequential work packages, implemented in order on the same execution track (no PR slicing). + +--- + +## Work Package 1 — `conftest.py` foundation (enables everything else) + +**Goal:** kill duplication and enforce stable test isolation (fresh cwd/env + cleared `get_settings` cache) in one shot. + +```python +# tests/conftest.py +from __future__ import annotations +import pytest +from pathlib import Path +import polars as pl + +import connects_common_connectivity as ccc +from connects_common_connectivity.config import Settings, get_settings +from connects_common_connectivity import models as _models_mod + + +@pytest.fixture(autouse=True) +def _isolate_settings(monkeypatch, tmp_path): + """Every test gets a clean cwd, no CCC_OUTPUT_ROOT, and a cleared cache.""" + monkeypatch.delenv("CCC_OUTPUT_ROOT", raising=False) + monkeypatch.chdir(tmp_path) + get_settings.cache_clear() + yield + get_settings.cache_clear() + + +@pytest.fixture(scope="session") +def models() -> dict: + """Generate pydantic models once per session (expensive).""" + return ccc.generate_pydantic_models() + + +@pytest.fixture +def settings(tmp_path) -> Settings: + return Settings(output_root=tmp_path) + + +@pytest.fixture +def read_delta(): + def _read(path) -> pl.DataFrame: + return pl.read_delta(str(path)) + return _read +``` + +Then: +- delete the duplicated `_models()` from 4 schema files; switch tests to `def test_x(models):` +- delete the duplicated `settings` / `_read` from `test_writers.py` +- delete `_reset_cache_and_env` from `test_config.py` (now autouse globally) + +**Decision:** keep autouse `chdir(tmp_path)` globally. In this package it is a feature, not a risk: config discovery is cwd-based and cached, so per-test cwd isolation prevents cross-test bleed. `test_write_relocation.py` is safe because `REPO_ROOT` is anchored from `__file__`, not cwd. + +--- + +## Work Package 2 — Tighten exception assertions + +**Pattern:** prefer the narrowest exception + a `match=` that names the *field or condition*, not the generic word. + +`pytest.raises` signature reminder: +```python +with pytest.raises(ExpectedException, match=r"regex against str(exc)"): + ... +``` + +**Concrete replacements:** + +| File:line | Before | After | +|---|---|---| +| `test_basic.py:22` | `pytest.raises(Exception)` | `pytest.raises(ValidationError, match=r"project_id.*[Ff]ield required")` | +| `test_basic.py:37` | `pytest.raises(Exception)` | `pytest.raises(ValidationError, match=r"modality.*Input should be")` | +| `test_basic.py:69` | `pytest.raises(Exception)` | `pytest.raises(ValidationError, match=r"probability.*less than or equal to 1")` | +| `test_config.py:108` | `pytest.raises(Exception)` | `pytest.raises(ValidationError, match=r"output_root.*[Ff]ield required")` | +| `test_config.py:115` | `pytest.raises(Exception)` | `pytest.raises(ValidationError, match=r"[Ee]xtra inputs are not permitted")` *(verify what Settings raises first)* | +| `test_writers.py:328` | `pytest.raises(TypeError)` | `pytest.raises(TypeError, match=r"pydantic model or iterable")` | + +Also add a **new** test for true registry rejection (different code path): + +```python +from pydantic import BaseModel + +class UnregisteredModel(BaseModel): + id: str + +with pytest.raises(KeyError, match=r"UnregisteredModel"): + write_models(UnregisteredModel(id="u1"), settings=settings) +``` + +Note: this test needs `from pydantic import BaseModel` at the top of `test_writers.py`. Match on the class name rather than the exact error string — it's a more durable contract than the message text. + +**Rule of thumb to leave in the package notes:** +> Never `pytest.raises(Exception)`. Always pick the narrowest class the production code raises, and always include `match=` naming the field or condition. If you don't know which exception the code raises, that's the first thing to find out — that's the contract. + +For the dynamically-generated pydantic models in `test_basic.py`, import `from pydantic import ValidationError` at module top — it's the same class instance the dynamic models will raise. + +--- + +## Work Package 3 — Failure messages on regression-critical asserts + +Only add custom messages where the failure mode is non-obvious. Don't litter every assert. + +**Targets:** + +```python +# test_writers.py — patchseq regression +ids = sorted(rows["id"].to_list()) +assert ids == ["visp_exc_patchseq", "visp_inh_patchseq"], ( + f"patchseq regression: second write wiped first. " + f"Expected both datasets, got {ids}" +) +``` + +```python +# test_writers.py — idempotency, also strengthen content equality +rows = _read(settings.output_root / "dataset") +assert rows.shape[0] == 1, f"idempotent rewrite produced {rows.shape[0]} rows" +assert rows["id"].to_list() == ["d1"], "row identity changed across rewrites" +assert rows["name"].to_list() == ["example"], "row content drifted across rewrites" +``` + +```python +# test_write_utils.py — partial rerun +assert n == 1, f"expected only 'c' to be new; appended {n} rows" +``` + +```python +# test_write_validation.py — IO-never-happened check +assert not (tmp_path / "cluster").exists(), ( + "validation failure should short-circuit before any IO; " + "cluster/ directory was created anyway" +) +``` + +Skip messages on simple positive assertions like `assert cfd.range_max is None` — pytest's introspection already shows the value. + +--- + +## Work Package 4 — Coverage drift guards & list-failure tests + +### 4a. `WRITABLE_CLASSES` ↔ fixture drift + +Replace the if/elif tower in `_make_instance` with a registry dict + drift test: + +```python +# tests/_fixtures.py (or in conftest) +INSTANCE_FACTORIES = { + DataSet: lambda: DataSet(id="ds1", name="ds", project_id="p1"), + DataItem: lambda: DataItem(id="di1", name="di1", project_id="p1"), + # ... +} + +def make_instance(cls): + try: + return INSTANCE_FACTORIES[cls]() + except KeyError: + pytest.fail( + f"No fixture for {cls.__name__}. Add an entry to " + f"INSTANCE_FACTORIES in tests/_fixtures.py." + ) +``` + +```python +def test_every_writable_class_has_a_fixture(): + missing = set(WRITABLE_CLASSES) - set(INSTANCE_FACTORIES) + assert not missing, ( + f"WRITABLE_CLASSES added entries without fixtures: " + f"{sorted(c.__name__ for c in missing)}" + ) + stale = set(INSTANCE_FACTORIES) - set(WRITABLE_CLASSES) + assert not stale, ( + f"INSTANCE_FACTORIES has stale entries not in WRITABLE_CLASSES: " + f"{sorted(c.__name__ for c in stale)}" + ) +``` + +This makes the drift visible as a dedicated test failure instead of a parametrized round-trip error. + +### 4b. Negative-path coverage for `validate_for_write` with a list + +```python +def test_validate_for_write_list_reports_failing_row(): + spec = REGISTRY["Cluster"] + items = [ + Cluster(id="c1", hierarchy_id="h1"), + Cluster(id="c2"), # missing hierarchy_id + ] + with pytest.raises(ValueError, match=r"hierarchy_id") as ei: + validate_for_write(items, spec) + # row identity should appear in the error to make debugging tractable + assert "c2" in str(ei.value), ( + f"error should name failing row; got: {ei.value}" + ) +``` + +(If the production code doesn't currently name the row, that's a real finding to file — the test documents the desired contract.) + +--- + +## Work Package 5 — Plug the real coverage gaps + +This is the only work package that may touch behavior beyond test infra. Split per module to keep diffs small. + +### 5a. `dry_run` semantics +```python +def test_dry_run_does_not_write(tmp_path): + settings = Settings(output_root=tmp_path, dry_run=True) + ds = DataSet(id="d1", name="d", project_id="p1") + result = write_models(ds, settings=settings) + assert result.rows_written == 0, "dry_run must report 0 rows written" + assert not (tmp_path / "dataset").exists(), "dry_run must not create tables" +``` +If this fails, you've found a bug — `dry_run` exists in `Settings` but nothing checks it's honored. + +### 5b. `cli.py` +This CLI is `argparse`, not Click. Use `subprocess.run([sys.executable, "-m", "connects_common_connectivity.cli", ...])`. +Cover: top-level `--help`, `info` (assert package version text appears), one happy-path command (`bundle`), one error path (bad subcommand/args → nonzero exit). + +Skip `cmd_validate` and `etl-brain-regions` — both are marked `# pragma: no cover` in `cli.py` as runtime smoke commands; respect the existing exclusion. + +### 5c. `parquet_loader.py` +Test the public contract of `load_parquet_to_models(...)`: write a tiny parquet, load into a concrete class (e.g. `DataItem`), assert instance count + key field values + report counts/mapping. Add one negative test where required data is missing and assert the failure is surfaced in `report["errors"]`. + +### 5d. Extra escapes in `_build_predicate` (1-line additions to existing test) +```python +@pytest.mark.parametrize("value,expected_literal", [ + ("O'Hara", "'O''Hara'"), + ("", "''"), + ("a\\b", "'a\\b'"), # backslash is not special in SQL string literals + ("café", "'café'"), +]) +def test_build_predicate_escapes(value, expected_literal): + assert _build_predicate(["name"], [value]) == f"name = {expected_literal}" +``` + +### 5e. Repo-walk hardening in `test_write_relocation.py` +```python +SEARCH_ROOTS = ["src", "tests", "code", "scripts", "planning"] + +def _iter_source_files(): + for root in SEARCH_ROOTS: + base = REPO_ROOT / root + if not base.exists(): + continue + for path in base.rglob("*"): + if path.is_file() and path.suffix in {".py", ".ipynb"}: + if not any(p in EXCLUDED_DIRS for p in path.parts): + yield path +``` +Drops `data/`, `results/`, `scratch/`, `metadata/`, `environment/` from the walk. + +--- + +## Sequential execution guardrails (hard stops between packages) + +Do **not** start the next work package until the current package meets its guardrail. + +1. **WP1 → WP2:** `conftest.py` is in place, duplicated helper fixtures are removed from target files, and settings/cache isolation behavior remains intact. +2. **WP2 → WP3:** broad `pytest.raises(Exception)` uses targeted in this plan are replaced with narrow exception types and meaningful `match=` checks. +3. **WP3 → WP4:** custom assertion messages were added only to regression-critical/non-obvious assertions (no blanket message churn). +4. **WP4 → WP5:** fixture drift guard(s) are in place and list-failure validation coverage is added; registry/fixture mismatch now fails with explicit guidance. +5. **WP5 completion:** coverage-gap tests are in place; if `dry_run` exposes a real bug, fix behavior in the same package before declaring completion. + +--- + +## Sequencing & rollout + +| Work package | Effort | Risk | Blocks | +|---|---|---|---| +| 1. conftest | 1h | low | 2, 3 | +| 2. exceptions | 30m | low | — | +| 3. messages | 30m | none | — | +| 4. drift guards | 1h | low | — | +| 5. coverage gaps | 2–4h | medium (may surface real bugs) | — | + +Work packages 1–4 are pure test refactors. Work package 5 is where you'll likely find a `dry_run` bug; budget time for an immediate behavior fix in the same execution sequence. diff --git a/planning/etl_v1dd_01_v1196_temp_prompt.md b/planning/etl_v1dd_01_v1196_temp_prompt.md new file mode 100644 index 0000000..ff66bc3 --- /dev/null +++ b/planning/etl_v1dd_01_v1196_temp_prompt.md @@ -0,0 +1,119 @@ +# Handoff prompt — continue building `etl_v1dd_01_v1196.ipynb` + +You are picking up an in-progress ETL notebook that ingests the V1DD release 1196 dataset into the Common-Connectivity (CCC) Delta-lake schemas. One previous agent built the skeleton + section 1 (DataSets). The user wants the remaining sections filled in **one at a time, together** — finish a section, show the result, wait for the user to review before moving on. + +--- + +## Read first (in this order) + +### Authoritative conventions +- `/root/capsule/etl_example_prompt.md` — full ETL conventions guide. **Read end-to-end before writing any code.** Pay special attention to: + - §2 hard rules (never edit `src/` or `models.py`; never cast ids; use enum `.value`; every write has a verification cell). + - §4 canonical notebook structure. + - §5a–§5j write patterns per table family. + - §10 common mistakes table. +- `/root/capsule/CHANGELOG.md` — only relevant if you end up changing schemas (don't unless the user asks). + +### The notebook in progress (the one you'll be editing) +- `/root/capsule/code/etl_v1dd_01_v1196.ipynb` + - Cells 0–6: title, imports, constants, prereq check, **master id decision**, §1 DataSets (DONE, written + verified). + - Cells 7+: §2…§10 are skeletons with markdown plans + `# TODO` code stubs + per-section open questions. + - **`OUTPUT_ROOT = "../scratch/v1dd_1196_v1/"`** — relative to `code/`. The §1 outputs are already there under `dataset/`. + - Re-execute the whole notebook with `cd /root/capsule/code && uv run jupyter nbconvert --to notebook --execute --inplace etl_v1dd_01_v1196.ipynb` after every change. + +### Exploration / scratch reference +- `/root/capsule/code/etl_v1dd_00_explore.ipynb` — initial exploration of every input file with schema-fit notes. Useful for sanity-checking shapes/columns. + +### Example notebooks to mirror (same modality as V1DD = MICrONS Minnie) +- `/root/capsule/code/etl_minnie_01_dataset_dataitem.ipynb` — DataSet + DataItem + association pattern. +- `/root/capsule/code/etl_minnie_02_cell_features.ipynb` — CellFeatureSet/Definition/Matrix + wide parquet. +- `/root/capsule/code/etl_minnie_03_cluster_and_cluster_membership.ipynb` — Cluster taxonomy + parent-propagated memberships. +- `/root/capsule/code/etl_minnie_04_cell_cell.ipynb` — CellCellConnectivityLong with per-example folder convention. +- `/root/capsule/code/etl_wnm_exc_04_projection_matrix.ipynb` — for the `SingleCellReconstruction` + `SpatialLocation` pattern if needed. + +### Schemas (source of truth — do not modify without explicit user request) +- `/root/capsule/schemas/base_schema.yaml` +- `/root/capsule/schemas/core_schema.yaml` — `DataSet`, `DataItem`, `DataItemDataSetAssociation`, `SpatialLocation`, `Modality`. +- `/root/capsule/schemas/cell_features_schema.yaml` +- `/root/capsule/schemas/clustering_schema.yaml` +- `/root/capsule/schemas/mappings_schema.yaml` — `MappingSet`, `CellToCellMapping`, `CellToClusterMapping`. +- `/root/capsule/schemas/cell_cell_schema.yaml` — `CellCellConnectivityLong`, `SynapticMeasurementType` enum. +- `/root/capsule/schemas/single_cell_schema.yaml` — `SingleCellReconstruction`. +- The user has already added `Modality.CALCIUM_IMAGING` and regenerated `src/connects_common_connectivity/models.py`. Trust this. + +### Package utilities (read-only) +- `/root/capsule/src/connects_common_connectivity/models.py` — auto-generated pydantic models; read to confirm field names and enum values. +- `/root/capsule/src/connects_common_connectivity/io/writers.py` — `write_models(models, *, output_root=...)` dispatches by class. Use `output_root=OUTPUT_ROOT` because we are NOT writing to the shared `ccc_config.yaml` location. +- `/root/capsule/src/connects_common_connectivity/io/write_spec.py` — per-class WriteSpec (predicates, partition keys); consult before any write that isn't already in the writer registry. +- `/root/capsule/src/connects_common_connectivity/write_utils.py` — `append_new_dataitems`, `walk_ancestors`. +- `/root/capsule/src/connects_common_connectivity/arrow_utils.py` — `build_arrow_schema`, `models_to_table`, `attach_linkml_metadata`, `build_cell_feature_matrix_schema` (kwarg-only — see §10 of the prompt guide). + +--- + +## Raw V1DD data — `/data/v1dd_1196/` + +| File | Shape | Notes | +|---|---|---| +| `data_description.json`, `subject.json`, `metadata.nd.json` | aind-data-schema records | provenance; `name`, `project_name`, modalities, S3 location | +| `soma_and_cell_type_1196.feather` | (207 455, 11) | soma centroids + `cell_type_coarse` ∈ {E,I} + `cell_type` (12 leaves) | +| `proofread_axon_list_1196.npy` | (1 210,) int64 | `pt_root_id`s with proofread axons; 1164/1210 are in soma catalog | +| `proofread_dendrite_list_1196.npy` | (63 986,) int64 | `pt_root_id`s with proofread dendrites; all in soma catalog | +| `snr_by_cell.feather` | (4 458, 5) | functional ROI `(volume, column, plane, roi)` + `snr` | +| `coregistration_1196.feather` | (571, 5) | EM↔functional mapping; pre/post not unique on either side | +| `syn_df_all_to_proofread_to_all_1196.feather` | (8 204 497, 13) | per-synapse rows; `pre_pt_root_id`/`post_pt_root_id` + positions + `size` | +| `syn_label_df_all_to_proofread_to_all_1196.feather` | (6 706 286, 1) | per-synapse `tag` (`spine`, …), indexed by synapse `id` | +| `cell_cell_correlations_by_stimulus.feather` | (8 846 260, 13) | all-ROI functional Pearson corr per stimulus, ROI-tuple-keyed, tuples unique | +| `cell_cell_correlations_by_stimulus_coregistered.feather` | (148 728, 9) | same but EM-rootid-keyed; 142 410 unique pairs (≈4 % repeat), 12 self-pairs | + +--- + +## Master decisions already made (do not relitigate) + +- **One notebook for all of V1DD 1196**, no `_02`/`_03` follow-ups. +- **`OUTPUT_ROOT = "../scratch/v1dd_1196_v1/"`.** +- **`PROJECT_ID = "v1dd"`.** +- **Five DataSets** (`v1dd_1196_em`, `v1dd_1196_proofread_axons`, `v1dd_1196_proofread_dendrites`, `v1dd_1196_func`, `v1dd_1196_func_coregistered`). Already written, do not rewrite. +- **EM `DataItem.id = str(pt_root_id)`** — single source of truth for EM cells. See the `master-id-decision` markdown cell for the table + numbers + collapse policy (drop `pt_root_id==0`, keep largest-`volume` soma when multiple rows share a root). +- **Functional `DataItem.id = f"{volume}-{column}-{plane}-{roi}"`** (planned in §6 skeleton). +- **`publication = "https://github.com/AllenInstitute/v1dd_physiology"`** for every DataSet. + +--- + +## Sections still to fill (in order) + +| § | Title | Status | +|---|---|---| +| 1 | DataSet rows | ✅ DONE | +| 2 | EM DataItems + cohort associations | TODO — next up | +| 3 | EM soma `CellFeatureMatrix` (`v1dd_em_soma_geometry`) | TODO | +| 4 | `SingleCellReconstruction` + `SpatialLocation` (CCF) | TODO | +| 5 | V1DD cell-type taxonomy (Cluster + ClusterMembership) | TODO; includes a v1dd↔minnie taxonomy comparison table already verified | +| 6 | Functional DataItems + coregistered cohort | TODO | +| 7 | Functional feature sets (`v1dd_func_qc`, `v1dd_func_imaging_position`) | TODO | +| 8 | `CellToCellMapping` for EM↔functional coregistration | TODO | +| 9 | Synapse aggregation → `CellCellConnectivityLong` | TODO (per-synapse schema question is intentionally OPEN) | +| 10 | Functional correlations → `CellCellConnectivityLong` × 7 stimuli × 2 tables | TODO | + +Each section has open questions in its markdown cell. **Ask the user before answering them yourself** — they want to review each section before you wire the writes. + +--- + +## Working agreement (per user instruction) + +1. **Build one section at a time.** Do not jump ahead. After each section: re-execute the full notebook, show the verification cell output to the user, then stop and wait. +2. **Update the markdown of each section as decisions are resolved** — remove answered open questions, keep unresolved ones, keep the section concise. +3. **Don't touch `src/` or `schemas/`** unless the user explicitly asks for a schema change. +4. **Don't relitigate the master id decision.** If a section's open question is rendered moot by it, just delete the question. +5. **Verification cell after every write** — read back with `pl.read_delta`, print shape + head(3), assert at least one invariant (row count, unique ids, expected categorical value). +6. **Use `write_models(..., output_root=OUTPUT_ROOT)`** for everything that has a WriteSpec. For things without one (wide-form feature parquets, cell-cell folders), fall back to `deltalake.write_deltalake` with the patterns in §5b–§5g of the prompt guide. +7. **Run the notebook headless to validate**: `cd /root/capsule/code && uv run jupyter nbconvert --to notebook --execute --inplace etl_v1dd_01_v1196.ipynb`. + +--- + +## Next action when the user returns + +Start with **§2 (EM DataItems + cohort associations)**. The skeleton is in place and the master id decision is documented. The two remaining open questions in that section are: +1. Proofread axon roots missing from the soma catalog (46/1210) — skip silently or log + skip? +2. `neuroglancer_link` — populate, or leave null? + +Ask the user, then implement, write, verify, and stop. diff --git a/planning/multi_writer_scope_design.md b/planning/multi_writer_scope_design.md new file mode 100644 index 0000000..eede026 --- /dev/null +++ b/planning/multi_writer_scope_design.md @@ -0,0 +1,201 @@ +# Multi-writer Delta scopes: bug, contrast with minnie, design options + +Captured 2026-06-23 from a debugging session that started with an +`AssertionError` in `etl_visp_inh_patchseq_03_cluster_membership_and_mapping.ipynb`. + +## The problem + +`write_models` dispatches `overwrite_scoped` writes that **replace every row** in +the scope defined by a spec's `scope_columns`. For example +(`src/connects_common_connectivity/io/write_spec.py`): + +| Class | `scope_columns` | +|---|---| +| `DataItemDataSetAssociation` | `(project_id, dataset_id)` | +| `ClusterMembership` | `(project_id, hierarchy_id)` | +| `CellToClusterMapping` | `(project_id, mapping_set)` | + +When **multiple notebooks contribute disjoint row subsets to the same scope**, +any one of them issuing `write_models([...own_rows...])` deletes the other +notebooks' rows. The latest writer wins, silently. + +This bit us concretely in the patch-seq pipeline. After running +`etl_visp_inh_patchseq_01/02/03`: + +``` +visp_patchseq / visp_inh_patchseq associations: 495 (expected ≥ 2759) +``` + +`_03` Section 1's assertion surfaced it: + +``` +AssertionError: 2367 T-type CSV cells are not associated with visp_inh_patchseq +``` + +Root cause: `_01` writes 2759 association rows from the ttype CSV → `_02` +overwrites with 520 rows from the wide CSV → `_03` overwrites with 495 rows from +the MET CSV. Every step is a valid `overwrite_scoped` call; together they shrink +the scope monotonically. The same bug existed (silently) for `ClusterMembership` +under `(visp_patchseq, visp_met_types_taxonomy)`, where `etl_visp_exc_patchseq_03`'s +1152 rows were being wiped by `etl_visp_inh_patchseq_03`'s 1485-row overwrite. + +### Numbers from patch-seq + +| Source CSV (input) | Notebook | Rows in CSV | Scope written | +|---|---|---:|---| +| `patchseq_tx_cell_ttype_labels.csv` | `inh_01` | 2759 | `(visp_patchseq, visp_inh_patchseq)` | +| `inh_ivscc_features_wide_unnormalized.csv` | `inh_02` | 520 | `(visp_patchseq, visp_inh_patchseq)` | +| `visp_met_cell_assignments_text_names.csv` | `inh_03` § 0 | 495 | `(visp_patchseq, visp_inh_patchseq)` | +| `visp_met_cell_assignments_text_names.csv` | `inh_03` § 2 | 495 cells × 3 ancestors = 1485 | `(visp_patchseq, visp_met_types_taxonomy)` | +| `inferred_met_types.csv` | `exc_03` § 1 | 384 cells × 3 ancestors = 1152 | `(visp_patchseq, visp_met_types_taxonomy)` | + +After the fix (read-existing → union → re-write the full scope): + +``` +visp_patchseq / visp_inh_patchseq associations: 2879 +visp_patchseq / visp_met_types_taxonomy clustermembership rows: 2637 (=1152 exc + 1485 inh) +visp_patchseq / visp_met_types_taxonomy clustermembership items: 879 (=384 exc + 495 inh) +``` + +### Origin: migration regression + +This is a migration regression, not an original design flaw. The pre-migration +notebooks did the merge manually with raw +`write_deltalake(..., mode="overwrite", predicate=...)`: + +```python +existing_cm = pl.read_delta(...).filter(predicate) +other_cm = existing_cm.filter(~pl.col("item").is_in(our_cell_ids)) +all_memberships = [ClusterMembership(**r) for r in other_cm.to_dicts()] + new +write_deltalake(..., mode="overwrite", predicate=..., partition_by=...) +``` + +When that pattern was migrated to `write_models([...])`, the read-and-union step +was dropped (replaced with a stub `other_cm = pl.DataFrame({"item": []})`) and +the assertion `others_present.shape[0] == other_cm.shape[0]` continued to +"pass" because both sides became 0 — the verification was no longer +load-bearing. + +## How minnie avoids the problem entirely + +Minnie uses a **sub-dataset (cohort) pattern**: each notebook writes into its +own unique `(project_id, dataset_id)` scope, so `overwrite_scoped` calls never +collide. + +| Notebook | `DATASET_ID` | +|---|---| +| `etl_minnie_01_dataset_dataitem` | `minnie65_v1300_nuclei` (the universe) | +| `etl_minnie_02_cell_features` | `minnie65_v1300_csm_cluster` (CSM cohort) | +| `etl_minnie_03_cluster_and_cluster_membership` | reuses `minnie65_v1300_csm_cluster`, but writes `ClusterMembership` under `hierarchy_id="minnie65_csm_cell_types"` — a hierarchy no other minnie notebook writes to | +| `etl_minnie_04_cell_cell` | proofread cohorts (`minnie65_v1300_proofread*`) | + +For every `overwrite_scoped` write minnie issues, the **scope owner is exactly +one notebook**. No merge, no surprises. + +Patch-seq took the opposite philosophy: one `DataSet` +(`visp_inh_patchseq`) is treated as a single coherent cohort, and multiple +notebooks add rows of different kinds to the **same** `(project, dataset)` and +`(project, hierarchy)` scopes. That's what creates the multi-writer hazard. + +There's a meta-question buried here: should patch-seq follow minnie's cohort +pattern? E.g. `visp_inh_patchseq_ttype`, `visp_inh_patchseq_morph`, +`visp_inh_patchseq_met` as sibling sub-datasets. It would remove the merge +problem entirely but would also fragment what is currently a clean +"inh-cohort" abstraction. Not obvious which is better. + +## Considered solutions + +### Option A — Add a merging write mode to `write_models` + +Add a new `WriteSpec.write_mode` value, e.g. `"merge_by_id"` or +`"overwrite_scoped_by_id"`, that: + +1. Requires the spec to declare an **identity column** within the scope + (`dataitem_id` for `DataItemDataSetAssociation`, `item` for + `ClusterMembership`, …). +2. On write: reads existing rows in scope, replaces rows whose identity is in + the incoming batch, keeps the rest. + +**Pros** +- Eliminates the boilerplate currently duplicated in every patch-seq notebook. +- Makes the multi-writer contract explicit in the spec (it's *declared* that + this scope is multi-writer and merged on column X). +- Closes the regression class that bit us — a future migration cannot + accidentally strip the merge logic because the merge lives in the library. + +**Cons** +- Silently merging vs. overwriting is a semantically distinct contract; a + caller who actually wanted to *clear* sibling rows would have to opt out. +- Requires a read per write (negligible at current data sizes). +- The library implicitly trusts that the caller's batch is the authoritative + subset for the ids it contains. + +### Option B — Keep `write_models` overwrite-only, add a sibling helper + +```python +write_models_merging_on(items, id_column="item", output_root=...) +``` + +**Pros** +- No change to existing call sites or `write_models` semantics. +- Explicit at the call site: a reader sees "this notebook merges into a shared + scope" without having to look up the spec. +- Matches how minnie sidesteps the issue (use distinct scopes whenever + possible; reach for the merging helper only when you can't). + +**Cons** +- Still requires every shared-scope notebook to remember to use the merge + variant; the next migration can still regress this. + +### Option C — Status quo (don't change the library) + +Document the convention; every notebook touching a shared scope does its own +read-and-union before `write_models`. + +**Pros** +- Library stays minimal and explicit. + +**Cons** +- This is exactly the trap the recent migration walked into. There is no + structural mechanism preventing a recurrence. + +### Option D — Forbid shared scopes (push patch-seq toward minnie's pattern) + +Refactor patch-seq notebooks so each `(project, dataset)` and +`(project, hierarchy)` scope has a single owner — possibly by introducing +sub-datasets (`visp_inh_patchseq_ttype`, `_morph`, `_met`). + +**Pros** +- Removes the multi-writer hazard at the data-model level rather than papering + over it in the library. +- Brings patch-seq into stylistic alignment with minnie / V1DD. + +**Cons** +- Larger change. Downstream queries that group rows by "the inh cohort" now + need to union sub-datasets. May lose a useful natural grouping. +- Doesn't solve the `ClusterMembership` case (different MET-types + contributors *do* share a hierarchy — that's the taxonomy's whole point). + So a merge mechanism is probably still needed somewhere. + +## Suggested next step (for discussion, not yet decided) + +Lean toward **A + a scope-ownership audit**: + +1. For every `overwrite_scoped` spec, decide whether the scope is + single-writer (minnie-style) or multi-writer (patch-seq-style). +2. Single-writer specs stay as-is. +3. Multi-writer specs declare a merge key (Option A). +4. Bonus: `write_models` could detect "a write that would shrink the scope it + targets" (i.e. incoming rows form a strict subset of the existing scope by + the merge key) and warn/error when the spec is not marked multi-writer. + That would have caught the regression at runtime. + +Open questions: + +- Should `WriteSpec` gain a `merge_on: list[str] | None` field? +- Is the implicit "incoming batch is the truth for these ids" contract + acceptable for every multi-writer class, or do we need a more general + "upsert by composite key" mode? +- Do we want to keep patch-seq as multi-writer at all, or migrate to + sub-datasets and reserve the merge mechanism only for `ClusterMembership` + (where taxonomy-sharing makes single-ownership impossible)? diff --git a/pyproject.toml b/pyproject.toml index 223a69b..e1341a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "connects-common-connectivity" -version = "0.1.0" +version = "0.2.0" description = "Common connectivity data models and utilities (LinkML + Pydantic) for BRAIN CONNECTS pilot" authors = [ { name = "Forrest Collman" } ] license = { text = "MIT" } diff --git a/schemas/base_schema.yaml b/schemas/base_schema.yaml index a72221c..91fe769 100644 --- a/schemas/base_schema.yaml +++ b/schemas/base_schema.yaml @@ -26,6 +26,8 @@ enums: description: X-ray microscopy based connectivity mapping. EXPANSION_MICROSCOPY: description: Expansion microscopy based connectivity mapping. + CALCIUM_IMAGING: + description: Calcium imaging based functional correlations. OTHER: description: Other modality. ProjectionMeasurementType: diff --git a/src/connects_common_connectivity/config.py b/src/connects_common_connectivity/config.py new file mode 100644 index 0000000..da5c9e4 --- /dev/null +++ b/src/connects_common_connectivity/config.py @@ -0,0 +1,180 @@ +"""Package-wide settings discovered from a repo-root ``ccc_config.yaml``. + +Configuration is a *mechanism* here; the *values* live in a single +version-controlled ``ccc_config.yaml`` at the repo root. Every entry point +(CLI, writers/readers, notebooks, future plotting/analysis) calls +:func:`get_settings`, which walks up from ``cwd`` to find that file, +validates it with pydantic, and returns a cached :class:`Settings`. + +No notebook setup cell, no ``%run``, no process-global mutation. + +Resolution precedence (highest wins): + +1. An explicit ``settings=`` argument passed by a caller. +2. ``CCC_OUTPUT_ROOT`` environment variable (overrides ``output_root`` only; + it cannot express structured knobs like ``dry_run``). +3. The discovered ``ccc_config.yaml``. +4. Otherwise: a clear, actionable error. +""" + +from __future__ import annotations + +import os +from functools import lru_cache +from pathlib import Path +from typing import Optional + +import yaml +from pydantic import BaseModel, Field + +CONFIG_FILENAME = "ccc_config.yaml" + + +class Settings(BaseModel): + """Validated, package-wide settings loaded from ``ccc_config.yaml``.""" + + output_root: Path = Field( + ..., + description="Root directory under which Delta/Parquet tables are written.", + ) + dry_run: bool = Field( + default=False, + description="If True, callers should log intended writes instead of executing them.", + ) + + model_config = {"extra": "forbid"} + + def describe(self) -> str: + """Return a human-readable summary of the resolved settings.""" + return ( + f"Settings(output_root={self.output_root!s}, dry_run={self.dry_run})" + ) + + def __repr__(self) -> str: # pragma: no cover - trivial + return self.describe() + + +def find_config_file( + start: Optional[Path] = None, + filename: str = CONFIG_FILENAME, +) -> Optional[Path]: + """Walk up from ``start`` (default: ``cwd``) to the filesystem root looking + for ``filename``. + + Returns the resolved path to the first match, or ``None`` if not found. + Mirrors the discovery pattern used by ``pyproject.toml``, ``ruff``, and + ``pytest`` — a notebook in ``code/`` finds the repo-root config with zero + config code. + """ + here = (start or Path.cwd()).resolve() + for candidate in (here, *here.parents): + path = candidate / filename + if path.is_file(): + return path + return None + + +@lru_cache(maxsize=1) +def get_settings() -> Settings: + """Discover ``ccc_config.yaml``, validate it, and return cached settings. + + Raises ``RuntimeError`` with an actionable message if no config file is + discoverable from the current working directory. + + Tests can call ``get_settings.cache_clear()`` to force re-discovery. + """ + path = find_config_file() + if path is None: + raise RuntimeError( + f"No {CONFIG_FILENAME} found — create one at the repo root with " + "output_root: . Discovery walks up from the current working " + "directory, like pyproject.toml/ruff/pytest." + ) + + raw = yaml.safe_load(path.read_text()) or {} + if not isinstance(raw, dict): + raise RuntimeError( + f"{path}: expected a YAML mapping at the top level, got {type(raw).__name__}." + ) + + config_dir = path.parent + + if "output_root" in raw and raw["output_root"] is not None: + raw["output_root"] = _anchor_path(raw["output_root"], config_dir) + + env_override = os.environ.get("CCC_OUTPUT_ROOT") + if env_override: + # Env values come from the user's shell; anchor to cwd so they are + # cwd-independent thereafter (matches shell intuition). + raw["output_root"] = _anchor_path(env_override, Path.cwd()) + + return Settings(**raw) + + +def _anchor_path(value, base: Path) -> Path: + """Return ``value`` as an absolute :class:`Path`, anchored at ``base`` if relative. + + Uses :func:`os.path.abspath` rather than :meth:`Path.resolve`: abspath + normalizes the path without following symlinks, so a symlinked + ``scratch -> /scratch`` doesn't suddenly point outside the repo and + relative-path output stays sensible (e.g. ``../scratch/x`` from ``code/``). + """ + p = Path(value) + if not p.is_absolute(): + p = base / p + return Path(os.path.abspath(p)) + + +def table_path(settings: Settings, table: str) -> Path: + """Resolve the on-disk path for a named Delta/Parquet table subdir. + + ``table`` should be one of the canonical subdir names used by the + notebooks (e.g. ``"dataset"``, ``"dataitem"``, + ``"dataitem_dataset_association"``, ``"cellfeatureset"``, + ``"cellfeaturematrix"``, ``"cluster"``, ``"clusterhierarchy"``, + ``"clustermembership"``, ``"mappingset"``, ``"celltoclustermapping"``, + ``"projectionmeasurementmatrix"``). Callers pass the exact name so + nothing concatenates path strings ad hoc. + """ + return Path(settings.output_root) / table + + +def output_root(settings: Optional[Settings] = None, *, absolute: bool = False) -> str: + """Return ``output_root`` as a string with a trailing ``/``. + + Resolution rule (the bit that makes notebooks Just Work): a relative + ``output_root`` in ``ccc_config.yaml`` is anchored at the config file's + directory (the repo root), not at ``cwd``. So a notebook running in + ``code/`` and a script running at the repo root both point at the same + place. By default this function then returns the path **relative to the + current working directory**, so a notebook in ``code/`` sees + ``"../scratch//"`` while a process at the repo root sees + ``"scratch//"``. Pass ``absolute=True`` to get the fully + resolved absolute path instead. + + Prefer :func:`table_path` for new code — it returns a typed :class:`Path` + for a named table subdir and is cwd-independent. + """ + s = settings if settings is not None else get_settings() + abs_path = Path(s.output_root) + if not abs_path.is_absolute(): + abs_path = Path(os.path.abspath(abs_path)) + if absolute: + text = str(abs_path) + else: + try: + text = os.path.relpath(abs_path, Path.cwd()) + except ValueError: + # Different drives on Windows — fall back to absolute. + text = str(abs_path) + return text if text.endswith("/") else text + "/" + + +__all__ = [ + "CONFIG_FILENAME", + "Settings", + "find_config_file", + "get_settings", + "output_root", + "table_path", +] diff --git a/src/connects_common_connectivity/io/__init__.py b/src/connects_common_connectivity/io/__init__.py index e69de29..1ee5ba6 100644 --- a/src/connects_common_connectivity/io/__init__.py +++ b/src/connects_common_connectivity/io/__init__.py @@ -0,0 +1,35 @@ +"""IO layer for ConnectsCommonConnectivity. + +The IO layer owns the write/read path between generated pydantic models +and the shared Delta lake. This module is the curated public surface: +import from here for stable user code; everything else under ``io/`` is +internal plumbing. + +Example:: + + from connects_common_connectivity.io import write_models, write_projection_matrix + from connects_common_connectivity.models import DataSet + + write_models(DataSet(id="ds1", name="example", project_id="p1")) + write_projection_matrix(pmm, dense_matrix) +""" + +from __future__ import annotations + +from ..config import Settings, get_settings, table_path +from .writers import ( + WRITABLE_CLASSES, + WriteResult, + write_models, + write_projection_matrix, +) + +__all__ = [ + "get_settings", + "Settings", + "table_path", + "write_models", + "write_projection_matrix", + "WriteResult", + "WRITABLE_CLASSES", +] diff --git a/src/connects_common_connectivity/arrow_utils.py b/src/connects_common_connectivity/io/arrow_utils.py similarity index 100% rename from src/connects_common_connectivity/arrow_utils.py rename to src/connects_common_connectivity/io/arrow_utils.py diff --git a/src/connects_common_connectivity/io/io_plans.md b/src/connects_common_connectivity/io/io_plans.md deleted file mode 100644 index a5dee65..0000000 --- a/src/connects_common_connectivity/io/io_plans.md +++ /dev/null @@ -1,36 +0,0 @@ -# IO Utility Functions — Plans - -## `populate_region_coverage(pmm, matrix) → ProjectionMeasurementMatrix` - -Automatically populates the `region_coverage` field on a `ProjectionMeasurementMatrix` from the dense values array. - -- **Input:** - - `pmm`: a `ProjectionMeasurementMatrix` instance with `region_index` already set. - - `matrix`: dense numeric array of shape `(len(data_item_index), len(region_index))` — numpy ndarray or similar. -- **Logic:** For each column index `i`, check `any(matrix[:, i] != 0)`. Collect the corresponding `pmm.region_index[i]` entries where the column has at least one non-zero value. -- **Output:** Returns a copy of `pmm` with `region_coverage` set to the non-zero-column subset of `region_index`. -- **Properties:** Pure function, no side effects. Does not modify the input `pmm`. - ---- - -## `compare_region_coverage(pmms) → dict` - -Compares region index and region coverage across multiple `ProjectionMeasurementMatrix` instances. Answers: "which regions are shared, and which are exclusive to specific dataset combinations?" - -- **Input:** - - `pmms`: list of `ProjectionMeasurementMatrix` instances, each with `region_index` and `region_coverage` populated. -- **Computes:** - - `shared_regions`: intersection of all `region_index` across inputs (what regions can we compare at all?). - - `shared_coverage`: intersection of all `region_coverage` across inputs (where do all datasets have signal?). - - For every non-empty subset of the input PMMs (powerset, size 1 through N): count of regions that are in that subset's `region_coverage` intersection but **not** in any other PMM's `region_coverage` (exclusive to that combination). -- **Prints:** A summary table showing, for each subset combination, how many regions are exclusively covered by that combination. Example for 3 datasets A, B, C: - ``` - Only in A: 12 - Only in B: 5 - Only in C: 8 - Only in A ∩ B: 3 - Only in A ∩ C: 2 - Only in B ∩ C: 1 - In all (A ∩ B ∩ C): 45 - ``` -- **Returns:** dict with keys `shared_regions`, `shared_coverage`, and `exclusive_counts` (mapping subset labels to region counts). diff --git a/src/connects_common_connectivity/io/write_spec.py b/src/connects_common_connectivity/io/write_spec.py new file mode 100644 index 0000000..5a325b5 --- /dev/null +++ b/src/connects_common_connectivity/io/write_spec.py @@ -0,0 +1,189 @@ +"""Write-spec registry for IO-layer Delta writers. + +A :class:`WriteSpec` describes how a generated pydantic model is persisted into +the shared Delta lake: which subdirectory, which partition columns, which scope +columns, and which write mode the backend should dispatch on. :data:`REGISTRY` +is the source of truth for which classes are writable; add an entry here to +make a new class writable through :func:`write_models`. +""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import BaseModel, ConfigDict + +from ..models import ( + AlgorithmRun, + CellFeatureDefinition, + CellFeatureMatrix, + CellFeatureSet, + CellToClusterMapping, + Cluster, + ClusterHierarchy, + ClusterMembership, + DataItem, + DataItemDataSetAssociation, + DataSet, + HierarchyCategory, + MappingSet, + ProjectionMeasurementMatrix, +) + + +class WriteSpec(BaseModel): + """Declarative description of how a model class is written to Delta.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + model_cls: type + subdir: str + partition_by: list[str] + scope_columns: list[str] + write_mode: Literal["overwrite_scoped", "append_new_by_id"] + required_for_write: list[str] = [] + cross_field_rules: list[str] = [] + + +REGISTRY: dict[str, WriteSpec] = { + "DataSet": WriteSpec( + model_cls=DataSet, + subdir="dataset", + partition_by=["project_id"], + # Scoped on (project_id, id) so DataSet rows from sibling notebooks + # sharing a project_id (e.g. patchseq exc/inh) do not overwrite each + # other. + scope_columns=["project_id", "id"], + write_mode="overwrite_scoped", + ), + "DataItem": WriteSpec( + model_cls=DataItem, + subdir="dataitem", + partition_by=["project_id"], + scope_columns=["id"], + write_mode="append_new_by_id", + ), + "DataItemDataSetAssociation": WriteSpec( + model_cls=DataItemDataSetAssociation, + subdir="dataitem_dataset_association", + partition_by=["project_id"], + scope_columns=["project_id", "dataset_id"], + write_mode="overwrite_scoped", + ), + # Cluster taxonomy is project-agnostic in the schema — Cluster and + # ClusterHierarchy do not carry project_id. Scope is the hierarchy id + # (Cluster) or the row id (ClusterHierarchy), matching the existing + # cluster ETL notebooks. + "Cluster": WriteSpec( + model_cls=Cluster, + subdir="cluster", + partition_by=["hierarchy_id"], + scope_columns=["hierarchy_id"], + write_mode="overwrite_scoped", + required_for_write=["hierarchy_id"], + ), + "ClusterHierarchy": WriteSpec( + model_cls=ClusterHierarchy, + subdir="clusterhierarchy", + partition_by=[], + scope_columns=["id"], + write_mode="overwrite_scoped", + ), + "ClusterMembership": WriteSpec( + model_cls=ClusterMembership, + subdir="clustermembership", + partition_by=["project_id", "hierarchy_id"], + scope_columns=["project_id", "hierarchy_id"], + write_mode="overwrite_scoped", + required_for_write=["hierarchy_id"], + ), + "MappingSet": WriteSpec( + model_cls=MappingSet, + subdir="mappingset", + partition_by=["project_id"], + scope_columns=["project_id", "id"], + write_mode="overwrite_scoped", + ), + "CellToClusterMapping": WriteSpec( + model_cls=CellToClusterMapping, + subdir="celltoclustermapping", + partition_by=["project_id"], + # Notebooks predicate on (project_id, mapping_set), which is the + # mapping-set foreign key on the row. + scope_columns=["project_id", "mapping_set"], + write_mode="overwrite_scoped", + ), + "CellFeatureSet": WriteSpec( + model_cls=CellFeatureSet, + subdir="cellfeatureset", + partition_by=["project_id"], + scope_columns=["project_id", "id"], + write_mode="overwrite_scoped", + ), + "CellFeatureDefinition": WriteSpec( + model_cls=CellFeatureDefinition, + subdir="cellfeaturedefinition", + partition_by=["project_id", "feature_set_id"], + scope_columns=["project_id", "feature_set_id"], + write_mode="overwrite_scoped", + required_for_write=["feature_set_id"], + ), + "CellFeatureMatrix": WriteSpec( + model_cls=CellFeatureMatrix, + subdir="cellfeaturematrix", + partition_by=["project_id"], + scope_columns=["project_id", "feature_set_id"], + # CellFeatureMatrix rows are metadata pointers (one row per matrix); + # the wide-form numeric Parquet at ``cellfeatures/{feature_set_id}/`` + # is built from raw dataframes in the notebook, not from a model + # instance, so it does not flow through ``write_models`` and stays + # outside the registry. + write_mode="overwrite_scoped", + ), + "ProjectionMeasurementMatrix": WriteSpec( + model_cls=ProjectionMeasurementMatrix, + subdir="projectionmeasurementmatrix", + # ProjectionMeasurementMatrix is not ProjectScoped (schema gap noted + # in etl_wnm_exc_04). The notebook predicate is therefore ``id IN (...)`` + # only, with no partition columns. Once the schema gains + # ``ProjectScoped``, partition_by/scope_columns should be widened. + partition_by=[], + scope_columns=["id"], + write_mode="overwrite_scoped", + ), + # AlgorithmRun and HierarchyCategory are project-agnostic taxonomy metadata + # (no project_id slot). Notebook predicates are id-only, matching scope=["id"]. + "AlgorithmRun": WriteSpec( + model_cls=AlgorithmRun, + subdir="algorithmrun", + partition_by=[], + scope_columns=["id"], + write_mode="overwrite_scoped", + ), + "HierarchyCategory": WriteSpec( + model_cls=HierarchyCategory, + subdir="hierarchycategory", + partition_by=[], + scope_columns=["id"], + write_mode="overwrite_scoped", + ), +} + + +def get_spec(model_or_cls: type | BaseModel) -> WriteSpec: + """Look up the :class:`WriteSpec` for a model class or instance. + + Accepts either the generated pydantic class itself or an instance of it, + keyed by ``__name__`` of the class. + """ + cls = model_or_cls if isinstance(model_or_cls, type) else type(model_or_cls) + try: + return REGISTRY[cls.__name__] + except KeyError as err: + raise KeyError( + f"No WriteSpec registered for {cls.__name__!r}. " + f"Known: {sorted(REGISTRY)}" + ) from err + + +__all__ = ["WriteSpec", "REGISTRY", "get_spec"] diff --git a/src/connects_common_connectivity/write_utils.py b/src/connects_common_connectivity/io/write_utils.py similarity index 63% rename from src/connects_common_connectivity/write_utils.py rename to src/connects_common_connectivity/io/write_utils.py index 6eee0cc..91a8b52 100644 --- a/src/connects_common_connectivity/write_utils.py +++ b/src/connects_common_connectivity/io/write_utils.py @@ -1,12 +1,18 @@ """Idempotent write helpers for Delta Lake tables shared across notebooks.""" from __future__ import annotations -from typing import Iterator, Mapping, Optional, Tuple +from typing import Any, Iterator, Mapping, Optional, Tuple import pyarrow as pa import pyarrow.compute as pc from deltalake import write_deltalake +__all__ = [ + "append_new_dataitems", + "populate_region_coverage", + "walk_ancestors", +] + def walk_ancestors( leaf_id: str, @@ -112,3 +118,56 @@ def append_new_dataitems( write_deltalake(output_path, new_rows, mode="append", partition_by=["project_id"]) return new_rows.num_rows + + +def populate_region_coverage(pmm: Any, matrix: Any) -> Any: + """Return a copy of ``pmm`` with ``region_coverage`` derived from ``matrix``. + + ``region_coverage`` is the subset of ``pmm.region_index`` whose + corresponding column in the dense ``matrix`` has at least one non-zero + value. Pure function: the input ``pmm`` is not mutated. + + Parameters + ---------- + pmm: + A :class:`ProjectionMeasurementMatrix` instance with ``region_index`` + already populated. + matrix: + Dense numeric array of shape + ``(len(pmm.data_item_index), len(pmm.region_index))`` — typically a + NumPy ``ndarray``, but anything that supports element-wise truthiness + plus column-wise ``any()`` works. + + Returns + ------- + ProjectionMeasurementMatrix + A new instance equal to ``pmm`` except that ``region_coverage`` is + the list of region ids with at least one non-zero entry, in the + order they appear in ``region_index``. + + Raises + ------ + ValueError + If ``pmm.region_index`` is missing or its length does not match + ``matrix.shape[1]``. + """ + region_index = getattr(pmm, "region_index", None) + if region_index is None: + raise ValueError("pmm.region_index must be set before populating region_coverage") + + import numpy as np + + arr = np.asarray(matrix) + if arr.ndim != 2: + raise ValueError( + f"matrix must be 2D (cells x regions); got shape {arr.shape!r}" + ) + if arr.shape[1] != len(region_index): + raise ValueError( + f"matrix.shape[1] ({arr.shape[1]}) must equal len(region_index) " + f"({len(region_index)})" + ) + + nonzero_cols = np.any(arr != 0, axis=0) + coverage = [r for r, keep in zip(region_index, nonzero_cols.tolist()) if keep] + return pmm.model_copy(update={"region_coverage": coverage}) diff --git a/src/connects_common_connectivity/io/write_validation.py b/src/connects_common_connectivity/io/write_validation.py new file mode 100644 index 0000000..78f68b3 --- /dev/null +++ b/src/connects_common_connectivity/io/write_validation.py @@ -0,0 +1,148 @@ +"""Write-time, pydantic-only validation hooked into :func:`write_models`. + +The IO layer should never blindly trust that a model carries every slot +the write actually depends on. Many generated fields are ``Optional`` in +``models.py`` because the schema permits them to be missing in some +contexts, but the *write* path needs them concretely (e.g. the predicate +columns, the partition columns, the id used for dedupe). + +The :class:`WriteSpec` for each writable class records this in +``required_for_write``. This module turns that list into a real check by +deriving a strict pydantic subclass of the generated model — +runtime-only, never mutating ``models.py`` — and re-validating each +instance through it before any IO. +""" + +from __future__ import annotations + +from functools import lru_cache +from types import UnionType +from typing import Any, Iterable, Sequence, Union, get_args, get_origin + +from pydantic import BaseModel, Field, ValidationError, create_model + +from .write_spec import WriteSpec + + +__all__ = ["strict_model_for", "validate_for_write"] + + +def _strip_optional(annotation: Any) -> Any: + """Return ``annotation`` with ``None`` removed from any top-level Union. + + A field annotated ``Optional[str]`` (``str | None``) accepts ``None`` as + a valid value even when ``Field(...)`` makes it required. For write-time + enforcement we want ``None`` to be a validation error, so we strip the + ``NoneType`` arm of any top-level union. + """ + origin = get_origin(annotation) + if origin is Union or origin is UnionType: + args = tuple(a for a in get_args(annotation) if a is not type(None)) + if not args: + return annotation + if len(args) == 1: + return args[0] + return Union[args] # type: ignore[return-value] + return annotation + + +@lru_cache(maxsize=None) +def strict_model_for(model_cls: type) -> type[BaseModel]: + """Return a pydantic subclass of ``model_cls`` with write-required slots forced. + + For each name in the registered :attr:`WriteSpec.required_for_write` + list, the corresponding field on the returned subclass is required + (no default, ``...`` ellipsis). The annotation, validators, and other + metadata of the parent class are preserved — only the default is + flipped. + + Cached on ``model_cls`` so the derived class is built once and reused + across calls. + + Important: ``models.py`` is never mutated. The returned class is a + runtime-only subclass; assertions on the parent class's + ``model_fields`` continue to reflect the schema as generated. + """ + # Local import avoids a hard top-level cycle through the registry. + from .write_spec import REGISTRY + + spec = REGISTRY.get(model_cls.__name__) + required: Sequence[str] = spec.required_for_write if spec else () + if not required: + # Nothing to tighten — return the original class. + return model_cls + + overrides: dict[str, Any] = {} + for name in required: + finfo = model_cls.model_fields.get(name) + if finfo is None: + raise ValueError( + f"{model_cls.__name__}: required_for_write field {name!r} " + f"is not declared on the model" + ) + overrides[name] = (_strip_optional(finfo.annotation), Field(...)) + + strict = create_model( + f"{model_cls.__name__}_StrictWrite", + __base__=model_cls, + **overrides, + ) + return strict + + +def _coerce_iterable(models: Any) -> tuple[bool, list[BaseModel]]: + """Return ``(was_iterable, items)`` for the same shape contract as the hook.""" + if isinstance(models, BaseModel): + return False, [models] + if isinstance(models, (str, bytes)) or not isinstance(models, Iterable): + raise TypeError( + f"validate_for_write expected a model or iterable; " + f"got {type(models).__name__}" + ) + return True, list(models) + + +def validate_for_write(models: Any, spec: WriteSpec) -> Any: + """Re-validate ``models`` through the strict submodel for ``spec.model_cls``. + + Single instance in returns a single instance out; an iterable in + returns a list out. No I/O. Pydantic-only. On failure, raises + :class:`ValueError` naming the class and the failing slot. + """ + was_iter, items = _coerce_iterable(models) + if not items: + return items if was_iter else None + + cls = type(items[0]) + if cls is not spec.model_cls: + raise TypeError( + f"validate_for_write: spec.model_cls is {spec.model_cls.__name__!r} " + f"but received {cls.__name__!r}" + ) + + strict = strict_model_for(cls) + if strict is cls: + return items if was_iter else items[0] + + revalidated: list[BaseModel] = [] + for idx, m in enumerate(items): + try: + revalidated.append(strict.model_validate(m.model_dump())) + except ValidationError as err: + missing = sorted( + { + ".".join(str(p) for p in e.get("loc", ())) + for e in err.errors() + if e.get("type") + in ("missing", "none_not_allowed", "string_type", "value_error") + } + ) + slot_text = ", ".join(missing) if missing else "(see below)" + row_id = getattr(m, "id", None) + row_hint = f"row {idx}" if row_id is None else f"row {idx} (id={row_id})" + raise ValueError( + f"{cls.__name__}: missing required_for_write slot(s): " + f"{slot_text} at {row_hint}. {err}" + ) from err + + return revalidated if was_iter else revalidated[0] diff --git a/src/connects_common_connectivity/io/writers.py b/src/connects_common_connectivity/io/writers.py new file mode 100644 index 0000000..4d09ed0 --- /dev/null +++ b/src/connects_common_connectivity/io/writers.py @@ -0,0 +1,383 @@ +"""Dispatch core for IO-layer Delta writers. + +A single public entry point — :func:`write_models` — accepts a homogeneous +batch of generated pydantic models and routes the write through the +:class:`~connects_common_connectivity.io.write_spec.WriteSpec` registered +for that class. The only standalone writer is +:func:`write_projection_matrix`, which exists because its signature is +genuinely non-uniform (it accepts a dense matrix alongside the model). + +Class-specific behavior lives in the registry, never here. Callers +discover what is writable via :data:`WRITABLE_CLASSES`. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Iterable, Sequence + +import pyarrow as pa +from deltalake import write_deltalake +from pydantic import BaseModel + +from ..config import Settings, get_settings +from .arrow_utils import attach_linkml_metadata, build_arrow_schema, models_to_table +from .write_spec import REGISTRY, WriteSpec, get_spec +from .write_utils import append_new_dataitems, populate_region_coverage +from .write_validation import validate_for_write + +# --------------------------------------------------------------------------- +# Result type +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class WriteResult: + """Return value of a single :func:`write_models` invocation. + + ``predicates`` is one entry per scope group for ``overwrite_scoped`` + writes; an empty tuple for ``append_new_by_id`` (no predicate is + issued — Delta append + id-dedupe handles idempotency). + """ + + class_name: str + path: Path + mode: str + predicates: tuple[str, ...] + rows_written: int + + +# --------------------------------------------------------------------------- +# Discovery +# --------------------------------------------------------------------------- + + +WRITABLE_CLASSES: tuple[type, ...] = tuple( + spec.model_cls for spec in REGISTRY.values() +) + + +# --------------------------------------------------------------------------- +# Validation hook +# --------------------------------------------------------------------------- + + +def _validation_hook(models: Sequence[BaseModel], spec: WriteSpec) -> Sequence[BaseModel]: + """Strict re-validation against ``spec.required_for_write``. + + Identity-shaped: takes a sequence in, returns a sequence out. Pure + pydantic; no I/O. + """ + return validate_for_write(list(models), spec) + + +# --------------------------------------------------------------------------- +# Helpers (private; tested directly) +# --------------------------------------------------------------------------- + + +def _normalize_models(models: Any) -> list[BaseModel]: + """Coerce ``models`` to a list, accepting a single model or any iterable. + + Requires homogeneous type. Empty input is rejected — callers always + know which class they are writing. + """ + if isinstance(models, BaseModel): + return [models] + if isinstance(models, (str, bytes)) or not isinstance(models, Iterable): + raise TypeError( + f"write_models expected a pydantic model or iterable of models; " + f"got {type(models).__name__}" + ) + items = list(models) + if not items: + raise ValueError("write_models received an empty batch") + cls = type(items[0]) + for m in items: + if type(m) is not cls: + raise TypeError( + f"write_models requires homogeneous types; got " + f"{cls.__name__} and {type(m).__name__}" + ) + return items + + +def _format_value(v: Any) -> str: + """Render ``v`` as a single-quoted SQL literal for the Delta predicate.""" + if v is None: + return "NULL" + return "'" + str(v).replace("'", "''") + "'" + + +def _build_predicate(scope_columns: Sequence[str], row_values: Sequence[Any]) -> str: + """Build an AND-joined ``col = 'val'`` predicate for ``write_deltalake``. + + The format is exactly ``col1 = 'val1' AND col2 = 'val2'`` — single + quotes, AND-joined, no extra whitespace beyond the single space around + each operator. Notebooks that compose predicates by hand use the same + format; this helper is the canonical implementation. + """ + if len(scope_columns) != len(row_values): + raise ValueError( + f"scope_columns ({len(scope_columns)}) and row_values " + f"({len(row_values)}) length mismatch" + ) + parts = [f"{c} = {_format_value(v)}" for c, v in zip(scope_columns, row_values)] + return " AND ".join(parts) + + +def _group_by_scope( + table: pa.Table, scope_columns: Sequence[str] +) -> list[tuple[tuple, pa.Table]]: + """Partition ``table`` into one ``(scope_tuple, sub_table)`` per scope group. + + Scope groups preserve row order within each group. Two rows belong to + the same group iff they have equal values across every column in + ``scope_columns``. Order of groups is the order of first appearance. + """ + if not scope_columns: + raise ValueError("scope_columns must be non-empty for overwrite_scoped writes") + + cols = [table.column(c).to_pylist() for c in scope_columns] + keys: list[tuple] = list(zip(*cols)) if cols else [] + + seen: dict[tuple, list[int]] = {} + for i, key in enumerate(keys): + seen.setdefault(key, []).append(i) + + return [(key, table.take(pa.array(idxs))) for key, idxs in seen.items()] + + +# --------------------------------------------------------------------------- +# Dispatch branches +# --------------------------------------------------------------------------- + + +def _dispatch_overwrite_scoped( + table: pa.Table, spec: WriteSpec, path: Path +) -> WriteResult: + """Group by scope, issue one predicated overwrite per group.""" + groups = _group_by_scope(table, spec.scope_columns) + predicates: list[str] = [] + rows_written = 0 + partition_by = spec.partition_by or None + for key, sub in groups: + predicate = _build_predicate(spec.scope_columns, key) + write_deltalake( + str(path), + sub, + mode="overwrite", + predicate=predicate, + partition_by=partition_by, + ) + predicates.append(predicate) + rows_written += sub.num_rows + return WriteResult( + class_name=spec.model_cls.__name__, + path=path, + mode="overwrite_scoped", + predicates=tuple(predicates), + rows_written=rows_written, + ) + + +def _dispatch_append_new_by_id( + table: pa.Table, spec: WriteSpec, path: Path +) -> WriteResult: + """Append only rows whose id is new, scoped to a single ``project_id``.""" + if not spec.scope_columns: + raise ValueError( + f"{spec.model_cls.__name__}: scope_columns is empty for append_new_by_id " + f"(expected the id column at index 0)" + ) + id_column = spec.scope_columns[0] + + if "project_id" not in table.column_names: + raise ValueError( + f"{spec.model_cls.__name__}: append_new_by_id requires a 'project_id' " + f"column on every row (got columns {table.column_names!r})" + ) + project_ids = set(table.column("project_id").to_pylist()) + if len(project_ids) != 1: + raise ValueError( + f"{spec.model_cls.__name__}: append_new_by_id requires a single " + f"project_id per call (got {sorted(project_ids)!r}). Split the " + f"batch upstream." + ) + (project_id,) = project_ids + + rows_written = append_new_dataitems( + str(path), table, project_id=project_id, id_column=id_column + ) + return WriteResult( + class_name=spec.model_cls.__name__, + path=path, + mode="append_new_by_id", + predicates=(), + rows_written=rows_written, + ) + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def _resolve_output_root( + settings: Settings | None, + output_root: str | Path | None, +) -> tuple[Path, Settings | None]: + """Resolve the effective on-disk root for a single write call. + + Precedence (highest first): + + 1. Explicit ``output_root=`` (str or :class:`Path`). Used verbatim; + passing both ``settings=`` and ``output_root=`` is an error so callers + never have to remember a precedence rule. + 2. Explicit ``settings=`` → ``settings.output_root``. + 3. :func:`get_settings` → the discovered ``ccc_config.yaml``. + """ + if output_root is not None and settings is not None: + raise TypeError( + "Pass either settings= or output_root=, not both. " + "output_root= is the per-call override; settings= carries the " + "full Settings object." + ) + if output_root is not None: + return Path(output_root), None + resolved = settings or get_settings() + return Path(resolved.output_root), resolved + + +def write_models( + models: Any, + *, + settings: Settings | None = None, + output_root: str | Path | None = None, +) -> WriteResult: + """Write a batch of generated pydantic models to the shared Delta lake. + + The class is inferred from ``models`` and dispatched through its + :class:`WriteSpec` (see :mod:`connects_common_connectivity.io.write_spec`). + No per-class wrapper functions exist; renaming this function eight times + would add no behavior, only drift surface. + + Parameters + ---------- + models: + A single model instance or a non-empty iterable of instances of the + same class. The class must be one of :data:`WRITABLE_CLASSES`. + settings: + Optional explicit settings. Falls back to :func:`get_settings` when + omitted; an explicit ``settings=`` always wins over the discovered + config (matches the precedence documented in + :mod:`connects_common_connectivity.config`). + output_root: + Optional per-call override of the on-disk root under which the + canonical ``spec.subdir`` is written. Use this when a single + notebook/dataset should write to a different location than the + shared ``ccc_config.yaml`` ``output_root`` (e.g. an isolated test + dataset). Mutually exclusive with ``settings=`` — passing both + raises ``TypeError``. + + Returns + ------- + WriteResult + Class name, on-disk path, dispatch mode, the predicates issued (one + per scope group for ``overwrite_scoped``; empty for + ``append_new_by_id``), and the number of rows written. + + Notes + ----- + Writable classes (the registry, in order): + ``DataSet``, ``DataItem``, ``DataItemDataSetAssociation``, + ``Cluster``, ``ClusterHierarchy``, ``ClusterMembership``, + ``MappingSet``, ``CellToClusterMapping``, + ``CellFeatureSet``, ``CellFeatureDefinition``, ``CellFeatureMatrix``, + ``ProjectionMeasurementMatrix``. + Use ``WRITABLE_CLASSES`` to enumerate at runtime. + """ + items = _normalize_models(models) + cls = type(items[0]) + spec = get_spec(cls) + + items = list(_validation_hook(items, spec)) + + root, resolved_settings = _resolve_output_root(settings, output_root) + path = root / spec.subdir + + if resolved_settings is not None and resolved_settings.dry_run: + return WriteResult( + class_name=spec.model_cls.__name__, + path=path, + mode=spec.write_mode, + predicates=(), + rows_written=0, + ) + + schema = build_arrow_schema(cls) + table = models_to_table(items, schema=schema) + table = attach_linkml_metadata(table, linkml_class=cls.__name__) + + if spec.write_mode == "overwrite_scoped": + return _dispatch_overwrite_scoped(table, spec, path) + if spec.write_mode == "append_new_by_id": + return _dispatch_append_new_by_id(table, spec, path) + raise ValueError( + f"{cls.__name__}: unsupported write_mode {spec.write_mode!r}. " + f"Add a dispatch branch in writers.py." + ) + + +def write_projection_matrix( + pmm: Any, + matrix: Any, + *, + settings: Settings | None = None, + output_root: str | Path | None = None, +) -> WriteResult: + """Enrich ``pmm`` with derived ``region_coverage`` and write it. + + The single non-:func:`write_models` public writer, justified by the + non-uniform signature: callers must hand in the dense ``matrix`` + alongside the model so coverage can be derived from it. The input + ``pmm`` is not mutated — :func:`populate_region_coverage` returns a + new instance. + + ``settings`` and ``output_root`` have the same semantics — and the same + mutual-exclusion rule — as in :func:`write_models`. + """ + enriched = populate_region_coverage(pmm, matrix) + return write_models(enriched, settings=settings, output_root=output_root) + + +def write_cellcellconnectivitylong( + *args: Any, **kwargs: Any +) -> WriteResult: + """Placeholder writer for ``CellCellConnectivityLong`` rows. + + Not implemented. ``CellCellConnectivityLong`` has no ``WriteSpec`` entry + yet, and the existing ETL notebooks (``etl_minnie_04_cell_cell.ipynb``, + ``parse_minnie_clustering.ipynb``) write to non-canonical, run-specific + subdirs (e.g. ``cellcellconnectivitylong_proofread_pre_to_csm_post/``) + rather than the canonical ``cellcellconnectivitylong/`` subdir that + ``write_models`` would resolve. Until either (a) those callers + consolidate onto the canonical subdir and a ``WriteSpec`` is added, or + (b) dispatch is extended to accept a per-call subdir override, those + notebooks keep using ``write_deltalake`` directly. This stub exists as + a reminder of that open work. + """ + raise NotImplementedError( + "write_cellcellconnectivitylong is not implemented yet; " + "see the docstring for the migration plan." + ) + + +__all__ = [ + "WRITABLE_CLASSES", + "WriteResult", + "write_models", + "write_projection_matrix", + "write_cellcellconnectivitylong", +] diff --git a/src/connects_common_connectivity/models.py b/src/connects_common_connectivity/models.py index 1e6fb50..3233b1b 100644 --- a/src/connects_common_connectivity/models.py +++ b/src/connects_common_connectivity/models.py @@ -112,6 +112,10 @@ class Modality(str, Enum): """ Expansion microscopy based connectivity mapping. """ + CALCIUM_IMAGING = "CALCIUM_IMAGING" + """ + Calcium imaging based functional correlations. + """ OTHER = "OTHER" """ Other modality. diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..12b3378 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from pathlib import Path + +import polars as pl +import pytest + +import connects_common_connectivity as ccc +from connects_common_connectivity.config import Settings, get_settings + + +@pytest.fixture(autouse=True) +def _isolate_settings(monkeypatch: pytest.MonkeyPatch, tmp_path: Path): + """Each test gets isolated cwd/env and a fresh get_settings cache.""" + monkeypatch.delenv("CCC_OUTPUT_ROOT", raising=False) + monkeypatch.chdir(tmp_path) + get_settings.cache_clear() + yield + get_settings.cache_clear() + + +@pytest.fixture(scope="session") +def models() -> dict: + """Generate pydantic models once per session (expensive).""" + return ccc.generate_pydantic_models() + + +@pytest.fixture +def settings(tmp_path: Path) -> Settings: + return Settings(output_root=tmp_path) + + +@pytest.fixture +def read_delta(): + def _read(path: str | Path) -> pl.DataFrame: + return pl.read_delta(str(path)) + + return _read diff --git a/tests/test_basic.py b/tests/test_basic.py index bd1e600..c4220be 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -1,3 +1,7 @@ +import pytest +from pydantic import ValidationError + + def test_import(): import connects_common_connectivity as ccc assert ccc.__version__ @@ -14,17 +18,15 @@ def test_model_generation(): def test_required_field_enforcement(): - import pytest import connects_common_connectivity as ccc models = ccc.generate_pydantic_models() DataItem = models["DataItem"] # project_id is required; omitting should raise a validation error - with pytest.raises(Exception): + with pytest.raises(ValidationError, match=r"(?s)project_id.*[Ff]ield required"): DataItem(id="D1", name="Item 1") def test_enum_validation(): - import pytest import connects_common_connectivity as ccc models = ccc.generate_pydantic_models() Modality = models["Modality"] # Enum @@ -34,7 +36,7 @@ def test_enum_validation(): # Depending on dynamic generation, modality may be stored as enum value or raw string assert str(ds.modality) in {Modality.TRACER.value, Modality.TRACER.name, str(Modality.TRACER)} # Invalid modality should raise error now that slot has enum range - with pytest.raises(Exception): + with pytest.raises(ValidationError, match=r"(?s)modality.*Input should be"): DataSet(id="DS2", name="Dataset 2", modality="NOT_A_VALID_MODALITY", project_id="P1") @@ -51,7 +53,6 @@ def test_multivalued_slot_list_type(): def test_probability_bounds_and_pattern(): - import pytest import connects_common_connectivity as ccc models = ccc.generate_pydantic_models() MappingSet = models["MappingSet"] @@ -66,9 +67,8 @@ def test_probability_bounds_and_pattern(): mapping = CellToCellMapping(id="M1", mapping_set=ms.id, source_cell=cell1.id, target_cell=cell2.id, probability=0.5, project_id="P1") assert 0 <= mapping.probability <= 1 # Invalid probability > 1 - with pytest.raises(Exception): + with pytest.raises(ValidationError, match=r"(?s)probability.*less than or equal to 1"): CellToCellMapping(id="M2", mapping_set=ms.id, source_cell=cell1.id, target_cell=cell2.id, probability=1.5, project_id="P1") - diff --git a/tests/test_cell_features_schema.py b/tests/test_cell_features_schema.py index 51bae1d..28135b6 100644 --- a/tests/test_cell_features_schema.py +++ b/tests/test_cell_features_schema.py @@ -1,26 +1,19 @@ import pytest from pydantic import ValidationError -import connects_common_connectivity as ccc - - -def _models(): - return ccc.generate_pydantic_models() - - # --------------------------------------------------------------------------- # CellFeatureDefinition # --------------------------------------------------------------------------- -def test_cell_feature_definition_project_id_required(): - CellFeatureDefinition = _models()["CellFeatureDefinition"] +def test_cell_feature_definition_project_id_required(models): + CellFeatureDefinition = models["CellFeatureDefinition"] with pytest.raises(ValidationError, match=r"(?s)project_id.*Field required"): CellFeatureDefinition(id="nucleus_volume_um", description="Nucleus volume") -def test_cell_feature_definition_valid(): - CellFeatureDefinition = _models()["CellFeatureDefinition"] +def test_cell_feature_definition_valid(models): + CellFeatureDefinition = models["CellFeatureDefinition"] cfd = CellFeatureDefinition( id="nucleus_volume_um", description="Nucleus volume in cubic microns", @@ -34,37 +27,37 @@ def test_cell_feature_definition_valid(): assert cfd.range_max is None # optional -def test_cell_feature_definition_range_min_max_optional(): - CellFeatureDefinition = _models()["CellFeatureDefinition"] +def test_cell_feature_definition_range_min_max_optional(models): + CellFeatureDefinition = models["CellFeatureDefinition"] # Both range fields absent — should not raise cfd = CellFeatureDefinition(id="some_feature", project_id="minnie65") assert cfd.range_min is None assert cfd.range_max is None -def test_cell_feature_definition_data_type_pattern_valid(): - CellFeatureDefinition = _models()["CellFeatureDefinition"] +def test_cell_feature_definition_data_type_pattern_valid(models): + CellFeatureDefinition = models["CellFeatureDefinition"] for dt in ["f8", "=i4"]: cfd = CellFeatureDefinition(id="feat", data_type=dt, project_id="p1") assert cfd.data_type == dt -def test_cell_feature_definition_data_type_pattern_invalid(): - CellFeatureDefinition = _models()["CellFeatureDefinition"] +def test_cell_feature_definition_data_type_pattern_invalid(models): + CellFeatureDefinition = models["CellFeatureDefinition"] for bad in ["float32", "f4", " subprocess.CompletedProcess[str]: + return subprocess.run( + [sys.executable, "-m", "connects_common_connectivity.cli", *args], + cwd=str(cwd) if cwd else None, + capture_output=True, + text=True, + ) + + +def test_cli_help(): + result = _run_cli("--help") + assert result.returncode == 0 + assert "usage:" in result.stdout.lower() + + +def test_cli_info_shows_version(): + result = _run_cli("info") + assert result.returncode == 0 + assert "Package version:" in result.stdout + + +def test_cli_bundle_happy_path(tmp_path): + out = tmp_path / "connectivity_bundle.tar.gz" + result = _run_cli("bundle", "--output", str(out), cwd=tmp_path) + assert result.returncode == 0 + assert out.exists() + with tarfile.open(out, "r:gz") as tf: + names = tf.getnames() + assert any(name.startswith("schemas/") for name in names) + + +def test_cli_bad_subcommand_exits_nonzero(): + result = _run_cli("not-a-command") + assert result.returncode != 0 + assert "invalid choice" in result.stderr.lower() diff --git a/tests/test_clustering_schema.py b/tests/test_clustering_schema.py index ec30630..0b56ff5 100644 --- a/tests/test_clustering_schema.py +++ b/tests/test_clustering_schema.py @@ -1,32 +1,25 @@ import pytest from pydantic import ValidationError -import connects_common_connectivity as ccc - - -def _models(): - return ccc.generate_pydantic_models() - - # --------------------------------------------------------------------------- # Cluster — no longer ProjectScoped (taxonomies are global reference artifacts) # --------------------------------------------------------------------------- -def test_cluster_has_no_project_id_field(): - Cluster = _models()["Cluster"] +def test_cluster_has_no_project_id_field(models): + Cluster = models["Cluster"] assert "project_id" not in Cluster.model_fields -def test_cluster_constructs_without_project_id(): - Cluster = _models()["Cluster"] +def test_cluster_constructs_without_project_id(models): + Cluster = models["Cluster"] cluster = Cluster(id="c1") assert cluster.id == "c1" -def test_cluster_rejects_project_id(): +def test_cluster_rejects_project_id(models): # Pydantic config is extra='forbid', so passing project_id raises rather than silently dropping. - Cluster = _models()["Cluster"] + Cluster = models["Cluster"] with pytest.raises(ValidationError, match=r"(?s)project_id.*Extra inputs are not permitted"): Cluster(id="c1", project_id="visp_patchseq") @@ -36,20 +29,20 @@ def test_cluster_rejects_project_id(): # --------------------------------------------------------------------------- -def test_cluster_membership_project_id_required(): - ClusterMembership = _models()["ClusterMembership"] +def test_cluster_membership_project_id_required(models): + ClusterMembership = models["ClusterMembership"] with pytest.raises(ValidationError, match=r"(?s)project_id.*Field required"): ClusterMembership(item="cell_1", cluster="c1") -def test_cluster_membership_hierarchy_id_optional(): - ClusterMembership = _models()["ClusterMembership"] +def test_cluster_membership_hierarchy_id_optional(models): + ClusterMembership = models["ClusterMembership"] cm = ClusterMembership(item="cell_1", cluster="c1", project_id="visp_patchseq") assert cm.hierarchy_id is None -def test_cluster_membership_hierarchy_id_round_trip(): - ClusterMembership = _models()["ClusterMembership"] +def test_cluster_membership_hierarchy_id_round_trip(models): + ClusterMembership = models["ClusterMembership"] cm = ClusterMembership( item="cell_1", cluster="c1", @@ -59,8 +52,8 @@ def test_cluster_membership_hierarchy_id_round_trip(): assert cm.hierarchy_id == "visp_met_types_v1" -def test_cluster_membership_hierarchy_id_must_be_string(): - ClusterMembership = _models()["ClusterMembership"] +def test_cluster_membership_hierarchy_id_must_be_string(models): + ClusterMembership = models["ClusterMembership"] with pytest.raises(ValidationError, match=r"(?s)hierarchy_id.*Input should be a valid string"): ClusterMembership( item="cell_1", @@ -75,27 +68,27 @@ def test_cluster_membership_hierarchy_id_must_be_string(): # --------------------------------------------------------------------------- -def test_cluster_hierarchy_id_optional(): - Cluster = _models()["Cluster"] +def test_cluster_hierarchy_id_optional(models): + Cluster = models["Cluster"] cluster = Cluster(id="c1") assert cluster.hierarchy_id is None -def test_cluster_hierarchy_id_round_trip(): - Cluster = _models()["Cluster"] +def test_cluster_hierarchy_id_round_trip(models): + Cluster = models["Cluster"] cluster = Cluster(id="c1", hierarchy_id="visp_met_types_v1") assert cluster.hierarchy_id == "visp_met_types_v1" -def test_cluster_hierarchy_id_must_be_string(): - Cluster = _models()["Cluster"] +def test_cluster_hierarchy_id_must_be_string(models): + Cluster = models["Cluster"] with pytest.raises(ValidationError, match=r"(?s)hierarchy_id.*Input should be a valid string"): Cluster(id="c1", hierarchy_id=123) -def test_cluster_still_has_no_project_id_after_hierarchy_id_added(): +def test_cluster_still_has_no_project_id_after_hierarchy_id_added(models): # Regression guard: hierarchy_id was added without re-introducing ProjectScoped on Cluster. - Cluster = _models()["Cluster"] + Cluster = models["Cluster"] assert "project_id" not in Cluster.model_fields with pytest.raises(ValidationError, match=r"(?s)project_id.*Extra inputs are not permitted"): Cluster(id="c1", project_id="visp_patchseq") @@ -106,39 +99,39 @@ def test_cluster_still_has_no_project_id_after_hierarchy_id_added(): # --------------------------------------------------------------------------- -def test_cluster_hierarchy_constructs_with_id_run_root_clusters(): - ClusterHierarchy = _models()["ClusterHierarchy"] +def test_cluster_hierarchy_constructs_with_id_run_root_clusters(models): + ClusterHierarchy = models["ClusterHierarchy"] h = ClusterHierarchy(id="h1", run="run1", root="root", clusters=["root", "c1"]) assert h.id == "h1" assert h.root == "root" assert h.clusters == ["root", "c1"] -def test_cluster_hierarchy_requires_id(): - ClusterHierarchy = _models()["ClusterHierarchy"] +def test_cluster_hierarchy_requires_id(models): + ClusterHierarchy = models["ClusterHierarchy"] with pytest.raises(ValidationError, match=r"(?s)id.*Field required"): ClusterHierarchy(run="run1", root="root", clusters=["root"]) -def test_algorithm_run_requires_algorithm_name(): - AlgorithmRun = _models()["AlgorithmRun"] +def test_algorithm_run_requires_algorithm_name(models): + AlgorithmRun = models["AlgorithmRun"] with pytest.raises(ValidationError, match=r"(?s)algorithm_name.*Field required"): AlgorithmRun(id="run1") -def test_algorithm_run_constructs_without_input_dataset(): - AlgorithmRun = _models()["AlgorithmRun"] +def test_algorithm_run_constructs_without_input_dataset(models): + AlgorithmRun = models["AlgorithmRun"] run = AlgorithmRun(id="run1", algorithm_name="hierarchical") assert run.input_dataset is None -def test_hierarchy_category_requires_id(): - HierarchyCategory = _models()["HierarchyCategory"] +def test_hierarchy_category_requires_id(models): + HierarchyCategory = models["HierarchyCategory"] with pytest.raises(ValidationError, match=r"(?s)id.*Field required"): HierarchyCategory(description="leaf") -def test_hierarchy_category_level_optional(): - HierarchyCategory = _models()["HierarchyCategory"] +def test_hierarchy_category_level_optional(models): + HierarchyCategory = models["HierarchyCategory"] cat = HierarchyCategory(id="cluster") assert cat.level is None diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..e65970e --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,184 @@ +"""Tests for the package-wide config module.""" + +from __future__ import annotations + +import os +from pathlib import Path + +import pytest +from pydantic import ValidationError +from connects_common_connectivity.config import ( + CONFIG_FILENAME, + Settings, + find_config_file, + get_settings, + output_root, + table_path, +) + + +def _write_config(dir_: Path, **values) -> Path: + import yaml + + path = dir_ / CONFIG_FILENAME + path.write_text(yaml.safe_dump(values)) + return path + + +def test_get_settings_raises_actionable_error_when_missing(tmp_path): + # tmp_path has no ccc_config.yaml anywhere up the tree (we chdir'd into it). + with pytest.raises(RuntimeError, match=CONFIG_FILENAME): + get_settings() + + +def test_find_and_load_from_nested_cwd(tmp_path, monkeypatch): + _write_config(tmp_path, output_root=str(tmp_path / "out"), dry_run=True) + nested = tmp_path / "a" / "b" / "c" + nested.mkdir(parents=True) + monkeypatch.chdir(nested) + get_settings.cache_clear() + + found = find_config_file() + assert found == (tmp_path / CONFIG_FILENAME).resolve() + + settings = get_settings() + assert isinstance(settings, Settings) + assert settings.output_root == Path(str(tmp_path / "out")) + assert settings.dry_run is True + + +def test_env_overrides_only_output_root(tmp_path, monkeypatch): + _write_config(tmp_path, output_root=str(tmp_path / "from_file"), dry_run=True) + monkeypatch.setenv("CCC_OUTPUT_ROOT", str(tmp_path / "from_env")) + get_settings.cache_clear() + + settings = get_settings() + assert settings.output_root == Path(str(tmp_path / "from_env")) + # dry_run still comes from the file; env cannot express it. + assert settings.dry_run is True + + +def test_explicit_settings_wins_over_env_and_file(tmp_path, monkeypatch): + _write_config(tmp_path, output_root=str(tmp_path / "from_file"), dry_run=True) + monkeypatch.setenv("CCC_OUTPUT_ROOT", str(tmp_path / "from_env")) + get_settings.cache_clear() + + explicit = Settings(output_root=tmp_path / "explicit", dry_run=False) + + # Simulate the caller-side precedence pattern documented for writers/readers. + def writer(settings=None): + return settings or get_settings() + + resolved = writer(settings=explicit) + assert resolved is explicit + assert resolved.output_root == tmp_path / "explicit" + assert resolved.dry_run is False + + +def test_table_path_joins_and_returns_path(tmp_path): + settings = Settings(output_root=tmp_path / "root") + p = table_path(settings, "dataset") + assert isinstance(p, Path) + assert p == tmp_path / "root" / "dataset" + # A few of the canonical subdir names used by the notebooks. + for name in ( + "dataitem", + "dataitem_dataset_association", + "cellfeatureset", + "cellfeaturematrix", + "cluster", + "clustermembership", + "projectionmeasurementmatrix", + ): + assert table_path(settings, name) == tmp_path / "root" / name + + +def test_output_root_is_required(tmp_path): + _write_config(tmp_path, dry_run=False) # missing output_root + get_settings.cache_clear() + with pytest.raises(ValidationError, match=r"(?s)output_root.*[Ff]ield required"): + get_settings() + + +def test_unknown_keys_rejected(tmp_path): + _write_config(tmp_path, output_root=str(tmp_path), nonsense_key=1) + get_settings.cache_clear() + with pytest.raises(ValidationError, match=r"(?s)[Ee]xtra inputs are not permitted"): + get_settings() + + +def test_io_reexports_settings_helpers(): + from connects_common_connectivity.io import ( + Settings as IOSettings, + get_settings as io_get_settings, + table_path as io_table_path, + ) + + assert IOSettings is Settings + assert io_get_settings is get_settings + assert io_table_path is table_path + + +def test_get_settings_is_cached(tmp_path, monkeypatch): + _write_config(tmp_path, output_root=str(tmp_path / "out")) + get_settings.cache_clear() + first = get_settings() + # Mutating the file should not change the cached result. + _write_config(tmp_path, output_root=str(tmp_path / "changed")) + second = get_settings() + assert first is second + # After clearing, discovery re-runs. + get_settings.cache_clear() + third = get_settings() + assert third.output_root == Path(str(tmp_path / "changed")) + + +def test_describe_includes_resolved_values(tmp_path): + settings = Settings(output_root=tmp_path / "root", dry_run=True) + text = settings.describe() + assert "root" in text + assert "dry_run=True" in text + + +def test_output_root_helper_appends_trailing_slash(tmp_path, monkeypatch): + _write_config(tmp_path, output_root=str(tmp_path / "out")) + get_settings.cache_clear() + # cwd is tmp_path (autouse fixture), so relpath of tmp_path/out is "out". + root = output_root() + assert isinstance(root, str) + assert root.endswith("/") + assert root == "out/" + + +def test_output_root_helper_absolute_flag(tmp_path): + settings = Settings(output_root=tmp_path / "explicit") + assert output_root(settings, absolute=True) == str(tmp_path / "explicit") + "/" + + +def test_output_root_helper_accepts_explicit_settings(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + explicit = Settings(output_root=tmp_path / "explicit") + # Default returns path relative to cwd (tmp_path). + assert output_root(explicit) == "explicit/" + + +def test_relative_output_root_in_config_is_anchored_at_config_dir(tmp_path, monkeypatch): + # Config sits at tmp_path; output_root is relative ("scratch/x/"). + _write_config(tmp_path, output_root="scratch/x/") + nested = tmp_path / "code" + nested.mkdir() + monkeypatch.chdir(nested) + get_settings.cache_clear() + + settings = get_settings() + # Settings.output_root is absolute, anchored at the config file's dir + # (abspath, not resolve — symlinks must not be followed). + assert settings.output_root == Path(os.path.abspath(tmp_path / "scratch" / "x")) + + # output_root() returns the path relative to cwd → "../scratch/x/". + assert output_root() == "../scratch/x/" + + # table_path joins to an absolute path that works regardless of cwd. + tp = table_path(settings, "dataset") + assert tp.is_absolute() + assert tp == Path(os.path.abspath(tmp_path / "scratch" / "x" / "dataset")) diff --git a/tests/test_mappings_schema.py b/tests/test_mappings_schema.py index 1e70041..8e026b7 100644 --- a/tests/test_mappings_schema.py +++ b/tests/test_mappings_schema.py @@ -1,21 +1,14 @@ import pytest from pydantic import ValidationError -import connects_common_connectivity as ccc - - -def _models(): - return ccc.generate_pydantic_models() - - # --------------------------------------------------------------------------- # MappingSet — source/target endpoints can be DataSet or ClusterHierarchy # --------------------------------------------------------------------------- -def test_mapping_set_dataset_to_dataset(): +def test_mapping_set_dataset_to_dataset(models): # Cell-to-cell shape: source_dataset + target_dataset (back-compat). - MappingSet = _models()["MappingSet"] + MappingSet = models["MappingSet"] ms = MappingSet( id="ms_cell_cell", name="ms_cell_cell", @@ -30,9 +23,9 @@ def test_mapping_set_dataset_to_dataset(): assert ms.target_hierarchy is None -def test_mapping_set_dataset_to_hierarchy(): +def test_mapping_set_dataset_to_hierarchy(models): # Cell-to-cluster shape: source_dataset + target_hierarchy. - MappingSet = _models()["MappingSet"] + MappingSet = models["MappingSet"] ms = MappingSet( id="ms_cell_cluster", name="ms_cell_cluster", @@ -47,9 +40,9 @@ def test_mapping_set_dataset_to_hierarchy(): assert ms.source_hierarchy is None -def test_mapping_set_hierarchy_to_hierarchy(): +def test_mapping_set_hierarchy_to_hierarchy(models): # Cluster-to-cluster shape: source_hierarchy + target_hierarchy. - MappingSet = _models()["MappingSet"] + MappingSet = models["MappingSet"] ms = MappingSet( id="ms_cluster_cluster", name="ms_cluster_cluster", @@ -64,10 +57,10 @@ def test_mapping_set_hierarchy_to_hierarchy(): assert ms.target_hierarchy == "visp_met_types_v1" -def test_mapping_set_endpoints_optional(): +def test_mapping_set_endpoints_optional(models): # All four endpoint slots are optional at the schema level (LinkML can't enforce # "exactly one of"); convention is enforced per-mapping kind. - MappingSet = _models()["MappingSet"] + MappingSet = models["MappingSet"] ms = MappingSet( id="ms_minimal", name="ms_minimal", method_name="m", project_id="p1", @@ -78,20 +71,20 @@ def test_mapping_set_endpoints_optional(): assert ms.target_hierarchy is None -def test_mapping_set_method_name_still_required(): - MappingSet = _models()["MappingSet"] +def test_mapping_set_method_name_still_required(models): + MappingSet = models["MappingSet"] with pytest.raises(ValidationError, match=r"(?s)method_name.*Field required"): MappingSet(id="ms1", project_id="p1") -def test_mapping_set_project_id_still_required(): - MappingSet = _models()["MappingSet"] +def test_mapping_set_project_id_still_required(models): + MappingSet = models["MappingSet"] with pytest.raises(ValidationError, match=r"(?s)project_id.*Field required"): MappingSet(id="ms1", method_name="m") -def test_mapping_set_hierarchy_fields_must_be_strings(): - MappingSet = _models()["MappingSet"] +def test_mapping_set_hierarchy_fields_must_be_strings(models): + MappingSet = models["MappingSet"] with pytest.raises(ValidationError, match=r"(?s)target_hierarchy.*Input should be a valid string"): MappingSet( id="ms1", method_name="m", project_id="p1", @@ -104,8 +97,8 @@ def test_mapping_set_hierarchy_fields_must_be_strings(): # --------------------------------------------------------------------------- -def test_cell_to_cluster_mapping_round_trip(): - CellToClusterMapping = _models()["CellToClusterMapping"] +def test_cell_to_cluster_mapping_round_trip(models): + CellToClusterMapping = models["CellToClusterMapping"] m = CellToClusterMapping( id="map_001", mapping_set="ms_cell_cluster", @@ -122,8 +115,8 @@ def test_cell_to_cluster_mapping_round_trip(): assert m.probability == 0.91 -def test_cell_to_cluster_mapping_requires_target_cluster(): - CellToClusterMapping = _models()["CellToClusterMapping"] +def test_cell_to_cluster_mapping_requires_target_cluster(models): + CellToClusterMapping = models["CellToClusterMapping"] with pytest.raises(ValidationError, match=r"(?s)target_cluster.*Field required"): CellToClusterMapping( id="map_001", diff --git a/tests/test_parquet_loader.py b/tests/test_parquet_loader.py new file mode 100644 index 0000000..2522c90 --- /dev/null +++ b/tests/test_parquet_loader.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +import pyarrow as pa +import pyarrow.parquet as pq + +from connects_common_connectivity.parquet_loader import load_parquet_to_models + + +def _write_parquet(path, columns: dict[str, list[str]]) -> None: + table = pa.table({name: pa.array(values) for name, values in columns.items()}) + pq.write_table(table, path) + + +def test_load_parquet_to_models_happy_path_dataitem(tmp_path): + parquet_path = tmp_path / "dataitems.parquet" + _write_parquet( + parquet_path, + { + "id": ["d1", "d2"], + "name": ["item-1", "item-2"], + "project_id": ["p1", "p1"], + }, + ) + + instances, report = load_parquet_to_models( + "connectivity_schema.yaml", + "DataItem", + str(parquet_path), + ) + + assert len(instances) == 2 + assert [item.id for item in instances] == ["d1", "d2"] + assert [item.project_id for item in instances] == ["p1", "p1"] + assert report["mapping"]["id"] == "id" + assert report["mapping"]["project_id"] == "project_id" + assert report["counts"]["rows"] == 2 + assert report["counts"]["instances"] == 2 + assert report["counts"]["errors"] == 0 + + +def test_load_parquet_to_models_reports_missing_required_slot(tmp_path): + parquet_path = tmp_path / "missing_project_id.parquet" + _write_parquet( + parquet_path, + { + "id": ["d1"], + "name": ["item-1"], + }, + ) + + instances, report = load_parquet_to_models( + "connectivity_schema.yaml", + "DataItem", + str(parquet_path), + ) + + assert instances == [] + assert report["counts"]["errors"] == 1 + assert any("project_id" in err["message"] for err in report["errors"]) diff --git a/tests/test_projection_schema.py b/tests/test_projection_schema.py index 899d3ba..53c0fa3 100644 --- a/tests/test_projection_schema.py +++ b/tests/test_projection_schema.py @@ -1,11 +1,7 @@ import pytest from pydantic import ValidationError -import connects_common_connectivity as ccc - - -def test_laterality_enum(): - models = ccc.generate_pydantic_models() +def test_laterality_enum(models): Laterality = models["Laterality"] assert Laterality.IPSILATERAL.name == "IPSILATERAL" assert Laterality.CONTRALATERAL.name == "CONTRALATERAL" @@ -13,8 +9,7 @@ def test_laterality_enum(): assert Laterality.UNKNOWN.name == "UNKNOWN" -def test_projection_measurement_matrix_laterality(): - models = ccc.generate_pydantic_models() +def test_projection_measurement_matrix_laterality(models): PMM = models["ProjectionMeasurementMatrix"] Laterality = models["Laterality"] Modality = models["Modality"] @@ -32,8 +27,7 @@ def test_projection_measurement_matrix_laterality(): modality=Modality.MORPHOLOGY, laterality="NOT_VALID") -def test_region_coverage_on_pmm(): - models = ccc.generate_pydantic_models() +def test_region_coverage_on_pmm(models): PMM = models["ProjectionMeasurementMatrix"] Laterality = models["Laterality"] Modality = models["Modality"] diff --git a/tests/test_public_api.py b/tests/test_public_api.py new file mode 100644 index 0000000..f139261 --- /dev/null +++ b/tests/test_public_api.py @@ -0,0 +1,50 @@ +"""Lock the curated public surface of ``connects_common_connectivity.io``. + +The public API is whatever ``__all__`` says — nothing more, nothing less. +""" + +from __future__ import annotations + +import importlib + +import connects_common_connectivity.io as io_mod + + +EXPECTED = { + "get_settings", + "Settings", + "table_path", + "write_models", + "write_projection_matrix", + "WriteResult", + "WRITABLE_CLASSES", +} + + +def test_all_exact_set(): + assert set(io_mod.__all__) == EXPECTED + + +def test_all_resolves_to_non_none_objects(): + for name in io_mod.__all__: + obj = getattr(io_mod, name) + assert obj is not None, f"io.{name} resolved to None" + + +def test_no_private_names_in_all(): + for name in io_mod.__all__: + assert not name.startswith("_"), f"private name {name!r} in __all__" + + +def test_each_name_imports_cleanly(): + mod = importlib.reload(io_mod) + for name in EXPECTED: + assert hasattr(mod, name), f"io.{name} missing" + + +def test_internal_modules_not_re_exported(): + # arrow_utils / write_utils / write_spec / writers are accessible as + # submodules (they're real modules) but their names must not leak into + # io.__all__. + forbidden = {"arrow_utils", "write_utils", "write_spec", "writers"} + assert forbidden.isdisjoint(set(io_mod.__all__)) diff --git a/tests/test_write_relocation.py b/tests/test_write_relocation.py new file mode 100644 index 0000000..42d7779 --- /dev/null +++ b/tests/test_write_relocation.py @@ -0,0 +1,101 @@ +"""No-shim regression test. + +The deprecated shim modules at the package root +(``connects_common_connectivity.arrow_utils`` and ``connects_common_connectivity.write_utils``) +were removed after the W6 notebook migration. This test pins that contract: + +1. The shim modules no longer exist on disk or as importable modules. +2. No source file (package, tests, notebooks, scripts) imports from the old paths. +3. The canonical IO modules still expose the public names. +""" + +from __future__ import annotations + +import json +import re +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[1] +SEARCH_ROOTS = ["src", "tests", "code", "scripts", "planning"] + +EXCLUDED_DIRS = {".venv", ".git", ".pytest_cache", ".ruff_cache", + ".ipynb_checkpoints", ".Trash-0", "node_modules"} + +SHIM_IMPORT_PATTERN = re.compile( + r"connects_common_connectivity\.(?:arrow_utils|write_utils)\b" +) + + +def test_shim_modules_deleted(): + pkg = REPO_ROOT / "src" / "connects_common_connectivity" + assert not (pkg / "arrow_utils.py").exists(), "shim arrow_utils.py must be deleted" + assert not (pkg / "write_utils.py").exists(), "shim write_utils.py must be deleted" + + +def test_shim_modules_not_importable(): + with pytest.raises(ModuleNotFoundError): + import connects_common_connectivity.arrow_utils # noqa: F401 + with pytest.raises(ModuleNotFoundError): + import connects_common_connectivity.write_utils # noqa: F401 + + +def _iter_source_files(): + for root in SEARCH_ROOTS: + base = REPO_ROOT / root + if not base.exists(): + continue + for path in base.rglob("*"): + if path.is_file() and path.suffix in {".py", ".ipynb"}: + if not any(part in EXCLUDED_DIRS for part in path.parts): + yield path + + +def test_no_source_references_shim_paths(): + offenders: list[tuple[Path, list[str]]] = [] + for path in _iter_source_files(): + # Skip this test file itself (it intentionally mentions the names). + if path.resolve() == Path(__file__).resolve(): + continue + try: + text = path.read_text(encoding="utf-8", errors="ignore") + except OSError: + continue + if path.suffix == ".ipynb": + # Search only code-cell source to avoid false positives in markdown prose. + try: + nb = json.loads(text) + except json.JSONDecodeError: + continue + lines: list[str] = [] + for cell in nb.get("cells", []): + if cell.get("cell_type") == "code": + src = cell.get("source", "") + if isinstance(src, list): + src = "".join(src) + lines.append(src) + haystack = "\n".join(lines) + else: + haystack = text + hits = [m.group(0) for m in SHIM_IMPORT_PATTERN.finditer(haystack)] + if hits: + offenders.append((path.relative_to(REPO_ROOT), hits)) + assert not offenders, ( + "Old shim paths still referenced:\n" + + "\n".join(f" {p}: {set(hs)}" for p, hs in offenders) + ) + + +def test_public_names_from_io_paths(): + from connects_common_connectivity.io.arrow_utils import ( # noqa: F401 + attach_linkml_metadata, + build_arrow_schema, + build_cell_feature_matrix_schema, + models_to_table, + ) + from connects_common_connectivity.io.write_utils import ( # noqa: F401 + append_new_dataitems, + populate_region_coverage, + walk_ancestors, + ) diff --git a/tests/test_write_spec.py b/tests/test_write_spec.py new file mode 100644 index 0000000..e7219e9 --- /dev/null +++ b/tests/test_write_spec.py @@ -0,0 +1,55 @@ +"""Drift tests for the WriteSpec registry. + +These tests guard against the registry getting out of sync with +``models.py`` — e.g., a renamed field silently breaking a writer's +predicate. +""" + +from __future__ import annotations + +import pytest + +from connects_common_connectivity import models as models_module +from connects_common_connectivity.io.write_spec import REGISTRY, WriteSpec, get_spec + + +def test_registry_contains_seed_entries(): + seed = {"DataSet", "DataItem", "DataItemDataSetAssociation"} + assert seed.issubset(set(REGISTRY)) + + +@pytest.mark.parametrize("key", list(REGISTRY)) +def test_registry_key_matches_model_cls(key): + spec = REGISTRY[key] + cls = getattr(models_module, key, None) + assert cls is not None, f"models.py has no class named {key!r}" + assert spec.model_cls is cls, ( + f"REGISTRY[{key!r}].model_cls is {spec.model_cls!r}, expected {cls!r}" + ) + assert spec.model_cls.__name__ == key + + +@pytest.mark.parametrize("key", list(REGISTRY)) +def test_spec_columns_exist_on_model(key): + spec: WriteSpec = REGISTRY[key] + fields = set(spec.model_cls.model_fields) + for col in spec.scope_columns + spec.partition_by + spec.required_for_write: + assert col in fields, ( + f"{spec.model_cls.__name__}: column {col!r} is not a field " + f"(have: {sorted(fields)})" + ) + + +def test_get_spec_accepts_class_and_instance(): + ds_cls = REGISTRY["DataSet"].model_cls + instance = ds_cls(id="d1", name="example", project_id="p1") + assert get_spec(ds_cls) is REGISTRY["DataSet"] + assert get_spec(instance) is REGISTRY["DataSet"] + + +def test_get_spec_unknown_class_raises(): + class NotRegistered: + pass + + with pytest.raises(KeyError): + get_spec(NotRegistered) diff --git a/tests/test_write_utils.py b/tests/test_write_utils.py index 075e56b..29625e4 100644 --- a/tests/test_write_utils.py +++ b/tests/test_write_utils.py @@ -2,7 +2,7 @@ import pyarrow as pa import pytest -from connects_common_connectivity.write_utils import append_new_dataitems +from connects_common_connectivity.io.write_utils import append_new_dataitems def _make_table(ids: list[str], project_id: str = "proj_a") -> pa.Table: @@ -51,7 +51,7 @@ def test_idempotent_partial_rerun(tmp_path): path = str(tmp_path / "dataitem") append_new_dataitems(path, _make_table(["a", "b"]), project_id="proj_a") n = append_new_dataitems(path, _make_table(["a", "b", "c"]), project_id="proj_a") - assert n == 1 # only "c" is new + assert n == 1, f"expected only 'c' to be new; appended {n} rows" # --------------------------------------------------------------------------- diff --git a/tests/test_write_validation.py b/tests/test_write_validation.py new file mode 100644 index 0000000..74a6727 --- /dev/null +++ b/tests/test_write_validation.py @@ -0,0 +1,145 @@ +"""Tests for write-time validation (auto-derived strict submodels).""" + +from __future__ import annotations + +import pytest + +from connects_common_connectivity.io.write_spec import REGISTRY, WriteSpec +from connects_common_connectivity.io.write_validation import ( + strict_model_for, + validate_for_write, +) +from connects_common_connectivity.models import ( + CellFeatureDefinition, + Cluster, + DataSet, +) + + +# --------------------------------------------------------------------------- +# strict_model_for +# --------------------------------------------------------------------------- + + +def test_strict_model_subclasses_parent_without_mutating_it(): + before = dict(Cluster.model_fields) + strict = strict_model_for(Cluster) + after = dict(Cluster.model_fields) + + assert before.keys() == after.keys() + for k in before: + assert before[k].is_required() == after[k].is_required(), ( + f"Cluster.model_fields[{k!r}] was mutated" + ) + assert issubclass(strict, Cluster) + assert strict is not Cluster + + +def test_strict_model_for_is_cached(): + a = strict_model_for(Cluster) + b = strict_model_for(Cluster) + assert a is b + + +def test_strict_model_returns_parent_when_no_required_for_write(): + # DataSet has empty required_for_write; the strict subclass is just the parent. + assert REGISTRY["DataSet"].required_for_write == [] + assert strict_model_for(DataSet) is DataSet + + +def test_strict_model_flips_optional_field_to_required(): + strict = strict_model_for(Cluster) + # On the parent, hierarchy_id is optional. + assert not Cluster.model_fields["hierarchy_id"].is_required() + # On the strict subclass, hierarchy_id is required. + assert strict.model_fields["hierarchy_id"].is_required() + + +# --------------------------------------------------------------------------- +# validate_for_write — failure path +# --------------------------------------------------------------------------- + + +def test_missing_required_for_write_slot_raises_before_io(): + spec = REGISTRY["Cluster"] + bad = Cluster(id="c1") # hierarchy_id missing + with pytest.raises(ValueError, match="hierarchy_id"): + validate_for_write(bad, spec) + + +def test_missing_slot_names_class_in_error(): + spec = REGISTRY["CellFeatureDefinition"] + bad = CellFeatureDefinition(id="f1", project_id="p1") # feature_set_id missing + with pytest.raises(ValueError, match="CellFeatureDefinition"): + validate_for_write(bad, spec) + + +# --------------------------------------------------------------------------- +# validate_for_write — happy path +# --------------------------------------------------------------------------- + + +def test_valid_model_passes_and_round_trips_field_by_field(): + spec = REGISTRY["Cluster"] + good = Cluster(id="c1", hierarchy_id="h1", level=2) + result = validate_for_write(good, spec) + # Field-by-field equality with the input. + for name in Cluster.model_fields: + assert getattr(result, name) == getattr(good, name) + + +def test_validate_for_write_accepts_a_list(): + spec = REGISTRY["Cluster"] + items = [ + Cluster(id="c1", hierarchy_id="h1"), + Cluster(id="c2", hierarchy_id="h1"), + ] + result = validate_for_write(items, spec) + assert isinstance(result, list) + assert [m.id for m in result] == ["c1", "c2"] + + +def test_validate_for_write_list_reports_failing_row(): + spec = REGISTRY["Cluster"] + items = [ + Cluster(id="c1", hierarchy_id="h1"), + Cluster(id="c2"), # missing hierarchy_id + ] + with pytest.raises(ValueError, match="hierarchy_id") as ei: + validate_for_write(items, spec) + assert "c2" in str(ei.value), f"error should name failing row; got: {ei.value}" + + +def test_validate_for_write_passthrough_when_required_is_empty(): + spec = REGISTRY["DataSet"] + ds = DataSet(id="d1", name="d", project_id="p1") + result = validate_for_write(ds, spec) + # No revalidation needed; identity-equal. + assert result is ds + + +def test_validate_for_write_rejects_class_mismatch(): + spec = REGISTRY["Cluster"] + not_a_cluster = DataSet(id="d1", name="d", project_id="p1") + with pytest.raises(TypeError, match="Cluster"): + validate_for_write(not_a_cluster, spec) + + +# --------------------------------------------------------------------------- +# Wired into write_models +# --------------------------------------------------------------------------- + + +def test_write_models_calls_validation_before_io(tmp_path): + from connects_common_connectivity.config import Settings + from connects_common_connectivity.io.writers import write_models + + settings = Settings(output_root=tmp_path) + bad = Cluster(id="c1") # hierarchy_id missing + with pytest.raises(ValueError, match="hierarchy_id"): + write_models(bad, settings=settings) + # No table directory created — IO never happened. + assert not (tmp_path / "cluster").exists(), ( + "validation failure should short-circuit before any IO; " + "cluster/ directory was created anyway" + ) diff --git a/tests/test_writers.py b/tests/test_writers.py new file mode 100644 index 0000000..2f94a30 --- /dev/null +++ b/tests/test_writers.py @@ -0,0 +1,429 @@ +"""Tests for the IO writer dispatch core. + +Covers: + +* The patchseq regression — overlapping ``project_id`` writes do not wipe + each other (the original motivating bug). +* Idempotency, multi-scope-group dispatch, predicate construction. +* Append-new-by-id semantics. +* A per-class round-trip smoke test for every entry in ``WRITABLE_CLASSES``. +* ``write_projection_matrix`` enrichment + write. +""" + +from __future__ import annotations + +import numpy as np +import polars as pl +import pyarrow as pa +import pytest +from pydantic import BaseModel + +from connects_common_connectivity.config import Settings +from connects_common_connectivity.io.write_spec import REGISTRY +from connects_common_connectivity.io.writers import ( + WRITABLE_CLASSES, + WriteResult, + _build_predicate, + _group_by_scope, + write_models, + write_projection_matrix, +) +from connects_common_connectivity.models import ( + AlgorithmRun, + CellFeatureDefinition, + CellFeatureMatrix, + CellFeatureSet, + CellToClusterMapping, + Cluster, + ClusterHierarchy, + ClusterMembership, + DataItem, + DataItemDataSetAssociation, + DataSet, + HierarchyCategory, + Laterality, + MappingSet, + Modality, + ProjectionMeasurementMatrix, + ProjectionMeasurementType, + Unit, +) + +# --------------------------------------------------------------------------- +# Predicate construction +# --------------------------------------------------------------------------- + + +def test_build_predicate_format(): + assert ( + _build_predicate(["project_id"], ["minnie65"]) + == "project_id = 'minnie65'" + ) + assert ( + _build_predicate(["project_id", "id"], ["minnie65", "ds_a"]) + == "project_id = 'minnie65' AND id = 'ds_a'" + ) + + +@pytest.mark.parametrize( + "value,expected_literal", + [ + ("O'Hara", "'O''Hara'"), + ("", "''"), + ("a\\b", "'a\\b'"), + ("café", "'café'"), + ], +) +def test_build_predicate_escapes(value, expected_literal): + assert _build_predicate(["name"], [value]) == f"name = {expected_literal}" + + +# --------------------------------------------------------------------------- +# _group_by_scope +# --------------------------------------------------------------------------- + + +def test_group_by_scope_preserves_first_appearance_order(): + table = pa.table( + { + "project_id": ["p", "p", "p"], + "id": ["b", "a", "b"], + "value": [1, 2, 3], + } + ) + groups = _group_by_scope(table, ["project_id", "id"]) + keys = [k for k, _ in groups] + assert keys == [("p", "b"), ("p", "a")] + # The first 'b' group should hold rows 0 and 2 (preserved order). + first_sub = groups[0][1] + assert first_sub.column("value").to_pylist() == [1, 3] + + +# --------------------------------------------------------------------------- +# Patchseq regression: the headline test +# --------------------------------------------------------------------------- + + +def test_patchseq_regression_two_datasets_same_project(settings, read_delta): + """Two DataSet rows with the same ``project_id`` but different ``id`` must coexist. + + Before W2/W3 the notebooks predicated on ``project_id`` only, so a + second write wiped the first. The new ``scope_columns=[project_id, id]`` + keeps each row independent. + """ + ds_a = DataSet(id="visp_exc_patchseq", name="exc", project_id="visp_patchseq") + ds_b = DataSet(id="visp_inh_patchseq", name="inh", project_id="visp_patchseq") + write_models(ds_a, settings=settings) + write_models(ds_b, settings=settings) + + rows = read_delta(settings.output_root / "dataset") + ids = sorted(rows["id"].to_list()) + assert ids == ["visp_exc_patchseq", "visp_inh_patchseq"], ( + f"patchseq regression: second write wiped first. " + f"Expected both datasets, got {ids}" + ) + + +def test_overwrite_scoped_is_idempotent(settings, read_delta): + ds = DataSet(id="d1", name="example", project_id="p1") + write_models(ds, settings=settings) + write_models(ds, settings=settings) + rows = read_delta(settings.output_root / "dataset") + assert rows.shape[0] == 1, f"idempotent rewrite produced {rows.shape[0]} rows" + assert rows["id"].to_list() == ["d1"], "row identity changed across rewrites" + assert rows["name"].to_list() == ["example"], "row content drifted across rewrites" + + +def test_dry_run_does_not_write(tmp_path): + settings = Settings(output_root=tmp_path, dry_run=True) + ds = DataSet(id="d1", name="d", project_id="p1") + + result = write_models(ds, settings=settings) + + assert result.rows_written == 0, "dry_run must report 0 rows written" + assert not (tmp_path / "dataset").exists(), "dry_run must not create tables" + + +def test_multi_scope_group_dispatch_yields_one_predicate_per_group(settings, read_delta): + rows_in = [ + DataSet(id="a", name="A", project_id="p1"), + DataSet(id="b", name="B", project_id="p1"), + ] + result = write_models(rows_in, settings=settings) + assert isinstance(result, WriteResult) + assert len(result.predicates) == 2 + assert result.rows_written == 2 + # Both end up in the table. + rows = read_delta(settings.output_root / "dataset") + assert sorted(rows["id"].to_list()) == ["a", "b"] + + +# --------------------------------------------------------------------------- +# append_new_by_id semantics (DataItem) +# --------------------------------------------------------------------------- + + +def test_append_new_by_id_only_appends_unseen(settings, read_delta): + items_first = [ + DataItem(id="cell_1", name="cell_1", project_id="p1"), + DataItem(id="cell_2", name="cell_2", project_id="p1"), + ] + r1 = write_models(items_first, settings=settings) + assert r1.mode == "append_new_by_id" + assert r1.predicates == () + assert r1.rows_written == 2 + + items_second = [ + DataItem(id="cell_2", name="cell_2", project_id="p1"), # already there + DataItem(id="cell_3", name="cell_3", project_id="p1"), # new + ] + r2 = write_models(items_second, settings=settings) + assert r2.rows_written == 1 + + rows = read_delta(settings.output_root / "dataitem") + assert sorted(rows["id"].to_list()) == ["cell_1", "cell_2", "cell_3"] + + +def test_append_new_by_id_rejects_mixed_project_ids(settings): + bad = [ + DataItem(id="x", name="x", project_id="p1"), + DataItem(id="y", name="y", project_id="p2"), + ] + with pytest.raises(ValueError, match="single project_id"): + write_models(bad, settings=settings) + + +# --------------------------------------------------------------------------- +# Per-class smoke (every entry in WRITABLE_CLASSES exercised) +# --------------------------------------------------------------------------- + + +INSTANCE_FACTORIES = { + DataSet: lambda: DataSet(id="ds1", name="ds", project_id="p1"), + DataItem: lambda: DataItem(id="di1", name="di1", project_id="p1"), + DataItemDataSetAssociation: lambda: DataItemDataSetAssociation( + dataitem_id="di1", dataset_id="ds1", project_id="p1" + ), + Cluster: lambda: Cluster(id="c1", hierarchy_id="h1", level=0), + ClusterHierarchy: lambda: ClusterHierarchy(id="h1", root="c1", clusters=["c1"]), + ClusterMembership: lambda: ClusterMembership( + item="cell_1", cluster="c1", hierarchy_id="h1", project_id="p1" + ), + MappingSet: lambda: MappingSet(id="m1", project_id="p1", name="m", method_name="dummy"), + CellToClusterMapping: lambda: CellToClusterMapping( + id="ctc1", + project_id="p1", + mapping_set="m1", + source_cell="cell_1", + target_cluster="c1", + ), + CellFeatureSet: lambda: CellFeatureSet(id="fs1", project_id="p1"), + CellFeatureDefinition: lambda: CellFeatureDefinition( + id="feat_a", + project_id="p1", + feature_set_id="fs1", + data_type="= 1 + + +# --------------------------------------------------------------------------- +# write_projection_matrix +# --------------------------------------------------------------------------- + + +def test_write_projection_matrix_enriches_and_does_not_mutate_input(settings, read_delta): + pmm = ProjectionMeasurementMatrix( + id="pmm_test", + measurement_type=ProjectionMeasurementType.MICRONS_OF_AXON, + modality=Modality.MORPHOLOGY, + laterality=Laterality.IPSILATERAL, + unit=Unit.MICRONS_LENGTH, + data_item_index=["c1", "c2"], + region_index=["VISp", "ACA", "MOB"], + values="file:///tmp/pmm.delta", + ) + matrix = np.array( + [ + [1.0, 0.0, 0.0], + [0.0, 0.0, 2.0], + ] + ) + assert pmm.region_coverage in (None, []) + + result = write_projection_matrix(pmm, matrix, settings=settings) + assert result.class_name == "ProjectionMeasurementMatrix" + assert pmm.region_coverage in (None, []) # input not mutated + + rows = read_delta(settings.output_root / "projectionmeasurementmatrix") + coverage = rows.filter(pl.col("id") == "pmm_test")["region_coverage"].to_list()[0] + assert list(coverage) == ["VISp", "MOB"] + + +# --------------------------------------------------------------------------- +# Input validation +# --------------------------------------------------------------------------- + + +def test_write_models_rejects_empty(settings): + with pytest.raises(ValueError, match="empty"): + write_models([], settings=settings) + + +def test_write_models_rejects_heterogeneous(settings): + with pytest.raises(TypeError, match="homogeneous"): + write_models( + [ + DataSet(id="d1", name="d", project_id="p1"), + DataItem(id="x", name="x", project_id="p1"), + ], + settings=settings, + ) + + +def test_write_models_rejects_unregistered_class(settings): + class NotInRegistry: + pass + + with pytest.raises(TypeError, match="pydantic model or iterable"): + write_models(NotInRegistry(), settings=settings) + + +def test_write_models_rejects_unregistered_pydantic_model(settings): + class UnregisteredModel(BaseModel): + id: str + + with pytest.raises(KeyError, match="UnregisteredModel"): + write_models(UnregisteredModel(id="u1"), settings=settings) + + +# --------------------------------------------------------------------------- +# Per-call output_root override +# --------------------------------------------------------------------------- + + +def test_write_models_output_root_override_writes_to_given_path(tmp_path): + """Passing output_root= writes under that root, bypassing get_settings().""" + alt_root = tmp_path / "alt_dataset" + ds = DataSet(id="d_alt", name="alt", project_id="p_alt") + + result = write_models(ds, output_root=alt_root) + + assert result.path == alt_root / "dataset" + rows = pl.read_delta(str(alt_root / "dataset")).filter( + pl.col("id") == "d_alt" + ) + assert rows.shape[0] == 1 + + +def test_write_models_output_root_accepts_string(tmp_path): + """str and Path are both accepted for output_root.""" + alt_root = tmp_path / "string_root" + ds = DataSet(id="d_str", name="s", project_id="p_str") + + result = write_models(ds, output_root=str(alt_root)) + + assert result.path == alt_root / "dataset" + + +def test_write_models_rejects_both_settings_and_output_root(settings, tmp_path): + """Passing both settings= and output_root= raises (no precedence to memorize).""" + ds = DataSet(id="d_x", name="x", project_id="p_x") + with pytest.raises(TypeError, match="either settings= or output_root="): + write_models(ds, settings=settings, output_root=tmp_path / "other") + + +def test_write_projection_matrix_output_root_override(tmp_path): + """write_projection_matrix forwards output_root through write_models.""" + alt_root = tmp_path / "pmm_alt" + pmm = ProjectionMeasurementMatrix( + id="pmm_alt", + measurement_type=ProjectionMeasurementType.MICRONS_OF_AXON, + modality=Modality.MORPHOLOGY, + laterality=Laterality.IPSILATERAL, + unit=Unit.MICRONS_LENGTH, + data_item_index=["c1", "c2"], + region_index=["r1", "r2"], + values="file:///tmp/pmm_alt.delta", + ) + matrix = np.array([[1.0, 0.0], [0.0, 2.0]]) + + result = write_projection_matrix(pmm, matrix, output_root=alt_root) + + assert result.path == alt_root / "projectionmeasurementmatrix" + + +def test_write_projection_matrix_rejects_both_settings_and_output_root( + settings, tmp_path +): + pmm = ProjectionMeasurementMatrix( + id="pmm_x", + measurement_type=ProjectionMeasurementType.MICRONS_OF_AXON, + modality=Modality.MORPHOLOGY, + laterality=Laterality.IPSILATERAL, + unit=Unit.MICRONS_LENGTH, + data_item_index=["c1"], + region_index=["r1"], + values="file:///tmp/pmm_x.delta", + ) + matrix = np.array([[1.0]]) + with pytest.raises(TypeError, match="either settings= or output_root="): + write_projection_matrix( + pmm, matrix, settings=settings, output_root=tmp_path / "other" + ) diff --git a/uv.lock b/uv.lock index 0f39a21..f3926a0 100644 --- a/uv.lock +++ b/uv.lock @@ -9,7 +9,7 @@ resolution-markers = [ ] [options] -exclude-newer = "2026-04-15T22:36:33.389267Z" +exclude-newer = "0001-01-01T00:00:00Z" # This has no effect and is included for backwards compatibility when using relative exclude-newer values. exclude-newer-span = "P7D" [[package]] @@ -505,7 +505,7 @@ wheels = [ [[package]] name = "connects-common-connectivity" -version = "0.1.0" +version = "0.2.0" source = { editable = "." } dependencies = [ { name = "caveclient" },