diff --git a/CHANGELOG.md b/CHANGELOG.md index ffc26d3b..d4e61b70 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +## [0.18.0] - 2026-06-15 + +### Added + +- Entries are now searchable by the **human-readable name** of the Benchling objects they reference. `entry.json` gains a `links` array — one `{type, id, name, slug}` entry per referenced object — promoted into the package metadata so `links.name` queries match real names (e.g. find every experiment that references `QB-2743.1`). `name` is resolved from the Benchling API (best-effort `get_by_id`; `null` when the app lacks registry access or the type is unsupported); `slug` is a lossy token parsed from the `webURL` for display/debugging only and is never used as a name + +### Changed + +- `references.json` renamed to **`links.json`** and reduced to raw discovery facts only — `id`/`type`/`web_url` per link, plus `entities` and `results_tables`. Derived classifications (`category`/`fetchable`/`eventable`/`disposition`) are no longer persisted; they are recomputed from `type` in code at runtime, so the raw archive stays reprocessable and a future classification change needs no re-fetch. `schema_version` bumped to `2` + ## [0.17.2] - 2026-04-15 ### Fixed diff --git a/docker/app-manifest.yaml b/docker/app-manifest.yaml index daf47db8..9459ad3d 100644 --- a/docker/app-manifest.yaml +++ b/docker/app-manifest.yaml @@ -2,7 +2,7 @@ manifestVersion: 1 info: name: nightly-quilttest-com description: Packaging Benchling Notebooks as Quilt packages - version: 0.17.2 + version: 0.18.0 features: - name: Quilt Connector id: quilt_entry diff --git a/docker/pyproject.toml b/docker/pyproject.toml index 1a7d3cd4..f2c8b894 100644 --- a/docker/pyproject.toml +++ b/docker/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "benchling-quilt-integration" -version = "0.17.2" +version = "0.18.0" description = "Benchling-Quilt Integration Webhook Service" license = {text = "Apache-2.0"} authors = [ diff --git a/docker/src/entry_packager.py b/docker/src/entry_packager.py index 3fc18d83..73eb683e 100644 --- a/docker/src/entry_packager.py +++ b/docker/src/entry_packager.py @@ -20,11 +20,29 @@ from .auth import RoleManager from .config import get_config +from .entry_references import link_metadata, summarize_references from .payload import Payload from .retry_utils import LAMBDA_INVOKE_RETRY, REST_API_RETRY logger = structlog.get_logger(__name__) +# EntryLink ``type`` -> Benchling SDK service attribute used to resolve the +# authoritative human-readable name via GET-by-id. Only types with a stable +# get_by_id are listed (entities first, then inventory and entries -- the types +# John flagged on the 2026-06-15 call); any other type keeps ``name=None`` and +# relies on its slug. Name resolution requires the app to be a registry/project +# collaborator, so every lookup is best-effort (see _enrich_link_names). +LINK_TYPE_TO_SERVICE: Dict[str, str] = { + "custom_entity": "custom_entities", + "dna_sequence": "dna_sequences", + "aa_sequence": "aa_sequences", + "container": "containers", + "box": "boxes", + "plate": "plates", + "location": "locations", + "entry": "entries", +} + class DateTimeEncoder(json.JSONEncoder): """Custom JSON encoder that converts datetime objects to ISO format strings.""" @@ -629,6 +647,13 @@ def _create_metadata_files( if canvas_id is not None: entry_json["canvas_id"] = canvas_id + # links - curated, searchable view of the objects this entry references. + # Promoted into entry.json (the package's metadata_uri) so `links.name` is + # queryable. Names are resolved best-effort against the Benchling API. + links = link_metadata(entry_data) + self._enrich_link_names(links) + entry_json["links"] = links + # input.json - Processing metadata input_json = { "source": "benchling_webhook", @@ -684,13 +709,20 @@ def _create_metadata_files( """ for file_info in uploaded_files: - if file_info["filename"] not in ["entry.json", "entry_data.json", "input.json", "README.md"]: + if file_info["filename"] not in [ + "entry.json", + "entry_data.json", + "input.json", + "links.json", + "README.md", + ]: readme_content += f"- `{file_info['filename']}` ({file_info['size']} bytes)\n" readme_content += """ ## Metadata Files - `entry.json`: Key entry metadata (display_id, name, creator, authors, timestamps) - `entry_data.json`: Complete entry data from Benchling API +- `links.json`: Raw Benchling objects this entry links to (entities, inventory, tables); the searchable, name-enriched summary is the `links` field of `entry.json` - `input.json`: Export processing metadata - `README.md`: This documentation file @@ -699,13 +731,57 @@ def _create_metadata_files( For questions about the data, refer to the original Benchling entry. """ + # links.json - raw discovery of the entities/resources this entry points at, + # from the entry's note links and fields (no Benchling records are fetched + # here; inferences are not persisted). The searchable view is entry.json.links. + links_json = summarize_references(entry_data) + return { "entry.json": entry_json, "entry_data.json": entry_data, + "links.json": links_json, "input.json": input_json, "README.md": readme_content, } + def _enrich_link_names(self, links: list[Dict[str, Any]]) -> None: + """Fill each link's ``name`` with its authoritative Benchling display name. + + Mutates ``links`` in place (the curated entries from ``link_metadata``). + Best-effort and never raises: a missing client, an unsupported type, or a + forbidden/failed GET-by-id leaves ``name`` as ``None`` -- the slug stays for + eyeballing, and the slug is never promoted into ``name``. Name resolution + needs the app to be a registry/project collaborator (see AGENTS / setup). + """ + if not self.benchling: + return + for link in links: + service_attr = LINK_TYPE_TO_SERVICE.get(link.get("type") or "") + link_id = link.get("id") + if not service_attr or not link_id: + continue + service = getattr(self.benchling, service_attr, None) + if service is None: + continue + record = None + try: + # ``returning`` trims the payload to just the name where supported. + record = service.get_by_id(link_id, returning=["name"]) + except Exception: + try: + record = service.get_by_id(link_id) + except Exception as exc: + self.logger.debug( + "Link name lookup failed", + link_id=link_id, + link_type=link.get("type"), + error=str(exc), + ) + continue + record_name = getattr(record, "name", None) + if isinstance(record_name, str) and record_name: + link["name"] = record_name + def _load_existing_canvas_id_from_entry_json(self, s3_client: Any, package_name: str) -> Optional[str]: """Read canvas_id from a previously-written ``entry.json``, if any. diff --git a/docker/src/entry_references.py b/docker/src/entry_references.py new file mode 100644 index 00000000..56768c8a --- /dev/null +++ b/docker/src/entry_references.py @@ -0,0 +1,423 @@ +"""Extract typed references out of a Benchling entry's structured data. + +A Benchling entry points at other Benchling objects in three places: + +1. Note links -- ``days[].notes[].links[]``, each carrying a ``type`` plus an + optional ``id`` and ``webURL`` (external ``link`` has no ``id``; some types + such as ``location`` have no ``webURL``). ``type`` is a closed enum + (``EntryLink.type`` in the Benchling OpenAPI spec) of 18 tokens spanning + entities, inventory, references, dashboards, and plain external hyperlinks -- + see :data:`LINK_TYPE_CATEGORY`. +2. Entity-link fields -- ``fields[name]`` whose ``type`` mentions ``entity``, + carrying one or more entity IDs directly in ``value``. +3. Results tables -- ``results_table`` notes carrying an ``assayResultSchemaId`` + (the discovery site for assay results, issues #68/#69). + +This module is pure: it operates on the entry dict already fetched by +``EntryPackager`` and makes no Benchling API calls. Resolving each reference to a +full record (``get_by_id`` / ``bulk_get``) is the caller's job; this layer only +discovers and classifies what an entry points at. + +Shared discovery layer for entity packaging (#143), the full entry-linked +resource map (#389), and assay results (#68/#69). +""" + +from dataclasses import dataclass +from enum import Enum +from typing import Any, Iterator, Optional + + +class LinkCategory(str, Enum): + """How a note-link type relates to packaging (per #389). + + Two orthogonal axes drive what we do with a referenced object (see + FETCHABLE_CATEGORIES / EVENTABLE_CATEGORIES and CATEGORY_DISPOSITION): + - fetchable: a record can be retrieved via GET-by-id. + - eventable: it emits its own create/update webhooks, so it can arrive + independent of an entry (a candidate for its own package, #143 behavior 2). + """ + + ENTITY = "entity" # registry entity; GET-by-id + v2.entity.registered event + INVENTORY = "inventory" # packageable via GET-by-id; no webhook events + REFERENCE = "reference" # packageable; has its own create/update events + METADATA = "metadata" # GET works but low value to package as a record + NOT_PACKAGEABLE = "not_packageable" # no read API (dashboards, protocol) + UNCERTAIN = "uncertain" # endpoint depends on tenant API version; verify first + EXTERNAL = "external" # plain http(s) hyperlink ("link"); no Benchling ID + UNKNOWN = "unknown" # type not in the known enum -- surfaced, not dropped + + +# EntryLink.type -> category. Covers all 18 enum tokens from test/openapi.yaml. +# Unknown/future tokens fall through to LinkCategory.UNKNOWN via classify_link_type. +LINK_TYPE_CATEGORY: dict[str, LinkCategory] = { + # entities (eventable via v2.entity.registered) + "custom_entity": LinkCategory.ENTITY, + "dna_sequence": LinkCategory.ENTITY, + "aa_sequence": LinkCategory.ENTITY, + "batch": LinkCategory.ENTITY, + # inventory (packageable on reference, no events) + "container": LinkCategory.INVENTORY, + "box": LinkCategory.INVENTORY, + "plate": LinkCategory.INVENTORY, + "location": LinkCategory.INVENTORY, + # references (packageable, own events) + "entry": LinkCategory.REFERENCE, + "request": LinkCategory.REFERENCE, + "workflow": LinkCategory.REFERENCE, + # metadata-only pointers + "user": LinkCategory.METADATA, + "folder": LinkCategory.METADATA, + # no read API exists -- keep the webURL as a reference only + "sql_dashboard": LinkCategory.NOT_PACKAGEABLE, + "insights_dashboard": LinkCategory.NOT_PACKAGEABLE, + "protocol": LinkCategory.NOT_PACKAGEABLE, + # endpoint depends on tenant API version (v2-alpha) -- verify before relying + "stage_entry": LinkCategory.UNCERTAIN, + # plain external hyperlink (no Benchling id) + "link": LinkCategory.EXTERNAL, +} + +# Note: dna_oligo / rna_oligo / mixture / assay_run / assay_result / workflow_task +# are NOT EntryLink types -- they cannot appear as note links. They reach an entry +# via structured note parts / inventory tables, not links[]. + +# Categories whose resources can be fetched as a record via GET-by-id. +FETCHABLE_CATEGORIES = frozenset({LinkCategory.ENTITY, LinkCategory.INVENTORY, LinkCategory.REFERENCE}) + +# Categories that also emit their own create/update webhooks, so they can arrive +# independent of an entry (candidates for their own package, #143 behavior 2). +EVENTABLE_CATEGORIES = frozenset({LinkCategory.ENTITY, LinkCategory.REFERENCE}) + +# What to do with a referenced object when packaging the entry that links to it. +# "packageable" alone was ambiguous -- fetchable-as-a-record vs. worthy-of-its-own +# package. disposition is the actionable answer, derived from fetchable + eventable: +# nest_or_standalone -- entity: fetchable AND eventable. Nest inside the entry +# and/or package standalone -- the open #143 decision. +# nest -- fetchable, no events: only capturable via an entry. +# link -- fetchable primary object with its own events: reference +# its own package, don't embed a copy. +# pointer -- fetchable but low value: keep id + webURL only. +# verify -- endpoint depends on tenant API version / unknown type. +# skip -- nothing to fetch; the entry already keeps the webURL. +CATEGORY_DISPOSITION: dict[LinkCategory, str] = { + LinkCategory.ENTITY: "nest_or_standalone", + LinkCategory.INVENTORY: "nest", + LinkCategory.REFERENCE: "link", + LinkCategory.METADATA: "pointer", + LinkCategory.NOT_PACKAGEABLE: "skip", + LinkCategory.UNCERTAIN: "verify", + LinkCategory.EXTERNAL: "skip", + LinkCategory.UNKNOWN: "verify", +} + +# Linkable entity types (subset of LINK_TYPE_CATEGORY that are LinkCategory.ENTITY). +ENTITY_LINK_TYPES = frozenset(t for t, cat in LINK_TYPE_CATEGORY.items() if cat is LinkCategory.ENTITY) + +# Note ``type`` value that carries tabular assay results. Kept as a set for +# extensibility, but scoped to ``results_table`` only: that is the note type that +# carries an ``assayResultSchemaId`` (#68/#69). A generic ``table`` or a +# ``registration_table`` is a different mechanism and must not be swept in here. +RESULTS_TABLE_NOTE_TYPES = frozenset({"results_table"}) + + +@dataclass(frozen=True) +class LinkRef: + """A classified note link. ``id``/``web_url`` are absent for some types + (``link`` has no id; ``location`` has no webURL).""" + + type: str + category: LinkCategory + id: Optional[str] = None + web_url: Optional[str] = None + + @property + def is_fetchable(self) -> bool: + """Whether a record can be retrieved via GET-by-id.""" + return self.category in FETCHABLE_CATEGORIES + + @property + def is_eventable(self) -> bool: + """Whether it emits its own webhooks (can arrive independent of an entry).""" + return self.category in EVENTABLE_CATEGORIES + + @property + def disposition(self) -> str: + """How to treat this reference when packaging the entry (see CATEGORY_DISPOSITION).""" + return CATEGORY_DISPOSITION[self.category] + + +@dataclass(frozen=True) +class EntityReference: + """A reference to a Benchling entity discovered inside an entry. + + ``type`` is the discovery type as seen in the entry (e.g. ``custom_entity`` + from a note link, or ``entity_link`` from a field) -- not necessarily the + entity's own schema type. ``source`` records where the reference was found. + """ + + id: str + type: str + web_url: Optional[str] = None + source: str = "note_link" # "note_link" | "entity_field" + + +@dataclass(frozen=True) +class ResultsTableReference: + """A reference to an assay-results table embedded in an entry note.""" + + assay_result_schema_id: str + api_id: Optional[str] = None + name: Optional[str] = None + + +def _iter_notes(entry_data: dict[str, Any]) -> Iterator[dict[str, Any]]: + """Yield every note across all days, defensively skipping malformed shapes.""" + for day in entry_data.get("days") or []: + if not isinstance(day, dict): + continue + for note in day.get("notes") or []: + if isinstance(note, dict): + yield note + + +def _iter_fields(entry_data: dict[str, Any]) -> Iterator[tuple[Optional[str], dict[str, Any]]]: + """Yield ``(name, field)`` pairs. + + Benchling renders entry ``fields`` as a name-keyed dict; some payloads use a + list of field objects instead, so both are accepted. + """ + fields = entry_data.get("fields") + if isinstance(fields, dict): + for name, fval in fields.items(): + if isinstance(fval, dict): + yield name, fval + elif isinstance(fields, list): + for fval in fields: + if isinstance(fval, dict): + yield fval.get("name"), fval + + +def _field_value_ids(fval: dict[str, Any]) -> list[str]: + """Pull entity ID(s) out of a field value (single value or ``isMulti`` list). + + Empty strings are dropped, mirroring the ``if not link_id`` guard on note links. + """ + val = fval.get("value") + if isinstance(val, str): + return [val] if val else [] + if isinstance(val, list): + return [v for v in val if isinstance(v, str) and v] + return [] + + +def _link_web_url(link: dict[str, Any]) -> Optional[str]: + return link.get("webURL") or link.get("web_url") + + +def classify_link_type(link_type: Optional[str]) -> LinkCategory: + """Map an EntryLink ``type`` token to its :class:`LinkCategory`. + + Unknown/future tokens map to ``UNKNOWN`` rather than being silently dropped. + """ + if not link_type: + return LinkCategory.UNKNOWN + return LINK_TYPE_CATEGORY.get(link_type, LinkCategory.UNKNOWN) + + +def extract_note_links(entry_data: dict[str, Any]) -> list[dict[str, Any]]: + """Return every link object across all note bodies, unfiltered and untyped. + + Lowest-level primitive; most callers want :func:`classify_links` or + :func:`extract_entity_references`. + """ + links: list[dict[str, Any]] = [] + for note in _iter_notes(entry_data): + for link in note.get("links") or []: + if isinstance(link, dict): + links.append(link) + return links + + +def classify_links(entry_data: dict[str, Any]) -> list[LinkRef]: + """Return every note link, classified by category and deduped. + + Surfaces the *full* set of objects an entry points at -- entities, inventory, + references, metadata pointers, dashboards, and external URLs -- so callers can + decide what to fetch (e.g. ``[r for r in classify_links(e) if r.is_fetchable]``). + Deduped by Benchling ID when present, else by URL; first-seen order preserved. + """ + seen: set[str] = set() + refs: list[LinkRef] = [] + for link in extract_note_links(entry_data): + link_type = link.get("type") + link_id = link.get("id") + web_url = _link_web_url(link) + dedup_key = link_id or web_url + if dedup_key is not None: + if dedup_key in seen: + continue + seen.add(dedup_key) + refs.append( + LinkRef( + type=str(link_type) if link_type is not None else "", + category=classify_link_type(link_type), + id=link_id, + web_url=web_url, + ) + ) + return refs + + +def extract_entity_references( + entry_data: dict[str, Any], + *, + types: "frozenset[str] | set[str]" = ENTITY_LINK_TYPES, +) -> list[EntityReference]: + """Return deduped entity references from note links and entity-link fields. + + Note links are filtered to ``types`` (default: all linkable entity types). + Entity-link fields are detected by an ``entity`` substring in the field + ``type`` and are included regardless of ``types``. References are deduped by + ID, preserving first-seen order (note links before fields). + """ + seen: set[str] = set() + refs: list[EntityReference] = [] + + for link in extract_note_links(entry_data): + link_id = link.get("id") + link_type = link.get("type") + if not link_id or link_type not in types or link_id in seen: + continue + seen.add(link_id) + refs.append( + EntityReference( + id=str(link_id), + type=str(link_type), + web_url=_link_web_url(link), + source="note_link", + ) + ) + + for _name, fval in _iter_fields(entry_data): + ftype = fval.get("type") + if not ftype or "entity" not in str(ftype).lower(): + continue + for value_id in _field_value_ids(fval): + if value_id in seen: + continue + seen.add(value_id) + refs.append(EntityReference(id=value_id, type=str(ftype), source="entity_field")) + + return refs + + +def extract_results_tables(entry_data: dict[str, Any]) -> list[ResultsTableReference]: + """Return deduped assay-results-table references from entry notes. + + Only notes whose ``type`` is a results-table type *and* that carry an + ``assayResultSchemaId`` are returned. Deduped by ``(api_id, schema_id)``. + """ + seen: set[tuple[Optional[str], str]] = set() + tables: list[ResultsTableReference] = [] + for note in _iter_notes(entry_data): + if note.get("type") not in RESULTS_TABLE_NOTE_TYPES: + continue + schema_id = note.get("assayResultSchemaId") + if not schema_id: + continue + api_id = note.get("apiId") + key = (api_id, schema_id) + if key in seen: + continue + seen.add(key) + tables.append( + ResultsTableReference( + assay_result_schema_id=schema_id, + api_id=api_id, + name=note.get("name"), + ) + ) + return tables + + +def slug_from_web_url(web_url: Optional[str], link_id: Optional[str] = None) -> Optional[str]: + """Best-effort, human-ish slug parsed from a Benchling object's webURL. + + Benchling user-facing URLs end in ``.../{id}-{name-slug}/edit``. This returns + the trailing slug -- lowercased, punctuation-flattened, possibly truncated by + Benchling. It is **not** the authoritative display name (case and punctuation + are lost; e.g. ``QB-2743.1`` -> ``qb-2743-1``) and must never be used as one. + For the real name, fetch the record (the caller's job). Returns ``None`` when + no slug can be isolated. + """ + if not web_url or not link_id: + return None + path = web_url.split("?", 1)[0].split("#", 1)[0].rstrip("/") + segments = [p for p in path.split("/") if p and p != "edit"] + # The id-bearing segment is ``{id}-{slug}``; the id appears in either ``_`` or + # ``-`` form across tenants. Match it normalized, then slice off the ``{id}-`` + # prefix. A segment that is exactly the id (no trailing slug) yields None, and + # a URL with no id-bearing segment yields None -- never a stray path fragment. + norm_prefix = f"{link_id}-".replace("_", "-") + for candidate in segments: + if candidate.replace("_", "-").startswith(norm_prefix): + return candidate[len(norm_prefix) :] or None + return None + + +def link_metadata(entry_data: dict[str, Any]) -> list[dict[str, Any]]: + """Curated, searchable view of an entry's links: ``{type, id, name, slug}``. + + One flat entry per classified note link (deduped by :func:`classify_links`), + regardless of type -- the searchable summary promoted into ``entry.json`` so + ``links.name`` is queryable. ``name`` is left ``None`` here: the authoritative + display name needs an API fetch, which is the caller's job (this module is + pure). ``slug`` is the lossy webURL token from :func:`slug_from_web_url` -- + a debugging/eyeball aid only, never a substitute for ``name``. + """ + return [ + { + "type": r.type, + "id": r.id, + "name": None, + "slug": slug_from_web_url(r.web_url, r.id), + } + for r in classify_links(entry_data) + ] + + +# Bump when the links.json shape changes in a way consumers must notice. +# v2: links.json holds raw facts only -- derived classifications (category, +# fetchable, eventable, disposition) are no longer persisted; they are recomputed +# in code at runtime. The searchable, name-enriched view lives in entry.json +# ``links`` (see link_metadata), not here. +REFERENCES_SCHEMA_VERSION = 2 + + +def summarize_references(entry_data: dict[str, Any]) -> dict[str, Any]: + """Build the JSON-serializable raw-discovery payload written as ``links.json``. + + Records *raw facts* about what an entry points at -- ids, types, webURLs -- + so a consumer (or a future, different classification) can reprocess without + re-fetching. Deliberately excludes our inferences (``category``/``fetchable``/ + ``eventable``/``disposition``): those are derived from ``type`` in code, not + frozen to disk. No Benchling records are fetched here. The curated, searchable + view with human-readable names is the ``links`` field of ``entry.json`` (see + :func:`link_metadata`). + """ + return { + "schema_version": REFERENCES_SCHEMA_VERSION, + "entities": [ + {"id": e.id, "type": e.type, "web_url": e.web_url, "source": e.source} + for e in extract_entity_references(entry_data) + ], + "links": [{"id": link.id, "type": link.type, "web_url": link.web_url} for link in classify_links(entry_data)], + "results_tables": [ + { + "assay_result_schema_id": t.assay_result_schema_id, + "api_id": t.api_id, + "name": t.name, + } + for t in extract_results_tables(entry_data) + ], + } diff --git a/docker/tests/test_entry_packager.py b/docker/tests/test_entry_packager.py index f9c9187f..e2bb38f2 100644 --- a/docker/tests/test_entry_packager.py +++ b/docker/tests/test_entry_packager.py @@ -648,6 +648,61 @@ def test_create_metadata_files_dict_format(self, orchestrator): assert "filename" not in entry_json["files"]["file2.csv"] assert "filename" not in entry_json["files"]["data.json"] + def test_create_metadata_files_includes_links_json(self, orchestrator): + """links.json captures raw discovery; entry.json.links is the searchable view.""" + entry_data = { + "id": "etr_123", + "display_id": "EXP-001", + "name": "Test Entry", + "web_url": "https://demo.benchling.com/entry/etr_123", + "created_at": "2025-10-01T10:00:00Z", + "modified_at": "2025-10-02T10:00:00Z", + "days": [ + { + "notes": [ + { + "type": "text", + "links": [ + { + "id": "bfi_1", + "type": "custom_entity", + "webURL": "https://demo.benchling.com/benchling/f/lib_1-reg/bfi-1-qb-2743-1/edit", + }, + {"id": "axdash_1", "type": "sql_dashboard", "webURL": "u2"}, + ], + } + ] + } + ], + } + + result = orchestrator._create_metadata_files( + package_name="benchling/EXP-001", + entry_id="etr_123", + timestamp="2025-10-02T10:00:00Z", + base_url="https://demo.benchling.com", + webhook_data={}, + uploaded_files=[], + download_url="https://example.com/export.zip", + entry_data=entry_data, + ) + + # Raw discovery file (no references.json anymore). + assert "references.json" not in result + links_file = result["links.json"] + assert links_file["schema_version"] == 2 + assert [e["id"] for e in links_file["entities"]] == ["bfi_1"] # dashboard filtered out + assert {link["type"] for link in links_file["links"]} == {"custom_entity", "sql_dashboard"} + assert all(set(link) == {"id", "type", "web_url"} for link in links_file["links"]) + + # Searchable, curated view promoted into entry.json metadata. + meta_links = result["entry.json"]["links"] + assert all(set(link) == {"type", "id", "name", "slug"} for link in meta_links) + by_id = {link["id"]: link for link in meta_links} + # slug parsed from webURL; name left None (mock client returns no real name). + assert by_id["bfi_1"]["slug"] == "qb-2743-1" + assert by_id["bfi_1"]["name"] is None + def test_create_metadata_files_includes_canvas_id_when_present(self, orchestrator): """Test entry.json stores canvas_id for canvas-initiated exports.""" result = orchestrator._create_metadata_files( diff --git a/docker/tests/test_entry_references.py b/docker/tests/test_entry_references.py new file mode 100644 index 00000000..dd586a46 --- /dev/null +++ b/docker/tests/test_entry_references.py @@ -0,0 +1,362 @@ +"""Tests for entry_references extractor (shared discovery for #143 + #68/#69).""" + +import json + +from src.entry_references import ( + ENTITY_LINK_TYPES, + EVENTABLE_CATEGORIES, + FETCHABLE_CATEGORIES, + LINK_TYPE_CATEGORY, + EntityReference, + LinkCategory, + LinkRef, + ResultsTableReference, + classify_link_type, + classify_links, + extract_entity_references, + extract_note_links, + extract_results_tables, + link_metadata, + slug_from_web_url, + summarize_references, +) + + +def _entry(*notes, fields=None): + """Build a minimal entry dict with one day holding the given notes.""" + return { + "id": "etr_test", + "days": [{"date": "2026-06-15", "title": "Day 1", "notes": list(notes)}], + "fields": fields or {}, + } + + +def _link_note(*links): + return {"type": "text", "text": "", "links": list(links)} + + +class TestExtractNoteLinks: + def test_collects_links_across_notes(self): + entry = _entry( + _link_note({"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}), + _link_note({"id": "seq_1", "type": "dna_sequence", "webURL": "u2"}), + ) + ids = [link["id"] for link in extract_note_links(entry)] + assert ids == ["bfi_1", "seq_1"] + + def test_empty_when_no_links(self): + assert extract_note_links(_entry(_link_note())) == [] + + def test_tolerates_missing_days_notes_links(self): + assert extract_note_links({}) == [] + assert extract_note_links({"days": [{}]}) == [] + assert extract_note_links({"days": [{"notes": [{}]}]}) == [] + + +class TestExtractEntityReferences: + def test_returns_entity_links_only(self): + entry = _entry( + _link_note( + {"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}, + {"id": "axdash_1", "type": "sql_dashboard", "webURL": "u2"}, + ) + ) + refs = extract_entity_references(entry) + assert refs == [EntityReference(id="bfi_1", type="custom_entity", web_url="u1", source="note_link")] + + def test_dedupes_repeated_mentions(self): + entry = _entry( + _link_note({"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}), + _link_note({"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}), + ) + refs = extract_entity_references(entry) + assert [r.id for r in refs] == ["bfi_1"] + + def test_all_known_entity_types_pass_filter(self): + notes = [_link_note({"id": f"id_{t}", "type": t, "webURL": "u"}) for t in sorted(ENTITY_LINK_TYPES)] + refs = extract_entity_references(_entry(*notes)) + assert {r.type for r in refs} == set(ENTITY_LINK_TYPES) + + def test_pulls_entity_link_fields_single_and_multi(self): + entry = _entry( + fields={ + "Cell Line": {"type": "entity_link", "value": "bfi_field"}, + "Plasmids": {"type": "entity_link", "isMulti": True, "value": ["seq_a", "seq_b"]}, + "Project": {"type": "text", "value": "ignored"}, + } + ) + refs = extract_entity_references(entry) + assert [(r.id, r.source) for r in refs] == [ + ("bfi_field", "entity_field"), + ("seq_a", "entity_field"), + ("seq_b", "entity_field"), + ] + + def test_field_ids_deduped_against_note_links(self): + entry = _entry( + _link_note({"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}), + fields={"Cell Line": {"type": "entity_link", "value": "bfi_1"}}, + ) + refs = extract_entity_references(entry) + assert [r.id for r in refs] == ["bfi_1"] + assert refs[0].source == "note_link" # first-seen wins + + def test_custom_type_filter(self): + entry = _entry( + _link_note( + {"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}, + {"id": "seq_1", "type": "dna_sequence", "webURL": "u2"}, + ) + ) + refs = extract_entity_references(entry, types={"dna_sequence"}) + assert [r.id for r in refs] == ["seq_1"] + + def test_fields_as_list_shape(self): + entry = { + "days": [], + "fields": [{"name": "Cell Line", "type": "entity_link", "value": "bfi_x"}], + } + refs = extract_entity_references(entry) + assert [r.id for r in refs] == ["bfi_x"] + + def test_empty_string_field_values_ignored(self): + # Mirrors the `if not link_id` guard on note links: no EntityReference(id=""). + entry = _entry( + fields={ + "Empty": {"type": "entity_link", "value": ""}, + "EmptyMulti": {"type": "entity_link", "value": ["", "bfi_ok", ""]}, + } + ) + refs = extract_entity_references(entry) + assert [r.id for r in refs] == ["bfi_ok"] + + +class TestExtractResultsTables: + def test_returns_tables_with_schema_id(self): + entry = _entry( + {"type": "results_table", "apiId": "tbl_1", "assayResultSchemaId": "assaysch_1", "name": "T1"}, + {"type": "text", "links": []}, + ) + assert extract_results_tables(entry) == [ + ResultsTableReference(assay_result_schema_id="assaysch_1", api_id="tbl_1", name="T1") + ] + + def test_skips_tables_without_schema_id(self): + entry = _entry({"type": "results_table", "apiId": "tbl_1"}) + assert extract_results_tables(entry) == [] + + def test_dedupes_by_api_and_schema(self): + note = {"type": "results_table", "apiId": "tbl_1", "assayResultSchemaId": "assaysch_1"} + entry = _entry(dict(note), dict(note)) + assert len(extract_results_tables(entry)) == 1 + + +class TestEntityLinkTypes: + """Lock the linkable-entity set to the EntryLink enum (#389).""" + + def test_set_matches_entrylink_entities(self): + # custom_entity, dna_sequence, aa_sequence, batch -- and nothing else. + assert ENTITY_LINK_TYPES == {"custom_entity", "dna_sequence", "aa_sequence", "batch"} + + def test_batch_is_an_entity_reference(self): + entry = _entry(_link_note({"id": "bat_1", "type": "batch", "webURL": "u"})) + assert [r.id for r in extract_entity_references(entry)] == ["bat_1"] + + def test_oligos_are_not_link_types(self): + # dna_oligo / rna_oligo cannot appear as note links -- not in the enum. + for t in ("dna_oligo", "rna_oligo"): + assert t not in ENTITY_LINK_TYPES + assert t not in LINK_TYPE_CATEGORY + + +class TestClassifyLinkType: + def test_full_enum_is_mapped(self): + # All 18 EntryLink.type tokens from test/openapi.yaml. + enum = { + "link", "user", "request", "entry", "stage_entry", "protocol", "workflow", + "custom_entity", "aa_sequence", "dna_sequence", "batch", "box", "container", + "location", "plate", "insights_dashboard", "folder", "sql_dashboard", + } # fmt: skip + assert set(LINK_TYPE_CATEGORY) == enum + # none fall through to UNKNOWN + assert all(classify_link_type(t) is not LinkCategory.UNKNOWN for t in enum) + + def test_category_assignments(self): + assert classify_link_type("custom_entity") is LinkCategory.ENTITY + assert classify_link_type("batch") is LinkCategory.ENTITY + assert classify_link_type("container") is LinkCategory.INVENTORY + assert classify_link_type("entry") is LinkCategory.REFERENCE + assert classify_link_type("user") is LinkCategory.METADATA + assert classify_link_type("sql_dashboard") is LinkCategory.NOT_PACKAGEABLE + assert classify_link_type("stage_entry") is LinkCategory.UNCERTAIN + assert classify_link_type("link") is LinkCategory.EXTERNAL + + def test_unknown_and_empty_fall_through(self): + assert classify_link_type("future_type") is LinkCategory.UNKNOWN + assert classify_link_type(None) is LinkCategory.UNKNOWN + assert classify_link_type("") is LinkCategory.UNKNOWN + + +class TestClassifyLinks: + def test_surfaces_all_types_classified(self): + entry = _entry( + _link_note( + {"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}, + {"id": "con_1", "type": "container", "webURL": "u2"}, + {"id": "axdash_1", "type": "sql_dashboard", "webURL": "u3"}, + {"type": "link", "webURL": "https://example.com"}, + ) + ) + refs = classify_links(entry) + assert refs == [ + LinkRef(type="custom_entity", category=LinkCategory.ENTITY, id="bfi_1", web_url="u1"), + LinkRef(type="container", category=LinkCategory.INVENTORY, id="con_1", web_url="u2"), + LinkRef( + type="sql_dashboard", + category=LinkCategory.NOT_PACKAGEABLE, + id="axdash_1", + web_url="u3", + ), + LinkRef(type="link", category=LinkCategory.EXTERNAL, id=None, web_url="https://example.com"), + ] + + def test_is_fetchable_filter(self): + entry = _entry( + _link_note( + {"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}, + {"id": "axdash_1", "type": "sql_dashboard", "webURL": "u3"}, + {"type": "link", "webURL": "https://example.com"}, + ) + ) + fetchable = [r.id for r in classify_links(entry) if r.is_fetchable] + assert fetchable == ["bfi_1"] + + def test_disposition_and_eventable_by_category(self): + def link_of(t, i): + return _link_note({"id": i, "type": t, "webURL": "u"}) + + entry = _entry( + link_of("custom_entity", "bfi_1"), # entity + link_of("container", "con_1"), # inventory + link_of("entry", "etr_1"), # reference + link_of("user", "ent_1"), # metadata + link_of("sql_dashboard", "axdash_1"), # not_packageable + ) + by_id = {r.id: r for r in classify_links(entry)} + assert by_id["bfi_1"].disposition == "nest_or_standalone" + assert by_id["bfi_1"].is_eventable is True + assert by_id["con_1"].disposition == "nest" + assert by_id["con_1"].is_eventable is False # inventory: nest-only + assert by_id["etr_1"].disposition == "link" + assert by_id["ent_1"].disposition == "pointer" + assert by_id["axdash_1"].disposition == "skip" + + def test_dedupes_by_id_then_url(self): + entry = _entry( + _link_note( + {"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}, + {"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}, + {"type": "link", "webURL": "https://dup.com"}, + {"type": "link", "webURL": "https://dup.com"}, + ) + ) + refs = classify_links(entry) + assert [(r.type, r.id or r.web_url) for r in refs] == [ + ("custom_entity", "bfi_1"), + ("link", "https://dup.com"), + ] + + def test_fetchable_and_eventable_category_membership(self): + assert FETCHABLE_CATEGORIES == { + LinkCategory.ENTITY, + LinkCategory.INVENTORY, + LinkCategory.REFERENCE, + } + assert EVENTABLE_CATEGORIES == {LinkCategory.ENTITY, LinkCategory.REFERENCE} + + +class TestSummarizeReferences: + def test_serializable_summary_of_all_discovered(self): + entry = _entry( + _link_note( + {"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}, + {"id": "axdash_1", "type": "sql_dashboard", "webURL": "u2"}, + ), + {"type": "results_table", "apiId": "tbl_1", "assayResultSchemaId": "assaysch_1", "name": "T1"}, + fields={"Cell Line": {"type": "entity_link", "value": "seq_field"}}, + ) + summary = summarize_references(entry) + # JSON-serializable end to end. + assert json.loads(json.dumps(summary)) == summary + assert summary["schema_version"] == 2 + assert [e["id"] for e in summary["entities"]] == ["bfi_1", "seq_field"] + # links.json holds raw facts only -- no derived classifications persisted. + assert {link["type"] for link in summary["links"]} == {"custom_entity", "sql_dashboard"} + assert summary["links"][0] == {"id": "bfi_1", "type": "custom_entity", "web_url": "u1"} + assert all( + set(link) == {"id", "type", "web_url"} for link in summary["links"] + ), "links.json must not persist category/fetchable/eventable/disposition" + assert summary["results_tables"] == [{"assay_result_schema_id": "assaysch_1", "api_id": "tbl_1", "name": "T1"}] + + def test_empty_entry_yields_empty_arrays(self): + summary = summarize_references({}) + assert summary == { + "schema_version": 2, + "entities": [], + "links": [], + "results_tables": [], + } + + +class TestSlugFromWebUrl: + def test_strips_known_id_prefix_dash_and_underscore_forms(self): + # id appears in dash form in the URL segment + assert ( + slug_from_web_url( + "https://benchling.com/benchling/f/R8KcsjhW-academic-registry/bfi-xCUXNVyG-sbn000/edit", + "bfi_xCUXNVyG", + ) + == "sbn000" + ) + # id appears in underscore form + assert ( + slug_from_web_url( + "https://benchling.com/benchling/f/lib_55UxcIps-registry/bfi_YtegMKkT-batch-test/edit", + "bfi_YtegMKkT", + ) + == "batch-test" + ) + + def test_drops_query_and_fragment(self): + assert ( + slug_from_web_url( + "https://benchling.com/benchling/f/lib_Fy2C0HOl-example/seq_UpzwvUug-consensus/edit?alignment=seqanl_1", + "seq_UpzwvUug", + ) + == "consensus" + ) + + def test_none_and_empty(self): + assert slug_from_web_url(None) is None + assert slug_from_web_url("") is None + assert slug_from_web_url("https://benchling.com/", "bfi_1") is None + + +class TestLinkMetadata: + def test_curated_four_field_view_name_left_for_caller(self): + entry = _entry( + _link_note( + { + "id": "bfi_xCUXNVyG", + "type": "custom_entity", + "webURL": "https://benchling.com/benchling/f/lib_1-reg/bfi-xCUXNVyG-sbn000/edit", + }, + {"id": "axdash_1", "type": "sql_dashboard", "webURL": "https://benchling.com/sql/axdash_1"}, + ) + ) + links = link_metadata(entry) + assert all(set(link) == {"type", "id", "name", "slug"} for link in links) + # name is the caller's job (pure module makes no API call) + assert all(link["name"] is None for link in links) + by_id = {link["id"]: link for link in links} + assert by_id["bfi_xCUXNVyG"]["slug"] == "sbn000" diff --git a/docker/uv.lock b/docker/uv.lock index 7f351d09..2abcb96d 100644 --- a/docker/uv.lock +++ b/docker/uv.lock @@ -75,7 +75,7 @@ wheels = [ [[package]] name = "benchling-quilt-integration" -version = "0.17.2" +version = "0.18.0" source = { editable = "." } dependencies = [ { name = "benchling-sdk" }, diff --git a/package.json b/package.json index f98400d4..83eba0a8 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@quiltdata/benchling-webhook", - "version": "0.17.2", + "version": "0.18.0", "description": "AWS CDK deployment for Benchling webhook processing using Fargate - Deploy directly with npx", "main": "dist/lib/index.js", "types": "dist/lib/index.d.ts",