From 1ce7f2468c461040020cf4d82e33944a373f40f9 Mon Sep 17 00:00:00 2001 From: "Dr. Ernie Prabhakar" Date: Mon, 15 Jun 2026 10:29:22 -0700 Subject: [PATCH 01/11] feat: add entry-reference extractor (shared discovery for #143 + #68/#69) Pure parser over a Benchling entry dict that surfaces the objects an entry points at, in one place for both upcoming features: - extract_entity_references(): entity IDs from days[].notes[].links[] (filtered to entity types, dropping non-entity links like sql_dashboard) and from entity-link fields; deduped by ID. -> #143 entity packaging. - extract_results_tables(): results_table notes carrying assayResultSchemaId. -> #68/#69 assay results. - extract_note_links(): low-level all-links primitive. No Benchling API calls and no behavior change -- nothing consumes it yet, so it lands independently of either feature. 13 unit tests; black/isort/pyright clean. Refs #143 #68 Co-Authored-By: Claude Opus 4.8 (1M context) --- docker/src/entry_references.py | 190 ++++++++++++++++++++++++++ docker/tests/test_entry_references.py | 128 +++++++++++++++++ 2 files changed, 318 insertions(+) create mode 100644 docker/src/entry_references.py create mode 100644 docker/tests/test_entry_references.py diff --git a/docker/src/entry_references.py b/docker/src/entry_references.py new file mode 100644 index 00000000..375f3bdf --- /dev/null +++ b/docker/src/entry_references.py @@ -0,0 +1,190 @@ +"""Extract typed references out of a Benchling entry's structured data. + +A Benchling entry can point at other Benchling objects in three places: + +1. Note links -- ``days[].notes[].links[]``, each ``{id, type, webURL}``. Entity + mentions appear as ``custom_entity`` / ``dna_sequence`` / ``aa_sequence`` / + ``dna_oligo`` / ``rna_oligo``; the same list also carries non-entity types + (e.g. ``sql_dashboard``), so callers must filter by ``type``. +2. Entity-link fields -- ``fields[name]`` whose ``type`` mentions ``entity``, + carrying one or more entity IDs directly in ``value``. +3. Results tables -- ``results_table`` notes carrying an ``assayResultSchemaId`` + (the discovery site for assay results, issues #68/#69). + +This module is pure: it operates on the entry dict already fetched by +``EntryPackager`` and makes no Benchling API calls. Resolving each reference to a +full record (``get_by_id`` / ``bulk_get``) is the caller's job. + +Shared discovery layer for entity packaging (#143) and assay results (#68/#69). +""" + +from dataclasses import dataclass +from typing import Any, Dict, Iterator, List, Optional, Tuple + +# Note-link / field ``type`` values that denote a registry entity. Used to keep +# entity references out of the non-entity links (dashboards, etc.) that share +# the same ``links[]`` array. +ENTITY_LINK_TYPES = frozenset( + { + "custom_entity", + "dna_sequence", + "aa_sequence", + "dna_oligo", + "rna_oligo", + } +) + +# Note ``type`` values that carry tabular assay results. +RESULTS_TABLE_NOTE_TYPES = frozenset( + { + "results_table", + "registration_table", + "table", + } +) + + +@dataclass(frozen=True) +class EntityReference: + """A reference to a Benchling entity discovered inside an entry. + + ``type`` is the discovery type as seen in the entry (e.g. ``custom_entity`` + from a note link, or ``entity_link`` from a field) -- not necessarily the + entity's own schema type. ``source`` records where the reference was found. + """ + + id: str + type: str + web_url: Optional[str] = None + source: str = "note_link" # "note_link" | "entity_field" + + +@dataclass(frozen=True) +class ResultsTableReference: + """A reference to an assay-results table embedded in an entry note.""" + + assay_result_schema_id: str + api_id: Optional[str] = None + name: Optional[str] = None + + +def _iter_notes(entry_data: Dict[str, Any]) -> Iterator[Dict[str, Any]]: + """Yield every note across all days, defensively skipping malformed shapes.""" + for day in entry_data.get("days") or []: + if not isinstance(day, dict): + continue + for note in day.get("notes") or []: + if isinstance(note, dict): + yield note + + +def _iter_fields(entry_data: Dict[str, Any]) -> Iterator[Tuple[Optional[str], Dict[str, Any]]]: + """Yield ``(name, field)`` pairs. + + Benchling renders entry ``fields`` as a name-keyed dict; some payloads use a + list of field objects instead, so both are accepted. + """ + fields = entry_data.get("fields") + if isinstance(fields, dict): + for name, fval in fields.items(): + if isinstance(fval, dict): + yield name, fval + elif isinstance(fields, list): + for fval in fields: + if isinstance(fval, dict): + yield fval.get("name"), fval + + +def _field_value_ids(fval: Dict[str, Any]) -> List[str]: + """Pull entity ID(s) out of a field value (single value or ``isMulti`` list).""" + val = fval.get("value") + if isinstance(val, str): + return [val] + if isinstance(val, list): + return [v for v in val if isinstance(v, str)] + return [] + + +def extract_note_links(entry_data: Dict[str, Any]) -> List[Dict[str, Any]]: + """Return every link object across all note bodies, unfiltered. + + Lower-level primitive; most callers want :func:`extract_entity_references`. + """ + links: List[Dict[str, Any]] = [] + for note in _iter_notes(entry_data): + for link in note.get("links") or []: + if isinstance(link, dict): + links.append(link) + return links + + +def extract_entity_references( + entry_data: Dict[str, Any], + *, + types: "frozenset[str] | set[str]" = ENTITY_LINK_TYPES, +) -> List[EntityReference]: + """Return deduped entity references from note links and entity-link fields. + + Note links are filtered to ``types`` (default: all known entity types). + Entity-link fields are detected by an ``entity`` substring in the field + ``type`` and are included regardless of ``types``. References are deduped by + ID, preserving first-seen order (note links before fields). + """ + seen: set[str] = set() + refs: List[EntityReference] = [] + + for link in extract_note_links(entry_data): + link_id = link.get("id") + link_type = link.get("type") + if not link_id or link_type not in types or link_id in seen: + continue + seen.add(link_id) + refs.append( + EntityReference( + id=str(link_id), + type=str(link_type), + web_url=link.get("webURL") or link.get("web_url"), + source="note_link", + ) + ) + + for _name, fval in _iter_fields(entry_data): + ftype = fval.get("type") + if not ftype or "entity" not in str(ftype).lower(): + continue + for value_id in _field_value_ids(fval): + if value_id in seen: + continue + seen.add(value_id) + refs.append(EntityReference(id=value_id, type=str(ftype), source="entity_field")) + + return refs + + +def extract_results_tables(entry_data: Dict[str, Any]) -> List[ResultsTableReference]: + """Return deduped assay-results-table references from entry notes. + + Only notes whose ``type`` is a results-table type *and* that carry an + ``assayResultSchemaId`` are returned. Deduped by ``(api_id, schema_id)``. + """ + seen: set[Tuple[Optional[str], str]] = set() + tables: List[ResultsTableReference] = [] + for note in _iter_notes(entry_data): + if note.get("type") not in RESULTS_TABLE_NOTE_TYPES: + continue + schema_id = note.get("assayResultSchemaId") + if not schema_id: + continue + api_id = note.get("apiId") + key = (api_id, schema_id) + if key in seen: + continue + seen.add(key) + tables.append( + ResultsTableReference( + assay_result_schema_id=schema_id, + api_id=api_id, + name=note.get("name"), + ) + ) + return tables diff --git a/docker/tests/test_entry_references.py b/docker/tests/test_entry_references.py new file mode 100644 index 00000000..5bb75e2b --- /dev/null +++ b/docker/tests/test_entry_references.py @@ -0,0 +1,128 @@ +"""Tests for entry_references extractor (shared discovery for #143 + #68/#69).""" + +from src.entry_references import ( + ENTITY_LINK_TYPES, + EntityReference, + ResultsTableReference, + extract_entity_references, + extract_note_links, + extract_results_tables, +) + + +def _entry(*notes, fields=None): + """Build a minimal entry dict with one day holding the given notes.""" + return { + "id": "etr_test", + "days": [{"date": "2026-06-15", "title": "Day 1", "notes": list(notes)}], + "fields": fields or {}, + } + + +def _link_note(*links): + return {"type": "text", "text": "", "links": list(links)} + + +class TestExtractNoteLinks: + def test_collects_links_across_notes(self): + entry = _entry( + _link_note({"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}), + _link_note({"id": "seq_1", "type": "dna_sequence", "webURL": "u2"}), + ) + ids = [link["id"] for link in extract_note_links(entry)] + assert ids == ["bfi_1", "seq_1"] + + def test_empty_when_no_links(self): + assert extract_note_links(_entry(_link_note())) == [] + + def test_tolerates_missing_days_notes_links(self): + assert extract_note_links({}) == [] + assert extract_note_links({"days": [{}]}) == [] + assert extract_note_links({"days": [{"notes": [{}]}]}) == [] + + +class TestExtractEntityReferences: + def test_returns_entity_links_only(self): + entry = _entry( + _link_note( + {"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}, + {"id": "axdash_1", "type": "sql_dashboard", "webURL": "u2"}, + ) + ) + refs = extract_entity_references(entry) + assert refs == [EntityReference(id="bfi_1", type="custom_entity", web_url="u1", source="note_link")] + + def test_dedupes_repeated_mentions(self): + entry = _entry( + _link_note({"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}), + _link_note({"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}), + ) + refs = extract_entity_references(entry) + assert [r.id for r in refs] == ["bfi_1"] + + def test_all_known_entity_types_pass_filter(self): + notes = [_link_note({"id": f"id_{t}", "type": t, "webURL": "u"}) for t in sorted(ENTITY_LINK_TYPES)] + refs = extract_entity_references(_entry(*notes)) + assert {r.type for r in refs} == set(ENTITY_LINK_TYPES) + + def test_pulls_entity_link_fields_single_and_multi(self): + entry = _entry( + fields={ + "Cell Line": {"type": "entity_link", "value": "bfi_field"}, + "Plasmids": {"type": "entity_link", "isMulti": True, "value": ["seq_a", "seq_b"]}, + "Project": {"type": "text", "value": "ignored"}, + } + ) + refs = extract_entity_references(entry) + assert [(r.id, r.source) for r in refs] == [ + ("bfi_field", "entity_field"), + ("seq_a", "entity_field"), + ("seq_b", "entity_field"), + ] + + def test_field_ids_deduped_against_note_links(self): + entry = _entry( + _link_note({"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}), + fields={"Cell Line": {"type": "entity_link", "value": "bfi_1"}}, + ) + refs = extract_entity_references(entry) + assert [r.id for r in refs] == ["bfi_1"] + assert refs[0].source == "note_link" # first-seen wins + + def test_custom_type_filter(self): + entry = _entry( + _link_note( + {"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}, + {"id": "seq_1", "type": "dna_sequence", "webURL": "u2"}, + ) + ) + refs = extract_entity_references(entry, types={"dna_sequence"}) + assert [r.id for r in refs] == ["seq_1"] + + def test_fields_as_list_shape(self): + entry = { + "days": [], + "fields": [{"name": "Cell Line", "type": "entity_link", "value": "bfi_x"}], + } + refs = extract_entity_references(entry) + assert [r.id for r in refs] == ["bfi_x"] + + +class TestExtractResultsTables: + def test_returns_tables_with_schema_id(self): + entry = _entry( + {"type": "results_table", "apiId": "tbl_1", "assayResultSchemaId": "assaysch_1", "name": "T1"}, + {"type": "text", "links": []}, + ) + assert extract_results_tables(entry) == [ + ResultsTableReference(assay_result_schema_id="assaysch_1", api_id="tbl_1", name="T1") + ] + + def test_skips_tables_without_schema_id(self): + entry = _entry({"type": "results_table", "apiId": "tbl_1"}) + assert extract_results_tables(entry) == [] + + def test_dedupes_by_api_and_schema(self): + note = {"type": "results_table", "apiId": "tbl_1", "assayResultSchemaId": "assaysch_1"} + entry = _entry(dict(note), dict(note)) + assert len(extract_results_tables(entry)) == 1 From ad0c7a000167dcb6f2e5634374fb91bb491588b6 Mon Sep 17 00:00:00 2001 From: "Dr. Ernie Prabhakar" Date: Mon, 15 Jun 2026 10:37:28 -0700 Subject: [PATCH 02/11] feat: full EntryLink type map + classify_links; fix entity set (#389) Generalize the discovery layer from entities-only to the full EntryLink enum (18 types from test/openapi.yaml), per #389. - classify_links(entry): surfaces ALL note links, each labeled with a LinkCategory (entity/inventory/reference/metadata/not_packageable/uncertain/ external/unknown). Consumers filter, e.g. `r.is_packageable`. Unknown/future types surface as UNKNOWN rather than being silently dropped. - LINK_TYPE_CATEGORY: type -> category for all 18 tokens; PACKAGEABLE_CATEGORIES. - Fix entity set: ENTITY_LINK_TYPES now {custom_entity, dna_sequence, aa_sequence, batch}. Adds `batch` (a real registry entity, was missed); drops dna_oligo/rna_oligo (NOT EntryLink types -- can't appear as note links). - spec/entry-link-types.json: human-facing reference map (category, packageable, id prefix, GET endpoint, webhook events) for all 18 types, plus the not-inline-linkable resources. A test asserts its categories match the module so it can't drift. 26 unit tests; black/isort/pyright clean. Refs #143 #389 #68 Co-Authored-By: Claude Opus 4.8 (1M context) --- docker/src/entry_references.py | 150 ++++++++++++++++++++++---- docker/tests/test_entry_references.py | 136 +++++++++++++++++++++++ spec/entry-link-types.json | 38 +++++++ 3 files changed, 301 insertions(+), 23 deletions(-) create mode 100644 spec/entry-link-types.json diff --git a/docker/src/entry_references.py b/docker/src/entry_references.py index 375f3bdf..d0e989d4 100644 --- a/docker/src/entry_references.py +++ b/docker/src/entry_references.py @@ -1,11 +1,11 @@ """Extract typed references out of a Benchling entry's structured data. -A Benchling entry can point at other Benchling objects in three places: +A Benchling entry points at other Benchling objects in three places: -1. Note links -- ``days[].notes[].links[]``, each ``{id, type, webURL}``. Entity - mentions appear as ``custom_entity`` / ``dna_sequence`` / ``aa_sequence`` / - ``dna_oligo`` / ``rna_oligo``; the same list also carries non-entity types - (e.g. ``sql_dashboard``), so callers must filter by ``type``. +1. Note links -- ``days[].notes[].links[]``, each ``{id, type, webURL}``. ``type`` + is a closed enum (``EntryLink.type`` in the Benchling OpenAPI spec) of 18 + tokens spanning entities, inventory, references, dashboards, and plain + external hyperlinks -- see :data:`LINK_TYPE_CATEGORY`. 2. Entity-link fields -- ``fields[name]`` whose ``type`` mentions ``entity``, carrying one or more entity IDs directly in ``value``. 3. Results tables -- ``results_table`` notes carrying an ``assayResultSchemaId`` @@ -13,26 +13,70 @@ This module is pure: it operates on the entry dict already fetched by ``EntryPackager`` and makes no Benchling API calls. Resolving each reference to a -full record (``get_by_id`` / ``bulk_get``) is the caller's job. +full record (``get_by_id`` / ``bulk_get``) is the caller's job; this layer only +discovers and classifies what an entry points at. -Shared discovery layer for entity packaging (#143) and assay results (#68/#69). +Shared discovery layer for entity packaging (#143), the full entry-linked +resource map (#389), and assay results (#68/#69). """ from dataclasses import dataclass +from enum import Enum from typing import Any, Dict, Iterator, List, Optional, Tuple -# Note-link / field ``type`` values that denote a registry entity. Used to keep -# entity references out of the non-entity links (dashboards, etc.) that share -# the same ``links[]`` array. -ENTITY_LINK_TYPES = frozenset( - { - "custom_entity", - "dna_sequence", - "aa_sequence", - "dna_oligo", - "rna_oligo", - } -) + +class LinkCategory(str, Enum): + """How a note-link type relates to packaging (per #389 conclusions).""" + + ENTITY = "entity" # registry entity; GET-by-id + v2.entity.registered event + INVENTORY = "inventory" # packageable via GET-by-id; no webhook events + REFERENCE = "reference" # packageable; has its own create/update events + METADATA = "metadata" # GET works but low value to package as a record + NOT_PACKAGEABLE = "not_packageable" # no read API (dashboards, protocol) + UNCERTAIN = "uncertain" # endpoint depends on tenant API version; verify first + EXTERNAL = "external" # plain http(s) hyperlink ("link"); no Benchling ID + UNKNOWN = "unknown" # type not in the known enum -- surfaced, not dropped + + +# EntryLink.type -> category. Covers all 18 enum tokens from test/openapi.yaml. +# Unknown/future tokens fall through to LinkCategory.UNKNOWN via classify_link_type. +LINK_TYPE_CATEGORY: Dict[str, LinkCategory] = { + # entities (eventable via v2.entity.registered) + "custom_entity": LinkCategory.ENTITY, + "dna_sequence": LinkCategory.ENTITY, + "aa_sequence": LinkCategory.ENTITY, + "batch": LinkCategory.ENTITY, + # inventory (packageable on reference, no events) + "container": LinkCategory.INVENTORY, + "box": LinkCategory.INVENTORY, + "plate": LinkCategory.INVENTORY, + "location": LinkCategory.INVENTORY, + # references (packageable, own events) + "entry": LinkCategory.REFERENCE, + "request": LinkCategory.REFERENCE, + "workflow": LinkCategory.REFERENCE, + # metadata-only pointers + "user": LinkCategory.METADATA, + "folder": LinkCategory.METADATA, + # no read API exists -- keep the webURL as a reference only + "sql_dashboard": LinkCategory.NOT_PACKAGEABLE, + "insights_dashboard": LinkCategory.NOT_PACKAGEABLE, + "protocol": LinkCategory.NOT_PACKAGEABLE, + # endpoint depends on tenant API version (v2-alpha) -- verify before relying + "stage_entry": LinkCategory.UNCERTAIN, + # plain external hyperlink (no Benchling id) + "link": LinkCategory.EXTERNAL, +} + +# Note: dna_oligo / rna_oligo / mixture / assay_run / assay_result / workflow_task +# are NOT EntryLink types -- they cannot appear as note links. They reach an entry +# via structured note parts / inventory tables, not links[]. + +# Categories whose resources can be fetched as a record via GET-by-id. +PACKAGEABLE_CATEGORIES = frozenset({LinkCategory.ENTITY, LinkCategory.INVENTORY, LinkCategory.REFERENCE}) + +# Linkable entity types (subset of LINK_TYPE_CATEGORY that are LinkCategory.ENTITY). +ENTITY_LINK_TYPES = frozenset(t for t, cat in LINK_TYPE_CATEGORY.items() if cat is LinkCategory.ENTITY) # Note ``type`` values that carry tabular assay results. RESULTS_TABLE_NOTE_TYPES = frozenset( @@ -44,6 +88,21 @@ ) +@dataclass(frozen=True) +class LinkRef: + """A classified note link. ``id``/``web_url`` are absent for some types + (``link`` has no id; ``location`` has no webURL).""" + + type: str + category: LinkCategory + id: Optional[str] = None + web_url: Optional[str] = None + + @property + def is_packageable(self) -> bool: + return self.category in PACKAGEABLE_CATEGORIES + + @dataclass(frozen=True) class EntityReference: """A reference to a Benchling entity discovered inside an entry. @@ -105,10 +164,25 @@ def _field_value_ids(fval: Dict[str, Any]) -> List[str]: return [] +def _link_web_url(link: Dict[str, Any]) -> Optional[str]: + return link.get("webURL") or link.get("web_url") + + +def classify_link_type(link_type: Optional[str]) -> LinkCategory: + """Map an EntryLink ``type`` token to its :class:`LinkCategory`. + + Unknown/future tokens map to ``UNKNOWN`` rather than being silently dropped. + """ + if not link_type: + return LinkCategory.UNKNOWN + return LINK_TYPE_CATEGORY.get(link_type, LinkCategory.UNKNOWN) + + def extract_note_links(entry_data: Dict[str, Any]) -> List[Dict[str, Any]]: - """Return every link object across all note bodies, unfiltered. + """Return every link object across all note bodies, unfiltered and untyped. - Lower-level primitive; most callers want :func:`extract_entity_references`. + Lowest-level primitive; most callers want :func:`classify_links` or + :func:`extract_entity_references`. """ links: List[Dict[str, Any]] = [] for note in _iter_notes(entry_data): @@ -118,6 +192,36 @@ def extract_note_links(entry_data: Dict[str, Any]) -> List[Dict[str, Any]]: return links +def classify_links(entry_data: Dict[str, Any]) -> List[LinkRef]: + """Return every note link, classified by category and deduped. + + Surfaces the *full* set of objects an entry points at -- entities, inventory, + references, metadata pointers, dashboards, and external URLs -- so callers can + decide what to fetch (e.g. ``[r for r in classify_links(e) if r.is_packageable]``). + Deduped by Benchling ID when present, else by URL; first-seen order preserved. + """ + seen: set[str] = set() + refs: List[LinkRef] = [] + for link in extract_note_links(entry_data): + link_type = link.get("type") + link_id = link.get("id") + web_url = _link_web_url(link) + dedup_key = link_id or web_url + if dedup_key is not None: + if dedup_key in seen: + continue + seen.add(dedup_key) + refs.append( + LinkRef( + type=str(link_type) if link_type is not None else "", + category=classify_link_type(link_type), + id=link_id, + web_url=web_url, + ) + ) + return refs + + def extract_entity_references( entry_data: Dict[str, Any], *, @@ -125,7 +229,7 @@ def extract_entity_references( ) -> List[EntityReference]: """Return deduped entity references from note links and entity-link fields. - Note links are filtered to ``types`` (default: all known entity types). + Note links are filtered to ``types`` (default: all linkable entity types). Entity-link fields are detected by an ``entity`` substring in the field ``type`` and are included regardless of ``types``. References are deduped by ID, preserving first-seen order (note links before fields). @@ -143,7 +247,7 @@ def extract_entity_references( EntityReference( id=str(link_id), type=str(link_type), - web_url=link.get("webURL") or link.get("web_url"), + web_url=_link_web_url(link), source="note_link", ) ) diff --git a/docker/tests/test_entry_references.py b/docker/tests/test_entry_references.py index 5bb75e2b..7861aed0 100644 --- a/docker/tests/test_entry_references.py +++ b/docker/tests/test_entry_references.py @@ -1,9 +1,18 @@ """Tests for entry_references extractor (shared discovery for #143 + #68/#69).""" +import json +from pathlib import Path + from src.entry_references import ( ENTITY_LINK_TYPES, + LINK_TYPE_CATEGORY, + PACKAGEABLE_CATEGORIES, EntityReference, + LinkCategory, + LinkRef, ResultsTableReference, + classify_link_type, + classify_links, extract_entity_references, extract_note_links, extract_results_tables, @@ -126,3 +135,130 @@ def test_dedupes_by_api_and_schema(self): note = {"type": "results_table", "apiId": "tbl_1", "assayResultSchemaId": "assaysch_1"} entry = _entry(dict(note), dict(note)) assert len(extract_results_tables(entry)) == 1 + + +class TestEntityLinkTypes: + """Lock the linkable-entity set to the EntryLink enum (#389).""" + + def test_set_matches_entrylink_entities(self): + # custom_entity, dna_sequence, aa_sequence, batch -- and nothing else. + assert ENTITY_LINK_TYPES == {"custom_entity", "dna_sequence", "aa_sequence", "batch"} + + def test_batch_is_an_entity_reference(self): + entry = _entry(_link_note({"id": "bat_1", "type": "batch", "webURL": "u"})) + assert [r.id for r in extract_entity_references(entry)] == ["bat_1"] + + def test_oligos_are_not_link_types(self): + # dna_oligo / rna_oligo cannot appear as note links -- not in the enum. + for t in ("dna_oligo", "rna_oligo"): + assert t not in ENTITY_LINK_TYPES + assert t not in LINK_TYPE_CATEGORY + + +class TestClassifyLinkType: + def test_full_enum_is_mapped(self): + # All 18 EntryLink.type tokens from test/openapi.yaml. + enum = { + "link", "user", "request", "entry", "stage_entry", "protocol", "workflow", + "custom_entity", "aa_sequence", "dna_sequence", "batch", "box", "container", + "location", "plate", "insights_dashboard", "folder", "sql_dashboard", + } # fmt: skip + assert set(LINK_TYPE_CATEGORY) == enum + # none fall through to UNKNOWN + assert all(classify_link_type(t) is not LinkCategory.UNKNOWN for t in enum) + + def test_category_assignments(self): + assert classify_link_type("custom_entity") is LinkCategory.ENTITY + assert classify_link_type("batch") is LinkCategory.ENTITY + assert classify_link_type("container") is LinkCategory.INVENTORY + assert classify_link_type("entry") is LinkCategory.REFERENCE + assert classify_link_type("user") is LinkCategory.METADATA + assert classify_link_type("sql_dashboard") is LinkCategory.NOT_PACKAGEABLE + assert classify_link_type("stage_entry") is LinkCategory.UNCERTAIN + assert classify_link_type("link") is LinkCategory.EXTERNAL + + def test_unknown_and_empty_fall_through(self): + assert classify_link_type("future_type") is LinkCategory.UNKNOWN + assert classify_link_type(None) is LinkCategory.UNKNOWN + assert classify_link_type("") is LinkCategory.UNKNOWN + + +class TestClassifyLinks: + def test_surfaces_all_types_classified(self): + entry = _entry( + _link_note( + {"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}, + {"id": "con_1", "type": "container", "webURL": "u2"}, + {"id": "axdash_1", "type": "sql_dashboard", "webURL": "u3"}, + {"type": "link", "webURL": "https://example.com"}, + ) + ) + refs = classify_links(entry) + assert refs == [ + LinkRef(type="custom_entity", category=LinkCategory.ENTITY, id="bfi_1", web_url="u1"), + LinkRef(type="container", category=LinkCategory.INVENTORY, id="con_1", web_url="u2"), + LinkRef( + type="sql_dashboard", + category=LinkCategory.NOT_PACKAGEABLE, + id="axdash_1", + web_url="u3", + ), + LinkRef(type="link", category=LinkCategory.EXTERNAL, id=None, web_url="https://example.com"), + ] + + def test_is_packageable_filter(self): + entry = _entry( + _link_note( + {"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}, + {"id": "axdash_1", "type": "sql_dashboard", "webURL": "u3"}, + {"type": "link", "webURL": "https://example.com"}, + ) + ) + packageable = [r.id for r in classify_links(entry) if r.is_packageable] + assert packageable == ["bfi_1"] + + def test_dedupes_by_id_then_url(self): + entry = _entry( + _link_note( + {"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}, + {"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}, + {"type": "link", "webURL": "https://dup.com"}, + {"type": "link", "webURL": "https://dup.com"}, + ) + ) + refs = classify_links(entry) + assert [(r.type, r.id or r.web_url) for r in refs] == [ + ("custom_entity", "bfi_1"), + ("link", "https://dup.com"), + ] + + def test_packageable_categories_membership(self): + assert PACKAGEABLE_CATEGORIES == { + LinkCategory.ENTITY, + LinkCategory.INVENTORY, + LinkCategory.REFERENCE, + } + + +class TestEntryLinkTypesJson: + """Guard spec/entry-link-types.json against drift from the module.""" + + def _load(self): + path = Path(__file__).resolve().parents[2] / "spec" / "entry-link-types.json" + return json.loads(path.read_text()) + + def test_json_covers_same_types(self): + ref = self._load() + json_types = {row["type"] for row in ref["link_types"]} + assert json_types == set(LINK_TYPE_CATEGORY) + + def test_json_categories_match_module(self): + ref = self._load() + for row in ref["link_types"]: + module_cat = LINK_TYPE_CATEGORY[row["type"]] + assert row["category"] == module_cat.value, row["type"] + assert row["packageable"] == (module_cat in PACKAGEABLE_CATEGORIES), row["type"] + + def test_json_packageable_categories_match_module(self): + ref = self._load() + assert set(ref["packageable_categories"]) == {c.value for c in PACKAGEABLE_CATEGORIES} diff --git a/spec/entry-link-types.json b/spec/entry-link-types.json new file mode 100644 index 00000000..7200af4b --- /dev/null +++ b/spec/entry-link-types.json @@ -0,0 +1,38 @@ +{ + "$comment": "Reference map of Benchling EntryLink.type tokens -> how the packager treats them. Source of truth for `type` enum: test/openapi.yaml (EntryLink schema, 18 tokens). Categories mirror docker/src/entry_references.py (LINK_TYPE_CATEGORY); test_entry_references.py asserts they agree. Endpoints / id prefixes / events per issue #389. See issues #143, #389, #68.", + "categories": { + "entity": "Registry entity; GET-by-id and v2.entity.registered event. Packageable.", + "inventory": "Packageable via GET-by-id; no webhook events.", + "reference": "Packageable; has its own create/update webhook events.", + "metadata": "GET works but low value to package as a record.", + "not_packageable": "No read API exists; keep the webURL as a reference only.", + "uncertain": "Endpoint depends on tenant API version (e.g. v2-alpha); verify before relying.", + "external": "Plain http(s) hyperlink ('link' type); no Benchling ID.", + "unknown": "Type not in the known enum; surfaced rather than dropped." + }, + "packageable_categories": ["entity", "inventory", "reference"], + "link_types": [ + {"type": "custom_entity", "category": "entity", "packageable": true, "id_prefix": "bfi_", "get_endpoint": "GET /custom-entities/{id}", "webhook_events": ["v2.entity.registered"]}, + {"type": "dna_sequence", "category": "entity", "packageable": true, "id_prefix": "seq_", "get_endpoint": "GET /dna-sequences/{id}", "webhook_events": ["v2.entity.registered"]}, + {"type": "aa_sequence", "category": "entity", "packageable": true, "id_prefix": "prtn_", "get_endpoint": "GET /aa-sequences/{id}", "webhook_events": ["v2.entity.registered"]}, + {"type": "batch", "category": "entity", "packageable": true, "id_prefix": "bat_", "get_endpoint": "GET /batches/{id}", "webhook_events": ["v2.entity.registered"]}, + {"type": "container", "category": "inventory", "packageable": true, "id_prefix": "con_", "get_endpoint": "GET /containers/{id}", "webhook_events": []}, + {"type": "box", "category": "inventory", "packageable": true, "id_prefix": "box_", "get_endpoint": "GET /boxes/{id}", "webhook_events": []}, + {"type": "plate", "category": "inventory", "packageable": true, "id_prefix": "plt_", "get_endpoint": "GET /plates/{id}", "webhook_events": []}, + {"type": "location", "category": "inventory", "packageable": true, "id_prefix": "loc_", "get_endpoint": "GET /locations/{id}", "webhook_events": [], "notes": "Locations do not have a webURL."}, + {"type": "entry", "category": "reference", "packageable": true, "id_prefix": "etr_", "get_endpoint": "GET /entries/{id}", "webhook_events": ["v2.entry.created", "v2.entry.updated.fields", "v2.entry.updated.reviewRecord"]}, + {"type": "request", "category": "reference", "packageable": true, "id_prefix": "req_", "get_endpoint": "GET /requests/{id}", "webhook_events": ["v2.request.created", "v2.request.updated.fields", "v2.request.updated.status"]}, + {"type": "workflow", "category": "reference", "packageable": true, "id_prefix": "wfgrp_", "get_endpoint": "GET /workflow-task-groups/{id}", "webhook_events": ["v2.workflowTaskGroup.created", "v2.workflowTaskGroup.mappingCompleted", "v2.workflowTaskGroup.updated.watchers"]}, + {"type": "stage_entry", "category": "uncertain", "packageable": false, "id_prefix": "etr_", "get_endpoint": "v2-alpha only", "webhook_events": ["v2-alpha.stageEntry.*"], "notes": "Verify against the deployed tenant's API version before relying on it."}, + {"type": "protocol", "category": "not_packageable", "packageable": false, "id_prefix": null, "get_endpoint": null, "webhook_events": [], "notes": "Linkable but no GET-by-id endpoint in the stable spec."}, + {"type": "user", "category": "metadata", "packageable": false, "id_prefix": "ent_", "get_endpoint": "GET /users/{id}", "webhook_events": [], "notes": "Metadata only."}, + {"type": "folder", "category": "metadata", "packageable": false, "id_prefix": "lib_", "get_endpoint": "GET /folders/{id}", "webhook_events": [], "notes": "Legacy; metadata only."}, + {"type": "sql_dashboard", "category": "not_packageable", "packageable": false, "id_prefix": "axdash_", "get_endpoint": null, "webhook_events": [], "notes": "No read API; Insights/analytics artifact reachable only via the Warehouse. Keep webURL deep link."}, + {"type": "insights_dashboard", "category": "not_packageable", "packageable": false, "id_prefix": "axdash_", "get_endpoint": null, "webhook_events": [], "notes": "Legacy alias of sql_dashboard; no read API."}, + {"type": "link", "category": "external", "packageable": false, "id_prefix": null, "get_endpoint": null, "webhook_events": [], "notes": "Plain external hyperlink; id omitted, webURL is the literal URL."} + ], + "not_inline_linkable": { + "$comment": "Resources that are NOT EntryLink types -- they cannot appear in links[]. They reach an entry via structured note parts / inventory tables instead.", + "types": ["dna_oligo", "rna_oligo", "mixture", "assay_run", "assay_result", "workflow_task", "workflow_output"] + } +} From 26bbfb115ffe8c5c14f32b15ffd99f37de29c136 Mon Sep 17 00:00:00 2001 From: "Dr. Ernie Prabhakar" Date: Mon, 15 Jun 2026 10:45:21 -0700 Subject: [PATCH 03/11] feat: write references.json into each package; address review When packaging an entry, write a references.json alongside entry.json listing the Benchling objects the entry points at (entities, classified links, results tables), discovered from the entry's note links and fields. No records are fetched -- discovery only. - entry_references.summarize_references(entry): JSON-serializable payload ({schema_version, entities, links, results_tables}); REFERENCES_SCHEMA_VERSION. - entry_packager._create_metadata_files: emit references.json + document it in the package README. Review fixes (Greptile + Copilot): - Drop empty-string entity IDs in _field_value_ids, matching the note-link guard. - Narrow RESULTS_TABLE_NOTE_TYPES to {"results_table"} -- the only type carrying assayResultSchemaId; avoids latently capturing generic/registration tables. - Modernize typing (dict/list/tuple) on the 3.11+ codebase. - Remove committed spec/entry-link-types.json (relocated to the project's scripts/ as a research artifact) and its drift-guard test. Full suite green (437). Refs #143 #389 #68 Co-Authored-By: Claude Opus 4.8 (1M context) --- docker/src/entry_packager.py | 15 ++++- docker/src/entry_references.py | 93 +++++++++++++++++++-------- docker/tests/test_entry_packager.py | 40 ++++++++++++ docker/tests/test_entry_references.py | 52 ++++++++------- spec/entry-link-types.json | 38 ----------- 5 files changed, 149 insertions(+), 89 deletions(-) delete mode 100644 spec/entry-link-types.json diff --git a/docker/src/entry_packager.py b/docker/src/entry_packager.py index 3fc18d83..81d6a779 100644 --- a/docker/src/entry_packager.py +++ b/docker/src/entry_packager.py @@ -20,6 +20,7 @@ from .auth import RoleManager from .config import get_config +from .entry_references import summarize_references from .payload import Payload from .retry_utils import LAMBDA_INVOKE_RETRY, REST_API_RETRY @@ -684,13 +685,20 @@ def _create_metadata_files( """ for file_info in uploaded_files: - if file_info["filename"] not in ["entry.json", "entry_data.json", "input.json", "README.md"]: + if file_info["filename"] not in [ + "entry.json", + "entry_data.json", + "input.json", + "references.json", + "README.md", + ]: readme_content += f"- `{file_info['filename']}` ({file_info['size']} bytes)\n" readme_content += """ ## Metadata Files - `entry.json`: Key entry metadata (display_id, name, creator, authors, timestamps) - `entry_data.json`: Complete entry data from Benchling API +- `references.json`: Benchling objects this entry links to (entities, inventory, tables) - `input.json`: Export processing metadata - `README.md`: This documentation file @@ -699,9 +707,14 @@ def _create_metadata_files( For questions about the data, refer to the original Benchling entry. """ + # references.json - the entities/resources this entry points at, discovered + # from the entry's note links and fields (no Benchling records are fetched). + references_json = summarize_references(entry_data) + return { "entry.json": entry_json, "entry_data.json": entry_data, + "references.json": references_json, "input.json": input_json, "README.md": readme_content, } diff --git a/docker/src/entry_references.py b/docker/src/entry_references.py index d0e989d4..e503e613 100644 --- a/docker/src/entry_references.py +++ b/docker/src/entry_references.py @@ -22,7 +22,7 @@ from dataclasses import dataclass from enum import Enum -from typing import Any, Dict, Iterator, List, Optional, Tuple +from typing import Any, Iterator, Optional class LinkCategory(str, Enum): @@ -40,7 +40,7 @@ class LinkCategory(str, Enum): # EntryLink.type -> category. Covers all 18 enum tokens from test/openapi.yaml. # Unknown/future tokens fall through to LinkCategory.UNKNOWN via classify_link_type. -LINK_TYPE_CATEGORY: Dict[str, LinkCategory] = { +LINK_TYPE_CATEGORY: dict[str, LinkCategory] = { # entities (eventable via v2.entity.registered) "custom_entity": LinkCategory.ENTITY, "dna_sequence": LinkCategory.ENTITY, @@ -78,14 +78,11 @@ class LinkCategory(str, Enum): # Linkable entity types (subset of LINK_TYPE_CATEGORY that are LinkCategory.ENTITY). ENTITY_LINK_TYPES = frozenset(t for t, cat in LINK_TYPE_CATEGORY.items() if cat is LinkCategory.ENTITY) -# Note ``type`` values that carry tabular assay results. -RESULTS_TABLE_NOTE_TYPES = frozenset( - { - "results_table", - "registration_table", - "table", - } -) +# Note ``type`` value that carries tabular assay results. Kept as a set for +# extensibility, but scoped to ``results_table`` only: that is the note type that +# carries an ``assayResultSchemaId`` (#68/#69). A generic ``table`` or a +# ``registration_table`` is a different mechanism and must not be swept in here. +RESULTS_TABLE_NOTE_TYPES = frozenset({"results_table"}) @dataclass(frozen=True) @@ -127,7 +124,7 @@ class ResultsTableReference: name: Optional[str] = None -def _iter_notes(entry_data: Dict[str, Any]) -> Iterator[Dict[str, Any]]: +def _iter_notes(entry_data: dict[str, Any]) -> Iterator[dict[str, Any]]: """Yield every note across all days, defensively skipping malformed shapes.""" for day in entry_data.get("days") or []: if not isinstance(day, dict): @@ -137,7 +134,7 @@ def _iter_notes(entry_data: Dict[str, Any]) -> Iterator[Dict[str, Any]]: yield note -def _iter_fields(entry_data: Dict[str, Any]) -> Iterator[Tuple[Optional[str], Dict[str, Any]]]: +def _iter_fields(entry_data: dict[str, Any]) -> Iterator[tuple[Optional[str], dict[str, Any]]]: """Yield ``(name, field)`` pairs. Benchling renders entry ``fields`` as a name-keyed dict; some payloads use a @@ -154,17 +151,20 @@ def _iter_fields(entry_data: Dict[str, Any]) -> Iterator[Tuple[Optional[str], Di yield fval.get("name"), fval -def _field_value_ids(fval: Dict[str, Any]) -> List[str]: - """Pull entity ID(s) out of a field value (single value or ``isMulti`` list).""" +def _field_value_ids(fval: dict[str, Any]) -> list[str]: + """Pull entity ID(s) out of a field value (single value or ``isMulti`` list). + + Empty strings are dropped, mirroring the ``if not link_id`` guard on note links. + """ val = fval.get("value") if isinstance(val, str): - return [val] + return [val] if val else [] if isinstance(val, list): - return [v for v in val if isinstance(v, str)] + return [v for v in val if isinstance(v, str) and v] return [] -def _link_web_url(link: Dict[str, Any]) -> Optional[str]: +def _link_web_url(link: dict[str, Any]) -> Optional[str]: return link.get("webURL") or link.get("web_url") @@ -178,13 +178,13 @@ def classify_link_type(link_type: Optional[str]) -> LinkCategory: return LINK_TYPE_CATEGORY.get(link_type, LinkCategory.UNKNOWN) -def extract_note_links(entry_data: Dict[str, Any]) -> List[Dict[str, Any]]: +def extract_note_links(entry_data: dict[str, Any]) -> list[dict[str, Any]]: """Return every link object across all note bodies, unfiltered and untyped. Lowest-level primitive; most callers want :func:`classify_links` or :func:`extract_entity_references`. """ - links: List[Dict[str, Any]] = [] + links: list[dict[str, Any]] = [] for note in _iter_notes(entry_data): for link in note.get("links") or []: if isinstance(link, dict): @@ -192,7 +192,7 @@ def extract_note_links(entry_data: Dict[str, Any]) -> List[Dict[str, Any]]: return links -def classify_links(entry_data: Dict[str, Any]) -> List[LinkRef]: +def classify_links(entry_data: dict[str, Any]) -> list[LinkRef]: """Return every note link, classified by category and deduped. Surfaces the *full* set of objects an entry points at -- entities, inventory, @@ -201,7 +201,7 @@ def classify_links(entry_data: Dict[str, Any]) -> List[LinkRef]: Deduped by Benchling ID when present, else by URL; first-seen order preserved. """ seen: set[str] = set() - refs: List[LinkRef] = [] + refs: list[LinkRef] = [] for link in extract_note_links(entry_data): link_type = link.get("type") link_id = link.get("id") @@ -223,10 +223,10 @@ def classify_links(entry_data: Dict[str, Any]) -> List[LinkRef]: def extract_entity_references( - entry_data: Dict[str, Any], + entry_data: dict[str, Any], *, types: "frozenset[str] | set[str]" = ENTITY_LINK_TYPES, -) -> List[EntityReference]: +) -> list[EntityReference]: """Return deduped entity references from note links and entity-link fields. Note links are filtered to ``types`` (default: all linkable entity types). @@ -235,7 +235,7 @@ def extract_entity_references( ID, preserving first-seen order (note links before fields). """ seen: set[str] = set() - refs: List[EntityReference] = [] + refs: list[EntityReference] = [] for link in extract_note_links(entry_data): link_id = link.get("id") @@ -265,14 +265,14 @@ def extract_entity_references( return refs -def extract_results_tables(entry_data: Dict[str, Any]) -> List[ResultsTableReference]: +def extract_results_tables(entry_data: dict[str, Any]) -> list[ResultsTableReference]: """Return deduped assay-results-table references from entry notes. Only notes whose ``type`` is a results-table type *and* that carry an ``assayResultSchemaId`` are returned. Deduped by ``(api_id, schema_id)``. """ - seen: set[Tuple[Optional[str], str]] = set() - tables: List[ResultsTableReference] = [] + seen: set[tuple[Optional[str], str]] = set() + tables: list[ResultsTableReference] = [] for note in _iter_notes(entry_data): if note.get("type") not in RESULTS_TABLE_NOTE_TYPES: continue @@ -292,3 +292,42 @@ def extract_results_tables(entry_data: Dict[str, Any]) -> List[ResultsTableRefer ) ) return tables + + +# Bump when the references.json shape changes in a way consumers must notice. +REFERENCES_SCHEMA_VERSION = 1 + + +def summarize_references(entry_data: dict[str, Any]) -> dict[str, Any]: + """Build a JSON-serializable summary of everything an entry points at. + + Intended to be written into the package alongside ``entry.json`` (as + ``references.json``) so downstream consumers see the discovered objects + without re-parsing the raw entry. Records discovery only -- no Benchling + records are fetched here. + """ + return { + "schema_version": REFERENCES_SCHEMA_VERSION, + "entities": [ + {"id": e.id, "type": e.type, "web_url": e.web_url, "source": e.source} + for e in extract_entity_references(entry_data) + ], + "links": [ + { + "id": link.id, + "type": link.type, + "category": link.category.value, + "web_url": link.web_url, + "packageable": link.is_packageable, + } + for link in classify_links(entry_data) + ], + "results_tables": [ + { + "assay_result_schema_id": t.assay_result_schema_id, + "api_id": t.api_id, + "name": t.name, + } + for t in extract_results_tables(entry_data) + ], + } diff --git a/docker/tests/test_entry_packager.py b/docker/tests/test_entry_packager.py index f9c9187f..a00ed595 100644 --- a/docker/tests/test_entry_packager.py +++ b/docker/tests/test_entry_packager.py @@ -648,6 +648,46 @@ def test_create_metadata_files_dict_format(self, orchestrator): assert "filename" not in entry_json["files"]["file2.csv"] assert "filename" not in entry_json["files"]["data.json"] + def test_create_metadata_files_includes_references_json(self, orchestrator): + """references.json captures the entities/resources the entry links to.""" + entry_data = { + "id": "etr_123", + "display_id": "EXP-001", + "name": "Test Entry", + "web_url": "https://demo.benchling.com/entry/etr_123", + "created_at": "2025-10-01T10:00:00Z", + "modified_at": "2025-10-02T10:00:00Z", + "days": [ + { + "notes": [ + { + "type": "text", + "links": [ + {"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}, + {"id": "axdash_1", "type": "sql_dashboard", "webURL": "u2"}, + ], + } + ] + } + ], + } + + result = orchestrator._create_metadata_files( + package_name="benchling/EXP-001", + entry_id="etr_123", + timestamp="2025-10-02T10:00:00Z", + base_url="https://demo.benchling.com", + webhook_data={}, + uploaded_files=[], + download_url="https://example.com/export.zip", + entry_data=entry_data, + ) + + refs = result["references.json"] + assert refs["schema_version"] == 1 + assert [e["id"] for e in refs["entities"]] == ["bfi_1"] # dashboard filtered out + assert {link["type"] for link in refs["links"]} == {"custom_entity", "sql_dashboard"} + def test_create_metadata_files_includes_canvas_id_when_present(self, orchestrator): """Test entry.json stores canvas_id for canvas-initiated exports.""" result = orchestrator._create_metadata_files( diff --git a/docker/tests/test_entry_references.py b/docker/tests/test_entry_references.py index 7861aed0..d04acadb 100644 --- a/docker/tests/test_entry_references.py +++ b/docker/tests/test_entry_references.py @@ -1,7 +1,6 @@ """Tests for entry_references extractor (shared discovery for #143 + #68/#69).""" import json -from pathlib import Path from src.entry_references import ( ENTITY_LINK_TYPES, @@ -16,6 +15,7 @@ extract_entity_references, extract_note_links, extract_results_tables, + summarize_references, ) @@ -240,25 +240,31 @@ def test_packageable_categories_membership(self): } -class TestEntryLinkTypesJson: - """Guard spec/entry-link-types.json against drift from the module.""" - - def _load(self): - path = Path(__file__).resolve().parents[2] / "spec" / "entry-link-types.json" - return json.loads(path.read_text()) - - def test_json_covers_same_types(self): - ref = self._load() - json_types = {row["type"] for row in ref["link_types"]} - assert json_types == set(LINK_TYPE_CATEGORY) - - def test_json_categories_match_module(self): - ref = self._load() - for row in ref["link_types"]: - module_cat = LINK_TYPE_CATEGORY[row["type"]] - assert row["category"] == module_cat.value, row["type"] - assert row["packageable"] == (module_cat in PACKAGEABLE_CATEGORIES), row["type"] - - def test_json_packageable_categories_match_module(self): - ref = self._load() - assert set(ref["packageable_categories"]) == {c.value for c in PACKAGEABLE_CATEGORIES} +class TestSummarizeReferences: + def test_serializable_summary_of_all_discovered(self): + entry = _entry( + _link_note( + {"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}, + {"id": "axdash_1", "type": "sql_dashboard", "webURL": "u2"}, + ), + {"type": "results_table", "apiId": "tbl_1", "assayResultSchemaId": "assaysch_1", "name": "T1"}, + fields={"Cell Line": {"type": "entity_link", "value": "seq_field"}}, + ) + summary = summarize_references(entry) + # JSON-serializable end to end (categories are plain strings, etc.) + assert json.loads(json.dumps(summary)) == summary + assert summary["schema_version"] == 1 + assert [e["id"] for e in summary["entities"]] == ["bfi_1", "seq_field"] + assert {link["type"] for link in summary["links"]} == {"custom_entity", "sql_dashboard"} + assert summary["links"][0]["category"] == "entity" + assert summary["links"][1]["packageable"] is False + assert summary["results_tables"] == [{"assay_result_schema_id": "assaysch_1", "api_id": "tbl_1", "name": "T1"}] + + def test_empty_entry_yields_empty_arrays(self): + summary = summarize_references({}) + assert summary == { + "schema_version": 1, + "entities": [], + "links": [], + "results_tables": [], + } diff --git a/spec/entry-link-types.json b/spec/entry-link-types.json deleted file mode 100644 index 7200af4b..00000000 --- a/spec/entry-link-types.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "$comment": "Reference map of Benchling EntryLink.type tokens -> how the packager treats them. Source of truth for `type` enum: test/openapi.yaml (EntryLink schema, 18 tokens). Categories mirror docker/src/entry_references.py (LINK_TYPE_CATEGORY); test_entry_references.py asserts they agree. Endpoints / id prefixes / events per issue #389. See issues #143, #389, #68.", - "categories": { - "entity": "Registry entity; GET-by-id and v2.entity.registered event. Packageable.", - "inventory": "Packageable via GET-by-id; no webhook events.", - "reference": "Packageable; has its own create/update webhook events.", - "metadata": "GET works but low value to package as a record.", - "not_packageable": "No read API exists; keep the webURL as a reference only.", - "uncertain": "Endpoint depends on tenant API version (e.g. v2-alpha); verify before relying.", - "external": "Plain http(s) hyperlink ('link' type); no Benchling ID.", - "unknown": "Type not in the known enum; surfaced rather than dropped." - }, - "packageable_categories": ["entity", "inventory", "reference"], - "link_types": [ - {"type": "custom_entity", "category": "entity", "packageable": true, "id_prefix": "bfi_", "get_endpoint": "GET /custom-entities/{id}", "webhook_events": ["v2.entity.registered"]}, - {"type": "dna_sequence", "category": "entity", "packageable": true, "id_prefix": "seq_", "get_endpoint": "GET /dna-sequences/{id}", "webhook_events": ["v2.entity.registered"]}, - {"type": "aa_sequence", "category": "entity", "packageable": true, "id_prefix": "prtn_", "get_endpoint": "GET /aa-sequences/{id}", "webhook_events": ["v2.entity.registered"]}, - {"type": "batch", "category": "entity", "packageable": true, "id_prefix": "bat_", "get_endpoint": "GET /batches/{id}", "webhook_events": ["v2.entity.registered"]}, - {"type": "container", "category": "inventory", "packageable": true, "id_prefix": "con_", "get_endpoint": "GET /containers/{id}", "webhook_events": []}, - {"type": "box", "category": "inventory", "packageable": true, "id_prefix": "box_", "get_endpoint": "GET /boxes/{id}", "webhook_events": []}, - {"type": "plate", "category": "inventory", "packageable": true, "id_prefix": "plt_", "get_endpoint": "GET /plates/{id}", "webhook_events": []}, - {"type": "location", "category": "inventory", "packageable": true, "id_prefix": "loc_", "get_endpoint": "GET /locations/{id}", "webhook_events": [], "notes": "Locations do not have a webURL."}, - {"type": "entry", "category": "reference", "packageable": true, "id_prefix": "etr_", "get_endpoint": "GET /entries/{id}", "webhook_events": ["v2.entry.created", "v2.entry.updated.fields", "v2.entry.updated.reviewRecord"]}, - {"type": "request", "category": "reference", "packageable": true, "id_prefix": "req_", "get_endpoint": "GET /requests/{id}", "webhook_events": ["v2.request.created", "v2.request.updated.fields", "v2.request.updated.status"]}, - {"type": "workflow", "category": "reference", "packageable": true, "id_prefix": "wfgrp_", "get_endpoint": "GET /workflow-task-groups/{id}", "webhook_events": ["v2.workflowTaskGroup.created", "v2.workflowTaskGroup.mappingCompleted", "v2.workflowTaskGroup.updated.watchers"]}, - {"type": "stage_entry", "category": "uncertain", "packageable": false, "id_prefix": "etr_", "get_endpoint": "v2-alpha only", "webhook_events": ["v2-alpha.stageEntry.*"], "notes": "Verify against the deployed tenant's API version before relying on it."}, - {"type": "protocol", "category": "not_packageable", "packageable": false, "id_prefix": null, "get_endpoint": null, "webhook_events": [], "notes": "Linkable but no GET-by-id endpoint in the stable spec."}, - {"type": "user", "category": "metadata", "packageable": false, "id_prefix": "ent_", "get_endpoint": "GET /users/{id}", "webhook_events": [], "notes": "Metadata only."}, - {"type": "folder", "category": "metadata", "packageable": false, "id_prefix": "lib_", "get_endpoint": "GET /folders/{id}", "webhook_events": [], "notes": "Legacy; metadata only."}, - {"type": "sql_dashboard", "category": "not_packageable", "packageable": false, "id_prefix": "axdash_", "get_endpoint": null, "webhook_events": [], "notes": "No read API; Insights/analytics artifact reachable only via the Warehouse. Keep webURL deep link."}, - {"type": "insights_dashboard", "category": "not_packageable", "packageable": false, "id_prefix": "axdash_", "get_endpoint": null, "webhook_events": [], "notes": "Legacy alias of sql_dashboard; no read API."}, - {"type": "link", "category": "external", "packageable": false, "id_prefix": null, "get_endpoint": null, "webhook_events": [], "notes": "Plain external hyperlink; id omitted, webURL is the literal URL."} - ], - "not_inline_linkable": { - "$comment": "Resources that are NOT EntryLink types -- they cannot appear in links[]. They reach an entry via structured note parts / inventory tables instead.", - "types": ["dna_oligo", "rna_oligo", "mixture", "assay_run", "assay_result", "workflow_task", "workflow_output"] - } -} From 9136f51ffe67f1adeff5bd0687907790bb6bed4d Mon Sep 17 00:00:00 2001 From: "Dr. Ernie Prabhakar" Date: Mon, 15 Jun 2026 10:51:37 -0700 Subject: [PATCH 04/11] test: regression test for empty-string entity-link field values Confirms _field_value_ids drops empty strings (single + isMulti list), matching the note-link guard. Requested in review. Refs #143 Co-Authored-By: Claude Opus 4.8 (1M context) --- docker/tests/test_entry_references.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docker/tests/test_entry_references.py b/docker/tests/test_entry_references.py index d04acadb..5c854302 100644 --- a/docker/tests/test_entry_references.py +++ b/docker/tests/test_entry_references.py @@ -116,6 +116,17 @@ def test_fields_as_list_shape(self): refs = extract_entity_references(entry) assert [r.id for r in refs] == ["bfi_x"] + def test_empty_string_field_values_ignored(self): + # Mirrors the `if not link_id` guard on note links: no EntityReference(id=""). + entry = _entry( + fields={ + "Empty": {"type": "entity_link", "value": ""}, + "EmptyMulti": {"type": "entity_link", "value": ["", "bfi_ok", ""]}, + } + ) + refs = extract_entity_references(entry) + assert [r.id for r in refs] == ["bfi_ok"] + class TestExtractResultsTables: def test_returns_tables_with_schema_id(self): From c6e5c3d48df52352dbbee67cba4de65c897c945e Mon Sep 17 00:00:00 2001 From: "Dr. Ernie Prabhakar" Date: Mon, 15 Jun 2026 11:01:16 -0700 Subject: [PATCH 05/11] refactor: replace overloaded `packageable` with fetchable + eventable + disposition `packageable` conflated two questions: can a record be fetched, vs. should it live in its own package or nested in the entry. Split into orthogonal axes and a derived disposition. - FETCHABLE_CATEGORIES (GET-by-id exists) + EVENTABLE_CATEGORIES (own webhooks, can arrive independent of an entry) + CATEGORY_DISPOSITION. - LinkRef.is_fetchable / is_eventable / disposition (replaces is_packageable). - references.json links now carry {category, fetchable, eventable, disposition}. disposition makes explicit that nest-vs-standalone is a genuine product decision ONLY for entities (fetchable AND eventable -> nest_or_standalone). Non-entities are forced: inventory -> nest (no events); entry/request/workflow -> link (own package); metadata -> pointer; dashboards/external -> skip. Project artifact scripts/entry-link-types.json updated to match (not in repo). Refs #143 #389 Co-Authored-By: Claude Opus 4.8 (1M context) --- docker/src/entry_references.py | 58 ++++++++++++++++++++++++--- docker/tests/test_entry_references.py | 40 ++++++++++++++---- 2 files changed, 85 insertions(+), 13 deletions(-) diff --git a/docker/src/entry_references.py b/docker/src/entry_references.py index e503e613..7b3887df 100644 --- a/docker/src/entry_references.py +++ b/docker/src/entry_references.py @@ -26,7 +26,14 @@ class LinkCategory(str, Enum): - """How a note-link type relates to packaging (per #389 conclusions).""" + """How a note-link type relates to packaging (per #389). + + Two orthogonal axes drive what we do with a referenced object (see + FETCHABLE_CATEGORIES / EVENTABLE_CATEGORIES and CATEGORY_DISPOSITION): + - fetchable: a record can be retrieved via GET-by-id. + - eventable: it emits its own create/update webhooks, so it can arrive + independent of an entry (a candidate for its own package, #143 behavior 2). + """ ENTITY = "entity" # registry entity; GET-by-id + v2.entity.registered event INVENTORY = "inventory" # packageable via GET-by-id; no webhook events @@ -73,7 +80,33 @@ class LinkCategory(str, Enum): # via structured note parts / inventory tables, not links[]. # Categories whose resources can be fetched as a record via GET-by-id. -PACKAGEABLE_CATEGORIES = frozenset({LinkCategory.ENTITY, LinkCategory.INVENTORY, LinkCategory.REFERENCE}) +FETCHABLE_CATEGORIES = frozenset({LinkCategory.ENTITY, LinkCategory.INVENTORY, LinkCategory.REFERENCE}) + +# Categories that also emit their own create/update webhooks, so they can arrive +# independent of an entry (candidates for their own package, #143 behavior 2). +EVENTABLE_CATEGORIES = frozenset({LinkCategory.ENTITY, LinkCategory.REFERENCE}) + +# What to do with a referenced object when packaging the entry that links to it. +# "packageable" alone was ambiguous -- fetchable-as-a-record vs. worthy-of-its-own +# package. disposition is the actionable answer, derived from fetchable + eventable: +# nest_or_standalone -- entity: fetchable AND eventable. Nest inside the entry +# and/or package standalone -- the open #143 decision. +# nest -- fetchable, no events: only capturable via an entry. +# link -- fetchable primary object with its own events: reference +# its own package, don't embed a copy. +# pointer -- fetchable but low value: keep id + webURL only. +# verify -- endpoint depends on tenant API version / unknown type. +# skip -- nothing to fetch; the entry already keeps the webURL. +CATEGORY_DISPOSITION: dict[LinkCategory, str] = { + LinkCategory.ENTITY: "nest_or_standalone", + LinkCategory.INVENTORY: "nest", + LinkCategory.REFERENCE: "link", + LinkCategory.METADATA: "pointer", + LinkCategory.NOT_PACKAGEABLE: "skip", + LinkCategory.UNCERTAIN: "verify", + LinkCategory.EXTERNAL: "skip", + LinkCategory.UNKNOWN: "verify", +} # Linkable entity types (subset of LINK_TYPE_CATEGORY that are LinkCategory.ENTITY). ENTITY_LINK_TYPES = frozenset(t for t, cat in LINK_TYPE_CATEGORY.items() if cat is LinkCategory.ENTITY) @@ -96,8 +129,19 @@ class LinkRef: web_url: Optional[str] = None @property - def is_packageable(self) -> bool: - return self.category in PACKAGEABLE_CATEGORIES + def is_fetchable(self) -> bool: + """Whether a record can be retrieved via GET-by-id.""" + return self.category in FETCHABLE_CATEGORIES + + @property + def is_eventable(self) -> bool: + """Whether it emits its own webhooks (can arrive independent of an entry).""" + return self.category in EVENTABLE_CATEGORIES + + @property + def disposition(self) -> str: + """How to treat this reference when packaging the entry (see CATEGORY_DISPOSITION).""" + return CATEGORY_DISPOSITION[self.category] @dataclass(frozen=True) @@ -197,7 +241,7 @@ def classify_links(entry_data: dict[str, Any]) -> list[LinkRef]: Surfaces the *full* set of objects an entry points at -- entities, inventory, references, metadata pointers, dashboards, and external URLs -- so callers can - decide what to fetch (e.g. ``[r for r in classify_links(e) if r.is_packageable]``). + decide what to fetch (e.g. ``[r for r in classify_links(e) if r.is_fetchable]``). Deduped by Benchling ID when present, else by URL; first-seen order preserved. """ seen: set[str] = set() @@ -318,7 +362,9 @@ def summarize_references(entry_data: dict[str, Any]) -> dict[str, Any]: "type": link.type, "category": link.category.value, "web_url": link.web_url, - "packageable": link.is_packageable, + "fetchable": link.is_fetchable, + "eventable": link.is_eventable, + "disposition": link.disposition, } for link in classify_links(entry_data) ], diff --git a/docker/tests/test_entry_references.py b/docker/tests/test_entry_references.py index 5c854302..74fef581 100644 --- a/docker/tests/test_entry_references.py +++ b/docker/tests/test_entry_references.py @@ -4,8 +4,9 @@ from src.entry_references import ( ENTITY_LINK_TYPES, + EVENTABLE_CATEGORIES, + FETCHABLE_CATEGORIES, LINK_TYPE_CATEGORY, - PACKAGEABLE_CATEGORIES, EntityReference, LinkCategory, LinkRef, @@ -217,7 +218,7 @@ def test_surfaces_all_types_classified(self): LinkRef(type="link", category=LinkCategory.EXTERNAL, id=None, web_url="https://example.com"), ] - def test_is_packageable_filter(self): + def test_is_fetchable_filter(self): entry = _entry( _link_note( {"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}, @@ -225,8 +226,28 @@ def test_is_packageable_filter(self): {"type": "link", "webURL": "https://example.com"}, ) ) - packageable = [r.id for r in classify_links(entry) if r.is_packageable] - assert packageable == ["bfi_1"] + fetchable = [r.id for r in classify_links(entry) if r.is_fetchable] + assert fetchable == ["bfi_1"] + + def test_disposition_and_eventable_by_category(self): + def link_of(t, i): + return _link_note({"id": i, "type": t, "webURL": "u"}) + + entry = _entry( + link_of("custom_entity", "bfi_1"), # entity + link_of("container", "con_1"), # inventory + link_of("entry", "etr_1"), # reference + link_of("user", "ent_1"), # metadata + link_of("sql_dashboard", "axdash_1"), # not_packageable + ) + by_id = {r.id: r for r in classify_links(entry)} + assert by_id["bfi_1"].disposition == "nest_or_standalone" + assert by_id["bfi_1"].is_eventable is True + assert by_id["con_1"].disposition == "nest" + assert by_id["con_1"].is_eventable is False # inventory: nest-only + assert by_id["etr_1"].disposition == "link" + assert by_id["ent_1"].disposition == "pointer" + assert by_id["axdash_1"].disposition == "skip" def test_dedupes_by_id_then_url(self): entry = _entry( @@ -243,12 +264,13 @@ def test_dedupes_by_id_then_url(self): ("link", "https://dup.com"), ] - def test_packageable_categories_membership(self): - assert PACKAGEABLE_CATEGORIES == { + def test_fetchable_and_eventable_category_membership(self): + assert FETCHABLE_CATEGORIES == { LinkCategory.ENTITY, LinkCategory.INVENTORY, LinkCategory.REFERENCE, } + assert EVENTABLE_CATEGORIES == {LinkCategory.ENTITY, LinkCategory.REFERENCE} class TestSummarizeReferences: @@ -268,7 +290,11 @@ def test_serializable_summary_of_all_discovered(self): assert [e["id"] for e in summary["entities"]] == ["bfi_1", "seq_field"] assert {link["type"] for link in summary["links"]} == {"custom_entity", "sql_dashboard"} assert summary["links"][0]["category"] == "entity" - assert summary["links"][1]["packageable"] is False + assert summary["links"][0]["disposition"] == "nest_or_standalone" + assert summary["links"][0]["fetchable"] is True + assert summary["links"][0]["eventable"] is True + assert summary["links"][1]["disposition"] == "skip" + assert summary["links"][1]["fetchable"] is False assert summary["results_tables"] == [{"assay_result_schema_id": "assaysch_1", "api_id": "tbl_1", "name": "T1"}] def test_empty_entry_yields_empty_arrays(self): From f2af5e33a3cf816ab4ae6e2513c5662807eb7264 Mon Sep 17 00:00:00 2001 From: "Dr. Ernie Prabhakar" Date: Mon, 15 Jun 2026 12:02:13 -0700 Subject: [PATCH 06/11] feat: searchable links metadata + raw links.json (entity name enrichment) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Promote a curated `links` array into entry.json (the package's metadata_uri) so packages are searchable by the human-readable name of the entities/objects an entry references — the top-line use case from the 2026-06-15 call ("show me all experiments where QB-2743.1 was used"). Each curated link carries four fields, each with one job: - type, id — free; id supports downstream linking - name — authoritative Benchling display name via best-effort GET-by-id, or null when the lookup fails/isn't supported (never a slug) - slug — lossy token parsed from the webURL, for eyeballing/debugging only Verified the human name is NOT recoverable from the webURL: the trailing segment is a lowercased, punctuation-flattened slug (sBN000 -> sbn000), so the exact name must come from the API. Name resolution is best-effort and never raises — it requires the app to be a registry/project collaborator. Also rename references.json -> links.json and reduce it to raw facts only (id/type/web_url + entities + results_tables). Derived classifications (category/fetchable/eventable/disposition) are no longer persisted; they are recomputed from type in code, so the raw archive stays reprocessable and a future classification change needs no re-fetch. Schema bumped to v2. Refs #143 #389 Co-Authored-By: Claude Opus 4.8 (1M context) --- docker/src/entry_packager.py | 77 ++++++++++++++++++++++--- docker/src/entry_references.py | 82 ++++++++++++++++++++------- docker/tests/test_entry_packager.py | 29 +++++++--- docker/tests/test_entry_references.py | 73 +++++++++++++++++++++--- 4 files changed, 218 insertions(+), 43 deletions(-) diff --git a/docker/src/entry_packager.py b/docker/src/entry_packager.py index 81d6a779..73eb683e 100644 --- a/docker/src/entry_packager.py +++ b/docker/src/entry_packager.py @@ -20,12 +20,29 @@ from .auth import RoleManager from .config import get_config -from .entry_references import summarize_references +from .entry_references import link_metadata, summarize_references from .payload import Payload from .retry_utils import LAMBDA_INVOKE_RETRY, REST_API_RETRY logger = structlog.get_logger(__name__) +# EntryLink ``type`` -> Benchling SDK service attribute used to resolve the +# authoritative human-readable name via GET-by-id. Only types with a stable +# get_by_id are listed (entities first, then inventory and entries -- the types +# John flagged on the 2026-06-15 call); any other type keeps ``name=None`` and +# relies on its slug. Name resolution requires the app to be a registry/project +# collaborator, so every lookup is best-effort (see _enrich_link_names). +LINK_TYPE_TO_SERVICE: Dict[str, str] = { + "custom_entity": "custom_entities", + "dna_sequence": "dna_sequences", + "aa_sequence": "aa_sequences", + "container": "containers", + "box": "boxes", + "plate": "plates", + "location": "locations", + "entry": "entries", +} + class DateTimeEncoder(json.JSONEncoder): """Custom JSON encoder that converts datetime objects to ISO format strings.""" @@ -630,6 +647,13 @@ def _create_metadata_files( if canvas_id is not None: entry_json["canvas_id"] = canvas_id + # links - curated, searchable view of the objects this entry references. + # Promoted into entry.json (the package's metadata_uri) so `links.name` is + # queryable. Names are resolved best-effort against the Benchling API. + links = link_metadata(entry_data) + self._enrich_link_names(links) + entry_json["links"] = links + # input.json - Processing metadata input_json = { "source": "benchling_webhook", @@ -689,7 +713,7 @@ def _create_metadata_files( "entry.json", "entry_data.json", "input.json", - "references.json", + "links.json", "README.md", ]: readme_content += f"- `{file_info['filename']}` ({file_info['size']} bytes)\n" @@ -698,7 +722,7 @@ def _create_metadata_files( ## Metadata Files - `entry.json`: Key entry metadata (display_id, name, creator, authors, timestamps) - `entry_data.json`: Complete entry data from Benchling API -- `references.json`: Benchling objects this entry links to (entities, inventory, tables) +- `links.json`: Raw Benchling objects this entry links to (entities, inventory, tables); the searchable, name-enriched summary is the `links` field of `entry.json` - `input.json`: Export processing metadata - `README.md`: This documentation file @@ -707,18 +731,57 @@ def _create_metadata_files( For questions about the data, refer to the original Benchling entry. """ - # references.json - the entities/resources this entry points at, discovered - # from the entry's note links and fields (no Benchling records are fetched). - references_json = summarize_references(entry_data) + # links.json - raw discovery of the entities/resources this entry points at, + # from the entry's note links and fields (no Benchling records are fetched + # here; inferences are not persisted). The searchable view is entry.json.links. + links_json = summarize_references(entry_data) return { "entry.json": entry_json, "entry_data.json": entry_data, - "references.json": references_json, + "links.json": links_json, "input.json": input_json, "README.md": readme_content, } + def _enrich_link_names(self, links: list[Dict[str, Any]]) -> None: + """Fill each link's ``name`` with its authoritative Benchling display name. + + Mutates ``links`` in place (the curated entries from ``link_metadata``). + Best-effort and never raises: a missing client, an unsupported type, or a + forbidden/failed GET-by-id leaves ``name`` as ``None`` -- the slug stays for + eyeballing, and the slug is never promoted into ``name``. Name resolution + needs the app to be a registry/project collaborator (see AGENTS / setup). + """ + if not self.benchling: + return + for link in links: + service_attr = LINK_TYPE_TO_SERVICE.get(link.get("type") or "") + link_id = link.get("id") + if not service_attr or not link_id: + continue + service = getattr(self.benchling, service_attr, None) + if service is None: + continue + record = None + try: + # ``returning`` trims the payload to just the name where supported. + record = service.get_by_id(link_id, returning=["name"]) + except Exception: + try: + record = service.get_by_id(link_id) + except Exception as exc: + self.logger.debug( + "Link name lookup failed", + link_id=link_id, + link_type=link.get("type"), + error=str(exc), + ) + continue + record_name = getattr(record, "name", None) + if isinstance(record_name, str) and record_name: + link["name"] = record_name + def _load_existing_canvas_id_from_entry_json(self, s3_client: Any, package_name: str) -> Optional[str]: """Read canvas_id from a previously-written ``entry.json``, if any. diff --git a/docker/src/entry_references.py b/docker/src/entry_references.py index 7b3887df..c732c880 100644 --- a/docker/src/entry_references.py +++ b/docker/src/entry_references.py @@ -338,17 +338,70 @@ def extract_results_tables(entry_data: dict[str, Any]) -> list[ResultsTableRefer return tables -# Bump when the references.json shape changes in a way consumers must notice. -REFERENCES_SCHEMA_VERSION = 1 +def slug_from_web_url(web_url: Optional[str], link_id: Optional[str] = None) -> Optional[str]: + """Best-effort, human-ish slug parsed from a Benchling object's webURL. + + Benchling user-facing URLs end in ``.../{id}-{name-slug}/edit``. This returns + the trailing slug -- lowercased, punctuation-flattened, possibly truncated by + Benchling. It is **not** the authoritative display name (case and punctuation + are lost; e.g. ``QB-2743.1`` -> ``qb-2743-1``) and must never be used as one. + For the real name, fetch the record (the caller's job). Returns ``None`` when + no slug can be isolated. + """ + if not web_url or not link_id: + return None + path = web_url.split("?", 1)[0].split("#", 1)[0].rstrip("/") + segments = [p for p in path.split("/") if p and p != "edit"] + # The id-bearing segment is ``{id}-{slug}``; the id appears in either ``_`` or + # ``-`` form across tenants. Match it normalized, then slice off the ``{id}-`` + # prefix. A segment that is exactly the id (no trailing slug) yields None, and + # a URL with no id-bearing segment yields None -- never a stray path fragment. + norm_prefix = f"{link_id}-".replace("_", "-") + for candidate in segments: + if candidate.replace("_", "-").startswith(norm_prefix): + return candidate[len(norm_prefix) :] or None + return None + + +def link_metadata(entry_data: dict[str, Any]) -> list[dict[str, Any]]: + """Curated, searchable view of an entry's links: ``{type, id, name, slug}``. + + One flat entry per classified note link (deduped by :func:`classify_links`), + regardless of type -- the searchable summary promoted into ``entry.json`` so + ``links.name`` is queryable. ``name`` is left ``None`` here: the authoritative + display name needs an API fetch, which is the caller's job (this module is + pure). ``slug`` is the lossy webURL token from :func:`slug_from_web_url` -- + a debugging/eyeball aid only, never a substitute for ``name``. + """ + return [ + { + "type": r.type, + "id": r.id, + "name": None, + "slug": slug_from_web_url(r.web_url, r.id), + } + for r in classify_links(entry_data) + ] -def summarize_references(entry_data: dict[str, Any]) -> dict[str, Any]: - """Build a JSON-serializable summary of everything an entry points at. +# Bump when the links.json shape changes in a way consumers must notice. +# v2: links.json holds raw facts only -- derived classifications (category, +# fetchable, eventable, disposition) are no longer persisted; they are recomputed +# in code at runtime. The searchable, name-enriched view lives in entry.json +# ``links`` (see link_metadata), not here. +REFERENCES_SCHEMA_VERSION = 2 + - Intended to be written into the package alongside ``entry.json`` (as - ``references.json``) so downstream consumers see the discovered objects - without re-parsing the raw entry. Records discovery only -- no Benchling - records are fetched here. +def summarize_references(entry_data: dict[str, Any]) -> dict[str, Any]: + """Build the JSON-serializable raw-discovery payload written as ``links.json``. + + Records *raw facts* about what an entry points at -- ids, types, webURLs -- + so a consumer (or a future, different classification) can reprocess without + re-fetching. Deliberately excludes our inferences (``category``/``fetchable``/ + ``eventable``/``disposition``): those are derived from ``type`` in code, not + frozen to disk. No Benchling records are fetched here. The curated, searchable + view with human-readable names is the ``links`` field of ``entry.json`` (see + :func:`link_metadata`). """ return { "schema_version": REFERENCES_SCHEMA_VERSION, @@ -356,18 +409,7 @@ def summarize_references(entry_data: dict[str, Any]) -> dict[str, Any]: {"id": e.id, "type": e.type, "web_url": e.web_url, "source": e.source} for e in extract_entity_references(entry_data) ], - "links": [ - { - "id": link.id, - "type": link.type, - "category": link.category.value, - "web_url": link.web_url, - "fetchable": link.is_fetchable, - "eventable": link.is_eventable, - "disposition": link.disposition, - } - for link in classify_links(entry_data) - ], + "links": [{"id": link.id, "type": link.type, "web_url": link.web_url} for link in classify_links(entry_data)], "results_tables": [ { "assay_result_schema_id": t.assay_result_schema_id, diff --git a/docker/tests/test_entry_packager.py b/docker/tests/test_entry_packager.py index a00ed595..e2bb38f2 100644 --- a/docker/tests/test_entry_packager.py +++ b/docker/tests/test_entry_packager.py @@ -648,8 +648,8 @@ def test_create_metadata_files_dict_format(self, orchestrator): assert "filename" not in entry_json["files"]["file2.csv"] assert "filename" not in entry_json["files"]["data.json"] - def test_create_metadata_files_includes_references_json(self, orchestrator): - """references.json captures the entities/resources the entry links to.""" + def test_create_metadata_files_includes_links_json(self, orchestrator): + """links.json captures raw discovery; entry.json.links is the searchable view.""" entry_data = { "id": "etr_123", "display_id": "EXP-001", @@ -663,7 +663,11 @@ def test_create_metadata_files_includes_references_json(self, orchestrator): { "type": "text", "links": [ - {"id": "bfi_1", "type": "custom_entity", "webURL": "u1"}, + { + "id": "bfi_1", + "type": "custom_entity", + "webURL": "https://demo.benchling.com/benchling/f/lib_1-reg/bfi-1-qb-2743-1/edit", + }, {"id": "axdash_1", "type": "sql_dashboard", "webURL": "u2"}, ], } @@ -683,10 +687,21 @@ def test_create_metadata_files_includes_references_json(self, orchestrator): entry_data=entry_data, ) - refs = result["references.json"] - assert refs["schema_version"] == 1 - assert [e["id"] for e in refs["entities"]] == ["bfi_1"] # dashboard filtered out - assert {link["type"] for link in refs["links"]} == {"custom_entity", "sql_dashboard"} + # Raw discovery file (no references.json anymore). + assert "references.json" not in result + links_file = result["links.json"] + assert links_file["schema_version"] == 2 + assert [e["id"] for e in links_file["entities"]] == ["bfi_1"] # dashboard filtered out + assert {link["type"] for link in links_file["links"]} == {"custom_entity", "sql_dashboard"} + assert all(set(link) == {"id", "type", "web_url"} for link in links_file["links"]) + + # Searchable, curated view promoted into entry.json metadata. + meta_links = result["entry.json"]["links"] + assert all(set(link) == {"type", "id", "name", "slug"} for link in meta_links) + by_id = {link["id"]: link for link in meta_links} + # slug parsed from webURL; name left None (mock client returns no real name). + assert by_id["bfi_1"]["slug"] == "qb-2743-1" + assert by_id["bfi_1"]["name"] is None def test_create_metadata_files_includes_canvas_id_when_present(self, orchestrator): """Test entry.json stores canvas_id for canvas-initiated exports.""" diff --git a/docker/tests/test_entry_references.py b/docker/tests/test_entry_references.py index 74fef581..dd586a46 100644 --- a/docker/tests/test_entry_references.py +++ b/docker/tests/test_entry_references.py @@ -16,6 +16,8 @@ extract_entity_references, extract_note_links, extract_results_tables, + link_metadata, + slug_from_web_url, summarize_references, ) @@ -284,24 +286,77 @@ def test_serializable_summary_of_all_discovered(self): fields={"Cell Line": {"type": "entity_link", "value": "seq_field"}}, ) summary = summarize_references(entry) - # JSON-serializable end to end (categories are plain strings, etc.) + # JSON-serializable end to end. assert json.loads(json.dumps(summary)) == summary - assert summary["schema_version"] == 1 + assert summary["schema_version"] == 2 assert [e["id"] for e in summary["entities"]] == ["bfi_1", "seq_field"] + # links.json holds raw facts only -- no derived classifications persisted. assert {link["type"] for link in summary["links"]} == {"custom_entity", "sql_dashboard"} - assert summary["links"][0]["category"] == "entity" - assert summary["links"][0]["disposition"] == "nest_or_standalone" - assert summary["links"][0]["fetchable"] is True - assert summary["links"][0]["eventable"] is True - assert summary["links"][1]["disposition"] == "skip" - assert summary["links"][1]["fetchable"] is False + assert summary["links"][0] == {"id": "bfi_1", "type": "custom_entity", "web_url": "u1"} + assert all( + set(link) == {"id", "type", "web_url"} for link in summary["links"] + ), "links.json must not persist category/fetchable/eventable/disposition" assert summary["results_tables"] == [{"assay_result_schema_id": "assaysch_1", "api_id": "tbl_1", "name": "T1"}] def test_empty_entry_yields_empty_arrays(self): summary = summarize_references({}) assert summary == { - "schema_version": 1, + "schema_version": 2, "entities": [], "links": [], "results_tables": [], } + + +class TestSlugFromWebUrl: + def test_strips_known_id_prefix_dash_and_underscore_forms(self): + # id appears in dash form in the URL segment + assert ( + slug_from_web_url( + "https://benchling.com/benchling/f/R8KcsjhW-academic-registry/bfi-xCUXNVyG-sbn000/edit", + "bfi_xCUXNVyG", + ) + == "sbn000" + ) + # id appears in underscore form + assert ( + slug_from_web_url( + "https://benchling.com/benchling/f/lib_55UxcIps-registry/bfi_YtegMKkT-batch-test/edit", + "bfi_YtegMKkT", + ) + == "batch-test" + ) + + def test_drops_query_and_fragment(self): + assert ( + slug_from_web_url( + "https://benchling.com/benchling/f/lib_Fy2C0HOl-example/seq_UpzwvUug-consensus/edit?alignment=seqanl_1", + "seq_UpzwvUug", + ) + == "consensus" + ) + + def test_none_and_empty(self): + assert slug_from_web_url(None) is None + assert slug_from_web_url("") is None + assert slug_from_web_url("https://benchling.com/", "bfi_1") is None + + +class TestLinkMetadata: + def test_curated_four_field_view_name_left_for_caller(self): + entry = _entry( + _link_note( + { + "id": "bfi_xCUXNVyG", + "type": "custom_entity", + "webURL": "https://benchling.com/benchling/f/lib_1-reg/bfi-xCUXNVyG-sbn000/edit", + }, + {"id": "axdash_1", "type": "sql_dashboard", "webURL": "https://benchling.com/sql/axdash_1"}, + ) + ) + links = link_metadata(entry) + assert all(set(link) == {"type", "id", "name", "slug"} for link in links) + # name is the caller's job (pure module makes no API call) + assert all(link["name"] is None for link in links) + by_id = {link["id"]: link for link in links} + assert by_id["bfi_xCUXNVyG"]["slug"] == "sbn000" From 62fd7d21969fbbe5f837b0854c4c98f0428794a2 Mon Sep 17 00:00:00 2001 From: "Dr. Ernie Prabhakar" Date: Mon, 15 Jun 2026 12:18:29 -0700 Subject: [PATCH 07/11] chore: bump version to 0.17.3 --- docker/app-manifest.yaml | 2 +- docker/pyproject.toml | 2 +- docker/uv.lock | 2 +- package.json | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/app-manifest.yaml b/docker/app-manifest.yaml index daf47db8..5445ca59 100644 --- a/docker/app-manifest.yaml +++ b/docker/app-manifest.yaml @@ -2,7 +2,7 @@ manifestVersion: 1 info: name: nightly-quilttest-com description: Packaging Benchling Notebooks as Quilt packages - version: 0.17.2 + version: 0.17.3 features: - name: Quilt Connector id: quilt_entry diff --git a/docker/pyproject.toml b/docker/pyproject.toml index 1a7d3cd4..a9e667c6 100644 --- a/docker/pyproject.toml +++ b/docker/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "benchling-quilt-integration" -version = "0.17.2" +version = "0.17.3" description = "Benchling-Quilt Integration Webhook Service" license = {text = "Apache-2.0"} authors = [ diff --git a/docker/uv.lock b/docker/uv.lock index 7f351d09..b0baf329 100644 --- a/docker/uv.lock +++ b/docker/uv.lock @@ -75,7 +75,7 @@ wheels = [ [[package]] name = "benchling-quilt-integration" -version = "0.17.2" +version = "0.17.3" source = { editable = "." } dependencies = [ { name = "benchling-sdk" }, diff --git a/package.json b/package.json index f98400d4..10e89a8b 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@quiltdata/benchling-webhook", - "version": "0.17.2", + "version": "0.17.3", "description": "AWS CDK deployment for Benchling webhook processing using Fargate - Deploy directly with npx", "main": "dist/lib/index.js", "types": "dist/lib/index.d.ts", From a927ce3e5291713bae0ca4d38c54b43cbde899f2 Mon Sep 17 00:00:00 2001 From: "Dr. Ernie Prabhakar" Date: Mon, 15 Jun 2026 13:05:00 -0700 Subject: [PATCH 08/11] docs: changelog for 0.17.3 (searchable links metadata + links.json) Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ffc26d3b..4f481ba0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file. ## [Unreleased] +## [0.17.3] - 2026-06-15 + +### Added + +- Entries are now searchable by the **human-readable name** of the Benchling objects they reference. `entry.json` gains a `links` array — one `{type, id, name, slug}` entry per referenced object — promoted into the package metadata so `links.name` queries match real names (e.g. find every experiment that references `QB-2743.1`). `name` is resolved from the Benchling API (best-effort `get_by_id`; `null` when the app lacks registry access or the type is unsupported); `slug` is a lossy token parsed from the `webURL` for display/debugging only and is never used as a name + +### Changed + +- `references.json` renamed to **`links.json`** and reduced to raw discovery facts only — `id`/`type`/`web_url` per link, plus `entities` and `results_tables`. Derived classifications (`category`/`fetchable`/`eventable`/`disposition`) are no longer persisted; they are recomputed from `type` in code at runtime, so the raw archive stays reprocessable and a future classification change needs no re-fetch. `schema_version` bumped to `2` + ## [0.17.2] - 2026-04-15 ### Fixed From bc876d1976db7716243f5122cbf4a5ee2dad6fbd Mon Sep 17 00:00:00 2001 From: "Dr. Ernie Prabhakar" Date: Mon, 15 Jun 2026 13:09:08 -0700 Subject: [PATCH 09/11] chore: bump version to 0.18.0 --- docker/app-manifest.yaml | 2 +- docker/pyproject.toml | 2 +- docker/uv.lock | 2 +- package.json | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/app-manifest.yaml b/docker/app-manifest.yaml index 5445ca59..9459ad3d 100644 --- a/docker/app-manifest.yaml +++ b/docker/app-manifest.yaml @@ -2,7 +2,7 @@ manifestVersion: 1 info: name: nightly-quilttest-com description: Packaging Benchling Notebooks as Quilt packages - version: 0.17.3 + version: 0.18.0 features: - name: Quilt Connector id: quilt_entry diff --git a/docker/pyproject.toml b/docker/pyproject.toml index a9e667c6..f2c8b894 100644 --- a/docker/pyproject.toml +++ b/docker/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "benchling-quilt-integration" -version = "0.17.3" +version = "0.18.0" description = "Benchling-Quilt Integration Webhook Service" license = {text = "Apache-2.0"} authors = [ diff --git a/docker/uv.lock b/docker/uv.lock index b0baf329..2abcb96d 100644 --- a/docker/uv.lock +++ b/docker/uv.lock @@ -75,7 +75,7 @@ wheels = [ [[package]] name = "benchling-quilt-integration" -version = "0.17.3" +version = "0.18.0" source = { editable = "." } dependencies = [ { name = "benchling-sdk" }, diff --git a/package.json b/package.json index 10e89a8b..83eba0a8 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@quiltdata/benchling-webhook", - "version": "0.17.3", + "version": "0.18.0", "description": "AWS CDK deployment for Benchling webhook processing using Fargate - Deploy directly with npx", "main": "dist/lib/index.js", "types": "dist/lib/index.d.ts", From bbb22929f63aad5e5a9544e6e5029b815fa01142 Mon Sep 17 00:00:00 2001 From: "Dr. Ernie Prabhakar" Date: Mon, 15 Jun 2026 13:09:21 -0700 Subject: [PATCH 10/11] docs: align changelog heading to 0.18.0 Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f481ba0..d4e61b70 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file. ## [Unreleased] -## [0.17.3] - 2026-06-15 +## [0.18.0] - 2026-06-15 ### Added From 8d2ccdc039f7c0e1248ac4bf8e381e2fec52b0ce Mon Sep 17 00:00:00 2001 From: "Dr. Ernie Prabhakar" Date: Mon, 15 Jun 2026 13:11:19 -0700 Subject: [PATCH 11/11] =?UTF-8?q?docs:=20clarify=20note-link=20shape=20?= =?UTF-8?q?=E2=80=94=20id=20and=20webURL=20are=20optional?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Note links carry a required type plus optional id (external link has none) and optional webURL (e.g. location has none). Addresses PR #390 review. Co-Authored-By: Claude Opus 4.8 (1M context) --- docker/src/entry_references.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docker/src/entry_references.py b/docker/src/entry_references.py index c732c880..56768c8a 100644 --- a/docker/src/entry_references.py +++ b/docker/src/entry_references.py @@ -2,10 +2,12 @@ A Benchling entry points at other Benchling objects in three places: -1. Note links -- ``days[].notes[].links[]``, each ``{id, type, webURL}``. ``type`` - is a closed enum (``EntryLink.type`` in the Benchling OpenAPI spec) of 18 - tokens spanning entities, inventory, references, dashboards, and plain - external hyperlinks -- see :data:`LINK_TYPE_CATEGORY`. +1. Note links -- ``days[].notes[].links[]``, each carrying a ``type`` plus an + optional ``id`` and ``webURL`` (external ``link`` has no ``id``; some types + such as ``location`` have no ``webURL``). ``type`` is a closed enum + (``EntryLink.type`` in the Benchling OpenAPI spec) of 18 tokens spanning + entities, inventory, references, dashboards, and plain external hyperlinks -- + see :data:`LINK_TYPE_CATEGORY`. 2. Entity-link fields -- ``fields[name]`` whose ``type`` mentions ``entity``, carrying one or more entity IDs directly in ``value``. 3. Results tables -- ``results_table`` notes carrying an ``assayResultSchemaId``