test(fixtures): add bad-enum + missing-enum-variant reference workers

rustyconover · claude · rustyconover · commit 595481dfaa47 · 2026-06-18T22:11:04.000-04:00
Make the example catalog a conformance floor for other-language VGI
implementations by ensuring every wire-enum variant is emitted by at least
one fixture:

- bad_enum.py (vgi-fixture-bad-enum-worker): advertises an unrecognized
  null_handling value ("WEIRD") on the `double` scalar, to exercise the C++
  extension's strict wire-enum validation. A real Enum member whose .name is
  bogus serializes through ArrowSerializableDataclass without a typed path.
- QuerySeedFunction: emits stability=CONSISTENT_WITHIN_QUERY (no fixture
  emitted this variant before).
- OverlappingRangePartitionedFunction: emits
  partition_kind=OVERLAPPING_PARTITIONS (no fixture emitted this before).

All three are registered in the example catalog; bad-enum is wired as a
vgi-fixtures entry point alongside bad-protocol.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/packages/vgi-fixtures/pyproject.toml b/packages/vgi-fixtures/pyproject.toml
@@ -43,6 +43,7 @@ vgi-fixture-versioned-worker = "vgi._test_fixtures.versioned:main"
 vgi-fixture-versioned-tables-worker = "vgi._test_fixtures.versioned_tables:main"
 vgi-fixture-attach-options-worker = "vgi._test_fixtures.attach_options:main"
 vgi-fixture-bad-protocol-worker = "vgi._test_fixtures.bad_protocol:main"
+vgi-fixture-bad-enum-worker = "vgi._test_fixtures.bad_enum:main"
 vgi-fixture-writable-worker = "vgi._test_fixtures.writable.worker:main"
 vgi-fixture-simple-writable-worker = "vgi._test_fixtures.simple_writable:main"
 
diff --git a/vgi/_test_fixtures/bad_enum.py b/vgi/_test_fixtures/bad_enum.py
@@ -0,0 +1,72 @@
+# Copyright 2025, 2026 Query Farm LLC - https://query.farm
+
+"""Fixture worker that deliberately advertises an unrecognized enum value.
+
+This fixture exercises the C++ extension's *wire-enum validation* end-to-end:
+the catalog-metadata parser (``ParseFunctionInfo`` in
+``vgi/src/vgi_catalog_api.cpp``) must reject an enum string it does not
+recognize with a loud ``IOException`` rather than silently falling back to a
+default. A silent fallback would run with behavior inconsistent with what the
+worker declared (e.g. treating a ``SPECIAL`` null-handling function as
+``DEFAULT``).
+
+The trick is entirely Python-side and needs no extension rebuild. The normal
+metadata path can only ever emit valid enum names because the values come from
+typed Python ``Enum`` members. To get a bogus string onto the wire we override
+:meth:`ExampleCatalog._function_to_info` for one scalar function (``double``)
+and swap its ``null_handling`` for :class:`_BogusNullHandling.WEIRD` — a real
+``Enum`` member whose ``.name`` is ``"WEIRD"``. The vgi-rpc serializer converts
+any ``Enum`` field to ``value.name`` (see ``ArrowSerializableDataclass``), so
+``"WEIRD"`` lands in the ``null_handling`` Arrow column and the C++ parser
+trips on it the moment the ``double`` function's metadata is loaded.
+
+Otherwise this is a drop-in replacement for ``vgi-fixture-worker``: every other
+function and the catalog are inherited unchanged from :class:`ExampleWorker`,
+so any function except ``double`` still resolves normally.
+
+Registered as the ``vgi-fixture-bad-enum-worker`` entry point.
+"""
+
+from __future__ import annotations
+
+from dataclasses import replace
+from enum import Enum
+
+from vgi._test_fixtures.worker import ExampleCatalog, ExampleWorker
+from vgi.catalog.catalog_interface import FunctionInfo
+
+# The scalar function whose null_handling we corrupt. Tests reference this name
+# to force the broken metadata onto the parse path.
+BAD_ENUM_FUNCTION = "double"
+
+
+class _BogusNullHandling(Enum):
+    """An enum member whose ``.name`` is a value the C++ parser cannot map."""
+
+    WEIRD = "WEIRD"
+
+
+class BadEnumCatalog(ExampleCatalog):
+    """ExampleCatalog that advertises a bogus null_handling for one function."""
+
+    def _function_to_info(self, func_cls: type, schema_name: str) -> FunctionInfo:
+        info = super()._function_to_info(func_cls, schema_name)
+        if info.name == BAD_ENUM_FUNCTION and info.null_handling is not None:
+            # FunctionInfo is frozen; replace() returns a corrupted copy.
+            return replace(info, null_handling=_BogusNullHandling.WEIRD)  # type: ignore[arg-type]
+        return info
+
+
+class BadEnumWorker(ExampleWorker):
+    """ExampleWorker that serves the example catalog with one bad enum value."""
+
+    catalog_interface = BadEnumCatalog
+
+
+def main() -> None:
+    """Run the bad-enum fixture worker process."""
+    BadEnumWorker.main()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vgi/_test_fixtures/scalar/__init__.py b/vgi/_test_fixtures/scalar/__init__.py
@@ -52,6 +52,7 @@
 from vgi._test_fixtures.scalar.random_demo import (
     BernoulliFunction,
     HashSeedFunction,
+    QuerySeedFunction,
     RandomBytesFunction,
     RandomIntFunction,
 )
@@ -102,6 +103,7 @@
     "PairTypeIntIntFunction",
     "PairTypeIntStrFunction",
     "PairTypeStrStrFunction",
+    "QuerySeedFunction",
     "RandomBytesFunction",
     "RandomIntFunction",
     "ReturnSecretValueFunction",
diff --git a/vgi/_test_fixtures/scalar/random_demo.py b/vgi/_test_fixtures/scalar/random_demo.py
@@ -136,6 +136,50 @@ def compute(
         return pa.array([seed + i for i in range(_length)], type=pa.int64())
 
 
+class QuerySeedFunction(ScalarFunction):
+    """Adds a per-query-stable seed to each input value.
+
+    Demonstrates ``FunctionStability.CONSISTENT_WITHIN_QUERY`` — the only
+    fixture that emits this stability variant. Semantically the value is fixed
+    for the duration of a single query but may differ across queries (like
+    ``now()``). DuckDB has no behavioral consumer that this fixture asserts; it
+    exists so the wire path for the third stability value stays exercised and
+    so other-language workers must specify it.
+
+    Example:
+        SQL:    SELECT query_seed(value) FROM data
+
+    """
+
+    class Meta:
+        """Function metadata."""
+
+        name = "query_seed"
+        description = "Add a per-query-stable seed to each value (demonstrates CONSISTENT_WITHIN_QUERY stability)"
+        stability = FunctionStability.CONSISTENT_WITHIN_QUERY
+        examples = [
+            FunctionExample(
+                sql="SELECT query_seed(value) FROM data",
+                description="Offset each value by a seed that is constant within a query",
+            ),
+        ]
+
+    @classmethod
+    def compute(
+        cls,
+        value: Annotated[pa.Int64Array, Param(doc="Value to offset")],
+    ) -> Annotated[pa.Int64Array, Returns()]:
+        """Add a fixed per-query offset to each value.
+
+        The offset is deterministic here (a constant) so SQL tests have a
+        stable expected output; the stability flag is what is under test, not
+        the numeric result.
+        """
+        import pyarrow.compute as pc
+
+        return pc.add(value, 1000)
+
+
 class RandomBytesFunction(ScalarFunction):
     """Generates deterministic pseudo-random binary blobs from a seed."""
 
diff --git a/vgi/_test_fixtures/table/__init__.py b/vgi/_test_fixtures/table/__init__.py
@@ -82,6 +82,7 @@
 from vgi._test_fixtures.table.partition_columns import (
     CountryPartitionedSalesFunction,
     DisjointRangePartitionedFunction,
+    OverlappingRangePartitionedFunction,
     PartitionedWithExplicitOverrideFunction,
     RegionYearPartitionedFunction,
 )
@@ -182,6 +183,7 @@
     "NestedSequenceFunction",
     "NonMonotoneBatchIndexFunction",
     "OrderEchoFunction",
+    "OverlappingRangePartitionedFunction",
     "PartitionedBatchIndexFunction",
     "PartitionedBatchIndexMarkedFunction",
     "PartitionedFixedOrderFunction",
diff --git a/vgi/_test_fixtures/table/partition_columns.py b/vgi/_test_fixtures/table/partition_columns.py
@@ -33,6 +33,11 @@
   disjoint integer range. Verifies the wire path; DuckDB falls back to
   ``HASH_GROUP_BY`` for GROUP BY queries against it.
 
+* :class:`OverlappingRangePartitionedFunction` — declares
+  ``OVERLAPPING_PARTITIONS`` (the only fixture that does). Consecutive
+  chunks share ``key`` values. Wire-level only; DuckDB falls back to
+  ``HASH_GROUP_BY``.
+
 All fixtures use the in-memory state pattern (no work-queue / no
 stream_state) — they're simpler than the v1 partitioned_batch_index
 since the v2 plan is about correctness of the partition contract,
@@ -470,3 +475,99 @@ def process(
         )
         out.emit(batch)
         state.current_idx = rpp
+
+
+# =============================================================================
+# OVERLAPPING_PARTITIONS — wire-level only
+# =============================================================================
+
+
+@dataclass(slots=True, frozen=True)
+class _OverlappingArgs:
+    """Arguments for ``overlapping_range_partitioned``."""
+
+    partitions: Annotated[int, Arg(0, doc="Number of overlapping partitions", ge=1)]
+    rows_per_partition: Annotated[int, Arg("rows_per_partition", default=10, doc="Rows per partition", ge=1)]
+
+
+@dataclass(kw_only=True)
+class _OverlappingState(ArrowSerializableDataclass):
+    current_partition_idx: int = -1
+    current_idx: int = 0
+    started: bool = False
+
+
+@bind_fixed_schema
+@_cardinality_from_count
+class OverlappingRangePartitionedFunction(TableFunctionGenerator[_OverlappingArgs, _OverlappingState]):
+    """Per-chunk *overlapping* integer ranges on ``key``.
+
+    Each chunk N emits ``key`` values in ``[N*500, N*500 + rows)``. With the
+    default ``rows_per_partition`` of 10 the ranges are disjoint, but callers
+    pass ``rows_per_partition > 500`` to make consecutive chunks overlap on
+    ``key`` — distinguishing this from
+    :class:`DisjointRangePartitionedFunction`.
+
+    Declares ``OVERLAPPING_PARTITIONS``. Like DISJOINT, DuckDB has no consumer
+    for OVERLAPPING today, so GROUP BY queries fall back to ``HASH_GROUP_BY``;
+    the value's purpose here is to keep the wire path (declaration, per-batch
+    min/max metadata, C++ extraction → ``get_partition_info``) exercised so
+    other-language workers must specify it. This is the only fixture that emits
+    ``OVERLAPPING_PARTITIONS``.
+    """
+
+    FIXED_SCHEMA: ClassVar[pa.Schema] = pa.schema(
+        [
+            partition_field("key", pa.int64()),
+            pa.field("value", pa.int64()),
+        ]
+    )
+
+    class Meta:
+        name = "overlapping_range_partitioned"
+        description = (
+            "Overlapping per-chunk integer ranges on ``key``. Declares "
+            "OVERLAPPING_PARTITIONS (wire-level only; DuckDB falls back to "
+            "HASH_GROUP_BY for now)."
+        )
+        categories = ["generator", "partitioning", "testing"]
+        partition_kind = PartitionKind.OVERLAPPING_PARTITIONS
+
+    @classmethod
+    def on_init(cls, params: InitParams[_OverlappingArgs]) -> GlobalInitResponse:
+        items = [struct.pack(_QUEUE_ITEM_FMT, i) for i in range(params.args.partitions)]
+        params.storage.queue_push(items)
+        return GlobalInitResponse()
+
+    @classmethod
+    def initial_state(cls, params: ProcessParams[_OverlappingArgs]) -> _OverlappingState:
+        return _OverlappingState()
+
+    @classmethod
+    def process(
+        cls,
+        params: ProcessParams[_OverlappingArgs],
+        state: _OverlappingState,
+        out: OutputCollector,
+    ) -> None:
+        if not state.started or state.current_idx >= params.args.rows_per_partition:
+            item = params.storage.queue_pop()
+            if item is None:
+                out.finish()
+                return
+            (state.current_partition_idx,) = struct.unpack(_QUEUE_ITEM_FMT, item)
+            state.current_idx = 0
+            state.started = True
+
+        rpp = params.args.rows_per_partition
+        # Stride of 500 (< rpp when callers want overlap) makes consecutive
+        # chunks share key values.
+        base = state.current_partition_idx * 500
+        keys = [base + i for i in range(rpp)]
+        values = [state.current_partition_idx * 10 + i for i in range(rpp)]
+        batch = pa.RecordBatch.from_pydict(
+            {"key": keys, "value": values},
+            schema=cls.FIXED_SCHEMA,
+        )
+        out.emit(batch)
+        state.current_idx = rpp
diff --git a/vgi/_test_fixtures/worker.py b/vgi/_test_fixtures/worker.py
@@ -86,6 +86,7 @@
     PairTypeIntIntFunction,
     PairTypeIntStrFunction,
     PairTypeStrStrFunction,
+    QuerySeedFunction,
     RandomBytesFunction,
     RandomIntFunction,
     ReturnSecretValueFunction,
@@ -146,6 +147,7 @@
     NestedSequenceFunction,
     NonMonotoneBatchIndexFunction,
     OrderEchoFunction,
+    OverlappingRangePartitionedFunction,
     PartitionedBatchIndexFunction,
     PartitionedBatchIndexMarkedFunction,
     PartitionedFixedOrderFunction,
@@ -385,6 +387,7 @@ def _build_enum_stats() -> dict[str, ColumnStatisticsInput]:
                 # — see vgi/_test_fixtures/table/partition_columns.py.
                 CountryPartitionedSalesFunction,
                 DisjointRangePartitionedFunction,
+                OverlappingRangePartitionedFunction,
                 PartitionedWithExplicitOverrideFunction,
                 RegionYearPartitionedFunction,
                 # Deliberately-broken batch_index fixtures (see
@@ -457,6 +460,7 @@ def _build_enum_stats() -> dict[str, ColumnStatisticsInput]:
                 PairTypeIntIntFunction,
                 PairTypeIntStrFunction,
                 PairTypeStrStrFunction,
+                QuerySeedFunction,
                 RandomBytesFunction,
                 RandomIntFunction,
                 ReturnSecretValueFunction,