diff --git a/ordvec-python/README.md b/ordvec-python/README.md index 8d19666..d1ca54d 100644 --- a/ordvec-python/README.md +++ b/ordvec-python/README.md @@ -24,6 +24,22 @@ scores, ids = q.search_asymmetric(np.random.randn(8, 1024).astype(np.float32), k | `Bitmap` | Constant-weight top-bucket bitmap per document; `popcount(Q AND D)` candidate scoring. | | `SignBitmap` | Sign bitmap for sign-cosine candidate generation; separate from the constant-weight bitmap theorem. | +## Two-stage retrieval (subset rerank) + +A `Bitmap` / `SignBitmap` probe yields a candidate shortlist that +`RankQuant.search_asymmetric_subset(query, candidates, k)` reranks exactly: + +```python +cands = bm.top_m_candidates(query, m=256) # uint32 shortlist +scores, ids = rq.search_asymmetric_subset(query, cands, k=10) +``` + +Both returned arrays have length **`min(k, len(candidates))`**, not `k`. When +`k > len(candidates)` the result is silently capped to the candidate count — the +subset path never pads with sentinel rows. If you assemble a fixed-width +`(n_q, k)` result buffer, size each row by its candidate count rather than +assuming `k` rows back. + ## Theory and calibration `Bitmap` exposes the constant-weight top-bucket overlap statistic formalized in @@ -46,6 +62,13 @@ Wheels target CPython 3.10+ (abi3) and require `numpy>=2.2`. Building from source needs a Rust toolchain (MSRV 1.89) and [maturin](https://www.maturin.rs/). +## Type stubs + +The package ships hand-written type stubs (`_ordvec.pyi`) and a `py.typed` +marker, so editors and `mypy` get full signatures for the four index classes, +the module-level rank-math primitives, and the `MAX_*` constants — the abi3 +native module is otherwise opaque to static analysis. + ## Provenance & license The `ordvec` Python bindings were developed within turbovec, factored out diff --git a/ordvec-python/python/ordvec/__init__.py b/ordvec-python/python/ordvec/__init__.py index 7bd55b5..18d6555 100644 --- a/ordvec-python/python/ordvec/__init__.py +++ b/ordvec-python/python/ordvec/__init__.py @@ -29,6 +29,12 @@ rank-mode classes; they are kept only to ease script migration and are not part of the documented surface — new code should use the OrdVec ontology names above. +Subset rerank result length: ``RankQuant.search_asymmetric_subset(query, +candidates, k)`` returns ``(scores, ids)`` of length ``min(k, len(candidates))``, +not ``k``. Passing ``k > len(candidates)`` yields arrays shorter than ``k`` (the +subset path does not pad with sentinel rows), so a caller building a fixed-width +``(n_q, k)`` buffer must size each row by its candidate count. + On-disk persistence: each class's ``write(path)`` / ``load(path)`` passes ``path`` straight to the filesystem with no normalisation or ``..`` / traversal checks. Treat ``path`` as trusted input — in a service that derives it from diff --git a/ordvec-python/python/ordvec/_ordvec.pyi b/ordvec-python/python/ordvec/_ordvec.pyi new file mode 100644 index 0000000..de4dc7f --- /dev/null +++ b/ordvec-python/python/ordvec/_ordvec.pyi @@ -0,0 +1,191 @@ +"""Type stubs for the ``ordvec._ordvec`` native (abi3) extension module. + +Hand-written to mirror the PyO3 surface in ``ordvec-python/src/lib.rs`` exactly +— the four index classes (``Rank``, ``RankQuant``, ``Bitmap``, ``SignBitmap``), +the module-level rank-math primitives, the byte-LUT / eval scorers, and the +``MAX_*`` loader limit constants. abi3 wheels carry no embedded type +information, so without this stub (and the ``py.typed`` marker) editors and +``mypy`` see ``Any`` for the whole package. + +Drift policy: kept in sync with ``lib.rs`` by hand (the issue #32 trade-off — no +new build dependency in exchange for manual upkeep). When a binding signature +changes, update the matching entry here. + +Array conventions (matching the binding's runtime coercion / dtype contract): + +* Float embedding inputs (``vectors`` / ``queries`` / ``query`` / ``v`` / + ``corpus``) accept any floating NumPy array (float16/32/64) — coerced to + float32 at the boundary — so they are typed loosely as ``NDArray[Any]``. +* Candidate / doc-id inputs accept any integer dtype (coerced to ``uint32``), + also typed ``NDArray[Any]``. +* Returned arrays carry the binding's fixed output dtype: float32 scores, int64 + search indices, uint32 candidate ids / overlap scores, uint16 ranks, uint8 + buckets / packed bytes, uint64 bitmap words. +""" + +from typing import Any, final + +import numpy as np +from numpy.typing import NDArray + +# --------------------------------------------------------------------------- +# Loader / limit constants (parity with ``ordvec::rank_io::*``). +# --------------------------------------------------------------------------- +MAX_DIM: int +MAX_SIGN_BITMAP_DIM: int +MAX_VECTORS: int + +# --------------------------------------------------------------------------- +# Index classes +# --------------------------------------------------------------------------- + +@final +class Rank: + """Full-precision rank vectors (u16 per coordinate).""" + + def __new__(cls, dim: int) -> Rank: ... + def __repr__(self) -> str: ... + def __len__(self) -> int: ... + @property + def dim(self) -> int: ... + @property + def bytes_per_vec(self) -> int: ... + @property + def byte_size(self) -> int: ... + def is_empty(self) -> bool: ... + def add(self, vectors: NDArray[Any]) -> None: ... + def search( + self, queries: NDArray[Any], k: int + ) -> tuple[NDArray[np.float32], NDArray[np.int64]]: ... + def search_asymmetric( + self, queries: NDArray[Any], k: int + ) -> tuple[NDArray[np.float32], NDArray[np.int64]]: ... + def swap_remove(self, idx: int) -> int: ... + def write(self, path: str) -> None: ... + @classmethod + def load(cls, path: str) -> Rank: ... + +@final +class RankQuant: + """Bucketed ranks, ``bits`` in {1, 2, 4}; symmetric + asymmetric scoring.""" + + def __new__(cls, dim: int, bits: int) -> RankQuant: ... + def __repr__(self) -> str: ... + def __len__(self) -> int: ... + @property + def dim(self) -> int: ... + @property + def bits(self) -> int: ... + @property + def bytes_per_vec(self) -> int: ... + @property + def byte_size(self) -> int: ... + def is_empty(self) -> bool: ... + def add(self, vectors: NDArray[Any]) -> None: ... + def search( + self, queries: NDArray[Any], k: int + ) -> tuple[NDArray[np.float32], NDArray[np.int64]]: ... + def search_asymmetric( + self, queries: NDArray[Any], k: int + ) -> tuple[NDArray[np.float32], NDArray[np.int64]]: ... + def search_asymmetric_subset( + self, query: NDArray[Any], candidates: NDArray[Any], k: int + ) -> tuple[NDArray[np.float32], NDArray[np.int64]]: + """Asymmetric rerank over a candidate subset. + + Returns ``(scores, global_ids)`` of length ``min(k, len(candidates))`` + — not ``k``. When ``k > len(candidates)`` the result is silently capped + to the candidate count; the subset path does not pad with sentinel rows + (see issue #14). + """ + ... + + def swap_remove(self, idx: int) -> int: ... + def write(self, path: str) -> None: ... + @classmethod + def load(cls, path: str) -> RankQuant: ... + +@final +class Bitmap: + """Constant-weight top-bucket bitmap per document; ``popcount(Q AND D)``.""" + + def __new__(cls, dim: int, n_top: int) -> Bitmap: ... + def __repr__(self) -> str: ... + def __len__(self) -> int: ... + @property + def dim(self) -> int: ... + @property + def n_top(self) -> int: ... + @property + def bytes_per_vec(self) -> int: ... + @property + def byte_size(self) -> int: ... + def is_empty(self) -> bool: ... + def add(self, vectors: NDArray[Any]) -> None: ... + def search( + self, queries: NDArray[Any], k: int + ) -> tuple[NDArray[np.float32], NDArray[np.int64]]: ... + def search_subset( + self, query: NDArray[Any], doc_ids: NDArray[Any], k: int + ) -> tuple[NDArray[np.float32], NDArray[np.int64]]: ... + def top_m_candidates(self, query: NDArray[Any], m: int) -> NDArray[np.uint32]: ... + def top_m_candidates_batched( + self, queries: NDArray[Any], m: int + ) -> NDArray[np.uint32]: ... + def top_m_candidates_batched_chunked( + self, queries: NDArray[Any], m: int, batch_size: int + ) -> NDArray[np.uint32]: ... + def build_query_bitmap_fp32(self, query: NDArray[Any]) -> NDArray[np.uint64]: ... + def body_overlap_scores_subset( + self, q_bitmap: NDArray[np.uint64], doc_ids: NDArray[Any] + ) -> NDArray[np.uint32]: ... + def write(self, path: str) -> None: ... + @classmethod + def load(cls, path: str) -> Bitmap: ... + +@final +class SignBitmap: + """1-bit-per-coord sign-cosine retrieval substrate (no ``n_top``).""" + + def __new__(cls, dim: int) -> SignBitmap: ... + def __repr__(self) -> str: ... + def __len__(self) -> int: ... + @property + def dim(self) -> int: ... + @property + def bytes_per_vec(self) -> int: ... + @property + def byte_size(self) -> int: ... + def is_empty(self) -> bool: ... + def add(self, vectors: NDArray[Any]) -> None: ... + def top_m_candidates(self, query: NDArray[Any], m: int) -> NDArray[np.uint32]: ... + def top_m_candidates_batched( + self, queries: NDArray[Any], m: int + ) -> NDArray[np.uint32]: ... + def score_all(self, query: NDArray[Any]) -> NDArray[np.uint32]: ... + def score_all_batched(self, queries: NDArray[Any]) -> NDArray[np.uint32]: ... + def build_query_bitmap(self, query: NDArray[Any]) -> NDArray[np.uint64]: ... + def write(self, path: str) -> None: ... + @classmethod + def load(cls, path: str) -> SignBitmap: ... + +# --------------------------------------------------------------------------- +# Module-level rank-math primitives (parity with ``ordvec::rank::*``) and the +# byte-LUT / eval scoring helpers. +# --------------------------------------------------------------------------- + +def rank_transform(v: NDArray[Any]) -> NDArray[np.uint16]: ... +def rank_to_bucket(rank: int, d: int, bits: int) -> int: ... +def bucket_ranks(ranks: NDArray[np.uint16], bits: int) -> NDArray[np.uint8]: ... +def pack_buckets(buckets: NDArray[np.uint8], bits: int) -> NDArray[np.uint8]: ... +def unpack_buckets(packed: NDArray[np.uint8], d: int, bits: int) -> NDArray[np.uint8]: ... +def rankquant_bytes_per_vec(d: int, bits: int) -> int: ... +def bucket_centre(bucket: int, bits: int) -> float: ... +def rank_norm(d: int) -> float: ... +def rankquant_norm(d: int, bits: int) -> float: ... +def search_asymmetric_byte_lut( + index: RankQuant, queries: NDArray[Any], k: int +) -> tuple[NDArray[np.float32], NDArray[np.int64]]: ... +def rankquant_eval_search( + corpus: NDArray[Any], queries: NDArray[Any], bits: int, k: int +) -> tuple[NDArray[np.float32], NDArray[np.int64]]: ... diff --git a/ordvec-python/python/ordvec/py.typed b/ordvec-python/python/ordvec/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/ordvec-python/src/lib.rs b/ordvec-python/src/lib.rs index 984d8b1..6e75661 100644 --- a/ordvec-python/src/lib.rs +++ b/ordvec-python/src/lib.rs @@ -731,8 +731,14 @@ impl RankQuant { /// Asymmetric scoring restricted to a candidate subset (e.g. the top-M /// shortlist from a [`Bitmap`] or [`SignBitmap`] probe). Returns /// ``(scores, global_ids)`` where ``global_ids`` are the original doc - /// indices (mapped from the local candidate slot). ``k`` is capped to the - /// candidate-list length; the subset path does not add sentinel padding. + /// indices (mapped from the local candidate slot). + /// + /// Both returned arrays have length ``min(k, len(candidates))`` — **not** + /// ``k``. When ``k > len(candidates)`` the result is silently capped to the + /// candidate count; the subset path does not pad with sentinel rows. A + /// caller assembling a fixed-width ``(n_q, k)`` buffer must therefore size + /// each row by its candidate count, not by ``k``. + /// /// Uses the same AVX-512 → AVX2 → scalar dispatch as ``search_asymmetric``. /// /// ``candidates`` may be unsorted and may contain duplicates. Duplicate