From c1df1175595edf8cdf187900327bf018765c7a13 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Tue, 26 May 2026 20:24:21 +0000 Subject: [PATCH 1/6] uint64 header support based on size --- python/cuvs_bench/cuvs_bench/_bin_format.py | 144 ++++++++++++++ .../cuvs_bench/cuvs_bench/backends/_utils.py | 30 ++- .../cuvs_bench/generate_groundtruth/utils.py | 85 ++++++--- .../cuvs_bench/get_dataset/fbin_to_f16bin.py | 26 ++- .../cuvs_bench/get_dataset/hdf5_to_fbin.py | 10 +- .../cuvs_bench/cuvs_bench/tests/test_utils.py | 180 +++++++++++++++++- 6 files changed, 420 insertions(+), 55 deletions(-) create mode 100644 python/cuvs_bench/cuvs_bench/_bin_format.py diff --git a/python/cuvs_bench/cuvs_bench/_bin_format.py b/python/cuvs_bench/cuvs_bench/_bin_format.py new file mode 100644 index 0000000000..2611020fc5 --- /dev/null +++ b/python/cuvs_bench/cuvs_bench/_bin_format.py @@ -0,0 +1,144 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# + +""" +On-disk header helpers for the cuvs-bench binary file format. + +cuvs-bench inherits the big-ann-benchmarks binary layout: a small header +listing ``n_rows`` and ``n_cols`` followed by a dense ``n_rows * n_cols`` +array of the dtype implied by the file extension. Two layouts are supported: + +- **Legacy**: ``[uint32 n_rows, uint32 n_cols, data ...]`` (8-byte header). + This is what every existing ``.fbin`` / ``.ibin`` / ``.u8bin`` / ``.i8bin`` + / ``.f16bin`` / ``.hbin`` file on disk uses today. + +- **Extended**: ``[uint64 n_rows, uint64 n_cols, data ...]`` (16-byte header). + For datasets whose ``n_rows`` or ``n_cols`` exceeds ``UINT32_MAX`` (~4.29B). + +Detection is **size-based**: a well-formed cuvs-bench binary is exactly +``header_bytes + n_rows * n_cols * itemsize`` bytes long. :func:`read_bin_header` reads the first 16 bytes +of the file and: + +1. Tries the legacy layout (first 8 bytes as two ``uint32``s, 8-byte + header). The layout is accepted if ``8 + n_rows * n_cols * itemsize`` + matches the on-disk file size. +2. Otherwise tries the extended layout (first 16 bytes as two + ``uint64``s, 16-byte header). Accepted if + ``16 + n_rows * n_cols * itemsize`` matches the file size instead. +3. If neither layout matches, raises ``ValueError`` -- the file is + truncated, padded, or has a mismatched dtype extension. +""" + +from __future__ import annotations + +import os +import struct +from typing import BinaryIO, Tuple + +UINT32_MAX = (1 << 32) - 1 + +LEGACY_HEADER_BYTES = 8 +EXTENDED_HEADER_BYTES = 16 + + +def read_bin_header(path: str, itemsize: int) -> Tuple[int, int, int]: + """Read the header of a cuvs-bench binary file. + + Auto-detects the on-disk layout from the file size by checking which + of the two layouts (legacy 8-byte uint32 header, extended 16-byte uint64 + header) makes ``file_size == header_bytes + n_rows * n_cols * itemsize`` + balance. + + Parameters + ---------- + path : str + Path to the binary file. + itemsize : int + Per-element size in bytes (e.g. ``4`` for ``float32``, ``1`` for + ``int8``). Used purely for the size-equation check; the file + contents are not inspected. + + Returns + ------- + (n_rows, n_cols, header_bytes) : Tuple[int, int, int] + Row count, column count, and the number of bytes the header + occupies on disk (``8`` for legacy, ``16`` for extended). Callers + seeking to the data start should use ``header_bytes`` rather than + a hardcoded offset. + + Raises + ------ + ValueError + If neither the legacy nor the extended interpretation matches. + FileNotFoundError + If ``path`` does not exist. + """ + file_size = os.path.getsize(path) + with open(path, "rb") as f: + head = f.read(EXTENDED_HEADER_BYTES) + + if len(head) < LEGACY_HEADER_BYTES: + raise ValueError( + f"File too small to contain a valid header (expected at least " + f"{LEGACY_HEADER_BYTES} bytes, got {len(head)}): {path}" + ) + + n_rows_32, n_cols_32 = struct.unpack(" int: + """Write the canonical cuvs-bench binary header at the current position. + + The legacy 8-byte uint32 layout is used whenever both ``n_rows`` and + ``n_cols`` fit in a ``uint32``. The 16-byte uint64 layout is used + otherwise, or when explicitly requested via ``force_uint64=True``. + + Parameters + ---------- + f : BinaryIO + Open binary file handle, positioned where the header should go. + n_rows, n_cols : int + Header values to write. Must be non-negative. + force_uint64 : bool + If ``True``, always write the 16-byte uint64 layout regardless of + whether the values fit in ``uint32``. Defaults to ``False``. + + Returns + ------- + int + Number of bytes written (``8`` for legacy, ``16`` for extended). + """ + if n_rows < 0 or n_cols < 0: + raise ValueError( + f"n_rows and n_cols must be non-negative, got ({n_rows}, {n_cols})" + ) + if force_uint64 or n_rows > UINT32_MAX or n_cols > UINT32_MAX: + f.write(struct.pack(" np.ndarray: """ Read a binary vector file into a numpy array. - Supports the standard big-ann-bench binary format used by cuvs-bench - datasets: a 4-byte uint32 ``n_rows``, a 4-byte uint32 ``n_cols``, - followed by ``n_rows * n_cols`` elements of the dtype inferred from - the file extension via ``dtype_from_filename``. + Supports the cuvs-bench binary format with either the legacy 8-byte + ``[uint32 n_rows, uint32 n_cols]`` header or the extended 16-byte + ``[uint64 n_rows, uint64 n_cols]`` header used for datasets with more + than ``UINT32_MAX`` rows or columns. The layout is auto-detected from + the file size by :func:`cuvs_bench._bin_format.read_bin_header`. Parameters ---------- @@ -93,27 +96,22 @@ def load_vectors(path: str, subset_size: Optional[int] = None) -> np.ndarray: or the file is truncated. """ dtype = dtype_from_filename(path) + itemsize = np.dtype(dtype).itemsize if subset_size is not None and subset_size < 1: raise ValueError( f"subset_size must be a positive integer, got {subset_size}" ) + n_rows, n_cols, header_bytes = read_bin_header(path, itemsize) + if subset_size is not None: + n_rows = min(n_rows, subset_size) with open(path, "rb") as f: - header = f.read(8) - if len(header) < 8: - raise ValueError( - f"File too small to contain a valid header (expected 8 bytes, " - f"got {len(header)}): {path}" - ) - n_rows = int(np.frombuffer(header[:4], dtype=np.uint32)[0]) - n_cols = int(np.frombuffer(header[4:], dtype=np.uint32)[0]) - if subset_size is not None: - n_rows = min(n_rows, subset_size) - expected_bytes = n_rows * n_cols * np.dtype(dtype).itemsize + f.seek(header_bytes) + expected_bytes = n_rows * n_cols * itemsize raw = f.read(expected_bytes) if len(raw) < expected_bytes: raise ValueError( f"File is truncated: expected {expected_bytes} bytes of data " - f"({n_rows} rows x {n_cols} cols x {np.dtype(dtype).itemsize} bytes), " + f"({n_rows} rows x {n_cols} cols x {itemsize} bytes), " f"got {len(raw)}: {path}" ) data = np.frombuffer(raw, dtype=dtype) diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py index 72fb5b4a07..11f8ee1762 100644 --- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py +++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py @@ -1,5 +1,5 @@ # -# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # @@ -7,6 +7,8 @@ import numpy as np +from cuvs_bench._bin_format import read_bin_header, write_bin_header + def dtype_from_filename(filename): ext = os.path.splitext(filename)[1] @@ -40,29 +42,56 @@ def suffix_from_dtype(dtype): def memmap_bin_file( - bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32 + bin_file, dtype, shape=None, mode="r", *, force_uint64=False ): - extent_itemsize = np.dtype(size_dtype).itemsize - offset = int(extent_itemsize) * 2 + """Memory-map a cuvs-bench binary file. + + Supports both the legacy 8-byte ``[uint32 n_rows, uint32 n_cols]`` and + the extended 16-byte ``[uint64 n_rows, uint64 n_cols]`` headers. In read + mode the layout is auto-detected from the file size; in write mode the + legacy layout is used unless ``force_uint64=True`` or one of the shape + dimensions exceeds ``UINT32_MAX``. + + Parameters + ---------- + bin_file : str or None + Path to the binary file. ``None`` short-circuits and returns ``None`` + (preserves the historical "skip optional file" behavior). + dtype : numpy dtype or None + Element dtype. If ``None``, inferred from the file extension via + :func:`dtype_from_filename`. + shape : tuple or None + Read mode: optionally override ``(n_rows, n_cols)`` from the header; + any ``None`` entries are filled in from the header value. Write mode: + required ``(n_rows, n_cols)`` of the file to create. + mode : str + Standard ``np.memmap`` mode string (``"r"``, ``"r+"``, ``"w+"``). + force_uint64 : bool + Write mode only: force the extended uint64 header even when the + shape would fit in uint32. Ignored in read mode (auto-detected). + """ if bin_file is None: return None if dtype is None: dtype = dtype_from_filename(bin_file) + itemsize = np.dtype(dtype).itemsize if mode[0] == "r": - a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,)) + n_rows, n_cols, header_bytes = read_bin_header(bin_file, itemsize) if shape is None: - shape = (a[0], a[1]) + final_shape = (n_rows, n_cols) else: - shape = tuple( - [ - aval if sval is None else sval - for aval, sval in zip(a, shape) - ] + header_dims = (n_rows, n_cols) + final_shape = tuple( + aval if sval is None else sval + for aval, sval in zip(header_dims, shape) ) - return np.memmap( - bin_file, mode=mode, dtype=dtype, offset=offset, shape=shape + bin_file, + mode=mode, + dtype=dtype, + offset=header_bytes, + shape=final_shape, ) elif mode[0] == "w": if shape is None: @@ -72,19 +101,29 @@ def memmap_bin_file( dirname = os.path.dirname(bin_file) if len(dirname) > 0: os.makedirs(dirname, exist_ok=True) - a = np.memmap(bin_file, mode=mode, dtype=size_dtype, shape=(2,)) - a[0] = shape[0] - a[1] = shape[1] - a.flush() - del a - fp = np.memmap( - bin_file, mode="r+", dtype=dtype, offset=offset, shape=shape + with open(bin_file, "wb") as f: + header_bytes = write_bin_header( + f, shape[0], shape[1], force_uint64=force_uint64 + ) + return np.memmap( + bin_file, + mode="r+", + dtype=dtype, + offset=header_bytes, + shape=shape, ) - return fp -def write_bin(fname, data): +def write_bin(fname, data, *, force_uint64=False): + """Write a 2-D numpy array to a cuvs-bench binary file. + + The legacy 8-byte uint32 header is used by default; pass + ``force_uint64=True`` (or supply a shape with a dimension exceeding + ``UINT32_MAX``) to write the extended 16-byte uint64 header instead. + """ print("writing", fname, data.shape, data.dtype, "...") with open(fname, "wb") as f: - np.asarray(data.shape, dtype=np.uint32).tofile(f) + write_bin_header( + f, data.shape[0], data.shape[1], force_uint64=force_uint64 + ) data.tofile(f) diff --git a/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py b/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py index 5647ece771..19c14e8aa8 100644 --- a/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py +++ b/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py @@ -1,5 +1,5 @@ # -# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 @@ -9,21 +9,29 @@ import numpy as np +from cuvs_bench._bin_format import read_bin_header, write_bin_header + def read_fbin(fname): - shape = np.fromfile(fname, dtype=np.uint32, count=2) - if float(shape[0]) * shape[1] * 4 > 2_000_000_000: - data = np.memmap(fname, dtype=np.float32, offset=8, mode="r").reshape( - shape - ) + itemsize = np.dtype(np.float32).itemsize + n_rows, n_cols, header_bytes = read_bin_header(fname, itemsize) + shape = (n_rows, n_cols) + if float(n_rows) * n_cols * itemsize > 2_000_000_000: + data = np.memmap( + fname, dtype=np.float32, offset=header_bytes, mode="r" + ).reshape(shape) else: - data = np.fromfile(fname, dtype=np.float32, offset=8).reshape(shape) + data = np.fromfile( + fname, dtype=np.float32, offset=header_bytes + ).reshape(shape) return data -def write_bin(fname, data): +def write_bin(fname, data, *, force_uint64=False): with open(fname, "wb") as f: - np.asarray(data.shape, dtype=np.uint32).tofile(f) + write_bin_header( + f, data.shape[0], data.shape[1], force_uint64=force_uint64 + ) data.tofile(f) diff --git a/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py b/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py index af6a7aac31..34abacbf6d 100644 --- a/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py +++ b/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py @@ -1,5 +1,5 @@ # -# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 @@ -8,15 +8,19 @@ import h5py import numpy as np +from cuvs_bench._bin_format import write_bin_header + def normalize(x): norm = np.linalg.norm(x, axis=1) return (x.T / norm).T -def write_bin(fname, data): +def write_bin(fname, data, *, force_uint64=False): with open(fname, "wb") as f: - np.asarray(data.shape, dtype=np.uint32).tofile(f) + write_bin_header( + f, data.shape[0], data.shape[1], force_uint64=force_uint64 + ) data.tofile(f) diff --git a/python/cuvs_bench/cuvs_bench/tests/test_utils.py b/python/cuvs_bench/cuvs_bench/tests/test_utils.py index db1fd7b137..a009684723 100644 --- a/python/cuvs_bench/cuvs_bench/tests/test_utils.py +++ b/python/cuvs_bench/cuvs_bench/tests/test_utils.py @@ -6,10 +6,19 @@ Unit tests for shared backend utilities and Dataset transparent loading. """ +import struct + import numpy as np import pytest import yaml +from cuvs_bench._bin_format import ( + EXTENDED_HEADER_BYTES, + LEGACY_HEADER_BYTES, + UINT32_MAX, + read_bin_header, + write_bin_header, +) from cuvs_bench.backends import Dataset from cuvs_bench.backends._utils import ( compute_recall, @@ -20,10 +29,12 @@ from cuvs_bench.orchestrator.config_loaders import CppGBenchConfigLoader -def _write_test_bin(path, data): - """Write a numpy array in big-ann-bench binary format.""" +def _write_test_bin(path, data, *, force_uint64=False): + """Write a numpy array in cuvs-bench binary format.""" with open(path, "wb") as f: - np.asarray(data.shape, dtype=np.uint32).tofile(f) + write_bin_header( + f, data.shape[0], data.shape[1], force_uint64=force_uint64 + ) data.tofile(f) @@ -170,7 +181,7 @@ def test_truncated_data(self, tmp_path): np.array([10, 4], dtype=np.uint32).tofile(f) np.random.rand(5, 4).astype(np.float32).tofile(f) - with pytest.raises(ValueError, match="File is truncated"): + with pytest.raises(ValueError, match="does not match either"): load_vectors(path) def test_file_not_found(self): @@ -178,6 +189,167 @@ def test_file_not_found(self): with pytest.raises(FileNotFoundError): load_vectors("/nonexistent/path/vectors.fbin") + def test_load_uint64_header(self, tmp_path): + """``load_vectors`` reads files written with the extended uint64 header.""" + data = np.random.rand(40, 16).astype(np.float32) + path = str(tmp_path / "test.fbin") + _write_test_bin(path, data, force_uint64=True) + + # Sanity check: file really uses the extended layout. + assert ( + tmp_path.joinpath("test.fbin").stat().st_size + == EXTENDED_HEADER_BYTES + data.nbytes + ) + loaded = load_vectors(path) + np.testing.assert_array_equal(loaded, data) + + def test_load_uint64_header_with_subset(self, tmp_path): + """``subset_size`` works regardless of which header layout was used.""" + data = np.random.rand(50, 8).astype(np.float32) + path = str(tmp_path / "test.fbin") + _write_test_bin(path, data, force_uint64=True) + + loaded = load_vectors(path, subset_size=12) + assert loaded.shape == (12, 8) + np.testing.assert_array_equal(loaded, data[:12]) + + @pytest.mark.parametrize( + "ext, dtype, force_uint64", + [ + (".fbin", np.float32, False), + (".fbin", np.float32, True), + (".ibin", np.int32, False), + (".ibin", np.int32, True), + (".u8bin", np.uint8, False), + (".u8bin", np.uint8, True), + (".i8bin", np.int8, False), + (".i8bin", np.int8, True), + ], + ) + def test_load_roundtrip_all_dtypes( + self, tmp_path, ext, dtype, force_uint64 + ): + """Round-trip every supported dtype through both header layouts.""" + if np.issubdtype(dtype, np.integer): + info = np.iinfo(dtype) + data = np.random.randint( + info.min, info.max, size=(25, 7), dtype=dtype + ) + else: + data = np.random.rand(25, 7).astype(dtype) + path = str(tmp_path / f"test{ext}") + _write_test_bin(path, data, force_uint64=force_uint64) + + loaded = load_vectors(path) + np.testing.assert_array_equal(loaded, data) + + +class TestBinHeaderHelpers: + """Tests for ``cuvs_bench._bin_format.read_bin_header`` / ``write_bin_header``.""" + + def test_write_legacy_returns_8_bytes(self, tmp_path): + """Small shapes should write the 8-byte uint32 header by default.""" + path = tmp_path / "h.bin" + with open(path, "wb") as f: + n = write_bin_header(f, 7, 3) + assert n == LEGACY_HEADER_BYTES + assert path.stat().st_size == LEGACY_HEADER_BYTES + + def test_write_force_uint64_returns_16_bytes(self, tmp_path): + """``force_uint64=True`` should write the 16-byte uint64 header.""" + path = tmp_path / "h.bin" + with open(path, "wb") as f: + n = write_bin_header(f, 7, 3, force_uint64=True) + assert n == EXTENDED_HEADER_BYTES + assert path.stat().st_size == EXTENDED_HEADER_BYTES + + def test_write_auto_promotes_to_uint64_when_overflowing(self, tmp_path): + """Shapes that don't fit in uint32 should auto-promote to uint64.""" + path = tmp_path / "h.bin" + with open(path, "wb") as f: + n = write_bin_header(f, UINT32_MAX + 1, 4) + assert n == EXTENDED_HEADER_BYTES + + def test_write_negative_raises(self, tmp_path): + """Negative dimensions are rejected.""" + path = tmp_path / "h.bin" + with open(path, "wb") as f: + with pytest.raises(ValueError, match="non-negative"): + write_bin_header(f, -1, 4) + + def test_read_legacy_round_trip(self, tmp_path): + """Legacy round-trip: write 8-byte header, read it back.""" + path = tmp_path / "x.fbin" + data = np.random.rand(11, 5).astype(np.float32) + with open(path, "wb") as f: + write_bin_header(f, data.shape[0], data.shape[1]) + data.tofile(f) + n_rows, n_cols, hbytes = read_bin_header(str(path), itemsize=4) + assert (n_rows, n_cols, hbytes) == (11, 5, LEGACY_HEADER_BYTES) + + def test_read_extended_round_trip(self, tmp_path): + """Extended round-trip: write 16-byte header, read it back.""" + path = tmp_path / "x.fbin" + data = np.random.rand(11, 5).astype(np.float32) + with open(path, "wb") as f: + write_bin_header( + f, data.shape[0], data.shape[1], force_uint64=True + ) + data.tofile(f) + n_rows, n_cols, hbytes = read_bin_header(str(path), itemsize=4) + assert (n_rows, n_cols, hbytes) == (11, 5, EXTENDED_HEADER_BYTES) + + def test_read_synthesized_huge_extended_header(self, tmp_path): + """A hand-crafted extended-header file with >UINT32_MAX rows reads correctly. + + We can't materialize the data section (>16 GB just for the dummy + bytes), so write only the header and pad with the exact number of + zero bytes ``read_bin_header`` expects to balance the size equation + -- using ``n_cols=0`` so the data section is empty. + """ + path = tmp_path / "huge.fbin" + n_rows = UINT32_MAX + 17 + n_cols = 0 + with open(path, "wb") as f: + write_bin_header(f, n_rows, n_cols) + assert path.stat().st_size == EXTENDED_HEADER_BYTES + + got_rows, got_cols, hbytes = read_bin_header(str(path), itemsize=4) + assert got_rows == n_rows + assert got_cols == n_cols + assert hbytes == EXTENDED_HEADER_BYTES + + def test_read_file_too_small_raises(self, tmp_path): + """A file shorter than the legacy header raises a clear error.""" + path = tmp_path / "x.fbin" + path.write_bytes(b"\x00\x00\x00") + with pytest.raises(ValueError, match="File too small"): + read_bin_header(str(path), itemsize=4) + + def test_read_size_mismatch_raises(self, tmp_path): + """Header values that don't balance the file size are rejected.""" + path = tmp_path / "x.fbin" + with open(path, "wb") as f: + f.write(struct.pack(" Date: Thu, 4 Jun 2026 18:02:56 +0000 Subject: [PATCH 2/6] coderabbit reviews --- python/cuvs_bench/cuvs_bench/_bin_format.py | 23 ++++++++--- .../cuvs_bench/cuvs_bench/backends/_utils.py | 4 +- .../cuvs_bench/generate_groundtruth/utils.py | 31 +++++++------- .../cuvs_bench/get_dataset/fbin_to_f16bin.py | 6 +-- .../cuvs_bench/get_dataset/hdf5_to_fbin.py | 6 +-- .../cuvs_bench/cuvs_bench/tests/test_utils.py | 40 +++++++++---------- 6 files changed, 58 insertions(+), 52 deletions(-) diff --git a/python/cuvs_bench/cuvs_bench/_bin_format.py b/python/cuvs_bench/cuvs_bench/_bin_format.py index 2611020fc5..8fb0d4d502 100644 --- a/python/cuvs_bench/cuvs_bench/_bin_format.py +++ b/python/cuvs_bench/cuvs_bench/_bin_format.py @@ -37,6 +37,8 @@ import struct from typing import BinaryIO, Tuple +import numpy as np + UINT32_MAX = (1 << 32) - 1 LEGACY_HEADER_BYTES = 8 @@ -75,6 +77,10 @@ def read_bin_header(path: str, itemsize: int) -> Tuple[int, int, int]: FileNotFoundError If ``path`` does not exist. """ + if itemsize < 1: + raise ValueError( + f"itemsize must be a positive integer, got {itemsize!r}" + ) file_size = os.path.getsize(path) with open(path, "rb") as f: head = f.read(EXTENDED_HEADER_BYTES) @@ -110,13 +116,13 @@ def write_bin_header( n_rows: int, n_cols: int, *, - force_uint64: bool = False, + size_dtype=np.uint32, ) -> int: """Write the canonical cuvs-bench binary header at the current position. The legacy 8-byte uint32 layout is used whenever both ``n_rows`` and ``n_cols`` fit in a ``uint32``. The 16-byte uint64 layout is used - otherwise, or when explicitly requested via ``force_uint64=True``. + otherwise, or when explicitly requested via ``size_dtype=np.uint64``. Parameters ---------- @@ -124,9 +130,9 @@ def write_bin_header( Open binary file handle, positioned where the header should go. n_rows, n_cols : int Header values to write. Must be non-negative. - force_uint64 : bool - If ``True``, always write the 16-byte uint64 layout regardless of - whether the values fit in ``uint32``. Defaults to ``False``. + size_dtype : numpy dtype + ``np.uint32`` for the legacy 8-byte header (default), or + ``np.uint64`` to force the extended 16-byte header. Returns ------- @@ -137,7 +143,12 @@ def write_bin_header( raise ValueError( f"n_rows and n_cols must be non-negative, got ({n_rows}, {n_cols})" ) - if force_uint64 or n_rows > UINT32_MAX or n_cols > UINT32_MAX: + use_uint64 = ( + np.dtype(size_dtype) == np.uint64 + or n_rows > UINT32_MAX + or n_cols > UINT32_MAX + ) + if use_uint64: f.write(struct.pack(" np.ndarray: """ dtype = dtype_from_filename(path) itemsize = np.dtype(dtype).itemsize - if subset_size is not None and subset_size < 1: + if subset_size is not None and ( + isinstance(subset_size, float) or subset_size < 1 + ): raise ValueError( f"subset_size must be a positive integer, got {subset_size}" ) diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py index 11f8ee1762..ad6f567705 100644 --- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py +++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py @@ -42,14 +42,14 @@ def suffix_from_dtype(dtype): def memmap_bin_file( - bin_file, dtype, shape=None, mode="r", *, force_uint64=False + bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32 ): """Memory-map a cuvs-bench binary file. Supports both the legacy 8-byte ``[uint32 n_rows, uint32 n_cols]`` and the extended 16-byte ``[uint64 n_rows, uint64 n_cols]`` headers. In read mode the layout is auto-detected from the file size; in write mode the - legacy layout is used unless ``force_uint64=True`` or one of the shape + legacy layout is used unless ``size_dtype=np.uint64`` or one of the shape dimensions exceeds ``UINT32_MAX``. Parameters @@ -66,9 +66,10 @@ def memmap_bin_file( required ``(n_rows, n_cols)`` of the file to create. mode : str Standard ``np.memmap`` mode string (``"r"``, ``"r+"``, ``"w+"``). - force_uint64 : bool - Write mode only: force the extended uint64 header even when the - shape would fit in uint32. Ignored in read mode (auto-detected). + size_dtype : numpy dtype + Write mode only: ``np.uint32`` for the legacy 8-byte header (default), + or ``np.uint64`` to force the extended 16-byte header. Ignored in read + mode (auto-detected). """ if bin_file is None: return None @@ -76,6 +77,11 @@ def memmap_bin_file( dtype = dtype_from_filename(bin_file) itemsize = np.dtype(dtype).itemsize + if shape is not None and len(shape) != 2: + raise ValueError( + f"shape must have exactly 2 dimensions (n_rows, n_cols), got {shape!r}" + ) + if mode[0] == "r": n_rows, n_cols, header_bytes = read_bin_header(bin_file, itemsize) if shape is None: @@ -103,7 +109,7 @@ def memmap_bin_file( os.makedirs(dirname, exist_ok=True) with open(bin_file, "wb") as f: header_bytes = write_bin_header( - f, shape[0], shape[1], force_uint64=force_uint64 + f, shape[0], shape[1], size_dtype=size_dtype ) return np.memmap( bin_file, @@ -114,16 +120,9 @@ def memmap_bin_file( ) -def write_bin(fname, data, *, force_uint64=False): - """Write a 2-D numpy array to a cuvs-bench binary file. - - The legacy 8-byte uint32 header is used by default; pass - ``force_uint64=True`` (or supply a shape with a dimension exceeding - ``UINT32_MAX``) to write the extended 16-byte uint64 header instead. - """ +def write_bin(fname, data): + """Write a 2-D numpy array to a cuvs-bench binary file.""" print("writing", fname, data.shape, data.dtype, "...") with open(fname, "wb") as f: - write_bin_header( - f, data.shape[0], data.shape[1], force_uint64=force_uint64 - ) + write_bin_header(f, data.shape[0], data.shape[1]) data.tofile(f) diff --git a/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py b/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py index 19c14e8aa8..0fba915cae 100644 --- a/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py +++ b/python/cuvs_bench/cuvs_bench/get_dataset/fbin_to_f16bin.py @@ -27,11 +27,9 @@ def read_fbin(fname): return data -def write_bin(fname, data, *, force_uint64=False): +def write_bin(fname, data): with open(fname, "wb") as f: - write_bin_header( - f, data.shape[0], data.shape[1], force_uint64=force_uint64 - ) + write_bin_header(f, data.shape[0], data.shape[1]) data.tofile(f) diff --git a/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py b/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py index 34abacbf6d..2998a5dcb2 100644 --- a/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py +++ b/python/cuvs_bench/cuvs_bench/get_dataset/hdf5_to_fbin.py @@ -16,11 +16,9 @@ def normalize(x): return (x.T / norm).T -def write_bin(fname, data, *, force_uint64=False): +def write_bin(fname, data): with open(fname, "wb") as f: - write_bin_header( - f, data.shape[0], data.shape[1], force_uint64=force_uint64 - ) + write_bin_header(f, data.shape[0], data.shape[1]) data.tofile(f) diff --git a/python/cuvs_bench/cuvs_bench/tests/test_utils.py b/python/cuvs_bench/cuvs_bench/tests/test_utils.py index a009684723..d5a3482385 100644 --- a/python/cuvs_bench/cuvs_bench/tests/test_utils.py +++ b/python/cuvs_bench/cuvs_bench/tests/test_utils.py @@ -29,11 +29,11 @@ from cuvs_bench.orchestrator.config_loaders import CppGBenchConfigLoader -def _write_test_bin(path, data, *, force_uint64=False): +def _write_test_bin(path, data, *, size_dtype=np.uint32): """Write a numpy array in cuvs-bench binary format.""" with open(path, "wb") as f: write_bin_header( - f, data.shape[0], data.shape[1], force_uint64=force_uint64 + f, data.shape[0], data.shape[1], size_dtype=size_dtype ) data.tofile(f) @@ -193,7 +193,7 @@ def test_load_uint64_header(self, tmp_path): """``load_vectors`` reads files written with the extended uint64 header.""" data = np.random.rand(40, 16).astype(np.float32) path = str(tmp_path / "test.fbin") - _write_test_bin(path, data, force_uint64=True) + _write_test_bin(path, data, size_dtype=np.uint64) # Sanity check: file really uses the extended layout. assert ( @@ -207,28 +207,26 @@ def test_load_uint64_header_with_subset(self, tmp_path): """``subset_size`` works regardless of which header layout was used.""" data = np.random.rand(50, 8).astype(np.float32) path = str(tmp_path / "test.fbin") - _write_test_bin(path, data, force_uint64=True) + _write_test_bin(path, data, size_dtype=np.uint64) loaded = load_vectors(path, subset_size=12) assert loaded.shape == (12, 8) np.testing.assert_array_equal(loaded, data[:12]) @pytest.mark.parametrize( - "ext, dtype, force_uint64", + "ext, dtype, size_dtype", [ - (".fbin", np.float32, False), - (".fbin", np.float32, True), - (".ibin", np.int32, False), - (".ibin", np.int32, True), - (".u8bin", np.uint8, False), - (".u8bin", np.uint8, True), - (".i8bin", np.int8, False), - (".i8bin", np.int8, True), + (".fbin", np.float32, np.uint32), + (".fbin", np.float32, np.uint64), + (".ibin", np.int32, np.uint32), + (".ibin", np.int32, np.uint64), + (".u8bin", np.uint8, np.uint32), + (".u8bin", np.uint8, np.uint64), + (".i8bin", np.int8, np.uint32), + (".i8bin", np.int8, np.uint64), ], ) - def test_load_roundtrip_all_dtypes( - self, tmp_path, ext, dtype, force_uint64 - ): + def test_load_roundtrip_all_dtypes(self, tmp_path, ext, dtype, size_dtype): """Round-trip every supported dtype through both header layouts.""" if np.issubdtype(dtype, np.integer): info = np.iinfo(dtype) @@ -238,7 +236,7 @@ def test_load_roundtrip_all_dtypes( else: data = np.random.rand(25, 7).astype(dtype) path = str(tmp_path / f"test{ext}") - _write_test_bin(path, data, force_uint64=force_uint64) + _write_test_bin(path, data, size_dtype=size_dtype) loaded = load_vectors(path) np.testing.assert_array_equal(loaded, data) @@ -255,11 +253,11 @@ def test_write_legacy_returns_8_bytes(self, tmp_path): assert n == LEGACY_HEADER_BYTES assert path.stat().st_size == LEGACY_HEADER_BYTES - def test_write_force_uint64_returns_16_bytes(self, tmp_path): - """``force_uint64=True`` should write the 16-byte uint64 header.""" + def test_write_size_dtype_uint64_returns_16_bytes(self, tmp_path): + """``size_dtype=np.uint64`` should write the 16-byte uint64 header.""" path = tmp_path / "h.bin" with open(path, "wb") as f: - n = write_bin_header(f, 7, 3, force_uint64=True) + n = write_bin_header(f, 7, 3, size_dtype=np.uint64) assert n == EXTENDED_HEADER_BYTES assert path.stat().st_size == EXTENDED_HEADER_BYTES @@ -293,7 +291,7 @@ def test_read_extended_round_trip(self, tmp_path): data = np.random.rand(11, 5).astype(np.float32) with open(path, "wb") as f: write_bin_header( - f, data.shape[0], data.shape[1], force_uint64=True + f, data.shape[0], data.shape[1], size_dtype=np.uint64 ) data.tofile(f) n_rows, n_cols, hbytes = read_bin_header(str(path), itemsize=4) From 8240e5209699a09ac8df93d9c65e0f501cd213d4 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Thu, 4 Jun 2026 21:02:04 +0000 Subject: [PATCH 3/6] simplify comment --- python/cuvs_bench/cuvs_bench/_bin_format.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/python/cuvs_bench/cuvs_bench/_bin_format.py b/python/cuvs_bench/cuvs_bench/_bin_format.py index 8fb0d4d502..e0c973524a 100644 --- a/python/cuvs_bench/cuvs_bench/_bin_format.py +++ b/python/cuvs_bench/cuvs_bench/_bin_format.py @@ -59,16 +59,13 @@ def read_bin_header(path: str, itemsize: int) -> Tuple[int, int, int]: Path to the binary file. itemsize : int Per-element size in bytes (e.g. ``4`` for ``float32``, ``1`` for - ``int8``). Used purely for the size-equation check; the file - contents are not inspected. + ``int8``) used for the size-equation check. Returns ------- (n_rows, n_cols, header_bytes) : Tuple[int, int, int] Row count, column count, and the number of bytes the header - occupies on disk (``8`` for legacy, ``16`` for extended). Callers - seeking to the data start should use ``header_bytes`` rather than - a hardcoded offset. + occupies on disk (``8`` for legacy, ``16`` for extended). Raises ------ From 751d0f3af1f102cf47ab48c3a54c15d64a64ed69 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Fri, 5 Jun 2026 00:37:31 +0000 Subject: [PATCH 4/6] fix/add tests --- .../cuvs_bench/cuvs_bench/tests/test_utils.py | 92 +++++++++++++++++-- 1 file changed, 84 insertions(+), 8 deletions(-) diff --git a/python/cuvs_bench/cuvs_bench/tests/test_utils.py b/python/cuvs_bench/cuvs_bench/tests/test_utils.py index d5a3482385..8c6587c2f2 100644 --- a/python/cuvs_bench/cuvs_bench/tests/test_utils.py +++ b/python/cuvs_bench/cuvs_bench/tests/test_utils.py @@ -26,6 +26,7 @@ expand_param_grid, load_vectors, ) +from cuvs_bench.generate_groundtruth.utils import memmap_bin_file from cuvs_bench.orchestrator.config_loaders import CppGBenchConfigLoader @@ -218,6 +219,8 @@ def test_load_uint64_header_with_subset(self, tmp_path): [ (".fbin", np.float32, np.uint32), (".fbin", np.float32, np.uint64), + (".f16bin", np.float16, np.uint32), + (".f16bin", np.float16, np.uint64), (".ibin", np.int32, np.uint32), (".ibin", np.int32, np.uint64), (".u8bin", np.uint8, np.uint32), @@ -298,21 +301,24 @@ def test_read_extended_round_trip(self, tmp_path): assert (n_rows, n_cols, hbytes) == (11, 5, EXTENDED_HEADER_BYTES) def test_read_synthesized_huge_extended_header(self, tmp_path): - """A hand-crafted extended-header file with >UINT32_MAX rows reads correctly. + """Extended-header file with >UINT32_MAX rows and positive n_cols. - We can't materialize the data section (>16 GB just for the dummy - bytes), so write only the header and pad with the exact number of - zero bytes ``read_bin_header`` expects to balance the size equation - -- using ``n_cols=0`` so the data section is empty. + We can't materialize the full data section, so write the header and + truncate to the exact file size ``read_bin_header`` expects. """ path = tmp_path / "huge.fbin" n_rows = UINT32_MAX + 17 - n_cols = 0 + n_cols = 4 + itemsize = 4 + expected_size = EXTENDED_HEADER_BYTES + n_rows * n_cols * itemsize with open(path, "wb") as f: write_bin_header(f, n_rows, n_cols) - assert path.stat().st_size == EXTENDED_HEADER_BYTES + f.truncate(expected_size) - got_rows, got_cols, hbytes = read_bin_header(str(path), itemsize=4) + assert path.stat().st_size == expected_size + got_rows, got_cols, hbytes = read_bin_header( + str(path), itemsize=itemsize + ) assert got_rows == n_rows assert got_cols == n_cols assert hbytes == EXTENDED_HEADER_BYTES @@ -349,6 +355,76 @@ def test_read_dispatch_prefers_legacy(self, tmp_path): assert hbytes == LEGACY_HEADER_BYTES +class TestMemmapBinFile: + """Tests for ``generate_groundtruth.utils.memmap_bin_file``.""" + + def test_read_legacy_header(self, tmp_path): + """Read mode auto-detects the legacy 8-byte header offset.""" + data = np.random.rand(30, 8).astype(np.float32) + path = str(tmp_path / "test.fbin") + _write_test_bin(path, data) + + mm = memmap_bin_file(path, np.float32, mode="r") + assert mm.shape == (30, 8) + np.testing.assert_array_equal(mm[:], data) + + def test_read_extended_header(self, tmp_path): + """Read mode auto-detects the extended 16-byte header offset.""" + data = np.random.rand(30, 8).astype(np.float32) + path = str(tmp_path / "test.fbin") + _write_test_bin(path, data, size_dtype=np.uint64) + + mm = memmap_bin_file(path, np.float32, mode="r") + assert mm.shape == (30, 8) + np.testing.assert_array_equal(mm[:], data) + + def test_read_partial_shape_override(self, tmp_path): + """Read mode fills ``None`` shape entries from the header.""" + data = np.random.rand(50, 8).astype(np.float32) + path = str(tmp_path / "test.fbin") + _write_test_bin(path, data) + + mm = memmap_bin_file(path, np.float32, shape=(10, None), mode="r") + assert mm.shape == (10, 8) + np.testing.assert_array_equal(mm[:], data[:10]) + + def test_write_read_roundtrip_legacy(self, tmp_path): + """Write mode with uint32 header, then read back via memmap.""" + path = str(tmp_path / "test.fbin") + shape = (20, 8) + data = np.random.rand(*shape).astype(np.float32) + + mm = memmap_bin_file(path, np.float32, shape=shape, mode="w+") + mm[:] = data + mm.flush() + del mm + + loaded = memmap_bin_file(path, np.float32, mode="r") + assert loaded.shape == shape + np.testing.assert_array_equal(loaded[:], data) + + def test_write_read_roundtrip_extended(self, tmp_path): + """Write mode with uint64 header, then read back via memmap.""" + path = str(tmp_path / "test.fbin") + shape = (20, 8) + data = np.random.rand(*shape).astype(np.float32) + + mm = memmap_bin_file( + path, np.float32, shape=shape, mode="w+", size_dtype=np.uint64 + ) + mm[:] = data + mm.flush() + del mm + + loaded = memmap_bin_file(path, np.float32, mode="r") + assert loaded.shape == shape + np.testing.assert_array_equal(loaded[:], data) + assert ( + tmp_path.joinpath("test.fbin").stat().st_size + == EXTENDED_HEADER_BYTES + data.nbytes + ) + + class TestDatasetLazyLoading: """Tests for Dataset transparent vector loading.""" From 9a5b9e0e8ded16557e0eeb93d58652e3046b4691 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Fri, 5 Jun 2026 01:25:01 +0000 Subject: [PATCH 5/6] gt uint64 support --- python/cuvs_bench/cuvs_bench/_bin_format.py | 2 +- .../cuvs_bench/cuvs_bench/backends/_utils.py | 4 +- .../generate_groundtruth/__main__.py | 14 ++++-- .../cuvs_bench/generate_groundtruth/utils.py | 18 ++++++++ .../cuvs_bench/cuvs_bench/tests/test_utils.py | 46 ++++++++++++++++++- 5 files changed, 78 insertions(+), 6 deletions(-) diff --git a/python/cuvs_bench/cuvs_bench/_bin_format.py b/python/cuvs_bench/cuvs_bench/_bin_format.py index e0c973524a..f92422474c 100644 --- a/python/cuvs_bench/cuvs_bench/_bin_format.py +++ b/python/cuvs_bench/cuvs_bench/_bin_format.py @@ -12,7 +12,7 @@ - **Legacy**: ``[uint32 n_rows, uint32 n_cols, data ...]`` (8-byte header). This is what every existing ``.fbin`` / ``.ibin`` / ``.u8bin`` / ``.i8bin`` - / ``.f16bin`` / ``.hbin`` file on disk uses today. + / ``.f16bin`` / ``.hbin`` / ``.u64bin`` file on disk uses today. - **Extended**: ``[uint64 n_rows, uint64 n_cols, data ...]`` (16-byte header). For datasets whose ``n_rows`` or ``n_cols`` exceeds ``UINT32_MAX`` (~4.29B). diff --git a/python/cuvs_bench/cuvs_bench/backends/_utils.py b/python/cuvs_bench/cuvs_bench/backends/_utils.py index 25dce65152..250e2d2f50 100644 --- a/python/cuvs_bench/cuvs_bench/backends/_utils.py +++ b/python/cuvs_bench/cuvs_bench/backends/_utils.py @@ -55,6 +55,8 @@ def dtype_from_filename(filename): return np.float16 elif ext == ".ibin": return np.int32 + elif ext == ".u64bin": + return np.uint64 elif ext == ".u8bin": return np.ubyte elif ext == ".i8bin": @@ -78,7 +80,7 @@ def load_vectors(path: str, subset_size: Optional[int] = None) -> np.ndarray: path : str Path to the binary file. The dtype is inferred from the extension: ``.fbin`` (float32), ``.f16bin`` (float16), ``.u8bin`` (uint8), - ``.i8bin`` (int8), ``.ibin`` (int32). + ``.i8bin`` (int8), ``.ibin`` (int32), ``.u64bin`` (uint64). subset_size : Optional[int] If provided, only the first ``subset_size`` rows are loaded. diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py index 43c03f4322..9baa9e2a37 100644 --- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py +++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py @@ -9,7 +9,13 @@ import sys import warnings -from .utils import memmap_bin_file, suffix_from_dtype, write_bin +from .utils import ( + groundtruth_neighbors_filename, + memmap_bin_file, + neighbor_index_dtype, + suffix_from_dtype, + write_bin, +) def import_with_fallback(primary_lib, secondary_lib=None, alias=None): @@ -358,9 +364,11 @@ def main(): print("Calculating true nearest neighbors") distances, indices = calc_truth(dataset, queries, args.k, args.metric) + n_base = dataset.shape[0] + neighbor_dtype = neighbor_index_dtype(n_base) write_bin( - os.path.join(args.output, "groundtruth.neighbors.ibin"), - indices.astype(xp.uint32), + os.path.join(args.output, groundtruth_neighbors_filename(n_base)), + xp.asarray(indices, dtype=neighbor_dtype), ) write_bin( os.path.join(args.output, "groundtruth.distances.fbin"), diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py index ad6f567705..0159211520 100644 --- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py +++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py @@ -18,6 +18,8 @@ def dtype_from_filename(filename): return np.float16 elif ext == ".ibin": return np.int32 + elif ext == ".u64bin": + return np.uint64 elif ext == ".u8bin": return np.ubyte elif ext == ".i8bin": @@ -33,6 +35,8 @@ def suffix_from_dtype(dtype): return ".hbin" elif dtype == np.int32: return ".ibin" + elif dtype == np.uint64: + return ".u64bin" elif dtype == np.ubyte: return ".u8bin" elif dtype == np.byte: @@ -41,6 +45,20 @@ def suffix_from_dtype(dtype): raise RuntimeError("Not supported dtype extension" + dtype) +def neighbor_index_dtype(n_base: int) -> np.dtype: + """Return the dtype used to store neighbor row IDs for a base set size.""" + if n_base > np.iinfo(np.int32).max: + return np.uint64 + return np.int32 + + +def groundtruth_neighbors_filename(n_base: int) -> str: + """Return the ground-truth neighbors filename for a base set size.""" + if n_base > np.iinfo(np.int32).max: + return "groundtruth.neighbors.u64bin" + return "groundtruth.neighbors.ibin" + + def memmap_bin_file( bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32 ): diff --git a/python/cuvs_bench/cuvs_bench/tests/test_utils.py b/python/cuvs_bench/cuvs_bench/tests/test_utils.py index 8c6587c2f2..6fffe59e5a 100644 --- a/python/cuvs_bench/cuvs_bench/tests/test_utils.py +++ b/python/cuvs_bench/cuvs_bench/tests/test_utils.py @@ -26,7 +26,11 @@ expand_param_grid, load_vectors, ) -from cuvs_bench.generate_groundtruth.utils import memmap_bin_file +from cuvs_bench.generate_groundtruth.utils import ( + groundtruth_neighbors_filename, + memmap_bin_file, + neighbor_index_dtype, +) from cuvs_bench.orchestrator.config_loaders import CppGBenchConfigLoader @@ -54,6 +58,10 @@ def test_ibin(self): """Test .ibin maps to int32.""" assert dtype_from_filename("groundtruth.ibin") == np.int32 + def test_u64bin(self): + """Test .u64bin maps to uint64.""" + assert dtype_from_filename("groundtruth.neighbors.u64bin") == np.uint64 + def test_u8bin(self): """Test .u8bin maps to uint8.""" assert dtype_from_filename("vectors.u8bin") == np.ubyte @@ -223,6 +231,8 @@ def test_load_uint64_header_with_subset(self, tmp_path): (".f16bin", np.float16, np.uint64), (".ibin", np.int32, np.uint32), (".ibin", np.int32, np.uint64), + (".u64bin", np.uint64, np.uint32), + (".u64bin", np.uint64, np.uint64), (".u8bin", np.uint8, np.uint32), (".u8bin", np.uint8, np.uint64), (".i8bin", np.int8, np.uint32), @@ -236,6 +246,8 @@ def test_load_roundtrip_all_dtypes(self, tmp_path, ext, dtype, size_dtype): data = np.random.randint( info.min, info.max, size=(25, 7), dtype=dtype ) + if dtype == np.uint64: + data[0, 0] = np.iinfo(np.int32).max + 42 else: data = np.random.rand(25, 7).astype(dtype) path = str(tmp_path / f"test{ext}") @@ -245,6 +257,38 @@ def test_load_roundtrip_all_dtypes(self, tmp_path, ext, dtype, size_dtype): np.testing.assert_array_equal(loaded, data) +class TestGroundtruthNeighborFormat: + """Tests for large-base ground-truth neighbor index format selection.""" + + def test_neighbor_index_dtype_small_base(self): + assert neighbor_index_dtype(1_000_000) == np.int32 + + def test_neighbor_index_dtype_large_base(self): + assert neighbor_index_dtype(np.iinfo(np.int32).max + 1) == np.uint64 + + def test_groundtruth_neighbors_filename_small_base(self): + assert ( + groundtruth_neighbors_filename(1_000_000) + == "groundtruth.neighbors.ibin" + ) + + def test_groundtruth_neighbors_filename_large_base(self): + assert ( + groundtruth_neighbors_filename(np.iinfo(np.int32).max + 1) + == "groundtruth.neighbors.u64bin" + ) + + def test_load_u64bin_preserves_large_indices(self, tmp_path): + """uint64 GT files preserve neighbor IDs above INT32_MAX.""" + large_id = np.iinfo(np.int32).max + 12345 + indices = np.array([[large_id, 0, 1]], dtype=np.uint64) + path = str(tmp_path / "gt.u64bin") + _write_test_bin(path, indices) + + loaded = load_vectors(path) + np.testing.assert_array_equal(loaded, indices) + + class TestBinHeaderHelpers: """Tests for ``cuvs_bench._bin_format.read_bin_header`` / ``write_bin_header``.""" From 44856bcc8b8acdf0808c097a17fb8a8260b6fcd1 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Fri, 5 Jun 2026 01:58:01 +0000 Subject: [PATCH 6/6] shift neighbor index support large dtype --- .../generate_groundtruth/__main__.py | 11 ++-- .../cuvs_bench/generate_groundtruth/utils.py | 25 +++++++++ .../cuvs_bench/cuvs_bench/tests/test_utils.py | 56 +++++++++++++++++++ 3 files changed, 87 insertions(+), 5 deletions(-) diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py index 9baa9e2a37..4315c8e3ac 100644 --- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py +++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py @@ -12,9 +12,10 @@ from .utils import ( groundtruth_neighbors_filename, memmap_bin_file, - neighbor_index_dtype, + offset_neighbor_indices, suffix_from_dtype, write_bin, + write_groundtruth_neighbors, ) @@ -199,7 +200,7 @@ def calc_truth(dataset, queries, k, metric="sqeuclidean"): D, Ind = cpu_search(X, queries, k, metric=metric) D, Ind = xp.asarray(D), xp.asarray(Ind) - Ind += i # shift neighbor index by offset i + Ind = offset_neighbor_indices(Ind, i, n_samples) if distances is None: distances = D @@ -365,10 +366,10 @@ def main(): distances, indices = calc_truth(dataset, queries, args.k, args.metric) n_base = dataset.shape[0] - neighbor_dtype = neighbor_index_dtype(n_base) - write_bin( + write_groundtruth_neighbors( os.path.join(args.output, groundtruth_neighbors_filename(n_base)), - xp.asarray(indices, dtype=neighbor_dtype), + indices, + n_base, ) write_bin( os.path.join(args.output, "groundtruth.distances.fbin"), diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py index 0159211520..d3ee4b3479 100644 --- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py +++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/utils.py @@ -52,6 +52,18 @@ def neighbor_index_dtype(n_base: int) -> np.dtype: return np.int32 +def neighbor_index_accumulator_dtype(n_base: int) -> np.dtype: + """Return the in-memory dtype for neighbor IDs during GT computation. + + cuVS brute-force search returns ``int64`` neighbors. Use ``int64`` for + large bases so batch offsets up to multi-billion row counts do not + overflow; cast to :func:`neighbor_index_dtype` only when writing files. + """ + if n_base > np.iinfo(np.int32).max: + return np.int64 + return np.int32 + + def groundtruth_neighbors_filename(n_base: int) -> str: """Return the ground-truth neighbors filename for a base set size.""" if n_base > np.iinfo(np.int32).max: @@ -59,6 +71,19 @@ def groundtruth_neighbors_filename(n_base: int) -> str: return "groundtruth.neighbors.ibin" +def offset_neighbor_indices(indices, batch_offset: int, n_base: int): + """Shift local neighbor IDs by a batch offset without integer overflow.""" + dtype = neighbor_index_accumulator_dtype(n_base) + return indices.astype(dtype) + batch_offset + + +def write_groundtruth_neighbors(path, indices, n_base: int): + """Write a ground-truth neighbor matrix using the correct on-disk dtype.""" + storage_dtype = neighbor_index_dtype(n_base) + data = np.asarray(indices, dtype=storage_dtype) + write_bin(path, data) + + def memmap_bin_file( bin_file, dtype, shape=None, mode="r", size_dtype=np.uint32 ): diff --git a/python/cuvs_bench/cuvs_bench/tests/test_utils.py b/python/cuvs_bench/cuvs_bench/tests/test_utils.py index 6fffe59e5a..01dd803a07 100644 --- a/python/cuvs_bench/cuvs_bench/tests/test_utils.py +++ b/python/cuvs_bench/cuvs_bench/tests/test_utils.py @@ -29,7 +29,10 @@ from cuvs_bench.generate_groundtruth.utils import ( groundtruth_neighbors_filename, memmap_bin_file, + neighbor_index_accumulator_dtype, neighbor_index_dtype, + offset_neighbor_indices, + write_groundtruth_neighbors, ) from cuvs_bench.orchestrator.config_loaders import CppGBenchConfigLoader @@ -266,6 +269,12 @@ def test_neighbor_index_dtype_small_base(self): def test_neighbor_index_dtype_large_base(self): assert neighbor_index_dtype(np.iinfo(np.int32).max + 1) == np.uint64 + def test_neighbor_index_accumulator_dtype_large_base(self): + assert ( + neighbor_index_accumulator_dtype(np.iinfo(np.int32).max + 1) + == np.int64 + ) + def test_groundtruth_neighbors_filename_small_base(self): assert ( groundtruth_neighbors_filename(1_000_000) @@ -288,6 +297,46 @@ def test_load_u64bin_preserves_large_indices(self, tmp_path): loaded = load_vectors(path) np.testing.assert_array_equal(loaded, indices) + def test_offset_neighbor_indices_small_base(self): + local = np.array([[0, 1, 2]], dtype=np.uint32) + offset = offset_neighbor_indices(local, 1000, 1_000_000) + assert offset.dtype == np.int32 + np.testing.assert_array_equal(offset, [[1000, 1001, 1002]]) + + def test_offset_neighbor_indices_large_batch_offset(self): + """Search-local IDs must not wrap when batch offset exceeds INT32_MAX.""" + batch_offset = np.iinfo(np.int32).max + 1 + n_base = batch_offset + 10 + local = np.array([[0, 1, 2]], dtype=np.int64) + offset = offset_neighbor_indices(local, batch_offset, n_base) + assert offset.dtype == np.int64 + np.testing.assert_array_equal( + offset, + [[batch_offset, batch_offset + 1, batch_offset + 2]], + ) + + def test_write_groundtruth_neighbors_round_trip(self, tmp_path): + """GT write/load preserves neighbor IDs above INT32_MAX.""" + n_base = np.iinfo(np.int32).max + 1 + large_id = n_base + 999 + indices = np.array([[large_id, large_id - 1, 0]], dtype=np.int64) + path = str(tmp_path / groundtruth_neighbors_filename(n_base)) + write_groundtruth_neighbors(path, indices, n_base) + + loaded = load_vectors(path) + assert loaded.dtype == np.uint64 + np.testing.assert_array_equal(loaded, indices.astype(np.uint64)) + + def test_dataset_lazy_load_u64bin_groundtruth(self, tmp_path): + """Dataset loads .u64bin ground truth with IDs above INT32_MAX.""" + large_id = np.iinfo(np.int32).max + 12345 + gt = np.array([[large_id, 0, 1]], dtype=np.uint64) + path = str(tmp_path / "groundtruth.neighbors.u64bin") + _write_test_bin(path, gt) + + dataset = Dataset(name="test", groundtruth_neighbors_file=path) + np.testing.assert_array_equal(dataset.groundtruth_neighbors, gt) + class TestBinHeaderHelpers: """Tests for ``cuvs_bench._bin_format.read_bin_header`` / ``write_bin_header``.""" @@ -720,3 +769,10 @@ def test_multiple_queries(self): groundtruth = np.array([[0, 1, 2], [3, 4, 5]]) recall = compute_recall(neighbors, groundtruth, k=3) assert abs(recall - 5.0 / 6.0) < 1e-9 + + def test_large_uint64_neighbor_ids(self): + """Recall works when GT neighbor IDs exceed INT32_MAX.""" + large_id = np.iinfo(np.int32).max + 999 + neighbors = np.array([[large_id, 0, 1]], dtype=np.int64) + groundtruth = np.array([[large_id, 0, 1]], dtype=np.uint64) + assert compute_recall(neighbors, groundtruth, k=3) == 1.0